/* * Copyright (C) 2000 Helix Code Inc. * * Authors: Michael Zucchi <notzed@helixcode.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License * as published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* What should hopefully be a fast mail parser */ /* Do not change this code without asking me (Michael Zucchi) first There is almost always a reason something was done a certain way. */ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <string.h> #include <stdio.h> #include <errno.h> #include <unicode.h> #include <regex.h> #include <ctype.h> #include <glib.h> #include "camel-mime-parser.h" #include "camel-mime-utils.h" #include "camel-mime-filter.h" #include "camel-stream.h" #include "camel-seekable-stream.h" #define r(x) #define h(x) #define c(x) #define d(x) /*#define PURIFY*/ #define MEMPOOL #define STRUCT_ALIGN 4 #ifdef PURIFY int inend_id = -1, inbuffer_id = -1; #endif #if 0 extern int strdup_count; extern int malloc_count; extern int free_count; #define g_strdup(x) (strdup_count++, g_strdup(x)) #define g_malloc(x) (malloc_count++, g_malloc(x)) #define g_free(x) (free_count++, g_free(x)) #endif #ifdef MEMPOOL typedef struct _MemPoolNode { struct _MemPoolNode *next; int free; char data[1]; } MemPoolNode; typedef struct _MemPoolThresholdNode { struct _MemPoolThresholdNode *next; char data[1]; } MemPoolThresholdNode; typedef struct _MemPool { int blocksize; int threshold; struct _MemPoolNode *blocks; struct _MemPoolThresholdNode *threshold_blocks; } MemPool; MemPool *mempool_new(int blocksize, int threshold); void *mempool_alloc(MemPool *pool, int size); void mempool_flush(MemPool *pool, int freeall); void mempool_free(MemPool *pool); MemPool *mempool_new(int blocksize, int threshold) { MemPool *pool; pool = g_malloc(sizeof(*pool)); if (threshold >= blocksize) threshold = blocksize * 2 / 3; pool->blocksize = blocksize; pool->threshold = threshold; pool->blocks = NULL; pool->threshold_blocks = NULL; return pool; } void *mempool_alloc(MemPool *pool, int size) { size = (size + STRUCT_ALIGN) & (~(STRUCT_ALIGN-1)); if (size>=pool->threshold) { MemPoolThresholdNode *n; n = g_malloc(sizeof(*n) - sizeof(char) + size); n->next = pool->threshold_blocks; pool->threshold_blocks = n; return &n->data[0]; } else { MemPoolNode *n; n = pool->blocks; while (n) { if (n->free >= size) { n->free -= size; return &n->data[n->free]; } n = n->next; } n = g_malloc(sizeof(*n) - sizeof(char) + pool->blocksize); n->next = pool->blocks; pool->blocks = n; n->free = pool->blocksize - size; return &n->data[n->free]; } } void mempool_flush(MemPool *pool, int freeall) { MemPoolThresholdNode *tn, *tw; MemPoolNode *pw, *pn; tw = pool->threshold_blocks; while (tw) { tn = tw->next; g_free(tw); tw = tn; } pool->threshold_blocks = NULL; if (freeall) { pw = pool->blocks; while (pw) { pn = pw->next; g_free(pw); pw = pn; } pool->blocks = NULL; } else { pw = pool->blocks; while (pw) { pw->free = pool->blocksize; pw = pw->next; } } } void mempool_free(MemPool *pool) { if (pool) { mempool_flush(pool, 1); g_free(pool); } } #endif #define SCAN_BUF 4096 /* size of read buffer */ #define SCAN_HEAD 128 /* headroom guaranteed to be before each read buffer */ /* a little hacky, but i couldn't be bothered renaming everything */ #define _header_scan_state _CamelMimeParserPrivate #define _PRIVATE(o) (((CamelMimeParser *)(o))->priv) struct _header_scan_state { /* global state */ enum _header_state state; /* for building headers during scanning */ char *outbuf; char *outptr; char *outend; int fd; /* input for a fd input */ CamelStream *stream; /* or for a stream */ /* for scanning input buffers */ char *realbuf; /* the real buffer, SCAN_HEAD*2 + SCAN_BUF bytes */ char *inbuf; /* points to a subset of the allocated memory, the underflow */ char *inptr; /* (upto SCAN_HEAD) is for use by filters so they dont copy all data */ char *inend; int atleast; int seek; /* current offset to start of buffer */ int unstep; /* how many states to 'unstep' (repeat the current state) */ unsigned int midline:1; /* are we mid-line interrupted? */ unsigned int scan_from:1; /* do we care about From lines? */ unsigned int scan_pre_from:1; /* do we return pre-from data? */ int start_of_from; /* where from started */ int start_of_headers; /* where headers started from the last scan */ int header_start; /* start of last header, or -1 */ /* filters to apply to all content before output */ int filterid; /* id of next filter */ struct _header_scan_filter *filters; /* per message/part info */ struct _header_scan_stack *parts; }; struct _header_scan_stack { struct _header_scan_stack *parent; enum _header_state savestate; /* state at invocation of this part */ #ifdef MEMPOOL MemPool *pool; /* memory pool to keep track of headers/etc at this level */ #endif struct _header_raw *headers; /* headers for this part */ struct _header_content_type *content_type; /* I dont use GString's casue you can't efficiently append a buffer to them */ GByteArray *pretext; /* for multipart types, save the pre-boundary data here */ GByteArray *posttext; /* for multipart types, save the post-boundary data here */ int prestage; /* used to determine if it is a pre-boundary or post-boundary data segment */ GByteArray *from_line; /* the from line */ char *boundary; /* for multipart/ * boundaries, including leading -- and trailing -- for the final part */ int boundarylen; /* actual length of boundary, including leading -- if there is one */ int boundarylenfinal; /* length of boundary, including trailing -- if there is one */ int atleast; /* the biggest boundary from here to the parent */ }; struct _header_scan_filter { struct _header_scan_filter *next; int id; CamelMimeFilter *filter; }; static void folder_scan_step(struct _header_scan_state *s, char **databuffer, int *datalength); static void folder_scan_drop_step(struct _header_scan_state *s); static int folder_scan_init_with_fd(struct _header_scan_state *s, int fd); static int folder_scan_init_with_stream(struct _header_scan_state *s, CamelStream *stream); static struct _header_scan_state *folder_scan_init(void); static void folder_scan_close(struct _header_scan_state *s); static struct _header_scan_stack *folder_scan_content(struct _header_scan_state *s, int *lastone, char **data, int *length); static struct _header_scan_stack *folder_scan_header(struct _header_scan_state *s, int *lastone); static int folder_scan_skip_line(struct _header_scan_state *s, GByteArray *save); static off_t folder_seek(struct _header_scan_state *s, off_t offset, int whence); static off_t folder_tell(struct _header_scan_state *s); static int folder_read(struct _header_scan_state *s); #ifdef MEMPOOL static void header_append_mempool(struct _header_scan_state *s, struct _header_scan_stack *h, char *header, int offset); #endif static void camel_mime_parser_class_init (CamelMimeParserClass *klass); static void camel_mime_parser_init (CamelMimeParser *obj); static char *states[] = { "HSCAN_INITIAL", "HSCAN_PRE_FROM", /* pre-from data */ "HSCAN_FROM", /* got 'From' line */ "HSCAN_HEADER", /* toplevel header */ "HSCAN_BODY", /* scanning body of message */ "HSCAN_MULTIPART", /* got multipart header */ "HSCAN_MESSAGE", /* rfc822/news message */ "HSCAN_PART", /* part of a multipart */ "HSCAN_EOF", /* end of file */ "HSCAN_PRE_FROM_END", "HSCAN_FROM_END", "HSCAN_HEAER_END", "HSCAN_BODY_END", "HSCAN_MULTIPART_END", "HSCAN_MESSAGE_END", }; static CamelObjectClass *camel_mime_parser_parent; static void camel_mime_parser_class_init (CamelMimeParserClass *klass) { camel_mime_parser_parent = camel_type_get_global_classfuncs (camel_object_get_type ()); } static void camel_mime_parser_init (CamelMimeParser *obj) { struct _header_scan_state *s; s = folder_scan_init(); _PRIVATE(obj) = s; } static void camel_mime_parser_finalise(CamelObject *o) { struct _header_scan_state *s = _PRIVATE(o); #ifdef PURIFY purify_watch_remove_all(); #endif folder_scan_close(s); } CamelType camel_mime_parser_get_type (void) { static CamelType type = CAMEL_INVALID_TYPE; if (type == CAMEL_INVALID_TYPE) { type = camel_type_register (camel_object_get_type (), "CamelMimeParser", sizeof (CamelMimeParser), sizeof (CamelMimeParserClass), (CamelObjectClassInitFunc) camel_mime_parser_class_init, NULL, (CamelObjectInitFunc) camel_mime_parser_init, (CamelObjectFinalizeFunc) camel_mime_parser_finalise); } return type; } /** * camel_mime_parser_new: * * Create a new CamelMimeParser object. * * Return value: A new CamelMimeParser widget. **/ CamelMimeParser * camel_mime_parser_new (void) { CamelMimeParser *new = CAMEL_MIME_PARSER ( camel_object_new (camel_mime_parser_get_type ())); return new; } /** * camel_mime_parser_filter_add: * @m: * @mf: * * Add a filter that will be applied to any body content before it is passed * to the caller. Filters may be pipelined to perform multi-pass operations * on the content, and are applied in the order they were added. * * Note that filters are only applied to the body content of messages, and once * a filter has been set, all content returned by a filter_step() with a state * of HSCAN_BODY will have passed through the filter. * * Return value: An id that may be passed to filter_remove() to remove * the filter, or -1 if the operation failed. **/ int camel_mime_parser_filter_add(CamelMimeParser *m, CamelMimeFilter *mf) { struct _header_scan_state *s = _PRIVATE(m); struct _header_scan_filter *f, *new; new = g_malloc(sizeof(*new)); new->filter = mf; new->id = s->filterid++; if (s->filterid == -1) s->filterid++; new->next = 0; camel_object_ref((CamelObject *)mf); /* yes, this is correct, since 'next' is the first element of the struct */ f = (struct _header_scan_filter *)&s->filters; while (f->next) f = f->next; f->next = new; return new->id; } /** * camel_mime_parser_filter_remove: * @m: * @id: * * Remove a processing filter from the pipeline. There is no * restriction on the order the filters can be removed. **/ void camel_mime_parser_filter_remove(CamelMimeParser *m, int id) { struct _header_scan_state *s = _PRIVATE(m); struct _header_scan_filter *f, *old; f = (struct _header_scan_filter *)&s->filters; while (f && f->next) { old = f->next; if (old->id == id) { camel_object_unref((CamelObject *)old->filter); f->next = old->next; g_free(old); /* there should only be a single matching id, but scan the whole lot anyway */ } f = f->next; } } /** * camel_mime_parser_header: * @m: * @name: Name of header. * @offset: Pointer that can receive the offset of the header in * the stream from the start of parsing. * * Lookup a header by name. * * Return value: The header value, or NULL if the header is not * defined. **/ const char * camel_mime_parser_header(CamelMimeParser *m, const char *name, int *offset) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts && s->parts->headers) { return header_raw_find(&s->parts->headers, name, offset); } return NULL; } /** * camel_mime_parser_headers_raw: * @m: * * Get the list of the raw headers which are defined for the * current state of the parser. These headers are valid * until the next call to parser_step(), or parser_drop_step(). * * Return value: The raw headers, or NULL if there are no headers * defined for the current part or state. These are READ ONLY. **/ struct _header_raw * camel_mime_parser_headers_raw(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts) return s->parts->headers; return NULL; } static const char * byte_array_to_string(GByteArray *array) { if (array == NULL) return NULL; if (array->len == 0 || array->data[array->len-1] != '\0') g_byte_array_append(array, "", 1); return array->data; } /** * camel_mime_parser_preface: * @m: * * Retrieve the preface text for the current multipart. * Can only be used when the state is HSCAN_MULTIPART_END. * * Return value: The preface text, or NULL if there wasn't any. **/ const char * camel_mime_parser_preface(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts) return byte_array_to_string(s->parts->pretext); return NULL; } /** * camel_mime_parser_postface: * @m: * * Retrieve the postface text for the current multipart. * Only returns valid data when the current state if * HSCAN_MULTIPART_END. * * Return value: The postface text, or NULL if there wasn't any. **/ const char * camel_mime_parser_postface(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts) return byte_array_to_string(s->parts->posttext); return NULL; } /** * camel_mime_parser_from_line: * @m: * * Get the last scanned "From " line, from a recently scanned from. * This should only be called in the HSCAN_FROM state. The * from line will include the closing \n found (if there was one). * * The return value will remain valid while in the HSCAN_FROM * state, or any deeper state. * * Return value: The From line, or NULL if called out of context. **/ const char * camel_mime_parser_from_line(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts) return byte_array_to_string(s->parts->from_line); return NULL; } /** * camel_mime_parser_init_with_fd: * @m: * @fd: A valid file descriptor. * * Initialise the scanner with an fd. The scanner's offsets * will be relative to the current file position of the file * descriptor. As a result, seekable descritors should * be seeked using the parser seek functions. * * An initial buffer will be read from the file descriptor * immediately, although no parsing will occur. * * Return value: Returns -1 on error. **/ int camel_mime_parser_init_with_fd(CamelMimeParser *m, int fd) { struct _header_scan_state *s = _PRIVATE(m); return folder_scan_init_with_fd(s, fd); } /** * camel_mime_parser_init_with_stream: * @m: * @stream: * * Initialise the scanner with a source stream. The scanner's * offsets will be relative to the current file position of * the stream. As a result, seekable streams should only * be seeked using the parser seek function. * * An initial buffer will be read from the stream * immediately, although no parsing will occur. * * Return value: -1 on error. **/ int camel_mime_parser_init_with_stream(CamelMimeParser *m, CamelStream *stream) { struct _header_scan_state *s = _PRIVATE(m); return folder_scan_init_with_stream(s, stream); } /** * camel_mime_parser_scan_from: * @m: * @scan_from: #TRUE if the scanner should scan From lines. * * Tell the scanner if it should scan "^From " lines or not. * * If the scanner is scanning from lines, two additional * states HSCAN_FROM and HSCAN_FROM_END will be returned * to the caller during parsing. * * This may also be preceeded by an optional * HSCAN_PRE_FROM state which contains the scanned data * found before the From line is encountered. See also * scan_pre_from(). **/ void camel_mime_parser_scan_from(CamelMimeParser *m, int scan_from) { struct _header_scan_state *s = _PRIVATE(m); s->scan_from = scan_from; } /** * camel_mime_parser_scan_pre_from: * @: * @scan_pre_from: #TRUE if we want to get pre-from data. * * Tell the scanner whether we want to know abou the pre-from * data during a scan. If we do, then we may get an additional * state HSCAN_PRE_FROM which returns the specified data. **/ void camel_mime_parser_scan_pre_from(CamelMimeParser *m, int scan_pre_from) { struct _header_scan_state *s = _PRIVATE(m); s->scan_pre_from = scan_pre_from; } /** * camel_mime_parser_content_type: * @m: * * Get the content type defined in the current part. * * Return value: A content_type structure, or NULL if there * is no content-type defined for this part of state of the * parser. **/ struct _header_content_type * camel_mime_parser_content_type(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); /* FIXME: should this search up until it's found the 'right' content-type? can it? */ if (s->parts) return s->parts->content_type; return NULL; } /** * camel_mime_parser_unstep: * @m: * * Cause the last step operation to repeat itself. If this is * called repeated times, then the same step will be repeated * that many times. * * Note that it is not possible to scan back using this function, * only to have a way of peeking the next state. **/ void camel_mime_parser_unstep(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); s->unstep++; } /** * camel_mime_parser_drop_step: * @m: * * Drop the last step call. This should only be used * in conjunction with seeking of the stream as the * stream may be in an undefined state relative to the * state of the parser. * * Use this call with care. **/ void camel_mime_parser_drop_step(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); s->unstep = 0; folder_scan_drop_step(s); } /** * camel_mime_parser_step: * @m: * @databuffer: Pointer to accept a pointer to the data * associated with this step (if any). May be #NULL, * in which case datalength is also ingored. * @datalength: Pointer to accept a pointer to the data * length associated with this step (if any). * * Parse the next part of the MIME message. If _unstep() * has been called, then continue to return the same state * for that many calls. * * If the step is HSCAN_BODY then the databuffer and datalength * pointers will be setup to point to the internal data buffer * of the scanner and may be processed as required. Any * filters will have already been applied to this data. * * Refer to the state diagram elsewhere for a full listing of * the states an application is gauranteed to get from the * scanner. * * Return value: The current new state of the parser * is returned. **/ enum _header_state camel_mime_parser_step(CamelMimeParser *m, char **databuffer, int *datalength) { struct _header_scan_state *s = _PRIVATE(m); d(printf("OLD STATE: '%s' :\n", states[s->state])); if (s->unstep <= 0) { char *dummy; int dummylength; if (databuffer == NULL) { databuffer = &dummy; datalength = &dummylength; } folder_scan_step(s, databuffer, datalength); } else s->unstep--; d(printf("NEW STATE: '%s' :\n", states[s->state])); return s->state; } /** * camel_mime_parser_read: * @m: * @databuffer: * @len: * * Read at most @len bytes from the internal mime parser buffer. * * Returns the address of the internal buffer in @databuffer, * and the length of useful data. * * @len may be specified as INT_MAX, in which case you will * get the full remainder of the buffer at each call. * * Note that no parsing of the data read through this function * occurs, so no state changes occur, but the seek position * is updated appropriately. * * Return value: The number of bytes available, or -1 on error. **/ int camel_mime_parser_read(CamelMimeParser *m, const char **databuffer, int len) { struct _header_scan_state *s = _PRIVATE(m); int there; if (len == 0) return 0; d(printf("parser::read() reading %d bytes\n", len)); there = MIN(s->inend - s->inptr, len); d(printf("parser::read() there = %d bytes\n", there)); if (there > 0) { *databuffer = s->inptr; s->inptr += there; return there; } if (folder_read(s) == -1) return -1; there = MIN(s->inend - s->inptr, len); d(printf("parser::read() had to re-read, now there = %d bytes\n", there)); *databuffer = s->inptr; s->inptr += there; return there; } /** * camel_mime_parser_tell: * @m: * * Return the current scanning offset. The meaning of this * value will depend on the current state of the parser. * * An incomplete listing of the states: * * HSCAN_INITIAL, The start of the current message. * HSCAN_HEADER, HSCAN_MESSAGE, HSCAN_MULTIPART, the character * position immediately after the end of the header. * HSCAN_BODY, Position within the message of the start * of the current data block. * HSCAN_*_END, The position of the character starting * the next section of the scan (the last position + 1 of * the respective current state). * * Return value: See above. **/ off_t camel_mime_parser_tell(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return folder_tell(s); } /** * camel_mime_parser_tell_start_headers: * @m: * * Find out the position within the file of where the * headers started, this is cached by the parser * at the time. * * Return value: The header start position, or -1 if * no headers were scanned in the current state. **/ off_t camel_mime_parser_tell_start_headers(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->start_of_headers; } /** * camel_mime_parser_tell_start_from: * @m: * * If the parser is scanning From lines, then this returns * the position of the start of the From line. * * Return value: The start of the from line, or -1 if there * was no From line, or From lines are not being scanned. **/ off_t camel_mime_parser_tell_start_from(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->start_of_from; } /** * camel_mime_parser_seek: * @m: * @off: Number of bytes to offset the seek by. * @whence: SEEK_SET, SEEK_CUR, SEEK_END * * Reset the source position to a known value. * * Note that if the source stream/descriptor was not * positioned at 0 to begin with, and an absolute seek * is specified (whence != SEEK_CUR), then the seek * position may not match the desired seek position. * * Return value: The new seek offset, or -1 on * an error (for example, trying to seek on a non-seekable * stream or file descriptor). **/ off_t camel_mime_parser_seek(CamelMimeParser *m, off_t off, int whence) { struct _header_scan_state *s = _PRIVATE(m); return folder_seek(s, off, whence); } /** * camel_mime_parser_state: * @m: * * Get the current parser state. * * Return value: The current parser state. **/ enum _header_state camel_mime_parser_state(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->state; } /** * camel_mime_parser_stream: * @m: * * Get the stream, if any, the parser has been initialised * with. May be used to setup sub-streams, but should not * be read from directly (without saving and restoring * the seek position in between). * * Return value: The stream from _init_with_stream(), or NULL * if the parser is reading from a file descriptor or is * uninitialised. **/ CamelStream *camel_mime_parser_stream(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->stream; } /** * camel_mime_parser_fd: * @m: * * Return the file descriptor, if any, the parser has been * initialised with. * * Should not be read from unless the parser it to terminate, * or the seek offset can be reset before the next parse * step. * * Return value: The file descriptor or -1 if the parser * is reading from a stream or has not been initialised. **/ int camel_mime_parser_fd(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->fd; } /* ********************************************************************** */ /* Implementation */ /* ********************************************************************** */ /* read the next bit of data, ensure there is enough room 'atleast' bytes */ static int folder_read(struct _header_scan_state *s) { int len; int inoffset; if (s->inptr<s->inend-s->atleast) return s->inend-s->inptr; #ifdef PURIFY purify_watch_remove(inend_id); purify_watch_remove(inbuffer_id); #endif /* check for any remaning bytes (under the atleast limit( */ inoffset = s->inend - s->inptr; if (inoffset>0) { memcpy(s->inbuf, s->inptr, inoffset); } if (s->stream) { len = camel_stream_read(s->stream, s->inbuf+inoffset, SCAN_BUF-inoffset); } else { len = read(s->fd, s->inbuf+inoffset, SCAN_BUF-inoffset); } r(printf("read %d bytes, offset = %d\n", len, inoffset)); if (len>=0) { /* add on the last read block */ s->seek += s->inptr - s->inbuf; s->inptr = s->inbuf; s->inend = s->inbuf+len+inoffset; r(printf("content = %d '%.*s'\n",s->inend - s->inptr, s->inend - s->inptr, s->inptr)); } g_assert(s->inptr<=s->inend); #ifdef PURIFY inend_id = purify_watch(&s->inend); inbuffer_id = purify_watch_n(s->inend+1, SCAN_HEAD-1, "rw"); #endif r(printf("content = %d '%.*s'\n", s->inend - s->inptr, s->inend - s->inptr, s->inptr)); /* set a sentinal, for the inner loops to check against */ s->inend[0] = '\n'; return s->inend-s->inptr; } /* return the current absolute position of the data pointer */ static off_t folder_tell(struct _header_scan_state *s) { return s->seek + (s->inptr - s->inbuf); } /* need some way to prime the parser state, so this actually works for other than top-level messages */ static off_t folder_seek(struct _header_scan_state *s, off_t offset, int whence) { off_t newoffset; int len; if (s->stream) { if (CAMEL_IS_SEEKABLE_STREAM(s->stream)) { /* NOTE: assumes whence seekable stream == whence libc, which is probably the case (or bloody well should've been) */ newoffset = camel_seekable_stream_seek((CamelSeekableStream *)s->stream, offset, whence); } else { newoffset = -1; errno = EINVAL; } } else { newoffset = lseek(s->fd, offset, whence); } #ifdef PURIFY purify_watch_remove(inend_id); purify_watch_remove(inbuffer_id); #endif if (newoffset != -1) { s->seek = newoffset; s->inptr = s->inbuf; s->inend = s->inbuf; if (s->stream) len = camel_stream_read(s->stream, s->inbuf, SCAN_BUF); else len = read(s->fd, s->inbuf, SCAN_BUF); if (len>=0) { s->inend = s->inbuf+len; s->inend[0] = '\n'; } else newoffset = -1; } #ifdef PURIFY inend_id = purify_watch(&s->inend); inbuffer_id = purify_watch_n(s->inend+1, SCAN_HEAD-1, "rw"); #endif return newoffset; } static void folder_push_part(struct _header_scan_state *s, struct _header_scan_stack *h) { if (s->parts && s->parts->atleast > h->boundarylenfinal) h->atleast = s->parts->atleast; else h->atleast = MAX(h->boundarylenfinal, 1); h->parent = s->parts; s->parts = h; } static void folder_pull_part(struct _header_scan_state *s) { struct _header_scan_stack *h; h = s->parts; if (h) { s->parts = h->parent; g_free(h->boundary); #ifdef MEMPOOL mempool_free(h->pool); #else header_raw_clear(&h->headers); #endif header_content_type_unref(h->content_type); if (h->pretext) g_byte_array_free(h->pretext, TRUE); if (h->posttext) g_byte_array_free(h->posttext, TRUE); if (h->from_line) g_byte_array_free(h->from_line, TRUE); g_free(h); } else { g_warning("Header stack underflow!\n"); } } static int folder_scan_skip_line(struct _header_scan_state *s, GByteArray *save) { int atleast = s->atleast; register char *inptr, *inend, c; int len; s->atleast = 1; while ( (len = folder_read(s)) > 0 && len > s->atleast) { /* ensure we have at least enough room here */ inptr = s->inptr; inend = s->inend-1; c = -1; while (inptr<inend && (c = *inptr++)!='\n') ; if (save) g_byte_array_append(save, s->inptr, inptr-s->inptr); s->inptr = inptr; if (c=='\n') { s->atleast = atleast; return 0; } } s->atleast = atleast; return -1; /* not found */ } /* TODO: Is there any way to make this run faster? It gets called a lot ... */ static struct _header_scan_stack * folder_boundary_check(struct _header_scan_state *s, const char *boundary, int *lastone) { struct _header_scan_stack *part; int len = s->atleast; /* make sure we dont access past the buffer */ h(printf("checking boundary marker upto %d bytes\n", len)); part = s->parts; while (part) { h(printf(" boundary: %s\n", part->boundary)); h(printf(" against: '%.*s'\n", s->atleast, boundary)); if (part->boundary && part->boundarylen <= len && memcmp(boundary, part->boundary, part->boundarylen)==0) { h(printf("matched boundary: %s\n", part->boundary)); /* again, make sure we're in range */ if (part->boundarylenfinal <= len) { int extra = part->boundarylenfinal - part->boundarylen; /* check the extra stuff on an final boundary, normally -- for mime parts */ if (extra>0) { *lastone = memcmp(&boundary[part->boundarylen], &part->boundary[part->boundarylen], extra) == 0; } else { *lastone = TRUE; } h(printf("checking lastone = %s\n", *lastone?"TRUE":"FALSE")); } else { h(printf("not enough room to check last one?\n")); *lastone = FALSE; } /*printf("ok, we found it! : %s \n", (*lastone)?"Last one":"More to come?");*/ return part; } part = part->parent; } return NULL; } #ifdef MEMPOOL static void header_append_mempool(struct _header_scan_state *s, struct _header_scan_stack *h, char *header, int offset) { struct _header_raw *l, *n; char *content; content = strchr(header, ':'); if (content) { register int len; n = mempool_alloc(h->pool, sizeof(*n)); n->next = NULL; len = content-header; n->name = mempool_alloc(h->pool, len+1); memcpy(n->name, header, len); n->name[len] = 0; content++; len = s->outptr - content; n->value = mempool_alloc(h->pool, len+1); memcpy(n->value, content, len); n->value[len] = 0; n->offset = offset; l = (struct _header_raw *)&h->headers; while (l->next) { l = l->next; } l->next = n; } } #define header_raw_append_parse(a, b, c) (header_append_mempool(s, h, b, c)) #endif /* Copy the string start->inptr into the header buffer (s->outbuf), grow if necessary remove trailing \r chars (\n's assumed already removed) and track the start offset of the header */ /* Basically an optimised version of g_byte_array_append() */ #define header_append(s, start, inptr) \ { \ register int headerlen = inptr-start; \ \ if (headerlen > 0) { \ if (headerlen >= (s->outend - s->outptr)) { \ register char *outnew; \ register int len = ((s->outend - s->outbuf)+headerlen)*2+1; \ outnew = g_realloc(s->outbuf, len); \ s->outptr = s->outptr - s->outbuf + outnew; \ s->outbuf = outnew; \ s->outend = outnew + len; \ } \ if (start[headerlen-1] == '\r') \ headerlen--; \ memcpy(s->outptr, start, headerlen); \ s->outptr += headerlen; \ } \ if (s->header_start == -1) \ s->header_start = (start-s->inbuf) + s->seek; \ } static struct _header_scan_stack * folder_scan_header(struct _header_scan_state *s, int *lastone) { int atleast = s->atleast, newatleast; char *start = NULL; int len; struct _header_scan_stack *h; char *inend; register char *inptr; h(printf("scanning first bit\n")); h = g_malloc0(sizeof(*h)); #ifdef MEMPOOL h->pool = mempool_new(8192, 4096); #endif if (s->parts) newatleast = s->parts->atleast; else newatleast = 1; *lastone = FALSE; do { s->atleast = newatleast; h(printf("atleast = %d\n", s->atleast)); while ((len = folder_read(s))>0 && len >= s->atleast) { /* ensure we have at least enough room here */ inptr = s->inptr; inend = s->inend-s->atleast+1; while (inptr<inend) { if (!s->midline) { if (folder_boundary_check(s, inptr, lastone)) { if ((s->outptr>s->outbuf)) goto header_truncated; /* may not actually be truncated */ goto header_done; } } start = inptr; /* goto next line/sentinal */ while ((*inptr++)!='\n') ; g_assert(inptr<=s->inend+1); /* check for sentinal or real end of line */ if (inptr > inend) { h(printf("not at end of line yet, going further\n")); /* didn't find end of line within our allowed area */ inptr = inend; s->midline = TRUE; header_append(s, start, inptr); } else { h(printf("got line part: '%.*s'\n", inptr-1-start, start)); /* got a line, strip and add it, process it */ s->midline = FALSE; header_append(s, start, inptr-1); /* check for end of headers */ if (s->outbuf == s->outptr) goto header_done; /* check for continuation/compress headers, we have atleast 1 char here to work with */ if (inptr[0] == ' ' || inptr[0] == '\t') { h(printf("continuation\n")); /* TODO: this wont catch multiple space continuation across a read boundary, but that is assumed rare, and not fatal anyway */ do inptr++; while (*inptr == ' ' || *inptr == '\t'); inptr--; *inptr = ' '; } else { /* otherwise, complete header, add it */ s->outptr[0] = 0; h(printf("header '%.20s' at %d\n", s->outbuf, s->header_start)); header_raw_append_parse(&h->headers, s->outbuf, s->header_start); s->outptr = s->outbuf; s->header_start = -1; } } } s->inptr = inptr; } h(printf("end of file? read %d bytes\n", len)); newatleast = 1; } while (s->atleast > 1); if ((s->outptr > s->outbuf) || s->inend > s->inptr) { start = s->inptr; inptr = s->inend; if (inptr > start) { if (inptr[-1] == '\n') inptr--; } goto header_truncated; } s->atleast = atleast; return h; header_truncated: header_append(s, start, inptr); s->outptr[0] = 0; if (s->outbuf == s->outptr) goto header_done; header_raw_append_parse(&h->headers, s->outbuf, s->header_start); s->outptr = s->outbuf; header_done: s->inptr = inptr; s->atleast = atleast; s->header_start = -1; return h; } static struct _header_scan_stack * folder_scan_content(struct _header_scan_state *s, int *lastone, char **data, int *length) { int atleast = s->atleast, newatleast; register char *inptr; char *inend; char *start; int len; struct _header_scan_stack *part; int onboundary = FALSE; c(printf("scanning content\n")); part = s->parts; if (part) newatleast = part->atleast; else newatleast = 1; *lastone = FALSE; c(printf("atleast = %d\n", s->atleast)); do { s->atleast = newatleast; while ((len = folder_read(s))>0 && len >= s->atleast) { /* ensure we have at least enough room here */ inptr = s->inptr; inend = s->inend-s->atleast+1; start = inptr; c(printf("inptr = %p, inend = %p\n", inptr, inend)); while (inptr<inend) { if (!s->midline && (part = folder_boundary_check(s, inptr, lastone))) { onboundary = TRUE; /* since we truncate the boundary data, we need at least 1 char here spare, to remain in the same state */ if ( (inptr-start) > 1) goto content; /* otherwise, jump to the state of the boundary we actually found */ goto normal_exit; } /* goto the next line */ while ((*inptr++)!='\n') ; /* check the sentinal, if we went past the atleast limit, and reset it to there */ if (inptr > inend) { s->midline = TRUE; inptr = inend; } else { s->midline = FALSE; } } c(printf("ran out of input, dumping what i have (%d) bytes midline = %s\n", inptr-start, s->midline?"TRUE":"FALSE")); goto content; } newatleast = 1; } while (s->atleast > 1); c(printf("length read = %d\n", len)); if (s->inend > s->inptr) { start = s->inptr; inptr = s->inend; goto content; } *length = 0; s->atleast = atleast; return NULL; content: part = s->parts; normal_exit: s->atleast = atleast; s->inptr = inptr; *data = start; /* if we hit a boundary, we should not include the closing \n */ if (onboundary && (inptr-start)>0) *length = inptr-start-1; else *length = inptr-start; /*printf("got %scontent: '%.*s'\n", s->midline?"partial ":"", inptr-start, start);*/ return part; } static void folder_scan_close(struct _header_scan_state *s) { g_free(s->realbuf); g_free(s->outbuf); while (s->parts) folder_pull_part(s); if (s->fd != -1) close(s->fd); if (s->stream) { camel_object_unref((CamelObject *)s->stream); } g_free(s); } static struct _header_scan_state * folder_scan_init(void) { struct _header_scan_state *s; s = g_malloc(sizeof(*s)); s->fd = -1; s->stream = NULL; s->outbuf = g_malloc(1024); s->outptr = s->outbuf; s->outend = s->outbuf+1024; s->realbuf = g_malloc(SCAN_BUF + SCAN_HEAD*2); s->inbuf = s->realbuf + SCAN_HEAD; s->inptr = s->inbuf; s->inend = s->inbuf; s->atleast = 0; s->seek = 0; /* current character position in file of the last read block */ s->unstep = 0; s->header_start = -1; s->start_of_from = -1; s->start_of_headers = -1; s->midline = FALSE; s->scan_from = FALSE; s->scan_pre_from = FALSE; s->filters = NULL; s->filterid = 1; s->parts = NULL; s->state = HSCAN_INITIAL; return s; } static int folder_scan_init_with_fd(struct _header_scan_state *s, int fd) { int len; len = read(fd, s->inbuf, SCAN_BUF); if (len>=0) { s->inend = s->inbuf+len; s->inptr = s->inbuf; s->inend[0] = '\n'; if (s->fd != -1) close(s->fd); s->fd = fd; if (s->stream) { camel_object_unref((CamelObject *)s->stream); s->stream = NULL; } return 0; } else { return -1; } } static int folder_scan_init_with_stream(struct _header_scan_state *s, CamelStream *stream) { int len; len = camel_stream_read(stream, s->inbuf, SCAN_BUF); if (len >= 0) { s->inend = s->inbuf+len; s->inptr = s->inbuf; s->inend[0] = '\n'; if (s->stream) camel_object_unref((CamelObject *)s->stream); s->stream = stream; camel_object_ref((CamelObject *)stream); if (s->fd != -1) { close(s->fd); s->fd = -1; } return 0; } else { return -1; } } #define USE_FROM static void folder_scan_step(struct _header_scan_state *s, char **databuffer, int *datalength) { struct _header_scan_stack *h, *hb; const char *content; const char *bound; int type; int state; struct _header_content_type *ct = NULL; struct _header_scan_filter *f; size_t presize; /* printf("\nSCAN PASS: state = %d '%s'\n", s->state, states[s->state]);*/ tail_recurse: d({ printf("\nSCAN STACK:\n"); printf(" '%s' :\n", states[s->state]); hb = s->parts; while (hb) { printf(" '%s' : %s ", states[hb->savestate], hb->boundary); if (hb->content_type) { printf("(%s/%s)", hb->content_type->type, hb->content_type->subtype); } else { printf("(default)"); } printf("\n"); hb = hb->parent; } printf("\n"); }); switch (s->state) { #ifdef USE_FROM case HSCAN_INITIAL: if (s->scan_from) { h = g_malloc0(sizeof(*h)); h->boundary = g_strdup("From "); h->boundarylen = strlen(h->boundary); h->boundarylenfinal = h->boundarylen; h->from_line = g_byte_array_new(); folder_push_part(s, h); s->state = HSCAN_PRE_FROM; } else { s->start_of_from = -1; goto scan_header; } case HSCAN_PRE_FROM: h = s->parts; do { hb = folder_scan_content(s, &state, databuffer, datalength); if (s->scan_pre_from && *datalength > 0) { d(printf("got pre-from content %d bytes\n", *datalength)); return; } } while (hb==h && *datalength>0); if (*datalength==0 && hb==h) { d(printf("found 'From '\n")); s->start_of_from = folder_tell(s); folder_scan_skip_line(s, h->from_line); h->savestate = HSCAN_INITIAL; s->state = HSCAN_FROM; } else { folder_pull_part(s); s->state = HSCAN_EOF; } return; #else case HSCAN_INITIAL: case HSCAN_PRE_FROM: #endif /* !USE_FROM */ scan_header: case HSCAN_FROM: s->start_of_headers = folder_tell(s); h = folder_scan_header(s, &state); #ifdef USE_FROM if (s->scan_from) h->savestate = HSCAN_FROM_END; else #endif h->savestate = HSCAN_EOF; /* FIXME: should this check for MIME-Version: 1.0 as well? */ type = HSCAN_HEADER; if ( (content = header_raw_find(&h->headers, "Content-Type", NULL)) && (ct = header_content_type_decode(content))) { if (!strcasecmp(ct->type, "multipart")) { bound = header_content_type_param(ct, "boundary"); if (bound) { d(printf("multipart, boundary = %s\n", bound)); h->boundarylen = strlen(bound)+2; h->boundarylenfinal = h->boundarylen+2; h->boundary = g_malloc(h->boundarylen+3); sprintf(h->boundary, "--%s--", bound); type = HSCAN_MULTIPART; } else { header_content_type_unref(ct); ct = header_content_type_decode("text/plain"); /* We can't quite do this, as it will mess up all the offsets ... */ /* header_raw_replace(&h->headers, "Content-Type", "text/plain", offset);*/ g_warning("Multipart with no boundary, treating as text/plain"); } } else if (!strcasecmp(ct->type, "message")) { if (!strcasecmp(ct->subtype, "rfc822") || !strcasecmp(ct->subtype, "news") /*|| !strcasecmp(ct->subtype, "partial")*/) { type = HSCAN_MESSAGE; } } } else { /* make the default type for multipart/digest be message/rfc822 */ if ((s->parts && header_content_type_is(s->parts->content_type, "multipart", "digest"))) { ct = header_content_type_decode("message/rfc822"); type = HSCAN_MESSAGE; d(printf("parent was multipart/digest, autoupgrading to message/rfc822?\n")); /* maybe we should do this too? header_raw_append_parse(&h->headers, "Content-Type: message/rfc822", -1);*/ } } h->content_type = ct; folder_push_part(s, h); s->state = type; return; case HSCAN_HEADER: s->state = HSCAN_BODY; case HSCAN_BODY: h = s->parts; *datalength = 0; presize = SCAN_HEAD; f = s->filters; do { hb = folder_scan_content (s, &state, databuffer, datalength); d(printf ("\n\nOriginal content: '")); d(fwrite(*databuffer, sizeof(char), *datalength, stdout)); d(printf("'\n")); if (*datalength > 0) { while (f) { camel_mime_filter_filter(f->filter, *databuffer, *datalength, presize, databuffer, datalength, &presize); d(printf ("Filtered content (%s): '", camel_type_to_name(((CamelObject *)f->filter)->s.type))); d(fwrite(*databuffer, sizeof(char), *datalength, stdout)); d(printf("'\n")); f = f->next; } return; } } while (hb == h && *datalength > 0); /* check for any filter completion data */ if (*datalength > 0) { while (f) { camel_mime_filter_complete(f->filter, *databuffer, *datalength, presize, databuffer, datalength, &presize); f = f->next; } } if (*datalength > 0) return; s->state = HSCAN_BODY_END; break; case HSCAN_MULTIPART: h = s->parts; do { do { hb = folder_scan_content(s, &state, databuffer, datalength); if (*datalength>0) { /* instead of a new state, we'll just store it locally and provide an accessor function */ d(printf("Multipart %s Content %p: '%.*s'\n", h->prestage>0?"post":"pre", h, *datalength, *databuffer)); if (h->prestage > 0) { if (h->posttext == NULL) h->posttext = g_byte_array_new(); g_byte_array_append(h->posttext, *databuffer, *datalength); } else { if (h->pretext == NULL) h->pretext = g_byte_array_new(); g_byte_array_append(h->pretext, *databuffer, *datalength); } } } while (hb==h && *datalength>0); h->prestage++; if (*datalength==0 && hb==h) { d(printf("got boundary: %s\n", hb->boundary)); folder_scan_skip_line(s, NULL); if (!state) { s->state = HSCAN_FROM; folder_scan_step(s, databuffer, datalength); s->parts->savestate = HSCAN_MULTIPART; /* set return state for the new head part */ return; } } else { break; } } while (1); s->state = HSCAN_MULTIPART_END; break; case HSCAN_MESSAGE: s->state = HSCAN_FROM; folder_scan_step(s, databuffer, datalength); s->parts->savestate = HSCAN_MESSAGE_END; break; case HSCAN_FROM_END: case HSCAN_BODY_END: case HSCAN_MULTIPART_END: case HSCAN_MESSAGE_END: s->state = s->parts->savestate; folder_pull_part(s); if (s->state & HSCAN_END) return; goto tail_recurse; case HSCAN_EOF: return; default: g_warning("Invalid state in camel-mime-parser: %d", s->state); break; } return; } /* drops the current state back one */ static void folder_scan_drop_step(struct _header_scan_state *s) { switch (s->state) { case HSCAN_INITIAL: case HSCAN_EOF: return; case HSCAN_FROM: case HSCAN_PRE_FROM: s->state = HSCAN_INITIAL; folder_pull_part(s); return; case HSCAN_MESSAGE: case HSCAN_HEADER: case HSCAN_MULTIPART: case HSCAN_FROM_END: case HSCAN_BODY_END: case HSCAN_MULTIPART_END: case HSCAN_MESSAGE_END: s->state = s->parts->savestate; folder_pull_part(s); if (s->state & HSCAN_END) { s->state &= ~HSCAN_END; } return; default: /* FIXME: not sure if this is entirely right */ } } #ifdef STANDALONE int main(int argc, char **argv) { int fd; struct _header_scan_state *s; char *data; int len; int state; char *name = "/tmp/evmail/Inbox"; struct _header_scan_stack *h; int i; int attach = 0; if (argc==2) name = argv[1]; printf("opening: %s", name); for (i=1;i<argc;i++) { const char *encoding = NULL, *charset = NULL; char *attachname; name = argv[i]; printf("opening: %s", name); fd = open(name, O_RDONLY); if (fd==-1) { perror("Cannot open mailbox"); exit(1); } s = folder_scan_init(); folder_scan_init_with_fd(s, fd); s->scan_from = FALSE; #if 0 h = g_malloc0(sizeof(*h)); h->savestate = HSCAN_EOF; folder_push_part(s, h); #endif while (s->state != HSCAN_EOF) { folder_scan_step(s, &data, &len); printf("\n -- PARSER STEP RETURN -- %d '%s'\n\n", s->state, states[s->state]); switch (s->state) { case HSCAN_HEADER: if (s->parts->content_type && (charset = header_content_type_param(s->parts->content_type, "charset"))) { if (strcasecmp(charset, "us-ascii")) { #if 0 folder_push_filter_charset(s, "UTF-8", charset); #endif } else { charset = NULL; } } else { charset = NULL; } encoding = header_raw_find(&s->parts->headers, "Content-transfer-encoding", 0); printf("encoding = '%s'\n", encoding); if (encoding && !strncasecmp(encoding, " base64", 7)) { printf("adding base64 filter\n"); attachname = g_strdup_printf("attach.%d.%d", i, attach++); #if 0 folder_push_filter_save(s, attachname); #endif g_free(attachname); #if 0 folder_push_filter_mime(s, 0); #endif } if (encoding && !strncasecmp(encoding, " quoted-printable", 17)) { printf("adding quoted-printable filter\n"); attachname = g_strdup_printf("attach.%d.%d", i, attach++); #if 0 folder_push_filter_save(s, attachname); #endif g_free(attachname); #if 0 folder_push_filter_mime(s, 1); #endif } break; case HSCAN_BODY: printf("got body %d '%.*s'\n", len, len, data); break; case HSCAN_BODY_END: printf("end body %d '%.*s'\n", len, len, data); if (encoding && !strncasecmp(encoding, " base64", 7)) { printf("removing filters\n"); #if 0 folder_filter_pull(s); folder_filter_pull(s); #endif } if (encoding && !strncasecmp(encoding, " quoted-printable", 17)) { printf("removing filters\n"); #if 0 folder_filter_pull(s); folder_filter_pull(s); #endif } if (charset) { #if 0 folder_filter_pull(s); #endif charset = NULL; } encoding = NULL; break; default: break; } } folder_scan_close(s); close(fd); } return 0; } #endif /* STANDALONE */