/* * Copyright (C) 2000 Helix Code Inc. * * Authors: Michael Zucchi <notzed@helixcode.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License * as published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* What should hopefully be a fast mail parser */ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <string.h> #include <stdio.h> #include <errno.h> #include <unicode.h> #include <regex.h> #include <ctype.h> #include <glib.h> #include "camel-mime-parser.h" #include "camel-mime-utils.h" #include "camel-mime-filter.h" #include "camel-stream.h" #include "camel-seekable-stream.h" #define r(x) #define h(x) #define c(x) #define d(x) /*#define PURIFY*/ #define MEMPOOL #define STRUCT_ALIGN 4 #ifdef PURIFY int inend_id = -1, inbuffer_id = -1; #endif #if 0 extern int strdup_count; extern int malloc_count; extern int free_count; #define g_strdup(x) (strdup_count++, g_strdup(x)) #define g_malloc(x) (malloc_count++, g_malloc(x)) #define g_free(x) (free_count++, g_free(x)) #endif #ifdef MEMPOOL typedef struct _MemPoolNode { struct _MemPoolNode *next; int free; char data[1]; } MemPoolNode; typedef struct _MemPoolThresholdNode { struct _MemPoolThresholdNode *next; char data[1]; } MemPoolThresholdNode; typedef struct _MemPool { int blocksize; int threshold; struct _MemPoolNode *blocks; struct _MemPoolThresholdNode *threshold_blocks; } MemPool; MemPool *mempool_new(int blocksize, int threshold); void *mempool_alloc(MemPool *pool, int size); void mempool_flush(MemPool *pool, int freeall); void mempool_free(MemPool *pool); MemPool *mempool_new(int blocksize, int threshold) { MemPool *pool; pool = g_malloc(sizeof(*pool)); if (threshold >= blocksize) threshold = blocksize * 2 / 3; pool->blocksize = blocksize; pool->threshold = threshold; pool->blocks = NULL; pool->threshold_blocks = NULL; return pool; } void *mempool_alloc(MemPool *pool, int size) { size = (size + STRUCT_ALIGN) & (~(STRUCT_ALIGN-1)); if (size>=pool->threshold) { MemPoolThresholdNode *n; n = g_malloc(sizeof(*n) - sizeof(char) + size); n->next = pool->threshold_blocks; pool->threshold_blocks = n; return &n->data[0]; } else { MemPoolNode *n; n = pool->blocks; while (n) { if (n->free >= size) { n->free -= size; return &n->data[n->free]; } n = n->next; } n = g_malloc(sizeof(*n) - sizeof(char) + pool->blocksize); n->next = pool->blocks; pool->blocks = n; n->free = pool->blocksize - size; return &n->data[n->free]; } } void mempool_flush(MemPool *pool, int freeall) { MemPoolThresholdNode *tn, *tw; MemPoolNode *pw, *pn; tw = pool->threshold_blocks; while (tw) { tn = tw->next; g_free(tw); tw = tn; } pool->threshold_blocks = NULL; if (freeall) { pw = pool->blocks; while (pw) { pn = pw->next; g_free(pw); pw = pn; } pool->blocks = NULL; } else { pw = pool->blocks; while (pw) { pw->free = pool->blocksize; pw = pw->next; } } } void mempool_free(MemPool *pool) { if (pool) { mempool_flush(pool, 1); g_free(pool); } } #endif #define SCAN_BUF 4096 /* size of read buffer */ #define SCAN_HEAD 128 /* headroom guaranteed to be before each read buffer */ /* a little hacky, but i couldn't be bothered renaming everything */ #define _header_scan_state _CamelMimeParserPrivate #define _PRIVATE(o) (((CamelMimeParser *)(o))->priv) struct _header_scan_state { /* global state */ enum _header_state state; /* for building headers during scanning */ char *outbuf; char *outptr; char *outend; int fd; /* input for a fd input */ CamelStream *stream; /* or for a stream */ /* for scanning input buffers */ char *realbuf; /* the real buffer, SCAN_HEAD*2 + SCAN_BUF bytes */ char *inbuf; /* points to a subset of the allocated memory, the underflow */ char *inptr; /* (upto SCAN_HEAD) is for use by filters so they dont copy all data */ char *inend; int atleast; int seek; /* current offset to start of buffer */ int unstep; /* how many states to 'unstep' (repeat the current state) */ int midline; /* are we mid-line interrupted? */ int scan_from; /* do we care about From lines? */ int start_of_from; /* where from started */ int start_of_headers; /* where headers started from the last scan */ int header_start; /* start of last header, or -1 */ struct _header_scan_stack *top_part; /* top of message header */ int top_start; /* offset of start */ struct _header_scan_stack *pending; /* if we're pending part info, from the wrong part end */ /* filters to apply to all content before output */ int filterid; /* id of next filter */ struct _header_scan_filter *filters; /* per message/part info */ struct _header_scan_stack *parts; }; struct _header_scan_stack { struct _header_scan_stack *parent; enum _header_state savestate; /* state at invocation of this part */ #ifdef MEMPOOL MemPool *pool; /* memory pool to keep track of headers/etc at this level */ #endif struct _header_raw *headers; /* headers for this part */ struct _header_content_type *content_type; char *boundary; /* for multipart/ * boundaries, including leading -- and trailing -- for the final part */ int boundarylen; /* length of boundary, including leading -- */ }; struct _header_scan_filter { struct _header_scan_filter *next; int id; CamelMimeFilter *filter; }; static void folder_scan_step(struct _header_scan_state *s, char **databuffer, int *datalength); static void folder_scan_drop_step(struct _header_scan_state *s); static int folder_scan_init_with_fd(struct _header_scan_state *s, int fd); static int folder_scan_init_with_stream(struct _header_scan_state *s, CamelStream *stream); static struct _header_scan_state *folder_scan_init(void); static void folder_scan_close(struct _header_scan_state *s); static struct _header_scan_stack *folder_scan_content(struct _header_scan_state *s, int *lastone, char **data, int *length); static struct _header_scan_stack *folder_scan_header(struct _header_scan_state *s, int *lastone); static int folder_scan_skip_line(struct _header_scan_state *s); static off_t folder_seek(struct _header_scan_state *s, off_t offset, int whence); static off_t folder_tell(struct _header_scan_state *s); #ifdef MEMPOOL static void header_append_mempool(struct _header_scan_state *s, struct _header_scan_stack *h, char *header, int offset); #endif static void camel_mime_parser_class_init (CamelMimeParserClass *klass); static void camel_mime_parser_init (CamelMimeParser *obj); static char *states[] = { "HSCAN_INITIAL", "HSCAN_FROM", /* got 'From' line */ "HSCAN_HEADER", /* toplevel header */ "HSCAN_BODY", /* scanning body of message */ "HSCAN_MULTIPART", /* got multipart header */ "HSCAN_MESSAGE", /* rfc822 message */ "HSCAN_PART", /* part of a multipart */ "<invalid>", "HSCAN_EOF", /* end of file */ "HSCAN_FROM_END", "HSCAN_HEAER_END", "HSCAN_BODY_END", "HSCAN_MULTIPART_END", "HSCAN_MESSAGE_END", }; static CamelObjectClass *camel_mime_parser_parent; enum SIGNALS { LAST_SIGNAL }; static guint signals[LAST_SIGNAL] = { 0 }; guint camel_mime_parser_get_type (void) { static guint type = 0; if (!type) { GtkTypeInfo type_info = { "CamelMimeParser", sizeof (CamelMimeParser), sizeof (CamelMimeParserClass), (GtkClassInitFunc) camel_mime_parser_class_init, (GtkObjectInitFunc) camel_mime_parser_init, (GtkArgSetFunc) NULL, (GtkArgGetFunc) NULL }; type = gtk_type_unique (camel_object_get_type (), &type_info); } return type; } static void finalise(GtkObject *o) { struct _header_scan_state *s = _PRIVATE(o); #ifdef PURIFY purify_watch_remove_all(); #endif folder_scan_close(s); ((GtkObjectClass *)camel_mime_parser_parent)->finalize (o); } static void camel_mime_parser_class_init (CamelMimeParserClass *klass) { GtkObjectClass *object_class = (GtkObjectClass *) klass; camel_mime_parser_parent = gtk_type_class (camel_object_get_type ()); object_class->finalize = finalise; gtk_object_class_add_signals (object_class, signals, LAST_SIGNAL); } static void camel_mime_parser_init (CamelMimeParser *obj) { struct _header_scan_state *s; s = folder_scan_init(); _PRIVATE(obj) = s; } /** * camel_mime_parser_new: * * Create a new CamelMimeParser object. * * Return value: A new CamelMimeParser widget. **/ CamelMimeParser * camel_mime_parser_new (void) { CamelMimeParser *new = CAMEL_MIME_PARSER ( gtk_type_new (camel_mime_parser_get_type ())); return new; } /** * camel_mime_parser_filter_add: * @m: * @mf: * * Add a filter that will be applied to any body content before it is passed * to the caller. Filters may be pipelined to perform multi-pass operations * on the content, and are applied in the order they were added. * * Note that filters are only applied to the body content of messages, and once * a filter has been set, all content returned by a filter_step() with a state * of HSCAN_BODY will have passed through the filter. * * Return value: An id that may be passed to filter_remove() to remove * the filter, or -1 if the operation failed. **/ int camel_mime_parser_filter_add(CamelMimeParser *m, CamelMimeFilter *mf) { struct _header_scan_state *s = _PRIVATE(m); struct _header_scan_filter *f, *new; new = g_malloc(sizeof(*new)); new->filter = mf; new->id = s->filterid++; if (s->filterid == -1) s->filterid++; new->next = 0; gtk_object_ref((GtkObject *)mf); /* yes, this is correct, since 'next' is the first element of the struct */ f = (struct _header_scan_filter *)&s->filters; while (f->next) f = f->next; f->next = new; return new->id; } /** * camel_mime_parser_filter_remove: * @m: * @id: * * Remove a processing filter from the pipeline. There is no * restriction on the order the filters can be removed. **/ void camel_mime_parser_filter_remove(CamelMimeParser *m, int id) { struct _header_scan_state *s = _PRIVATE(m); struct _header_scan_filter *f, *old; f = (struct _header_scan_filter *)&s->filters; while (f && f->next) { old = f->next; if (old->id == id) { gtk_object_unref((GtkObject *)old->filter); f->next = old->next; g_free(old); /* there should only be a single matching id, but scan the whole lot anyway */ } f = f->next; } } /** * camel_mime_parser_header: * @m: * @name: Name of header. * @offset: Pointer that can receive the offset of the header in * the stream from the start of parsing. * * Lookup a header by name. * * Return value: The header value, or NULL if the header is not * defined. **/ const char * camel_mime_parser_header(CamelMimeParser *m, const char *name, int *offset) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts && s->parts->headers) { return header_raw_find(&s->parts->headers, name, offset); } return NULL; } /** * camel_mime_parser_headers_raw: * @m: * * Get the list of the raw headers which are defined for the * current state of the parser. These headers are valid * until the next call to parser_step(), or parser_drop_step(). * * Return value: The raw headers, or NULL if there are no headers * defined for the current part or state. These are READ ONLY. **/ struct _header_raw * camel_mime_parser_headers_raw(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); if (s->parts) return s->parts->headers; return NULL; } /** * camel_mime_parser_init_with_fd: * @m: * @fd: A valid file descriptor. * * Initialise the scanner with an fd. The scanner's offsets * will be relative to the current file position of the file * descriptor. As a result, seekable descritors should * be seeked using the parser seek functions. * * An initial buffer will be read from the file descriptor * immediately, although no parsing will occur. * * Return value: Returns -1 on error. **/ int camel_mime_parser_init_with_fd(CamelMimeParser *m, int fd) { struct _header_scan_state *s = _PRIVATE(m); return folder_scan_init_with_fd(s, fd); } /** * camel_mime_parser_init_with_stream: * @m: * @stream: * * Initialise the scanner with a source stream. The scanner's * offsets will be relative to the current file position of * the stream. As a result, seekable streams should only * be seeked using the parser seek function. * * An initial buffer will be read from the stream * immediately, although no parsing will occur. * * Return value: -1 on error. **/ int camel_mime_parser_init_with_stream(CamelMimeParser *m, CamelStream *stream) { struct _header_scan_state *s = _PRIVATE(m); return folder_scan_init_with_stream(s, stream); } /** * camel_mime_parser_scan_from: * @m: * @scan_from: #TRUE if the scanner should scan From lines. * * Tell the scanner if it should scan "^From " lines or not. * * If the scanner is scanning from lines, two additional * states HSCAN_FROM and HSCAN_FROM_END will be returned * to the caller during parsing. **/ void camel_mime_parser_scan_from(CamelMimeParser *m, int scan_from) { struct _header_scan_state *s = _PRIVATE(m); s->scan_from = scan_from; } /** * camel_mime_parser_content_type: * @m: * * Get the content type defined in the current part. * * Return value: A content_type structure, or NULL if there * is no content-type defined for this part of state of the * parser. **/ struct _header_content_type * camel_mime_parser_content_type(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); /* FIXME: should this search up until its found the 'right' content-type? can it? */ if (s->parts) return s->parts->content_type; return NULL; } /** * camel_mime_parser_unstep: * @m: * * Cause the last step operation to repeat itself. If this is * called repeated times, then the same step will be repeated * that many times. * * Note that it is not possible to scan back using this function, * only to have a way of peeking the next state. **/ void camel_mime_parser_unstep(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); s->unstep++; } /** * camel_mime_parser_drop_step: * @m: * * Drop the last step call. This should only be used * in conjunction with seeking of the stream as the * stream may be in an undefined state relative to the * state of the parser. * * Use this call with care. **/ void camel_mime_parser_drop_step(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); s->unstep = 0; folder_scan_drop_step(s); } /** * camel_mime_parser_step: * @m: * @databuffer: Pointer to accept a pointer to the data * associated with this step (if any). May be #NULL, * in which case datalength is also ingored. * @datalength: Pointer to accept a pointer to the data * length associated with this step (if any). * * Parse the next part of the MIME message. If _unstep() * has been called, then continue to return the same state * for that many calls. * * If the step is HSCAN_BODY then the databuffer and datalength * pointers will be setup to point to the internal data buffer * of the scanner and may be processed as required. Any * filters will have already been applied to this data. * * Refer to the state diagram elsewhere for a full listing of * the states an application is gauranteed to get from the * scanner. * * Return value: The current new state of the parser * is returned. **/ enum _header_state camel_mime_parser_step(CamelMimeParser *m, char **databuffer, int *datalength) { struct _header_scan_state *s = _PRIVATE(m); d(printf("OLD STATE: '%s' :\n", states[s->state])); if (s->unstep <= 0) { char *dummy; int dummylength; if (databuffer == NULL) { databuffer = &dummy; datalength = &dummylength; } folder_scan_step(s, databuffer, datalength); } else s->unstep--; d(printf("NEW STATE: '%s' :\n", states[s->state])); return s->state; } /** * camel_mime_parser_tell: * @m: * * Return the current scanning offset. The meaning of this * value will depend on the current state of the parser. * * An incomplete listing of the states: * * HSCAN_INITIAL, The start of the current message. * HSCAN_HEADER, HSCAN_MESSAGE, HSCAN_MULTIPART, the character * position immediately after the end of the header. * HSCAN_BODY, Position within the message of the start * of the current data block. * HSCAN_*_END, The position of the character starting * the next section of the scan (the last position + 1 of * the respective current state). * * Return value: See above. **/ off_t camel_mime_parser_tell(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return folder_tell(s); } /** * camel_mime_parser_tell_start_headers: * @m: * * Find out the position within the file of where the * headers started, this is cached by the parser * at the time. * * Return value: The header start position, or -1 if * no headers were scanned in the current state. **/ off_t camel_mime_parser_tell_start_headers(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->start_of_headers; } /** * camel_mime_parser_tell_start_from: * @m: * * If the parser is scanning From lines, then this returns * the position of the start of the From line. * * Return value: The start of the from line, or -1 if there * was no From line, or From lines are not being scanned. **/ off_t camel_mime_parser_tell_start_from(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->start_of_from; } /** * camel_mime_parser_seek: * @m: * @off: Number of bytes to offset the seek by. * @whence: SEEK_SET, SEEK_CUR, SEEK_END * * Reset the source position to a known value. * * Note that if the source stream/descriptor was not * positioned at 0 to begin with, and an absolute seek * is specified (whence != SEEK_CUR), then the seek * position may not match the desired seek position. * * Return value: The new seek offset, or -1 on * an error (for example, trying to seek on a non-seekable * stream or file descriptor). **/ off_t camel_mime_parser_seek(CamelMimeParser *m, off_t off, int whence) { struct _header_scan_state *s = _PRIVATE(m); return folder_seek(s, off, whence); } /** * camel_mime_parser_state: * @m: * * Get the current parser state. * * Return value: The current parser state. **/ enum _header_state camel_mime_parser_state(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->state; } /** * camel_mime_parser_stream: * @m: * * Get the stream, if any, the parser has been initialised * with. May be used to setup sub-streams, but should not * be read from directly (without saving and restoring * the seek position in between). * * Return value: The stream from _init_with_stream(), or NULL * if the parser is reading from a file descriptor or is * uninitialised. **/ CamelStream *camel_mime_parser_stream(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->stream; } /** * camel_mime_parser_fd: * @m: * * Return the file descriptor, if any, the parser has been * initialised with. * * Should not be read from unless the parser it to terminate, * or the seek offset can be reset before the next parse * step. * * Return value: The file descriptor or -1 if the parser * is reading from a stream or has not been initialised. **/ int camel_mime_parser_fd(CamelMimeParser *m) { struct _header_scan_state *s = _PRIVATE(m); return s->fd; } /* ********************************************************************** */ /* Implementation */ /* ********************************************************************** */ /* read the next bit of data, ensure there is enough room 'atleast' bytes */ static int folder_read(struct _header_scan_state *s) { int len; int inoffset; if (s->inptr<s->inend-s->atleast) return s->inend-s->inptr; #ifdef PURIFY purify_watch_remove(inend_id); purify_watch_remove(inbuffer_id); #endif /* check for any remaning bytes (under the atleast limit( */ inoffset = s->inend - s->inptr; if (inoffset>0) { memcpy(s->inbuf, s->inptr, inoffset); } if (s->stream) { len = camel_stream_read(s->stream, s->inbuf+inoffset, SCAN_BUF-inoffset); } else { len = read(s->fd, s->inbuf+inoffset, SCAN_BUF-inoffset); } r(printf("read %d bytes, offset = %d\n", len, inoffset)); if (len>=0) { /* add on the last read block */ s->seek += s->inptr - s->inbuf; s->inptr = s->inbuf; s->inend = s->inbuf+len+inoffset; r(printf("content = %d '%.*s'\n",s->inend - s->inptr, s->inend - s->inptr, s->inptr)); } g_assert(s->inptr<=s->inend); #ifdef PURIFY inend_id = purify_watch(&s->inend); inbuffer_id = purify_watch_n(s->inend+1, SCAN_HEAD-1, "rw"); #endif r(printf("content = %d '%.*s'\n", s->inend - s->inptr, s->inend - s->inptr, s->inptr)); /* set a sentinal, for the inner loops to check against */ s->inend[0] = '\n'; return s->inend-s->inptr; } /* return the current absolute position of the data pointer */ static off_t folder_tell(struct _header_scan_state *s) { return s->seek + (s->inptr - s->inbuf); } /* need some way to prime the parser state, so this actually works for other than top-level messages */ static off_t folder_seek(struct _header_scan_state *s, off_t offset, int whence) { off_t newoffset; int len; if (s->stream) { if (CAMEL_IS_SEEKABLE_STREAM(s->stream)) { /* NOTE: assumes whence seekable stream == whence libc, which is probably the case (or bloody well should've been) */ newoffset = camel_seekable_stream_seek((CamelSeekableStream *)s->stream, offset, whence); } else { newoffset = -1; errno = EINVAL; } } else { newoffset = lseek(s->fd, offset, whence); } #ifdef PURIFY purify_watch_remove(inend_id); purify_watch_remove(inbuffer_id); #endif if (newoffset != -1) { s->seek = newoffset; s->inptr = s->inbuf; s->inend = s->inbuf; if (s->stream) len = camel_stream_read(s->stream, s->inbuf, SCAN_BUF); else len = read(s->fd, s->inbuf, SCAN_BUF); if (len>=0) { s->inend = s->inbuf+len; s->inend[0] = '\n'; } else newoffset = -1; } #ifdef PURIFY inend_id = purify_watch(&s->inend); inbuffer_id = purify_watch_n(s->inend+1, SCAN_HEAD-1, "rw"); #endif return newoffset; } static void folder_push_part(struct _header_scan_state *s, struct _header_scan_stack *h) { h->parent = s->parts; s->parts = h; } static void folder_pull_part(struct _header_scan_state *s) { struct _header_scan_stack *h; h = s->parts; if (h) { s->parts = h->parent; g_free(h->boundary); #ifdef MEMPOOL mempool_free(h->pool); #else header_raw_clear(&h->headers); #endif header_content_type_unref(h->content_type); g_free(h); } else { g_warning("Header stack underflow!\n"); } } static int folder_scan_skip_line(struct _header_scan_state *s) { int atleast = s->atleast; register char *inptr, *inend, c; int len; s->atleast = 1; while ( (len = folder_read(s)) > 0 && len > s->atleast) { /* ensure we have at least enough room here */ inptr = s->inptr; inend = s->inend-1; c = -1; while (inptr<inend && (c = *inptr++)!='\n') ; s->inptr = inptr; if (c=='\n') { s->atleast = atleast; return 0; } } s->atleast = atleast; return -1; /* not found */ } /* TODO: Is there any way to make this run faster? It gets called a lot ... */ static struct _header_scan_stack * folder_boundary_check(struct _header_scan_state *s, const char *boundary, int *lastone) { struct _header_scan_stack *part; int len = s->atleast-2; /* make sure we dont access past the buffer */ h(printf("checking boundary marker upto %d bytes\n", len)); part = s->parts; while (part) { h(printf(" boundary: %s\n", part->boundary)); h(printf(" against: '%.*s'\n", len, boundary)); if (part->boundary && part->boundarylen <= len && memcmp(boundary, part->boundary, part->boundarylen)==0) { h(printf("matched boundary: %s\n", part->boundary)); /* again, make sure we're in range */ if (part->boundarylen <= len+2) { h(printf("checking lastone\n")); *lastone = (boundary[part->boundarylen]=='-' && boundary[part->boundarylen+1]=='-'); } else { h(printf("not enough room to check last one?\n")); *lastone = FALSE; } /*printf("ok, we found it! : %s \n", (*lastone)?"Last one":"More to come?");*/ return part; } part = part->parent; } return NULL; } #ifdef MEMPOOL static void header_append_mempool(struct _header_scan_state *s, struct _header_scan_stack *h, char *header, int offset) { struct _header_raw *l, *n; char *content; d(printf("Header: %s: %s\n", name, value)); content = strchr(header, ':'); if (content) { register int len; n = mempool_alloc(h->pool, sizeof(*n)); n->next = NULL; len = content-header; n->name = mempool_alloc(h->pool, len+1); memcpy(n->name, header, len); n->name[len] = 0; content++; len = s->outptr - content; n->value = mempool_alloc(h->pool, len+1); memcpy(n->value, content, len); n->value[len] = 0; n->offset = offset; l = (struct _header_raw *)&h->headers; while (l->next) { l = l->next; } l->next = n; } } #define header_raw_append_parse(a, b, c) (header_append_mempool(s, h, b, c)) #endif /* Copy the string start->inptr into the header buffer (s->outbuf), grow if necessary and track the start offset of the header */ /* Basically an optimised version of g_byte_array_append() */ #define header_append(s, start, inptr) \ { \ register int headerlen = inptr-start; \ \ if (headerlen >= (s->outend - s->outptr)) { \ register char *outnew; \ register int len = ((s->outend - s->outbuf)+headerlen)*2+1; \ outnew = g_realloc(s->outbuf, len); \ s->outptr = s->outptr - s->outbuf + outnew; \ s->outbuf = outnew; \ s->outend = outnew + len; \ } \ memcpy(s->outptr, start, headerlen); \ s->outptr += headerlen; \ if (s->header_start == -1) \ s->header_start = (start-s->inbuf) + s->seek; \ } static struct _header_scan_stack * folder_scan_header(struct _header_scan_state *s, int *lastone) { int atleast = s->atleast; char *start; int len; struct _header_scan_stack *part, *overpart = s->parts; struct _header_scan_stack *h; char *inend; register char *inptr; h(printf("scanning first bit\n")); h = g_malloc0(sizeof(*h)); #ifdef MEMPOOL h->pool = mempool_new(8192, 4096); #endif /* FIXME: this info should be cached ? */ part = s->parts; s->atleast = 5; while (part) { if (part->boundary) s->atleast = MAX(s->atleast, part->boundarylen+2); part = part->parent; } #if 0 s->atleast = MAX(s->atleast, 5); if (s->parts) s->atleast = MAX(s->atleast, s->parts->boundarylen+2); #endif *lastone = FALSE; retry: while ((len = folder_read(s))>0 && len >= s->atleast) { /* ensure we have at least enough room here */ inptr = s->inptr; inend = s->inend-s->atleast; while (inptr<=inend) { /*printf(" '%.20s'\n", inptr);*/ start = inptr; if (!s->midline && (part = folder_boundary_check(s, inptr, lastone))) { if ((s->outptr>s->outbuf) || (inptr-start)) goto header_truncated; /* may not actually be truncated */ goto normal_exit; } /* goto next line */ while ((*inptr++)!='\n') ; /* check against the real buffer end, not our 'atleast limited' end */ /* also make sure we have at least 1 char lookahead, so even if we found a \n at the end, well, make out we didn't, and re-scan it next pass */ if (inptr>=s->inend) { inptr--; s->midline = TRUE; } else { s->midline = FALSE; } g_assert(inptr<=s->inend); header_append(s, start, inptr); h(printf("outbuf[0] = %02x '%c' oubuf[1] = %02x '%c'\n", s->outbuf[0], isprint(s->outbuf[0])?s->outbuf[0]:'.', s->outbuf[1], isprint(s->outbuf[1])?s->outbuf[1]:'.')); if (!s->midline && !(inptr[0] == ' ' || inptr[0] == '\t')) { if (s->outbuf[0] == '\n' || (s->outbuf[0] == '\r' && s->outbuf[1]=='\n')) { goto header_done; } /* we always have at least _1_ char here ... */ if (s->outptr[-1] == '\n') s->outptr--; s->outptr[0] = 0; d(printf("header %.10s at %d\n", s->outbuf, s->header_start)); header_raw_append_parse(&h->headers, s->outbuf, s->header_start); if (inptr[0]=='\n' || (inptr[0] == '\r' && inptr[1]=='\n')) { inptr++; goto header_done; } s->outptr = s->outbuf; s->header_start = -1; } } s->inptr = inptr; } /* ok, we're at the end of the data, just make sure we're not missing out some small truncated header markers */ if (overpart) { overpart = overpart->parent; while (overpart) { if (overpart->boundary && (overpart->boundarylen+2) < s->atleast) { s->atleast = overpart->boundarylen+2; h(printf("Retrying next smaller part ...\n")); goto retry; } overpart = overpart->parent; } } if ((s->outptr > s->outbuf) || s->inend > s->inptr) { start = s->inptr; inptr = s->inend; goto header_truncated; } s->atleast = atleast; return h; header_truncated: header_append(s, start, inptr); if (s->outptr>s->outbuf && s->outptr[-1] == '\n') s->outptr--; s->outptr[0] = 0; if (s->outbuf[0] == '\n' || (s->outbuf[0] == '\r' && s->outbuf[1]=='\n')) { goto header_done; } header_raw_append_parse(&h->headers, s->outbuf, s->header_start); header_done: part = s->parts; s->outptr = s->outbuf; normal_exit: s->inptr = inptr; s->atleast = atleast; s->header_start = -1; return h; } static struct _header_scan_stack * folder_scan_content(struct _header_scan_state *s, int *lastone, char **data, int *length) { int atleast = s->atleast; register char *inptr; char *inend; char *start; int len; struct _header_scan_stack *part, *overpart = s->parts; int already_packed = FALSE; /*printf("scanning content\n");*/ /* FIXME: this info should be cached ? */ part = s->parts; s->atleast = 5; while (part) { if (part->boundary) { c(printf("boundary: %s\n", part->boundary)); s->atleast = MAX(s->atleast, part->boundarylen+2); } part = part->parent; } /* s->atleast = MAX(s->atleast, 5);*/ #if 0 if (s->parts) s->atleast = MAX(s->atleast, s->parts->boundarylen+2); #endif *lastone = FALSE; retry: c(printf("atleast = %d\n", s->atleast)); while ((len = folder_read(s))>0 && len >= s->atleast) { /* ensure we have at least enough room here */ inptr = s->inptr; inend = s->inend-s->atleast; start = inptr; c(printf("inptr = %p, inend = %p\n", inptr, inend)); while (inptr<=inend) { if (!s->midline && (part = folder_boundary_check(s, inptr, lastone))) { if ( (inptr-start) ) goto content; goto normal_exit; } /* goto the next line */ while ((*inptr++)!='\n') ; /* check against the real buffer end, not our 'atleast limited' end */ if (inptr> s->inend) { inptr--; s->midline = TRUE; } else { s->midline = FALSE; } g_assert(inptr<=s->inend); } /* *sigh* so much for the beautiful simplicity of the code so far - here we have the snot to deal with the nasty end-cases that come from the read-ahead buffers we use */ /* what this does, is if we are somewhere near the end of the buffer, force it to the front, and re-read, ensuring we bunch as much together as possible, for the final read, without copying too much of the time */ /* make sure we dont loop forever, but also make sure we try smaller boundaries, if there are any, so we dont miss any. */ /* this is not needed for the header scanner, since it copies its own data */ c(printf("start offset = %d atleast = %d\n", start-s->inbuf, s->atleast)); if (start > (s->inbuf + s->atleast)) { /* force a re-scan of this data */ s->inptr = start; if (already_packed) goto smaller_boundary; c(printf("near the end, try and bunch things up a bit first\n")); already_packed = TRUE; } else { c(printf("dumping what i've got ...\n")); /* what would be nice here, is if that we're at eof, we bunch the last little bit in the same content, but i dont think this is easy */ goto content_mid; } } c(printf("length read = %d\n", len)); smaller_boundary: /* ok, we're at the end of the data, just make sure we're not missing out some small truncated header markers */ if (overpart) { overpart = overpart->parent; while (overpart) { if (overpart->boundary && (overpart->boundarylen+2) < s->atleast) { s->atleast = overpart->boundarylen+2; c(printf("Retrying next smaller part ...\n")); goto retry; } overpart = overpart->parent; } } if (s->inend > s->inptr) { start = s->inptr; inptr = s->inend; goto content; } *length = 0; s->atleast = atleast; return NULL; content_mid: s->midline = TRUE; content: part = s->parts; normal_exit: s->atleast = atleast; s->inptr = inptr; *data = start; *length = inptr-start; /* printf("got %scontent: %.*s", s->midline?"partial ":"", inptr-start, start);*/ return part; } static void folder_scan_close(struct _header_scan_state *s) { g_free(s->realbuf); g_free(s->outbuf); while (s->parts) folder_pull_part(s); if (s->fd != -1) close(s->fd); if (s->stream) { gtk_object_unref((GtkObject *)s->stream); } g_free(s); } static struct _header_scan_state * folder_scan_init(void) { struct _header_scan_state *s; s = g_malloc(sizeof(*s)); s->fd = -1; s->stream = NULL; s->outbuf = g_malloc(1024); s->outptr = s->outbuf; s->outend = s->outbuf+1024; s->realbuf = g_malloc(SCAN_BUF + SCAN_HEAD*2); s->inbuf = s->realbuf + SCAN_HEAD; s->inptr = s->inbuf; s->inend = s->inbuf; s->atleast = 0; s->seek = 0; /* current character position in file of the last read block */ s->unstep = 0; s->header_start = -1; s->start_of_from = -1; s->start_of_headers = -1; s->midline = FALSE; s->scan_from = FALSE; s->filters = NULL; s->filterid = 1; s->parts = NULL; s->state = HSCAN_INITIAL; return s; } static int folder_scan_init_with_fd(struct _header_scan_state *s, int fd) { int len; len = read(fd, s->inbuf, SCAN_BUF); if (len>=0) { s->inend = s->inbuf+len; s->inptr = s->inbuf; s->inend[0] = '\n'; if (s->fd != -1) close(s->fd); s->fd = fd; if (s->stream) { gtk_object_unref((GtkObject *)s->stream); s->stream = NULL; } return 0; } else { return -1; } } static int folder_scan_init_with_stream(struct _header_scan_state *s, CamelStream *stream) { int len; len = camel_stream_read(stream, s->inbuf, SCAN_BUF); if (len >= 0) { s->inend = s->inbuf+len; s->inptr = s->inbuf; s->inend[0] = '\n'; if (s->stream) gtk_object_unref((GtkObject *)s->stream); s->stream = stream; gtk_object_ref((GtkObject *)stream); if (s->fd != -1) { close(s->fd); s->fd = -1; } return 0; } else { return -1; } } #define USE_FROM static void folder_scan_step(struct _header_scan_state *s, char **databuffer, int *datalength) { struct _header_scan_stack *h, *hb; const char *content; const char *bound; int type; int state; struct _header_content_type *ct = NULL; struct _header_scan_filter *f; size_t presize; /* printf("\nSCAN PASS: state = %d '%s'\n", s->state, states[s->state]);*/ tail_recurse: d({ printf("\nSCAN STACK:\n"); printf(" '%s' :\n", states[s->state]); hb = s->parts; while (hb) { printf(" '%s' : %s ", states[hb->savestate], hb->boundary); if (hb->content_type) { printf("(%s/%s)", hb->content_type->type, hb->content_type->subtype); } else { printf("(default)"); } printf("\n"); hb = hb->parent; } printf("\n"); }); switch (s->state) { case HSCAN_INITIAL: #ifdef USE_FROM if (s->scan_from) { /* FIXME: it would be nice not to have to allocate this every pass */ h = g_malloc0(sizeof(*h)); h->boundary = g_strdup("From "); h->boundarylen = strlen(h->boundary); folder_push_part(s, h); h = s->parts; do { hb = folder_scan_content(s, &state, databuffer, datalength); } while (hb==h && *datalength>0); if (*datalength==0 && hb==h) { d(printf("found 'From '\n")); s->start_of_from = folder_tell(s); folder_scan_skip_line(s); h->savestate = HSCAN_INITIAL; s->state = HSCAN_FROM; } else { folder_pull_part(s); s->state = HSCAN_EOF; } return; } else { s->start_of_from = -1; } #endif case HSCAN_FROM: s->start_of_headers = folder_tell(s); h = folder_scan_header(s, &state); #ifdef USE_FROM if (s->scan_from) h->savestate = HSCAN_FROM_END; else #endif h->savestate = HSCAN_EOF; /* FIXME: should this check for MIME-Version: 1.0 as well? */ type = HSCAN_HEADER; if ( (content = header_raw_find(&h->headers, "Content-Type", NULL)) && (ct = header_content_type_decode(content))) { if (!strcasecmp(ct->type, "multipart")) { bound = header_content_type_param(ct, "boundary"); if (bound) { d(printf("multipart, boundary = %s\n", bound)); h->boundarylen = strlen(bound)+2; h->boundary = g_malloc(h->boundarylen+3); sprintf(h->boundary, "--%s--", bound); type = HSCAN_MULTIPART; } else { header_content_type_unref(ct); ct = header_content_type_decode("text/plain"); /* We can't quite do this, as it will mess up all the offsets ... */ /* header_raw_replace(&h->headers, "Content-Type", "text/plain", offset);*/ g_warning("Multipart with no boundary, treating as text/plain"); } } else if (!strcasecmp(ct->type, "message")) { if (!strcasecmp(ct->subtype, "rfc822") /*|| !strcasecmp(ct->subtype, "partial")*/) { type = HSCAN_MESSAGE; } } } else { /* make the default type for multipart/digest be message/rfc822 */ if ((s->parts && header_content_type_is(s->parts->content_type, "multipart", "digest"))) { ct = header_content_type_decode("message/rfc822"); type = HSCAN_MESSAGE; d(printf("parent was multipart/digest, autoupgrading to message/rfc822?\n")); /* maybe we should do this too? header_raw_append_parse(&h->headers, "Content-Type: message/rfc822", -1);*/ } } h->content_type = ct; folder_push_part(s, h); s->state = type; return; case HSCAN_HEADER: s->state = HSCAN_BODY; case HSCAN_BODY: h = s->parts; *datalength = 0; presize = SCAN_HEAD; f = s->filters; do { hb = folder_scan_content(s, &state, databuffer, datalength); if (*datalength>0) { d(printf("Content raw: '%.*s'\n", *datalength, *databuffer)); while (f) { camel_mime_filter_filter(f->filter, *databuffer, *datalength, presize, databuffer, datalength, &presize); f = f->next; } return; } } while (hb==h && *datalength>0); /* check for any filter completion data */ while (f) { camel_mime_filter_filter(f->filter, *databuffer, *datalength, presize, databuffer, datalength, &presize); f = f->next; } if (*datalength > 0) return; s->state = HSCAN_BODY_END; break; case HSCAN_MULTIPART: h = s->parts; do { do { hb = folder_scan_content(s, &state, databuffer, datalength); if (*datalength>0) { /* FIXME: needs a state to return this shit??? */ d(printf("Multipart Content: '%.*s'\n", *datalength, *databuffer)); } } while (hb==h && *datalength>0); if (*datalength==0 && hb==h) { d(printf("got boundary: %s\n", hb->boundary)); folder_scan_skip_line(s); if (!state) { s->state = HSCAN_FROM; folder_scan_step(s, databuffer, datalength); s->parts->savestate = HSCAN_MULTIPART; /* set return state for the new head part */ return; } } else { break; } } while (1); s->state = HSCAN_MULTIPART_END; break; case HSCAN_MESSAGE: s->state = HSCAN_FROM; folder_scan_step(s, databuffer, datalength); s->parts->savestate = HSCAN_MESSAGE_END; break; case HSCAN_FROM_END: case HSCAN_BODY_END: case HSCAN_MULTIPART_END: case HSCAN_MESSAGE_END: s->state = s->parts->savestate; folder_pull_part(s); if (s->state & HSCAN_END) return; goto tail_recurse; case HSCAN_EOF: return; default: g_warning("Invalid state in camel-mime-parser: %d", s->state); break; } return; } /* drops the current state back one */ static void folder_scan_drop_step(struct _header_scan_state *s) { switch (s->state) { case HSCAN_INITIAL: case HSCAN_EOF: return; case HSCAN_FROM: s->state = HSCAN_INITIAL; folder_pull_part(s); return; case HSCAN_MESSAGE: case HSCAN_HEADER: case HSCAN_MULTIPART: case HSCAN_FROM_END: case HSCAN_BODY_END: case HSCAN_MULTIPART_END: case HSCAN_MESSAGE_END: s->state = s->parts->savestate; folder_pull_part(s); if (s->state & HSCAN_END) { s->state &= ~HSCAN_END; } return; default: /* FIXME: not sure if this is entirely right */ } } #ifdef STANDALONE int main(int argc, char **argv) { int fd; struct _header_scan_state *s; char *data; int len; int state; char *name = "/tmp/evmail/Inbox"; struct _header_scan_stack *h; int i; int attach = 0; if (argc==2) name = argv[1]; printf("opening: %s", name); for (i=1;i<argc;i++) { const char *encoding = NULL, *charset = NULL; char *attachname; name = argv[i]; printf("opening: %s", name); fd = open(name, O_RDONLY); if (fd==-1) { perror("Cannot open mailbox"); exit(1); } s = folder_scan_init(fd); s->scan_from = FALSE; #if 0 h = g_malloc0(sizeof(*h)); h->savestate = HSCAN_EOF; folder_push_part(s, h); #endif while (s->state != HSCAN_EOF) { folder_scan_step(s, &data, &len); printf("\n -- PARSER STEP RETURN -- %d '%s'\n\n", s->state, states[s->state]); switch (s->state) { case HSCAN_HEADER: if (s->parts->content_type && (charset = header_content_type_param(s->parts->content_type, "charset"))) { if (strcasecmp(charset, "us-ascii")) { folder_push_filter_charset(s, "UTF-8", charset); } else { charset = NULL; } } else { charset = NULL; } encoding = header_raw_find(&s->parts->headers, "Content-transfer-encoding"); printf("encoding = '%s'\n", encoding); if (encoding && !strncasecmp(encoding, " base64", 7)) { printf("adding base64 filter\n"); attachname = g_strdup_printf("attach.%d.%d", i, attach++); folder_push_filter_save(s, attachname); g_free(attachname); folder_push_filter_mime(s, 0); } if (encoding && !strncasecmp(encoding, " quoted-printable", 17)) { printf("adding quoted-printable filter\n"); attachname = g_strdup_printf("attach.%d.%d", i, attach++); folder_push_filter_save(s, attachname); g_free(attachname); folder_push_filter_mime(s, 1); } break; case HSCAN_BODY: break; case HSCAN_BODY_END: if (encoding && !strncasecmp(encoding, " base64", 7)) { printf("removing filters\n"); folder_filter_pull(s); folder_filter_pull(s); } if (encoding && !strncasecmp(encoding, " quoted-printable", 17)) { printf("removing filters\n"); folder_filter_pull(s); folder_filter_pull(s); } if (charset) { folder_filter_pull(s); charset = NULL; } encoding = NULL; break; default: break; } } folder_scan_close(s); close(fd); } return 0; } #endif /* STANDALONE */