/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* camel-mbox-parser.c : mbox folder parser */ /* * * Author : Bertrand Guiheneuf * * Copyright (C) 1999 Helix Code . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA */ #include #include "camel-mbox-parser.h" #include "camel-mbox-utils.h" #include "camel-log.h" #include "camel-exception.h" #include #include #include #include #include #include #define MBOX_PARSER_BUF_SIZE 10000 #define MBOX_PARSER_FROM_KW "from:" #define MBOX_PARSER_FROM_KW_SZ 5 #define MBOX_PARSER_DATE_KW "date:" #define MBOX_PARSER_DATE_KW_SZ 5 #define MBOX_PARSER_SUBJECT_KW "subject:" #define MBOX_PARSER_SUBJECT_KW_SZ 8 #define MBOX_PARSER_TO_KW "to:" #define MBOX_PARSER_TO_KW_SZ 3 #define MBOX_PARSER_X_EVOLUTION_KW "x-evolution:" #define MBOX_PARSER_X_EVOLUTION_KW_SZ 12 /* the maximum lentgh of all the previous keywords */ #define MBOX_PARSER_MAX_KW_SIZE 12 #define MBOX_PARSER_SUMMARY_SIZE 150 typedef struct { int fd; /* file descriptor of the mbox file */ glong real_position; /* real position in the file */ gchar *message_delimiter; /* message delimiter string */ guint message_delimiter_length; guint message_summary_size; /* how many characters from the begining of the mail to put into the message summary */ GArray *preparsed_messages; /* array of MessagePreParsingInfo */ CamelMboxParserMessageInfo current_message_info; /* used to store curent info */ gboolean is_pending_message; /* is there some message information pending ? */ /* buffer info */ gchar *buffer; /* temporary buffer */ guint left_chunk_size; /* size of the left chunk in the temp buffer */ guint last_position; /* last position that can be compared to a keyword */ guint current_position; /* current position in the temp buffer */ /* other */ GString *tmp_string; /* temporary string to fill the headers in */ } CamelMboxPreParser; /* clear a preparsing info structure */ static void clear_message_info (CamelMboxParserMessageInfo *preparsing_info) { CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::clear_message_info\n"); preparsing_info->message_position = 0; preparsing_info->size = 0; preparsing_info->from = NULL; preparsing_info->date = NULL; preparsing_info->subject = NULL; preparsing_info->priority = NULL; preparsing_info->references = NULL; preparsing_info->body_summary = NULL; preparsing_info->end_of_headers_offset = 0; preparsing_info->x_evolution_offset = 0; preparsing_info->status = 0; preparsing_info->uid = 0; CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::clear_message_info\n"); } /** * new_parser: create a new parser object * @fd: file descriptor opened on the mbox file * @message_delimiter: the string that announce the start of a new message. * * Create a new parser object. This object is the place where are * stored all the information concerning the parsing process. * * Return value: The newly created parser object. **/ static CamelMboxPreParser * new_parser (int fd, const gchar *message_delimiter) { CamelMboxPreParser *parser; CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::ew_parser\n"); parser = g_new0 (CamelMboxPreParser, 1); parser->fd = fd; parser->buffer = g_new (gchar, MBOX_PARSER_BUF_SIZE); parser->current_position = 0; parser->message_delimiter = g_strdup (message_delimiter); parser->message_delimiter_length = strlen (message_delimiter); parser->real_position = 0; parser->preparsed_messages = g_array_new (FALSE, FALSE, sizeof (CamelMboxParserMessageInfo)); parser->message_summary_size = MBOX_PARSER_SUMMARY_SIZE; parser->left_chunk_size = MAX (parser->message_delimiter_length, MBOX_PARSER_MAX_KW_SIZE); parser->tmp_string = g_string_sized_new (1000); CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::ew_parser\n"); return parser; } /** * parser_free: free the parser object * @parser: the parser objet to free. * * it is important to notice that all structures allocated * in new_parser () are freed ** EXCEPT ** the message * information array, i.e. the preparsed_messages * field. **/ static void parser_free (CamelMboxPreParser *parser) { CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::parser_free\n"); g_free (parser->buffer); g_free (parser->message_delimiter); g_string_free (parser->tmp_string, TRUE); g_free (parser); CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::parser_free\n"); } /* ** handle exceptions here */ /** * initialize_buffer: read the first chunk of data in the buffer * @parser: parser object to fill * @first_position: position to start the read at * * read the first chunk of data from the mbox file. * **/ static void initialize_buffer (CamelMboxPreParser *parser, glong first_position) { gint seek_res; gint buf_nb_read; CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::intialize_buffer\n"); g_assert (parser); /* set the search start position */ seek_res = lseek (parser->fd, first_position, SEEK_SET); //if (seek_res == (off_t)-1) goto io_error; /* the first part of the buffer is filled with newlines, but the next time a chunk of buffer is read, it will be filled with the last bytes of the previous chunk. This allows simple g_strcasecmp to test for the presence of the keyword */ memset (parser->buffer, '\n', parser->left_chunk_size); do { buf_nb_read = read (parser->fd, parser->buffer + parser->left_chunk_size, MBOX_PARSER_BUF_SIZE - parser->left_chunk_size); } while ((buf_nb_read == -1) && (errno == EINTR)); /* ** check for an error here */ if (buf_nb_read < MBOX_PARSER_BUF_SIZE - parser->left_chunk_size) { /* fill the end of the buffer with 0\ */ memset (parser->buffer + buf_nb_read + parser->left_chunk_size, '\0', MIN (parser->left_chunk_size, MBOX_PARSER_BUF_SIZE - buf_nb_read - parser->left_chunk_size)); }; parser->last_position = MIN (buf_nb_read + parser->left_chunk_size + 1, MBOX_PARSER_BUF_SIZE - parser->left_chunk_size); parser->current_position = parser->left_chunk_size; CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::intialize_buffer\n"); } /** * read_next_buffer_chunk: read the next chunk of data in the mbox file * @parser: parser object * * read the next chunk of data in the mbox file. * Routine copies the last part of the buffer at * the begining are concatenate the read data to * it. This allows strcmp of keywords in the buffer, * until the last postion. That means you can * do a strcmp (buffer, keyword) for any of the * keyword defined at the begining of this file. * **/ static void read_next_buffer_chunk (CamelMboxPreParser *parser) { gint buf_nb_read; g_assert (parser); CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::intialize_buffer\n"); /* read the next chunk of data in the folder file : */ /* - first, copy the last bytes from the previous chunk at the begining of the new one. */ memcpy (parser->buffer, parser->buffer + MBOX_PARSER_BUF_SIZE - parser->left_chunk_size, parser->left_chunk_size); /* - then read the next chunk on disk */ do { buf_nb_read = read (parser->fd, parser->buffer + parser->left_chunk_size, MBOX_PARSER_BUF_SIZE - parser->left_chunk_size); } while ((buf_nb_read == -1) && (errno == EINTR)); /* ** check for an error here */ if (buf_nb_read < MBOX_PARSER_BUF_SIZE - parser->left_chunk_size) { /* fill the end of the buffer with 0\ */ memset (parser->buffer + buf_nb_read + parser->left_chunk_size, '\0', MIN (parser->left_chunk_size, MBOX_PARSER_BUF_SIZE - buf_nb_read - parser->left_chunk_size)); }; parser->last_position = MIN (buf_nb_read + parser->left_chunk_size + 1, MBOX_PARSER_BUF_SIZE - parser->left_chunk_size); parser->current_position = 0; CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::intialize_buffer\n"); } /** * goto_next_char: go one postion forward in the buffer * @parser: parser object * * goto one position forward in the buffer. If necessary, * read the next chunk of data in the file. * **/ static void goto_next_char (CamelMboxPreParser *parser) { if (parser->current_position < parser->last_position - 1) parser->current_position++; else read_next_buffer_chunk (parser); parser->real_position++; } /** * advance_n_chars: go n positions forward in the buffer. * @parser: parser object * @n: number of characters to advance. * **/ static void advance_n_chars (CamelMboxPreParser *parser, guint n) { gint position_to_the_end; CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::advnce_n_chars\n"); position_to_the_end = parser->last_position - parser->current_position; if (n < position_to_the_end) parser->current_position += n; else { read_next_buffer_chunk (parser); parser->current_position = n - position_to_the_end; } parser->real_position += n; CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::advance_n_chars\n"); } /* called when the buffer has detected the begining of a new message. This routine is supposed to simply store the previous message information and clean the temporary structure used to store the informations */ /** * new_message_detected: routine to call when a new message has been detected * @parser: parser object. * * this routine must be called when the keyword determining the * begining of a new message has been detected. It pushes the * information fetched for the last message into the message information * array. Also, it gets the parser to the end of the line. **/ static void new_message_detected (CamelMboxPreParser *parser) { gchar c; CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::new_message_detected\n"); /* if we were filling a message information save it in the message information array */ if (parser->is_pending_message) { parser->current_message_info.size = parser->real_position - parser->current_message_info.message_position; g_array_append_vals (parser->preparsed_messages, (gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1); } clear_message_info ( &(parser->current_message_info)); /* go to the end of the line */ do { c = parser->buffer[parser->current_position]; goto_next_char (parser); } while (c != '\n'); /* save message position in the message information structure */ (parser->current_message_info).message_position = parser->real_position; parser->is_pending_message = TRUE; CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::new_message_detected\n"); } /** * read_header: read the header content contained after the current position. * @parser: the parser object. * @header_content: a pointer on a (char *) variable to feed with the obtained header string. * * This routine must be called when the parser has detected a header * and it wants the header content to be stored. The parser current position * must EXACTLY be located at the begining of the header content line. * For example, if the file contains the line : * from:Bertrand Guiheneuf * * When this routine is called, the parser must be located * on the "B" of "Bertrand". * * When this routine returns, the parser is located just * after the "\n" at the end of the header content. * **/ static void read_header (CamelMboxPreParser *parser, gchar **header_content) { gboolean space = FALSE; gboolean newline = FALSE; gboolean header_end = FALSE; gchar *buffer; gchar c; CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::read_header\n"); g_assert (parser); /* reset the header buffer string */ parser->tmp_string = g_string_truncate (parser->tmp_string, 0); buffer = parser->buffer; /* read the current character */ c = buffer[parser->current_position]; while (! ((c == '\0') || header_end )) { if (space) { if (c == ' ' && c == '\t') goto next_char; else space = FALSE; } if (newline) { if (c == ' ' && c == '\t') { space = TRUE; newline = FALSE; goto next_char; } else { header_end = TRUE; continue; } } if (c == '\n') { newline = TRUE; goto next_char; } /* feed the header content */ parser->tmp_string = g_string_append_c (parser->tmp_string, c); next_char: /* read next char in the buffer */ goto_next_char (parser); /* read the current character */ c = buffer[parser->current_position]; } /* copy the buffer in the preparsing information structure */ *header_content = g_strndup (parser->tmp_string->str, parser->tmp_string->len); CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::read_header\n"); } /** * read_message_begining: read the first characters of a message body * @parser: parser object * @message_summary: a pointer on a (gchar *) variable where the obtained string will be stored. * * Read the first lines of a message. When calling this routine, the * parser must be located at the begining of the message body. * * Return value: if the parsing inside this routine last read a newline, then %TRUE is returned, otherwise %FALSE is returned **/ static gboolean read_message_begining (CamelMboxPreParser *parser, gchar **message_summary) { guint nb_read = 0; gchar *buffer; gboolean new_message = FALSE; guint nb_line = 0; g_assert (parser); CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::read_message_begining\n"); /* reset the header buffer string */ parser->tmp_string = g_string_truncate (parser->tmp_string, 0); buffer = parser->buffer; /* the message should not be filled character by character but there is no g_string_n_append function, so for the moment, this is a lazy implementation */ while (! (buffer[parser->current_position] != '\0') && (nb_line <2) && (nb_readmessage_summary_size) && (!new_message)) { /* test if we are not at the end of the message */ if (buffer[parser->current_position] == '\n') { nb_line++; goto_next_char (parser); if ((buffer[parser->current_position] == '\0') || (g_strncasecmp (parser->buffer + parser->current_position, parser->message_delimiter, parser->message_delimiter_length) == 0)) { new_message = TRUE; continue; } else { /* we're not at the end, so let's just add the cr to the summary */ parser->tmp_string = g_string_append_c (parser->tmp_string, '\n'); nb_read++; continue; } } parser->tmp_string = g_string_append_c (parser->tmp_string, buffer[parser->current_position]); nb_read++; goto_next_char (parser); } *message_summary = g_strndup (parser->tmp_string->str, parser->tmp_string->len); CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::read_message_begining\n"); return new_message; } /** * camel_mbox_parse_file: read an mbox file and parse it. * @fd: file descriptor opened on the mbox file. * @message_delimiter: character string delimiting the beginig of a new message * @start_position: poition in the file where to start the parsing. * @get_message_summary: should the parser retrieve the begining of the messages * @status_callback: function to call peridically to indicate the progress of the parser * @status_interval: floating value between 0 and 1 indicate how often to call @status_callback. * @user_data: user data that will be passed to the callback function * * This routine parses an mbox file and retreives both the message starting positions and * some of the informations contained in the message. Those informations are mainly * some RFC822 headers values but also (optionally) the first characters of the mail * body. The @get_message_summary parameter allows to enable or disable this option. * * * Return value: An array of CamelMboxParserMessageInfo containing the informations on each message parsed in the file **/ GArray * camel_mbox_parse_file (int fd, const gchar *message_delimiter, glong start_position, guint32 *file_size, guint32 *next_uid, gboolean get_message_summary, camel_mbox_preparser_status_callback *status_callback, double status_interval, gpointer user_data) { CamelMboxPreParser *parser; gboolean is_parsing_a_message = FALSE; gchar c; struct stat stat_buf; gint fstat_result; glong total_file_size; int last_status = 0; int real_interval; gboolean newline; GArray *return_value; gchar *x_ev_header_content; guint32 next_available_uid = 1; g_assert (next_uid); CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::parse_file\n"); /* get file size */ fstat_result = fstat (fd, &stat_buf); if (fstat_result == -1) { g_warning ("Manage exception here \n"); } total_file_size = stat_buf.st_size; real_interval = status_interval * total_file_size; /* create the parser */ parser = new_parser (fd, message_delimiter); /* initialize the temporary char buffer */ initialize_buffer (parser, start_position); /* the first line is indeed at the begining of a new line ... */ newline = TRUE; while (parser->buffer[parser->current_position] != '\0') { /* read the current character */ if (!newline) { c = parser->buffer[parser->current_position]; newline = (c == '\n'); goto_next_char (parser); } if (newline) { /* check if we reached a status milestone */ if ( status_callback && ((parser->real_position - last_status) > real_interval)) { last_status += real_interval; status_callback ((double)last_status / (double)total_file_size, user_data); } /* is the next part a message delimiter ? */ if (strncmp (parser->buffer + parser->current_position, parser->message_delimiter, parser->message_delimiter_length) == 0) { is_parsing_a_message = TRUE; new_message_detected (parser); newline = TRUE; continue; } if (is_parsing_a_message) { /* we could find the headers in a clever way, like storing them in a list of pair [keyword, offset_in_CamelMboxParserMessageInfo] I am too busy for now. Contribution welcome */ /* is the next part a "from" header ? */ if (g_strncasecmp (parser->buffer + parser->current_position, MBOX_PARSER_FROM_KW, MBOX_PARSER_FROM_KW_SZ) == 0) { advance_n_chars (parser, MBOX_PARSER_FROM_KW_SZ); read_header (parser, (gchar **) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, from))); newline = TRUE; continue; } /* is the next part a "Date" header ? */ if (g_strncasecmp (parser->buffer + parser->current_position, MBOX_PARSER_DATE_KW, MBOX_PARSER_DATE_KW_SZ) == 0) { advance_n_chars (parser, MBOX_PARSER_DATE_KW_SZ); read_header (parser, (gchar **) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, date))); newline = TRUE; continue; } /* is the next part a "Subject" header ? */ if (g_strncasecmp (parser->buffer + parser->current_position, MBOX_PARSER_SUBJECT_KW, MBOX_PARSER_SUBJECT_KW_SZ) == 0) { advance_n_chars (parser, MBOX_PARSER_SUBJECT_KW_SZ); read_header (parser, (gchar **) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, subject))); newline = TRUE; continue; } /* is the next part a "To" header ? */ if (g_strncasecmp (parser->buffer + parser->current_position, MBOX_PARSER_TO_KW, MBOX_PARSER_TO_KW_SZ) == 0) { advance_n_chars (parser, MBOX_PARSER_TO_KW_SZ); read_header (parser, (gchar **) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, to))); newline = TRUE; continue; } /* is the next part a "X-evolution" header ? */ if (g_strncasecmp (parser->buffer + parser->current_position, MBOX_PARSER_X_EVOLUTION_KW, MBOX_PARSER_X_EVOLUTION_KW_SZ) == 0) { /* in the case of the evolution private field, we store the field position as well as its length because we will have to change them */ parser->current_message_info.x_evolution_offset = parser->real_position - parser->current_message_info.message_position; advance_n_chars (parser, MBOX_PARSER_X_EVOLUTION_KW_SZ); /* read the header */ read_header (parser, &x_ev_header_content); /* parse it and put the result in the uid and status fields */ camel_mbox_xev_parse_header_content (x_ev_header_content, (guint32 *) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, uid)), (guchar *) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, status))); g_free (x_ev_header_content); next_available_uid = MAX (next_available_uid, parser->current_message_info.uid); newline = TRUE; continue; } /* is it an empty line ? */ if (parser->buffer[parser->current_position] == '\n') { parser->current_message_info.end_of_headers_offset = parser->real_position - parser->current_message_info.message_position; goto_next_char (parser); if (get_message_summary) newline = read_message_begining (parser, (gchar **) ((gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + G_STRUCT_OFFSET (CamelMboxParserMessageInfo, body_summary))); is_parsing_a_message = FALSE; continue; } } newline = FALSE; } } /* if there is a pending message information put it in the array */ if (parser->is_pending_message) { parser->current_message_info.size = parser->real_position - parser->current_message_info.message_position; g_array_append_vals (parser->preparsed_messages, (gchar *)parser + G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1); } return_value = parser->preparsed_messages; *file_size = parser->real_position; *next_uid = next_available_uid; /* free the parser */ parser_free (parser); CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::parse_file\n"); return return_value; } #ifdef MBOX_PARSER_TEST /* to build the test : gcc -O3 -I/opt/gnome/lib/glib/include `glib-config --cflags` -o test_parser -DMBOX_PARSER_TEST -I ../.. -I ../../.. -I /usr/lib/glib/include camel-mbox-parser.c `glib-config --libs` -lm */ #include static void status (double done, gpointer user_data) { printf ("%d %% done\n", (int)floor (done * 100)); } int main (int argc, char **argv) { int test_file_fd; int i; int file_size; int next_uid; GArray *message_positions; CamelMboxParserMessageInfo *message_info; gchar tmp_buffer[50]; tmp_buffer[49] = '\0'; if (argc<2) { printf("usage: %s mbox\n", argv[0]); return 1; } test_file_fd = open (argv[1], O_RDONLY); message_positions = camel_mbox_parse_file (test_file_fd, "From ", 0, &file_size, &next_uid, TRUE, status, 0.05, NULL); printf ("Found %d messages \n", message_positions->len); for (i=0; ilen; i++) { message_info = ((CamelMboxParserMessageInfo *)(message_positions->data)) + i; printf ("\n\n** Message %d : \n", i); printf ("Size : %d\n", message_info->size); printf ("From: %s\n", message_info->from); printf ("Date: %s\n", message_info->date); printf ("Subject: %s\n", message_info->subject); printf ("Summary: %s\n", message_info->body_summary) ; lseek (test_file_fd, message_info->message_position, SEEK_SET); read (test_file_fd, tmp_buffer, 49); printf ("File content at position %d : \n===\n%s\n===\n", message_info->message_position, tmp_buffer); } return 0; } #endif /* MBOX_PARSER_TEST */