/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/* camel-mbox-parser.c : mbox folder parser */
/*
*
* Author : Bertrand Guiheneuf <bertrand@helixcode.com>
*
* Copyright (C) 1999 Helix Code .
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*/
#include <config.h>
#include "camel-mbox-parser.h"
#include "camel-mbox-utils.h"
#include "camel-log.h"
#include "camel-exception.h"
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>
#include <fcntl.h>
#define MBOX_PARSER_BUF_SIZE 10000
#define MBOX_PARSER_FROM_KW "from:"
#define MBOX_PARSER_FROM_KW_SZ 5
#define MBOX_PARSER_DATE_KW "date:"
#define MBOX_PARSER_DATE_KW_SZ 5
#define MBOX_PARSER_SUBJECT_KW "subject:"
#define MBOX_PARSER_SUBJECT_KW_SZ 8
#define MBOX_PARSER_TO_KW "to:"
#define MBOX_PARSER_TO_KW_SZ 3
#define MBOX_PARSER_X_EVOLUTION_KW "x-evolution:"
#define MBOX_PARSER_X_EVOLUTION_KW_SZ 12
/* the maximum lentgh of all the previous keywords */
#define MBOX_PARSER_MAX_KW_SIZE 12
#define MBOX_PARSER_SUMMARY_SIZE 150
typedef struct {
int fd; /* file descriptor of the mbox file */
glong real_position; /* real position in the file */
gchar *message_delimiter; /* message delimiter string */
guint message_delimiter_length;
guint message_summary_size; /* how many characters from the begining of the
mail to put into the message summary */
GArray *preparsed_messages; /* array of MessagePreParsingInfo */
CamelMboxParserMessageInfo current_message_info; /* used to store curent info */
gboolean is_pending_message; /* is there some message information pending ? */
/* buffer info */
gchar *buffer; /* temporary buffer */
guint left_chunk_size; /* size of the left chunk in the temp buffer */
guint last_position; /* last position that can be compared to a keyword */
guint current_position; /* current position in the temp buffer */
/* other */
GString *tmp_string; /* temporary string to fill the headers in */
} CamelMboxPreParser;
/* clear a preparsing info structure */
static void
clear_message_info (CamelMboxParserMessageInfo *preparsing_info)
{
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::clear_message_info\n");
preparsing_info->message_position = 0;
preparsing_info->size = 0;
preparsing_info->from = NULL;
preparsing_info->date = NULL;
preparsing_info->subject = NULL;
preparsing_info->priority = NULL;
preparsing_info->references = NULL;
preparsing_info->body_summary = NULL;
preparsing_info->end_of_headers_offset = 0;
preparsing_info->x_evolution_offset = 0;
preparsing_info->status = 0;
preparsing_info->uid = 0;
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::clear_message_info\n");
}
/**
* new_parser: create a new parser object
* @fd: file descriptor opened on the mbox file
* @message_delimiter: the string that announce the start of a new message.
*
* Create a new parser object. This object is the place where are
* stored all the information concerning the parsing process.
*
* Return value: The newly created parser object.
**/
static CamelMboxPreParser *
new_parser (int fd,
const gchar *message_delimiter)
{
CamelMboxPreParser *parser;
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::ew_parser\n");
parser = g_new0 (CamelMboxPreParser, 1);
parser->fd = fd;
parser->buffer = g_new (gchar, MBOX_PARSER_BUF_SIZE);
parser->current_position = 0;
parser->message_delimiter = g_strdup (message_delimiter);
parser->message_delimiter_length = strlen (message_delimiter);
parser->real_position = 0;
parser->preparsed_messages = g_array_new (FALSE, FALSE, sizeof (CamelMboxParserMessageInfo));
parser->message_summary_size = MBOX_PARSER_SUMMARY_SIZE;
parser->left_chunk_size = MAX (parser->message_delimiter_length, MBOX_PARSER_MAX_KW_SIZE);
parser->tmp_string = g_string_sized_new (1000);
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::ew_parser\n");
return parser;
}
/**
* parser_free: free the parser object
* @parser: the parser objet to free.
*
* it is important to notice that all structures allocated
* in new_parser () are freed ** EXCEPT ** the message
* information array, i.e. the preparsed_messages
* field.
**/
static void
parser_free (CamelMboxPreParser *parser)
{
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::parser_free\n");
g_free (parser->buffer);
g_free (parser->message_delimiter);
g_string_free (parser->tmp_string, TRUE);
g_free (parser);
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::parser_free\n");
}
/* ** handle exceptions here */
/**
* initialize_buffer: read the first chunk of data in the buffer
* @parser: parser object to fill
* @first_position: position to start the read at
*
* read the first chunk of data from the mbox file.
*
**/
static void
initialize_buffer (CamelMboxPreParser *parser,
glong first_position)
{
gint seek_res;
gint buf_nb_read;
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::intialize_buffer\n");
g_assert (parser);
/* set the search start position */
seek_res = lseek (parser->fd, first_position, SEEK_SET);
//if (seek_res == (off_t)-1) goto io_error;
/* the first part of the buffer is filled with newlines,
but the next time a chunk of buffer is read, it will
be filled with the last bytes of the previous chunk.
This allows simple g_strcasecmp to test for the presence of
the keyword */
memset (parser->buffer, '\n', parser->left_chunk_size);
do {
buf_nb_read = read (parser->fd, parser->buffer + parser->left_chunk_size,
MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
} while ((buf_nb_read == -1) && (errno == EINTR));
/* ** check for an error here */
if (buf_nb_read < MBOX_PARSER_BUF_SIZE - parser->left_chunk_size) {
/* fill the end of the buffer with 0\ */
memset (parser->buffer + buf_nb_read + parser->left_chunk_size, '\0',
MIN (parser->left_chunk_size, MBOX_PARSER_BUF_SIZE - buf_nb_read - parser->left_chunk_size));
};
parser->last_position = MIN (buf_nb_read + parser->left_chunk_size + 1,
MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
parser->current_position = parser->left_chunk_size;
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::intialize_buffer\n");
}
/**
* read_next_buffer_chunk: read the next chunk of data in the mbox file
* @parser: parser object
*
* read the next chunk of data in the mbox file.
* Routine copies the last part of the buffer at
* the begining are concatenate the read data to
* it. This allows strcmp of keywords in the buffer,
* until the last postion. That means you can
* do a strcmp (buffer, keyword) for any of the
* keyword defined at the begining of this file.
*
**/
static void
read_next_buffer_chunk (CamelMboxPreParser *parser)
{
gint buf_nb_read;
g_assert (parser);
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::intialize_buffer\n");
/* read the next chunk of data in the folder file : */
/* - first, copy the last bytes from the previous
chunk at the begining of the new one. */
memcpy (parser->buffer,
parser->buffer + MBOX_PARSER_BUF_SIZE - parser->left_chunk_size,
parser->left_chunk_size);
/* - then read the next chunk on disk */
do {
buf_nb_read = read (parser->fd,
parser->buffer + parser->left_chunk_size,
MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
} while ((buf_nb_read == -1) && (errno == EINTR));
/* ** check for an error here */
if (buf_nb_read < MBOX_PARSER_BUF_SIZE - parser->left_chunk_size) {
/* fill the end of the buffer with 0\ */
memset (parser->buffer + buf_nb_read + parser->left_chunk_size, '\0',
MIN (parser->left_chunk_size, MBOX_PARSER_BUF_SIZE - buf_nb_read - parser->left_chunk_size));
};
parser->last_position = MIN (buf_nb_read + parser->left_chunk_size + 1,
MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
parser->current_position = 0;
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::intialize_buffer\n");
}
/**
* goto_next_char: go one postion forward in the buffer
* @parser: parser object
*
* goto one position forward in the buffer. If necessary,
* read the next chunk of data in the file.
*
**/
static void
goto_next_char (CamelMboxPreParser *parser)
{
if (parser->current_position < parser->last_position - 1)
parser->current_position++;
else
read_next_buffer_chunk (parser);
parser->real_position++;
}
/**
* advance_n_chars: go n positions forward in the buffer.
* @parser: parser object
* @n: number of characters to advance.
*
**/
static void
advance_n_chars (CamelMboxPreParser *parser, guint n)
{
gint position_to_the_end;
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::advnce_n_chars\n");
position_to_the_end = parser->last_position - parser->current_position;
if (n < position_to_the_end)
parser->current_position += n;
else {
read_next_buffer_chunk (parser);
parser->current_position = n - position_to_the_end;
}
parser->real_position += n;
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::advance_n_chars\n");
}
/* called when the buffer has detected the begining of
a new message. This routine is supposed to simply
store the previous message information and
clean the temporary structure used to store
the informations */
/**
* new_message_detected: routine to call when a new message has been detected
* @parser: parser object.
*
* this routine must be called when the keyword determining the
* begining of a new message has been detected. It pushes the
* information fetched for the last message into the message information
* array. Also, it gets the parser to the end of the line.
**/
static void
new_message_detected (CamelMboxPreParser *parser)
{
gchar c;
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::new_message_detected\n");
/* if we were filling a message information
save it in the message information array */
if (parser->is_pending_message) {
parser->current_message_info.size =
parser->real_position - parser->current_message_info.message_position;
g_array_append_vals (parser->preparsed_messages, (gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1);
}
clear_message_info ( &(parser->current_message_info));
/* go to the end of the line */
do {
c = parser->buffer[parser->current_position];
goto_next_char (parser);
} while (c != '\n');
/* save message position in the message information structure */
(parser->current_message_info).message_position = parser->real_position;
parser->is_pending_message = TRUE;
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::new_message_detected\n");
}
/**
* read_header: read the header content contained after the current position.
* @parser: the parser object.
* @header_content: a pointer on a (char *) variable to feed with the obtained header string.
*
* This routine must be called when the parser has detected a header
* and it wants the header content to be stored. The parser current position
* must EXACTELY be located at the begining of the header content line.
* For example, if the file contains the line :
* from:Bertrand Guiheneuf <bertrand@helixcode.com>
*
* When this routine is called, the parser must be located
* on the "B" of "Bertrand".
*
* When this routine returns, the parser is located just
* after the "\n" at the end of the header content.
*
**/
static void
read_header (CamelMboxPreParser *parser, gchar **header_content)
{
gboolean space = FALSE;
gboolean newline = FALSE;
gboolean header_end = FALSE;
gchar *buffer;
gchar c;
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::read_header\n");
g_assert (parser);
/* reset the header buffer string */
parser->tmp_string = g_string_truncate (parser->tmp_string, 0);
buffer = parser->buffer;
/* read the current character */
c = buffer[parser->current_position];
while (! ((c == '\0') || header_end )) {
if (space) {
if (c == ' ' && c == '\t')
goto next_char;
else
space = FALSE;
}
if (newline) {
if (c == ' ' && c == '\t') {
space = TRUE;
newline = FALSE;
goto next_char;
} else {
header_end = TRUE;
continue;
}
}
if (c == '\n') {
newline = TRUE;
goto next_char;
}
/* feed the header content */
parser->tmp_string = g_string_append_c (parser->tmp_string, c);
next_char: /* read next char in the buffer */
goto_next_char (parser);
/* read the current character */
c = buffer[parser->current_position];
}
/* copy the buffer in the preparsing information structure */
*header_content = g_strndup (parser->tmp_string->str, parser->tmp_string->len);
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::read_header\n");
}
/**
* read_message_begining: read the first characters of a message body
* @parser: parser object
* @message_summary: a pointer on a (gchar *) variable where the obtained string will be stored.
*
* Read the first lines of a message. When calling this routine, the
* parser must be located at the begining of the message body.
*
* Return value: if the parsing inside this routine last read a newline, then %TRUE is returned, otherwise %FALSE is returned
**/
static gboolean
read_message_begining (CamelMboxPreParser *parser, gchar **message_summary)
{
guint nb_read = 0;
gchar *buffer;
gboolean new_message = FALSE;
guint nb_line = 0;
g_assert (parser);
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::read_message_begining\n");
/* reset the header buffer string */
parser->tmp_string = g_string_truncate (parser->tmp_string, 0);
buffer = parser->buffer;
/* the message should not be filled character by
character but there is no g_string_n_append
function, so for the moment, this is a lazy
implementation */
while (! (buffer[parser->current_position] != '\0') &&
(nb_line <2) && (nb_read<parser->message_summary_size) &&
(!new_message)) {
/* test if we are not at the end of the message */
if (buffer[parser->current_position] == '\n') {
nb_line++;
goto_next_char (parser);
if ((buffer[parser->current_position] == '\0') ||
(g_strncasecmp (parser->buffer + parser->current_position,
parser->message_delimiter,
parser->message_delimiter_length) == 0)) {
new_message = TRUE;
continue;
} else {
/* we're not at the end, so let's just add the cr to the summary */
parser->tmp_string = g_string_append_c (parser->tmp_string,
'\n');
nb_read++;
continue;
}
}
parser->tmp_string = g_string_append_c (parser->tmp_string,
buffer[parser->current_position]);
nb_read++;
goto_next_char (parser);
}
*message_summary = g_strndup (parser->tmp_string->str, parser->tmp_string->len);
CAMEL_LOG_FULL_DEBUG ("Leaving CamelMboxParser::read_message_begining\n");
return new_message;
}
/**
* camel_mbox_parse_file: read an mbox file and parse it.
* @fd: file descriptor opened on the mbox file.
* @message_delimiter: character string delimiting the beginig of a new message
* @start_position: poition in the file where to start the parsing.
* @get_message_summary: should the parser retrieve the begining of the messages
* @status_callback: function to call peridically to indicate the progress of the parser
* @status_interval: floating value between 0 and 1 indicate how often to call @status_callback.
* @user_data: user data that will be passed to the callback function
*
* This routine parses an mbox file and retreives both the message starting positions and
* some of the informations contained in the message. Those informations are mainly
* some RFC822 headers values but also (optionally) the first characters of the mail
* body. The @get_message_summary parameter allows to enable or disable this option.
*
*
* Return value: An array of CamelMboxParserMessageInfo containing the informations on each message parsed in the file
**/
GArray *
camel_mbox_parse_file (int fd,
const gchar *message_delimiter,
glong start_position,
guint32 *file_size,
guint32 *next_uid,
gboolean get_message_summary,
camel_mbox_preparser_status_callback *status_callback,
double status_interval,
gpointer user_data)
{
CamelMboxPreParser *parser;
gboolean is_parsing_a_message = FALSE;
gchar c;
struct stat stat_buf;
gint fstat_result;
glong total_file_size;
int last_status = 0;
int real_interval;
gboolean newline;
GArray *return_value;
gchar *x_ev_header_content;
guint32 next_available_uid = 1;
g_assert (next_uid);
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::parse_file\n");
/* get file size */
fstat_result = fstat (fd, &stat_buf);
if (fstat_result == -1) {
g_warning ("Manage exception here \n");
}
total_file_size = stat_buf.st_size;
real_interval = status_interval * total_file_size;
/* create the parser */
parser = new_parser (fd, message_delimiter);
/* initialize the temporary char buffer */
initialize_buffer (parser, start_position);
/* the first line is indeed at the begining of a new line ... */
newline = TRUE;
while (parser->buffer[parser->current_position] != '\0') {
/* read the current character */
if (!newline) {
c = parser->buffer[parser->current_position];
newline = (c == '\n');
goto_next_char (parser);
}
if (newline) {
/* check if we reached a status milestone */
if ( status_callback && ((parser->real_position - last_status) > real_interval)) {
last_status += real_interval;
status_callback ((double)last_status / (double)total_file_size,
user_data);
}
/* is the next part a message delimiter ? */
if (strncmp (parser->buffer + parser->current_position,
parser->message_delimiter,
parser->message_delimiter_length) == 0) {
is_parsing_a_message = TRUE;
new_message_detected (parser);
newline = TRUE;
continue;
}
if (is_parsing_a_message) {
/* we could find the headers in a clever way, like
storing them in a list of pair
[keyword, offset_in_CamelMboxParserMessageInfo]
I am too busy for now. Contribution welcome */
/* is the next part a "from" header ? */
if (g_strncasecmp (parser->buffer + parser->current_position,
MBOX_PARSER_FROM_KW,
MBOX_PARSER_FROM_KW_SZ) == 0) {
advance_n_chars (parser, MBOX_PARSER_FROM_KW_SZ);
read_header (parser, (gchar **) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, from)));
newline = TRUE;
continue;
}
/* is the next part a "Date" header ? */
if (g_strncasecmp (parser->buffer + parser->current_position,
MBOX_PARSER_DATE_KW,
MBOX_PARSER_DATE_KW_SZ) == 0) {
advance_n_chars (parser, MBOX_PARSER_DATE_KW_SZ);
read_header (parser, (gchar **) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, date)));
newline = TRUE;
continue;
}
/* is the next part a "Subject" header ? */
if (g_strncasecmp (parser->buffer + parser->current_position,
MBOX_PARSER_SUBJECT_KW,
MBOX_PARSER_SUBJECT_KW_SZ) == 0) {
advance_n_chars (parser, MBOX_PARSER_SUBJECT_KW_SZ);
read_header (parser, (gchar **) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, subject)));
newline = TRUE;
continue;
}
/* is the next part a "To" header ? */
if (g_strncasecmp (parser->buffer + parser->current_position,
MBOX_PARSER_TO_KW,
MBOX_PARSER_TO_KW_SZ) == 0) {
advance_n_chars (parser, MBOX_PARSER_TO_KW_SZ);
read_header (parser, (gchar **) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, to)));
newline = TRUE;
continue;
}
/* is the next part a "X-evolution" header ? */
if (g_strncasecmp (parser->buffer + parser->current_position,
MBOX_PARSER_X_EVOLUTION_KW,
MBOX_PARSER_X_EVOLUTION_KW_SZ) == 0) {
/* in the case of the evolution private field, we store
the field position as well as its length because
we will have to change them */
parser->current_message_info.x_evolution_offset = parser->real_position
- parser->current_message_info.message_position;
advance_n_chars (parser, MBOX_PARSER_X_EVOLUTION_KW_SZ);
/* read the header */
read_header (parser, &x_ev_header_content);
/* parse it and put the result in the uid and status fields */
camel_mbox_xev_parse_header_content (x_ev_header_content,
(guint32 *) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, uid)),
(guchar *) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, status)));
g_free (x_ev_header_content);
next_available_uid = MAX (next_available_uid, parser->current_message_info.uid);
newline = TRUE;
continue;
}
/* is it an empty line ? */
if (parser->buffer[parser->current_position] == '\n') {
parser->current_message_info.end_of_headers_offset =
parser->real_position - parser->current_message_info.message_position;
goto_next_char (parser);
if (get_message_summary)
newline = read_message_begining (parser, (gchar **) ((gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
G_STRUCT_OFFSET (CamelMboxParserMessageInfo, body_summary)));
is_parsing_a_message = FALSE;
continue;
}
}
newline = FALSE;
}
}
/* if there is a pending message information put it in the array */
if (parser->is_pending_message) {
parser->current_message_info.size =
parser->real_position - parser->current_message_info.message_position;
g_array_append_vals (parser->preparsed_messages, (gchar *)parser +
G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1);
}
return_value = parser->preparsed_messages;
*file_size = parser->real_position;
*next_uid = next_available_uid;
/* free the parser */
parser_free (parser);
CAMEL_LOG_FULL_DEBUG ("Entering CamelMboxParser::parse_file\n");
return return_value;
}
#ifdef MBOX_PARSER_TEST
/* to build the test :
gcc -O3 -I/opt/gnome/lib/glib/include `glib-config --cflags` -o test_parser -DMBOX_PARSER_TEST -I ../.. -I ../../.. -I /usr/lib/glib/include camel-mbox-parser.c `glib-config --libs` -lm
*/
#include <math.h>
static void
status (double done, gpointer user_data)
{
printf ("%d %% done\n", (int)floor (done * 100));
}
int
main (int argc, char **argv)
{
int test_file_fd;
int i;
int file_size;
int next_uid;
GArray *message_positions;
CamelMboxParserMessageInfo *message_info;
gchar tmp_buffer[50];
tmp_buffer[49] = '\0';
if (argc<2) {
printf("usage: %s mbox\n", argv[0]);
return 1;
}
test_file_fd = open (argv[1], O_RDONLY);
message_positions = camel_mbox_parse_file (test_file_fd,
"From ",
0,
&file_size,
&next_uid,
TRUE,
status,
0.05,
NULL);
printf ("Found %d messages \n", message_positions->len);
for (i=0; i<message_positions->len; i++) {
message_info = ((CamelMboxParserMessageInfo *)(message_positions->data)) + i;
printf ("\n\n** Message %d : \n", i);
printf ("Size : %d\n", message_info->size);
printf ("From: %s\n", message_info->from);
printf ("Date: %s\n", message_info->date);
printf ("Subject: %s\n", message_info->subject);
printf ("Summary: %s\n", message_info->body_summary) ;
lseek (test_file_fd, message_info->message_position, SEEK_SET);
read (test_file_fd, tmp_buffer, 49);
printf ("File content at position %d : \n===\n%s\n===\n", message_info->message_position, tmp_buffer);
}
return 0;
}
#endif /* MBOX_PARSER_TEST */