/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; fill-column: 160 -*- */
/* camel-mime-part-utils : Utility for mime parsing and so on
*
* Authors: Bertrand Guiheneuf <bertrand@helixcode.com>
* Michael Zucchi <notzed@ximian.com>
*
* Copyright 1999, 2000 Ximian, Inc. (www.ximian.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "string-utils.h"
#include "camel-mime-part-utils.h"
#include "camel-mime-message.h"
#include "camel-multipart.h"
#include "camel-seekable-substream.h"
#include "camel-stream-fs.h"
#include "camel-stream-filter.h"
#include "camel-stream-mem.h"
#include "camel-mime-filter-basic.h"
#include "camel-mime-filter-charset.h"
#include "camel-mime-filter-crlf.h"
#include "camel-html-parser.h"
#include "camel-charset-map.h"
#define d(x) /*(printf("%s(%d): ", __FILE__, __LINE__),(x))*/
/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
static const char *
check_html_charset (CamelMimeParser *mp, CamelMimeFilterBasicType enctype)
{
const char *buf;
off_t offset;
int length;
CamelHTMLParser *hp;
const char *charset = NULL;
camel_html_parser_t state;
struct _header_content_type *ct;
CamelMimeFilterBasic *fdec = NULL;
/* if we can't find the charset within the first 2k, we ain't gonna find it */
offset = camel_mime_parser_tell(mp);
length = camel_mime_parser_read(mp, &buf, 2048);
d(printf("Checking html for meta content-type: '%.*s'", len, buf));
if (length == 0) {
camel_mime_parser_seek(mp, offset, SEEK_SET);
return NULL;
}
/* if we need to first base64/qp decode, do this here, sigh */
hp = camel_html_parser_new();
if (enctype != 0) {
int dummy, len;
char *buffer;
fdec = camel_mime_filter_basic_new_type(enctype);
camel_mime_filter_filter((CamelMimeFilter *)fdec, (char *)buf, length, 0, &buffer, &len, &dummy);
camel_html_parser_set_data(hp, buffer, len, TRUE);
} else {
camel_html_parser_set_data(hp, buf, length, TRUE);
}
do {
const char *data;
int len;
const char *val;
state = camel_html_parser_step(hp, &data, &len);
/* example: <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"> */
switch(state) {
case CAMEL_HTML_PARSER_ELEMENT:
val = camel_html_parser_tag(hp);
d(printf("Got tag: %s\n", tag));
if (g_strcasecmp(val, "meta") == 0
&& (val = camel_html_parser_attr(hp, "http-equiv"))
&& g_strcasecmp(val, "content-type") == 0
&& (val = camel_html_parser_attr(hp, "content"))
&& (ct = header_content_type_decode(val))) {
charset = header_content_type_param(ct, "charset");
if (charset)
charset = camel_charset_get_iconv_friendly_name (charset);
header_content_type_unref(ct);
}
break;
default:
/* ignore everything else */
break;
}
} while (charset == NULL && state != CAMEL_HTML_PARSER_EOF);
camel_object_unref((CamelObject *)hp);
if (fdec)
camel_object_unref((CamelObject *)fdec);
camel_mime_parser_seek(mp, offset, SEEK_SET);
return charset;
}
/* simple data wrapper */
static void
simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
{
CamelMimeFilter *fdec = NULL, *fcrlf = NULL, *fch = NULL;
int len, decid = -1, crlfid = -1, chrid = -1;
struct _header_content_type *ct;
CamelSeekableStream *seekable_source = NULL;
CamelStream *source;
GByteArray *buffer;
off_t start = 0, end;
char *encoding, *buf;
CamelMimeFilterBasicType enctype = 0;
d(printf("constructing data-wrapper\n"));
/* Ok, try and be smart. If we're storing a small message (typical) convert it,
and store it in memory as we parse it ... if not, throw away the conversion
and scan till the end ... */
/* if we can't seek, dont have a stream/etc, then we must cache it */
source = camel_mime_parser_stream (mp);
if (source) {
camel_object_ref ((CamelObject *)source);
if (CAMEL_IS_SEEKABLE_STREAM (source)) {
seekable_source = CAMEL_SEEKABLE_STREAM (source);
}
}
/* first, work out conversion, if any, required, we dont care about what we dont know about */
encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "content-transfer-encoding", NULL));
if (encoding) {
if (!strcasecmp (encoding, "base64")) {
d(printf("Adding base64 decoder ...\n"));
enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
} else if (!strcasecmp(encoding, "quoted-printable")) {
d(printf("Adding quoted-printable decoder ...\n"));
enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC;
}
g_free (encoding);
if (enctype != 0) {
fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype);
decid = camel_mime_parser_filter_add (mp, fdec);
}
}
/* If we're doing text, we also need to do CRLF->LF and may have to convert it to UTF8 as well. */
ct = camel_mime_parser_content_type (mp);
if (header_content_type_is (ct, "text", "*")) {
const char *charset = header_content_type_param (ct, "charset");
if (fdec) {
d(printf("Adding CRLF conversion filter\n"));
fcrlf = (CamelMimeFilter *)camel_mime_filter_crlf_new (CAMEL_MIME_FILTER_CRLF_DECODE,
CAMEL_MIME_FILTER_CRLF_MODE_CRLF_ONLY);
crlfid = camel_mime_parser_filter_add (mp, fcrlf);
}
/* Possible Lame Mailer Alert... check the META tags for a charset */
if (!charset && header_content_type_is (ct, "text", "html"))
charset = check_html_charset (mp, enctype);
/* if the charset is not us-ascii or utf-8, then we need to convert to utf-8 */
if (charset && !(g_strcasecmp (charset, "us-ascii") == 0 || g_strcasecmp (charset, "utf-8") == 0)) {
d(printf("Adding conversion filter from %s to UTF-8\n", charset));
fch = (CamelMimeFilter *)camel_mime_filter_charset_new_convert (charset, "UTF-8");
if (fch) {
chrid = camel_mime_parser_filter_add (mp, (CamelMimeFilter *)fch);
} else {
g_warning ("Cannot convert '%s' to 'UTF-8', message display may be corrupt", charset);
}
}
}
buffer = g_byte_array_new ();
if (seekable_source /* !cache */) {
start = camel_mime_parser_tell (mp) + seekable_source->bound_start;
}
while (camel_mime_parser_step (mp, &buf, &len) != HSCAN_BODY_END) {
d(printf("appending o/p data: %d: %.*s\n", len, len, buf));
if (buffer) {
if (buffer->len > 20480 && seekable_source) {
/* is this a 'big' message? Yes? We dont want to convert it all then. */
camel_mime_parser_filter_remove (mp, decid);
camel_mime_parser_filter_remove (mp, chrid);
decid = -1;
chrid = -1;
g_byte_array_free (buffer, TRUE);
buffer = NULL;
} else {
g_byte_array_append (buffer, buf, len);
}
}
}
if (buffer) {
CamelStream *mem;
d(printf("Small message part, kept in memory!\n"));
mem = camel_stream_mem_new_with_byte_array (buffer);
camel_data_wrapper_construct_from_stream (dw, mem);
camel_object_unref ((CamelObject *)mem);
} else {
CamelStream *sub;
CamelStreamFilter *filter;
d(printf("Big message part, left on disk ...\n"));
end = camel_mime_parser_tell (mp) + seekable_source->bound_start;
sub = camel_seekable_substream_new_with_seekable_stream_and_bounds (seekable_source, start, end);
if (fdec || fch) {
filter = camel_stream_filter_new_with_stream (sub);
if (fdec) {
camel_mime_filter_reset (fdec);
camel_stream_filter_add (filter, fdec);
}
if (fcrlf) {
camel_mime_filter_reset (fcrlf);
camel_stream_filter_add (filter, fcrlf);
}
if (fch) {
camel_mime_filter_reset (fch);
camel_stream_filter_add (filter, fch);
}
camel_data_wrapper_construct_from_stream (dw, (CamelStream *)filter);
camel_object_unref ((CamelObject *)filter);
} else {
camel_data_wrapper_construct_from_stream (dw, sub);
}
camel_object_unref ((CamelObject *)sub);
}
camel_mime_parser_filter_remove (mp, decid);
camel_mime_parser_filter_remove (mp, crlfid);
camel_mime_parser_filter_remove (mp, chrid);
if (fdec)
camel_object_unref ((CamelObject *)fdec);
if (fcrlf)
camel_object_unref ((CamelObject *)fcrlf);
if (fch)
camel_object_unref ((CamelObject *)fch);
if (source)
camel_object_unref ((CamelObject *)source);
}
/* This replaces the data wrapper repository ... and/or could be replaced by it? */
void
camel_mime_part_construct_content_from_parser (CamelMimePart *dw, CamelMimeParser *mp)
{
CamelDataWrapper *content = NULL;
char *buf;
int len;
switch (camel_mime_parser_state (mp)) {
case HSCAN_HEADER:
d(printf("Creating body part\n"));
content = camel_data_wrapper_new ();
simple_data_wrapper_construct_from_parser (content, mp);
break;
case HSCAN_MESSAGE:
d(printf("Creating message part\n"));
content = (CamelDataWrapper *) camel_mime_message_new ();
camel_mime_part_construct_from_parser ((CamelMimePart *)content, mp);
break;
case HSCAN_MULTIPART: {
CamelDataWrapper *bodypart;
#ifndef NO_WARNINGS
#warning This should use a camel-mime-multipart
#endif
d(printf("Creating multi-part\n"));
content = (CamelDataWrapper *)camel_multipart_new ();
/* FIXME: use the real boundary? */
camel_multipart_set_boundary ((CamelMultipart *)content, NULL);
while (camel_mime_parser_step (mp, &buf, &len) != HSCAN_MULTIPART_END) {
camel_mime_parser_unstep (mp);
bodypart = (CamelDataWrapper *)camel_mime_part_new ();
camel_mime_part_construct_from_parser ((CamelMimePart *)bodypart, mp);
camel_multipart_add_part ((CamelMultipart *)content, (CamelMimePart *)bodypart);
camel_object_unref ((CamelObject *)bodypart);
}
/* these are only return valid data in the MULTIPART_END state */
camel_multipart_set_preface ((CamelMultipart *)content, camel_mime_parser_preface (mp));
camel_multipart_set_postface ((CamelMultipart *)content, camel_mime_parser_postface (mp));
d(printf("Created multi-part\n"));
break; }
default:
g_warning("Invalid state encountered???: %d", camel_mime_parser_state (mp));
}
if (content) {
#ifndef NO_WARNINGS
#warning there just has got to be a better way ... to transfer the mime-type to the datawrapper
#endif
/* would you believe you have to set this BEFORE you set the content object??? oh my god !!!! */
camel_data_wrapper_set_mime_type_field (content,
camel_mime_part_get_content_type ((CamelMimePart *)dw));
camel_medium_set_content_object ((CamelMedium *)dw, content);
camel_object_unref ((CamelObject *)content);
}
}