From f6408daa103092f18789a719a4123224b259f71f Mon Sep 17 00:00:00 2001 From: Jeffrey Stedfast Date: Tue, 18 Dec 2001 01:28:27 +0000 Subject: New function to map ISO charsets to the Windows charsets. 2001-12-17 Jeffrey Stedfast * camel-charset-map.c (camel_charset_iso_to_windows): New function to map ISO charsets to the Windows charsets. * camel-mime-part-utils.c (broken_windows_charset): Detect Windows charsets. (simple_data_wrapper_construct_from_parser): Simplify a tad and also check for iso-8859-* charsets that are really Windows charsets. Fixes bug #12631. svn path=/trunk/; revision=15144 --- camel/camel-mime-part-utils.c | 75 ++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'camel/camel-mime-part-utils.c') diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c index 65c99c6dc8..08787df2cd 100644 --- a/camel/camel-mime-part-utils.c +++ b/camel/camel-mime-part-utils.c @@ -155,6 +155,28 @@ convert_buffer (GByteArray *in, const char *to, const char *from) return out; } +/* We don't really use the charset argument except for debugging... */ +static gboolean +broken_windows_charset (GByteArray *buffer, const char *charset) +{ + register unsigned char *inptr; + unsigned char *inend; + + inptr = buffer->data; + inend = inptr + buffer->len; + + while (inptr < inend) { + register unsigned char c = *inptr++; + + if (c >= 128 && c <= 159) { + g_warning ("Encountered Windows charset parading as %s", charset); + return TRUE; + } + } + + return FALSE; +} + static gboolean is_7bit (GByteArray *buffer) { @@ -172,33 +194,24 @@ static void simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp) { CamelMimeFilter *fdec = NULL, *fcrlf = NULL; + CamelMimeFilterBasicType enctype; int len, decid = -1, crlfid = -1; struct _header_content_type *ct; + const char *charset = NULL; GByteArray *buffer; char *encoding, *buf; - const char *charset = NULL; - CamelMimeFilterBasicType enctype = 0; CamelStream *mem; - - d(printf("constructing data-wrapper\n")); + + d(printf ("simple_data_wrapper_construct_from_parser()\n")); /* first, work out conversion, if any, required, we dont care about what we dont know about */ - encoding = header_content_encoding_decode(camel_mime_parser_header(mp, "content-transfer-encoding", NULL)); + encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL)); if (encoding) { - if (!strcasecmp(encoding, "base64")) { - d(printf("Adding base64 decoder ...\n")); - enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC; - } else if (!strcasecmp(encoding, "quoted-printable")) { - d(printf("Adding quoted-printable decoder ...\n")); - enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC; - } else if (!strcasecmp (encoding, "x-uuencode")) { - d(printf("Adding uudecoder ...\n")); - enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC; - } + enctype = camel_mime_part_encoding_from_string (encoding); g_free (encoding); - if (enctype != 0) { - fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype); + if (enctype != CAMEL_MIME_PART_ENCODING_DEFAULT) { + fdec = (CamelMimeFilter *) camel_mime_filter_basic_new_type (enctype); decid = camel_mime_parser_filter_add (mp, fdec); } } @@ -229,21 +242,32 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser charset = check_html_charset(buffer->data, buffer->len); /* if we need to do charset conversion, see if we can/it works/etc */ - if (charset && !(strcasecmp(charset, "us-ascii") == 0 - || strcasecmp(charset, "utf-8") == 0 - || strncasecmp(charset, "x-", 2) == 0)) { + if (charset && !(strcasecmp (charset, "us-ascii") == 0 + || strcasecmp (charset, "utf-8") == 0 + || strncasecmp (charset, "x-", 2) == 0)) { GByteArray *out; - out = convert_buffer(buffer, "UTF-8", charset); + /* You often see Microsoft Windows users announcing their texts + * as being in ISO-8859-1 even when in fact they contain funny + * characters from the Windows-CP1252 superset. + */ + if (!strncasecmp (charset, "iso-8859", 8)) { + /* check for Windows-specific chars... */ + if (broken_windows_charset (buffer, charset)) { + charset = camel_charset_iso_to_windows (charset); + charset = e_iconv_charset_name (charset); + } + } + + out = convert_buffer (buffer, "UTF-8", charset); if (out) { /* converted ok, use this data instead */ g_byte_array_free(buffer, TRUE); buffer = out; } else { - g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset); /* else failed to convert, leave as raw? */ + g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset); dw->rawtext = TRUE; - /* should we change the content-type header? */ } } else if (header_content_type_is (ct, "text", "*")) { if (charset == NULL) { @@ -258,10 +282,9 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL); } } - - + d(printf("message part kept in memory!\n")); - + mem = camel_stream_mem_new_with_byte_array(buffer); camel_data_wrapper_construct_from_stream(dw, mem); camel_object_unref((CamelObject *)mem); -- cgit v1.2.3