From f6408daa103092f18789a719a4123224b259f71f Mon Sep 17 00:00:00 2001 From: Jeffrey Stedfast Date: Tue, 18 Dec 2001 01:28:27 +0000 Subject: New function to map ISO charsets to the Windows charsets. 2001-12-17 Jeffrey Stedfast * camel-charset-map.c (camel_charset_iso_to_windows): New function to map ISO charsets to the Windows charsets. * camel-mime-part-utils.c (broken_windows_charset): Detect Windows charsets. (simple_data_wrapper_construct_from_parser): Simplify a tad and also check for iso-8859-* charsets that are really Windows charsets. Fixes bug #12631. svn path=/trunk/; revision=15144 --- camel/ChangeLog | 26 ++++++++++++--- camel/camel-charset-map.c | 55 +++++++++++++++++++++++++++++++ camel/camel-charset-map.h | 2 ++ camel/camel-mime-part-utils.c | 75 ++++++++++++++++++++++++++++--------------- 4 files changed, 127 insertions(+), 31 deletions(-) diff --git a/camel/ChangeLog b/camel/ChangeLog index cb8a9eaf6a..b82266ff9e 100644 --- a/camel/ChangeLog +++ b/camel/ChangeLog @@ -1,3 +1,14 @@ +2001-12-17 Jeffrey Stedfast + + * camel-charset-map.c (camel_charset_iso_to_windows): New function + to map ISO charsets to the Windows charsets. + + * camel-mime-part-utils.c (broken_windows_charset): Detect Windows + charsets. + (simple_data_wrapper_construct_from_parser): Simplify a tad and + also check for iso-8859-* charsets that are really Windows + charsets. Fixes bug #12631. + 2001-12-17 Dan Winship * Makefile.am (INCLUDES): define CAMEL_PROVIDERDIR to be the @@ -7,11 +18,16 @@ * providers/imap/Makefile.am (camel_provider_LTLIBRARIES, camel_provider_DATA): renamed from provider_LTLIBRARIES, - provider_DATA. - * providers/local/Makefile.am: Likewise - * providers/nntp/Makefile.am: Likewise - * providers/pop3/Makefile.am: Likewise - * providers/sendmail/Makefile.am: Likewise + provider_DATA. + + * providers/local/Makefile.am: Likewise + + * providers/nntp/Makefile.am: Likewise + + * providers/pop3/Makefile.am: Likewise + + * providers/sendmail/Makefile.am: Likewise + * providers/smtp/Makefile.am: Likewise 2001-12-16 Jeffrey Stedfast diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c index 17962d74be..2416dd2504 100644 --- a/camel/camel-charset-map.c +++ b/camel/camel-charset-map.c @@ -292,5 +292,60 @@ camel_charset_best (const char *in, int len) return camel_charset_best_name (&charset); } + +/** + * camel_charset_iso_to_windows: + * @isocharset: an ISO charset + * + * Returns the equivalent Windows charset. + **/ +const char * +camel_charset_iso_to_windows (const char *isocharset) +{ + /* According to http://czyborra.com/charsets/codepages.html, + * the charset mapping is as follows: + * + * iso-8859-1 maps to windows-cp1252 + * iso-8859-2 maps to windows-cp1250 + * iso-8859-3 maps to windows-cp???? + * iso-8859-4 maps to windows-cp???? + * iso-8859-5 maps to windows-cp1251 + * iso-8859-6 maps to windows-cp1256 + * iso-8859-7 maps to windows-cp1253 + * iso-8859-8 maps to windows-cp1255 + * iso-8859-9 maps to windows-cp1254 + * iso-8859-10 maps to windows-cp???? + * iso-8859-11 maps to windows-cp???? + * iso-8859-12 maps to windows-cp???? + * iso-8859-13 maps to windows-cp1257 + * + * Assumptions: + * - I'm going to assume that since iso-8859-4 and + * iso-8859-13 are Baltic that it also maps to + * windows-cp1257. + */ + + if (!strcasecmp (isocharset, "iso-8859-1")) + return "windows-cp1252"; + else if (!strcasecmp (isocharset, "iso-8859-2")) + return "windows-cp1250"; + else if (!strcasecmp (isocharset, "iso-8859-4")) + return "windows-cp1257"; + else if (!strcasecmp (isocharset, "iso-8859-5")) + return "windows-cp1251"; + else if (!strcasecmp (isocharset, "iso-8859-6")) + return "windows-cp1256"; + else if (!strcasecmp (isocharset, "iso-8859-7")) + return "windows-cp1253"; + else if (!strcasecmp (isocharset, "iso-8859-8")) + return "windows-cp1255"; + else if (!strcasecmp (isocharset, "iso-8859-9")) + return "windows-cp1254"; + else if (!strcasecmp (isocharset, "iso-8859-13")) + return "windows-cp1257"; + + return isocharset; +} + #endif /* !BUILD_MAP */ diff --git a/camel/camel-charset-map.h b/camel/camel-charset-map.h index 7c7022c0a1..0cae1916a6 100644 --- a/camel/camel-charset-map.h +++ b/camel/camel-charset-map.h @@ -37,4 +37,6 @@ const char *camel_charset_best_name(CamelCharset *); /* helper function */ const char *camel_charset_best(const char *in, int len); +const char *camel_charset_iso_to_windows (const char *isocharset); + #endif /* ! _CAMEL_CHARSET_MAP_H */ diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c index 65c99c6dc8..08787df2cd 100644 --- a/camel/camel-mime-part-utils.c +++ b/camel/camel-mime-part-utils.c @@ -155,6 +155,28 @@ convert_buffer (GByteArray *in, const char *to, const char *from) return out; } +/* We don't really use the charset argument except for debugging... */ +static gboolean +broken_windows_charset (GByteArray *buffer, const char *charset) +{ + register unsigned char *inptr; + unsigned char *inend; + + inptr = buffer->data; + inend = inptr + buffer->len; + + while (inptr < inend) { + register unsigned char c = *inptr++; + + if (c >= 128 && c <= 159) { + g_warning ("Encountered Windows charset parading as %s", charset); + return TRUE; + } + } + + return FALSE; +} + static gboolean is_7bit (GByteArray *buffer) { @@ -172,33 +194,24 @@ static void simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp) { CamelMimeFilter *fdec = NULL, *fcrlf = NULL; + CamelMimeFilterBasicType enctype; int len, decid = -1, crlfid = -1; struct _header_content_type *ct; + const char *charset = NULL; GByteArray *buffer; char *encoding, *buf; - const char *charset = NULL; - CamelMimeFilterBasicType enctype = 0; CamelStream *mem; - - d(printf("constructing data-wrapper\n")); + + d(printf ("simple_data_wrapper_construct_from_parser()\n")); /* first, work out conversion, if any, required, we dont care about what we dont know about */ - encoding = header_content_encoding_decode(camel_mime_parser_header(mp, "content-transfer-encoding", NULL)); + encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL)); if (encoding) { - if (!strcasecmp(encoding, "base64")) { - d(printf("Adding base64 decoder ...\n")); - enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC; - } else if (!strcasecmp(encoding, "quoted-printable")) { - d(printf("Adding quoted-printable decoder ...\n")); - enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC; - } else if (!strcasecmp (encoding, "x-uuencode")) { - d(printf("Adding uudecoder ...\n")); - enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC; - } + enctype = camel_mime_part_encoding_from_string (encoding); g_free (encoding); - if (enctype != 0) { - fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype); + if (enctype != CAMEL_MIME_PART_ENCODING_DEFAULT) { + fdec = (CamelMimeFilter *) camel_mime_filter_basic_new_type (enctype); decid = camel_mime_parser_filter_add (mp, fdec); } } @@ -229,21 +242,32 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser charset = check_html_charset(buffer->data, buffer->len); /* if we need to do charset conversion, see if we can/it works/etc */ - if (charset && !(strcasecmp(charset, "us-ascii") == 0 - || strcasecmp(charset, "utf-8") == 0 - || strncasecmp(charset, "x-", 2) == 0)) { + if (charset && !(strcasecmp (charset, "us-ascii") == 0 + || strcasecmp (charset, "utf-8") == 0 + || strncasecmp (charset, "x-", 2) == 0)) { GByteArray *out; - out = convert_buffer(buffer, "UTF-8", charset); + /* You often see Microsoft Windows users announcing their texts + * as being in ISO-8859-1 even when in fact they contain funny + * characters from the Windows-CP1252 superset. + */ + if (!strncasecmp (charset, "iso-8859", 8)) { + /* check for Windows-specific chars... */ + if (broken_windows_charset (buffer, charset)) { + charset = camel_charset_iso_to_windows (charset); + charset = e_iconv_charset_name (charset); + } + } + + out = convert_buffer (buffer, "UTF-8", charset); if (out) { /* converted ok, use this data instead */ g_byte_array_free(buffer, TRUE); buffer = out; } else { - g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset); /* else failed to convert, leave as raw? */ + g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset); dw->rawtext = TRUE; - /* should we change the content-type header? */ } } else if (header_content_type_is (ct, "text", "*")) { if (charset == NULL) { @@ -258,10 +282,9 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL); } } - - + d(printf("message part kept in memory!\n")); - + mem = camel_stream_mem_new_with_byte_array(buffer); camel_data_wrapper_construct_from_stream(dw, mem); camel_object_unref((CamelObject *)mem); -- cgit v1.2.3