From f6408daa103092f18789a719a4123224b259f71f Mon Sep 17 00:00:00 2001
From: Jeffrey Stedfast <fejj@ximian.com>
Date: Tue, 18 Dec 2001 01:28:27 +0000
Subject: New function to map ISO charsets to the Windows charsets.

2001-12-17  Jeffrey Stedfast  <fejj@ximian.com>

	* camel-charset-map.c (camel_charset_iso_to_windows): New function
	to map ISO charsets to the Windows charsets.

	* camel-mime-part-utils.c (broken_windows_charset): Detect Windows
	charsets.
	(simple_data_wrapper_construct_from_parser): Simplify a tad and
	also check for iso-8859-* charsets that are really Windows
	charsets. Fixes bug #12631.

svn path=/trunk/; revision=15144
---
 camel/camel-mime-part-utils.c | 75 ++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 26 deletions(-)

(limited to 'camel/camel-mime-part-utils.c')

diff --git a/camel/camel-mime-part-utils.c b/camel/camel-mime-part-utils.c
index 65c99c6dc8..08787df2cd 100644
--- a/camel/camel-mime-part-utils.c
+++ b/camel/camel-mime-part-utils.c
@@ -155,6 +155,28 @@ convert_buffer (GByteArray *in, const char *to, const char *from)
 	return out;
 }
 
+/* We don't really use the charset argument except for debugging... */
+static gboolean
+broken_windows_charset (GByteArray *buffer, const char *charset)
+{
+	register unsigned char *inptr;
+	unsigned char *inend;
+	
+	inptr = buffer->data;
+	inend = inptr + buffer->len;
+	
+	while (inptr < inend) {
+		register unsigned char c = *inptr++;
+		
+		if (c >= 128 && c <= 159) {
+			g_warning ("Encountered Windows charset parading as %s", charset);
+			return TRUE;
+		}
+	}
+	
+	return FALSE;
+}
+
 static gboolean
 is_7bit (GByteArray *buffer)
 {
@@ -172,33 +194,24 @@ static void
 simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser *mp)
 {
 	CamelMimeFilter *fdec = NULL, *fcrlf = NULL;
+	CamelMimeFilterBasicType enctype;
 	int len, decid = -1, crlfid = -1;
 	struct _header_content_type *ct;
+	const char *charset = NULL;
 	GByteArray *buffer;
 	char *encoding, *buf;
-	const char *charset = NULL;
-	CamelMimeFilterBasicType enctype = 0;
 	CamelStream *mem;
-
-	d(printf("constructing data-wrapper\n"));
+	
+	d(printf ("simple_data_wrapper_construct_from_parser()\n"));
 	
 	/* first, work out conversion, if any, required, we dont care about what we dont know about */
-	encoding = header_content_encoding_decode(camel_mime_parser_header(mp, "content-transfer-encoding", NULL));
+	encoding = header_content_encoding_decode (camel_mime_parser_header (mp, "Content-Transfer-Encoding", NULL));
 	if (encoding) {
-		if (!strcasecmp(encoding, "base64")) {
-			d(printf("Adding base64 decoder ...\n"));
-			enctype = CAMEL_MIME_FILTER_BASIC_BASE64_DEC;
-		} else if (!strcasecmp(encoding, "quoted-printable")) {
-			d(printf("Adding quoted-printable decoder ...\n"));
-			enctype = CAMEL_MIME_FILTER_BASIC_QP_DEC;
-		} else if (!strcasecmp (encoding, "x-uuencode")) {
-			d(printf("Adding uudecoder ...\n"));
-			enctype = CAMEL_MIME_FILTER_BASIC_UU_DEC;
-		}
+		enctype = camel_mime_part_encoding_from_string (encoding);
 		g_free (encoding);
 		
-		if (enctype != 0) {
-			fdec = (CamelMimeFilter *)camel_mime_filter_basic_new_type(enctype);
+		if (enctype != CAMEL_MIME_PART_ENCODING_DEFAULT) {
+			fdec = (CamelMimeFilter *) camel_mime_filter_basic_new_type (enctype);
 			decid = camel_mime_parser_filter_add (mp, fdec);
 		}
 	}
@@ -229,21 +242,32 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
 		charset = check_html_charset(buffer->data, buffer->len);
 	
 	/* if we need to do charset conversion, see if we can/it works/etc */
-	if (charset && !(strcasecmp(charset, "us-ascii") == 0
-			 || strcasecmp(charset, "utf-8") == 0
-			 || strncasecmp(charset, "x-", 2) == 0)) {
+	if (charset && !(strcasecmp (charset, "us-ascii") == 0
+			 || strcasecmp (charset, "utf-8") == 0
+			 || strncasecmp (charset, "x-", 2) == 0)) {
 		GByteArray *out;
 		
-		out = convert_buffer(buffer, "UTF-8", charset);
+		/* You often see Microsoft Windows users announcing their texts
+		 * as being in ISO-8859-1 even when in fact they contain funny
+		 * characters from the Windows-CP1252 superset.
+		 */
+		if (!strncasecmp (charset, "iso-8859", 8)) {
+			/* check for Windows-specific chars... */
+			if (broken_windows_charset (buffer, charset)) {
+				charset = camel_charset_iso_to_windows (charset);
+				charset = e_iconv_charset_name (charset);
+			}
+		}
+		
+		out = convert_buffer (buffer, "UTF-8", charset);
 		if (out) {
 			/* converted ok, use this data instead */
 			g_byte_array_free(buffer, TRUE);
 			buffer = out;
 		} else {
-			g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
 			/* else failed to convert, leave as raw? */
+			g_warning("Storing text as raw, unknown charset '%s' or invalid format", charset);
 			dw->rawtext = TRUE;
-			/* should we change the content-type header? */
 		}
 	} else if (header_content_type_is (ct, "text", "*")) {
 		if (charset == NULL) {
@@ -258,10 +282,9 @@ simple_data_wrapper_construct_from_parser (CamelDataWrapper *dw, CamelMimeParser
 			dw->rawtext = !g_utf8_validate (buffer->data, buffer->len, NULL);
 		}
 	}
-			
-
+	
 	d(printf("message part kept in memory!\n"));
-		
+	
 	mem = camel_stream_mem_new_with_byte_array(buffer);
 	camel_data_wrapper_construct_from_stream(dw, mem);
 	camel_object_unref((CamelObject *)mem);
-- 
cgit v1.2.3