From 1a6fb9ab8807f8ad8c8f79ba1cc4fa7a015c08d6 Mon Sep 17 00:00:00 2001
From: Not Zed <NotZed@Ximian.com>
Date: Tue, 1 Jun 2004 10:07:13 +0000
Subject: ** A few fixes for better rfc compliance, and cleaner code.

2004-06-01  Not Zed  <NotZed@Ximian.com>

        ** A few fixes for better rfc compliance, and cleaner code.

        * camel-mime-utils.c (header_encode_param): a bunch of logic
        cleanups with new util functions.
        (header_decode_init): setup a new type ATTR_CHAR, for
        attribute-char.

        * tests/misc/test2.c (main): new test for rfc2184 stuff.

        * camel-mime-utils.c (header_convert): helper to convert between
        charsets.
        (rfc2184_decode): fix a bunch of logic problems and use the helper
        above to simplify code.
        (decode_param_token): removed, not needed.
        (header_decode_rfc2184_param): removed, not needed.
        (header_decode_param): removed, not needed.  ugh.
        (header_decode_param_list): completely rewritten, hence lack of
        need of above.

svn path=/trunk/; revision=26140
---
 camel/camel-mime-utils.c | 544 ++++++++++++++++++-----------------------------
 1 file changed, 203 insertions(+), 341 deletions(-)

(limited to 'camel/camel-mime-utils.c')

diff --git a/camel/camel-mime-utils.c b/camel/camel-mime-utils.c
index 35766e0b48..4e5ec39ee3 100644
--- a/camel/camel-mime-utils.c
+++ b/camel/camel-mime-utils.c
@@ -51,6 +51,7 @@
 #include "camel-mime-utils.h"
 #include "camel-charset-map.h"
 #include "camel-service.h"  /* for camel_gethostbyname() */
+#include "camel-utf8.h"
 
 #ifndef CLEAN_DATE
 #include "broken-date-parser.h"
@@ -96,6 +97,7 @@ static unsigned char camel_mime_base64_rank[256];
 					      encoded word in text specials: rfc 2047 5(1)*/
 #define CHARS_PSPECIAL "!*+-/" /* list of additional characters that can be left unencoded.
 				  encoded word in phrase specials: rfc 2047 5(3) */
+#define CHARS_ATTRCHAR "*\'% "	/* extra non-included attribute-chars */
 
 static void
 header_remove_bits(unsigned short bit, unsigned char *vals)
@@ -144,20 +146,22 @@ header_decode_init(void)
 
 	for (i=0;i<256;i++) {
 		camel_mime_special_table[i] = 0;
-		if (i<32)
+		if (i<32 || i==127)
 			camel_mime_special_table[i] |= CAMEL_MIME_IS_CTRL;
+		else if (i < 127)
+			camel_mime_special_table[i] |= CAMEL_MIME_IS_ATTRCHAR;
 		if ((i>=32 && i<=60) || (i>=62 && i<=126) || i==9)
 			camel_mime_special_table[i] |= (CAMEL_MIME_IS_QPSAFE|CAMEL_MIME_IS_ESAFE);
 		if ((i>='0' && i<='9') || (i>='a' && i<='z') || (i>='A' && i<= 'Z'))
 			camel_mime_special_table[i] |= CAMEL_MIME_IS_PSAFE;
 	}
-	camel_mime_special_table[127] |= CAMEL_MIME_IS_CTRL;
 	camel_mime_special_table[' '] |= CAMEL_MIME_IS_SPACE;
 	header_init_bits(CAMEL_MIME_IS_LWSP, 0, 0, CHARS_LWSP);
 	header_init_bits(CAMEL_MIME_IS_TSPECIAL, CAMEL_MIME_IS_CTRL, 0, CHARS_TSPECIAL);
 	header_init_bits(CAMEL_MIME_IS_SPECIAL, 0, 0, CHARS_SPECIAL);
 	header_init_bits(CAMEL_MIME_IS_DSPECIAL, 0, FALSE, CHARS_DSPECIAL);
 	header_remove_bits(CAMEL_MIME_IS_ESAFE, CHARS_ESPECIAL);
+	header_remove_bits(CAMEL_MIME_IS_ATTRCHAR, CHARS_TSPECIAL CHARS_ATTRCHAR);
 	header_init_bits(CAMEL_MIME_IS_PSAFE, 0, 0, CHARS_PSPECIAL);
 }
 
@@ -1826,6 +1830,33 @@ hex_decode (const char *in, size_t len)
 	return outbuf;
 }
 
+/* Tries to convert @in @from charset @to charset.  Any failure, we get no data out rather than partial conversion */
+static char *
+header_convert(const char *to, const char *from, const char *in, size_t inlen)
+{
+	iconv_t ic;
+	size_t outlen, ret;
+	char *outbuf, *outbase, *result = NULL;
+
+	ic = e_iconv_open(to, from);
+	if (ic == (iconv_t) -1)
+		return NULL;
+
+	outlen = inlen * 6 + 16;
+	outbuf = outbase = g_malloc(outlen);
+			
+	ret = e_iconv(ic, &in, &inlen, &outbuf, &outlen);
+	if (ret != (size_t) -1) {
+		e_iconv(ic, NULL, 0, &outbuf, &outlen);
+		*outbuf = '\0';
+		result = g_strdup(outbase);
+	}
+	e_iconv_close(ic);
+	g_free(outbase);
+
+	return result;
+}
+
 /* an rfc2184 encoded string looks something like:
  * us-ascii'en'This%20is%20even%20more%20
  */
@@ -1836,221 +1867,29 @@ rfc2184_decode (const char *in, size_t len)
 	const char *inptr = in;
 	const char *inend = in + len;
 	const char *charset;
-	char *decoded = NULL;
-	char *encoding;
+	char *decoded, *decword, *encoding;
 	
 	inptr = memchr (inptr, '\'', len);
 	if (!inptr)
 		return NULL;
-	
-	encoding = g_strndup (in, inptr - in);
+
+	encoding = g_alloca(inptr-in+1);
+	memcpy(encoding, in, inptr-in);
+	encoding[inptr-in] = 0;
 	charset = e_iconv_charset_name (encoding);
-	g_free (encoding);
 	
 	inptr = memchr (inptr + 1, '\'', inend - inptr - 1);
 	if (!inptr)
 		return NULL;
-	
 	inptr++;
-	if (inptr < inend) {
-		char *decword, *outbase, *outbuf;
-		const char *inbuf;
-		size_t inlen, outlen;
-		iconv_t ic;
-		
-		inbuf = decword = hex_decode (inptr, inend - inptr);
-		inlen = strlen (inbuf);
-		
-		ic = e_iconv_open ("UTF-8", charset);
-		if (ic != (iconv_t) -1) {
-			size_t ret;
-			
-			outlen = inlen * 6 + 16;
-			outbuf = outbase = g_malloc (outlen);
-			
-			ret = e_iconv (ic, &inbuf, &inlen, &outbuf, &outlen);
-			if (ret != (size_t) -1) {
-				e_iconv (ic, NULL, 0, &outbuf, &outlen);
-				*outbuf = '\0';
-				g_free (decoded);
-				decoded = outbase;
-			}
-			
-			e_iconv_close (ic);
-		} else {
-			decoded = decword;
-		}
-	}
-	
-	return decoded;
-}
-
-/* This function is basically the same as decode_token()
- * except that it will not accept *'s which have a special
- * meaning for rfc2184 params */
-static char *
-decode_param_token (const char **in)
-{
-	const char *inptr = *in;
-	const char *start;
-	
-	header_decode_lwsp (&inptr);
-	start = inptr;
-	while (camel_mime_is_ttoken (*inptr) && *inptr != '*')
-		inptr++;
-	if (inptr > start) {
-		*in = inptr;
-		return g_strndup (start, inptr - start);
-	} else {
+	if (inptr >= inend)
 		return NULL;
-	}
-}
 
-static gboolean
-header_decode_rfc2184_param (const char **in, char **paramp, gboolean *is_encoded, int *part)
-{
-	gboolean is_rfc2184 = FALSE;
-	const char *inptr = *in;
-	char *param;
-	
-	*is_encoded = FALSE;
-	*part = -1;
-	
-	param = decode_param_token (&inptr);
-	header_decode_lwsp (&inptr);
-	
-	if (*inptr == '*') {
-		is_rfc2184 = TRUE;
-		inptr++;
-		header_decode_lwsp (&inptr);
-		if (*inptr == '=') {
-			/* form := param*=value */
-			if (is_encoded)
-				*is_encoded = TRUE;
-		} else {
-			/* form := param*#=value or param*#*=value */
-			*part = camel_header_decode_int (&inptr);
-			header_decode_lwsp (&inptr);
-			if (*inptr == '*') {
-				/* form := param*#*=value */
-				if (is_encoded)
-					*is_encoded = TRUE;
-				inptr++;
-				header_decode_lwsp (&inptr);
-			}
-		}
-	}
-	
-	if (paramp)
-		*paramp = param;
-	
-	if (param)
-		*in = inptr;
-	
-	return is_rfc2184;
-}
+	decword = hex_decode (inptr, inend - inptr);
+	decoded = header_convert("UTF-8", charset, decword, strlen(decword));
+	g_free(decword);
 
-static int
-header_decode_param (const char **in, char **paramp, char **valuep, int *is_rfc2184_param, int *rfc2184_part)
-{
-	gboolean is_rfc2184_encoded = FALSE;
-	gboolean is_rfc2184 = FALSE;
-	const char *inptr = *in;
-	char *param = NULL;
-	char *value = NULL;
-	
-	*is_rfc2184_param = FALSE;
-	*rfc2184_part = -1;
-	
-	is_rfc2184 = header_decode_rfc2184_param (&inptr, &param, &is_rfc2184_encoded, rfc2184_part);
-	
-	if (*inptr == '=') {
-		inptr++;
-		value = header_decode_value (&inptr);
-		
-		if (value && is_rfc2184) {
-			/* We have ourselves an rfc2184 parameter */
-			
-			if (*rfc2184_part == -1) {
-				/* rfc2184 allows the value to be broken into
-				 * multiple parts - this isn't one of them so
-				 * it is safe to decode it.
-				 */
-				char *val;
-				
-				val = rfc2184_decode (value, strlen (value));
-				if (val) {
-					g_free (value);
-					value = val;
-				}
-			} else {
-				/* Since we are expecting to find the rest of
-				 * this paramter value later, let our caller know.
-				 */
-				*is_rfc2184_param = TRUE;
-			}
-		} else if (value && !strncmp (value, "=?", 2)) {
-			/* We have a broken param value that is rfc2047 encoded.
-			 * Since both Outlook and Netscape/Mozilla do this, we
-			 * should handle this case.
-			 */
-			char *val;
-			
-			if ((val = header_decode_text (value, strlen (value), NULL))) {
-				g_free (value);
-				value = val;
-			}
-		}
-	}
-	
-	if (value && !g_utf8_validate (value, -1, NULL)) {
-		/* The (broken) mailer sent us an unencoded 8bit value
-		 * attempt to save it by assuming it's in the user's
-		 * locale and converting to utf8 */
-		char *outbase, *outbuf, *p;
-		const char *charset, *inbuf;
-		size_t inlen, outlen;
-		iconv_t ic;
-		
-		inbuf = value;
-		inlen = strlen (inbuf);
-		
-		charset = e_iconv_locale_charset ();
-		ic = e_iconv_open ("UTF-8", charset ? charset : "ISO-8859-1");
-		if (ic != (iconv_t) -1) {
-			size_t ret;
-			
-			outlen = inlen * 6 + 16;
-			outbuf = outbase = g_malloc (outlen);
-			
-			ret = e_iconv (ic, &inbuf, &inlen, &outbuf, &outlen);
-			if (ret != (size_t) -1) {
-				e_iconv (ic, NULL, 0, &outbuf, &outlen);
-				*outbuf = '\0';
-			}
-			
-			e_iconv_close (ic);
-			
-			g_free (value);
-			value = outbase;
-		} else {
-			/* Okay, so now what? I guess we convert invalid chars to _'s? */
-			for (p = value; *p; p++)
-				if (!isascii ((unsigned) *p))
-					*p = '_';
-		}
-	}
-	
-	if (param && value) {
-		*paramp = param;
-		*valuep = value;
-		*in = inptr;
-		return 0;
-	} else {
-		g_free (param);
-		g_free (value);
-		return 1;
-	}
+	return decoded;
 }
 
 char *
@@ -2953,87 +2792,158 @@ camel_header_mime_decode(const char *in, int *maj, int *min)
 	d(printf("major = %d, minor = %d\n", major, minor));
 }
 
+struct _rfc2184_param {
+	struct _camel_header_param param;
+	int index;
+};
+
+static int
+rfc2184_param_cmp(const void *ap, const void *bp)
+{
+	const struct _rfc2184_param *a = *(void **)ap;
+	const struct _rfc2184_param *b = *(void **)bp;
+	int res;
+
+	res = strcmp(a->param.name, b->param.name);
+	if (res == 0) {
+		if (a->index > b->index)
+			res = 1;
+		else if (a->index < b->index)
+			res = -1;
+	}
+		
+	return res;
+}
+
+/* NB: Steals name and value */
+static struct _camel_header_param *
+header_append_param(struct _camel_header_param *last, char *name, char *value)
+{
+	struct _camel_header_param *node;
+
+	/* This handles -
+	    8 bit data in parameters, illegal, tries to convert using locale, or just safens it up.
+	    rfc2047 ecoded parameters, illegal, decodes them anyway.  Some Outlook & Mozilla do this?
+	*/
+	node = g_malloc(sizeof(*node));
+	last->next = node;
+	node->next = NULL;
+	node->name = name;
+	if (strncmp(value, "=?", 2) == 0
+	    && (node->value = header_decode_text(value, strlen(value), NULL))) {
+		g_free(value);
+	} else if (!g_utf8_validate(value, -1, NULL)) {
+		const char * charset = e_iconv_locale_charset();
+
+		if ((node->value = header_convert("UTF-8", charset?charset:"ISO-8859-1", value, strlen(value)))) {
+			g_free(value);
+		} else {
+			node->value = value;
+			for (;*value;value++)
+				if (!isascii((unsigned char)*value))
+					*value = '_';
+		}
+	} else
+		node->value = value;
+
+	return node;
+}
+
 static struct _camel_header_param *
 header_decode_param_list (const char **in)
 {
+	struct _camel_header_param *head = NULL, *last = (struct _camel_header_param *)&head;
+	GPtrArray *split = NULL;
 	const char *inptr = *in;
-	struct _camel_header_param *head = NULL, *tail = NULL;
-	gboolean last_was_rfc2184 = FALSE;
-	gboolean is_rfc2184 = FALSE;
-	
-	header_decode_lwsp (&inptr);
-	
+	struct _rfc2184_param *work;
+	char *tmp;
+
+	/* Dump parameters into the output list, in the order found.  RFC 2184 split parameters are kept in an array */
+	header_decode_lwsp(&inptr);
 	while (*inptr == ';') {
-		struct _camel_header_param *param;
-		char *name, *value;
-		int rfc2184_part;
-		
+		char *name;
+		char *value = NULL;
+
 		inptr++;
-		/* invalid format? */
-		if (header_decode_param (&inptr, &name, &value, &is_rfc2184, &rfc2184_part) != 0)
-			break;
-		
-		if (is_rfc2184 && tail && !strcasecmp (name, tail->name)) {
-			/* rfc2184 allows a parameter to be broken into multiple parts
-			 * and it looks like we've found one. Append this value to the
-			 * last value.
-			 */
-			/* FIXME: we should be ordering these based on rfc2184_part id */
-			GString *gvalue;
-			
-			gvalue = g_string_new (tail->value);
-			g_string_append (gvalue, value);
-			g_free (tail->value);
-			g_free (value);
-			g_free (name);
-			
-			tail->value = gvalue->str;
-			g_string_free (gvalue, FALSE);
-		} else {
-			if (last_was_rfc2184) {
-				/* We've finished gathering the values for the last param
-				 * so it is now safe to decode it.
-				 */
-				char *val;
-				
-				val = rfc2184_decode (tail->value, strlen (tail->value));
-				if (val) {
-					g_free (tail->value);
-					tail->value = val;
+		name = decode_token(&inptr);
+		header_decode_lwsp(&inptr);
+		if (*inptr == '=') {
+			inptr++;
+			value = header_decode_value(&inptr);
+		}
+
+		if (name && value) {
+			char *index = strchr(name, '*');
+
+			if (index) {
+				if (index[1] == 0) {
+					/* VAL*="foo", decode immediately and append */
+					*index = 0;
+					tmp = rfc2184_decode(value, strlen(value));
+					if (tmp) {
+						g_free(value);
+						value = tmp;
+					}
+					last = header_append_param(last, name, value);
+				} else {
+					/* VAL*1="foo", save for later */
+					*index++ = 0;
+					work = g_malloc(sizeof(*work));
+					work->param.name = name;
+					work->param.value = value;
+					work->index = atoi(index);
+					if (split == NULL)
+						split = g_ptr_array_new();
+					g_ptr_array_add(split, work);
 				}
+			} else {
+				last = header_append_param(last, name, value);
 			}
-			
-			param = g_malloc (sizeof (struct _camel_header_param));
-			param->name = name;
-			param->value = value;
-			param->next = NULL;
-			if (head == NULL)
-				head = param;
-			if (tail)
-				tail->next = param;
-			tail = param;
+		} else {
+			g_free(name);
+			g_free(value);
 		}
-		
-		last_was_rfc2184 = is_rfc2184;
-		
-		header_decode_lwsp (&inptr);
+
+		header_decode_lwsp(&inptr);
 	}
-	
-	if (last_was_rfc2184) {
-		/* We've finished gathering the values for the last param
-		 * so it is now safe to decode it.
-		 */
-		char *val;
-		
-		val = rfc2184_decode (tail->value, strlen (tail->value));
-		if (val) {
-			g_free (tail->value);
-			tail->value = val;
+
+	/* Rejoin any RFC 2184 split parameters in the proper order */
+	/* Parameters with the same index will be concatenated in undefined order */
+	if (split) {
+		GString *value = g_string_new("");
+		struct _rfc2184_param *first;
+		int i;
+
+		qsort(split->pdata, split->len, sizeof(split->pdata[0]), rfc2184_param_cmp);
+		first = split->pdata[0];
+		for (i=0;i<split->len;i++) {
+			work = split->pdata[i];
+			if (split->len-1 == i)
+				g_string_append(value, work->param.value);
+			if (split->len-1 == i || strcmp(work->param.name, first->param.name) != 0) {
+				tmp = rfc2184_decode(value->str, value->len);
+				if (tmp == NULL)
+					tmp = g_strdup(value->str);
+
+				last = header_append_param(last, g_strdup(first->param.name), tmp);
+				g_string_truncate(value, 0);
+				first = work;
+			}
+			if (split->len-1 != i)
+				g_string_append(value, work->param.value);
 		}
+		g_string_free(value, TRUE);
+		for (i=0;i<split->len;i++) {
+			work = split->pdata[i];
+			g_free(work->param.name);
+			g_free(work->param.value);
+			g_free(work);
+		}
+		g_ptr_array_free(split, TRUE);
 	}
-	
+
 	*in = inptr;
-	
+
 	return head;
 }
 
@@ -3046,23 +2956,19 @@ camel_header_param_list_decode(const char *in)
 	return header_decode_param_list(&in);
 }
 
-
 static char *
 header_encode_param (const unsigned char *in, gboolean *encoded)
 {
-	register const unsigned char *inptr = in;
+	const unsigned char *inptr = in;
 	unsigned char *outbuf = NULL;
-	const unsigned char *inend;
-	iconv_t cd = (iconv_t) -1;
 	const char *charset;
-	char *outstr;
 	int encoding;
 	GString *out;
-	
+	guint32 c;
+
 	*encoded = FALSE;
 	
 	g_return_val_if_fail (in != NULL, NULL);
-	g_return_val_if_fail (g_utf8_validate (in, -1, NULL), NULL);
 	
 	/* do a quick us-ascii check (the common case?) */
 	while (*inptr) {
@@ -3076,87 +2982,43 @@ header_encode_param (const unsigned char *in, gboolean *encoded)
 	
 	inptr = in;
 	encoding = 0;
-	while (inptr && *inptr) {
-		const char *newinptr;
-		gunichar c;
-		
-		newinptr = g_utf8_next_char (inptr);
-		c = g_utf8_get_char (inptr);
-		if (newinptr == NULL || !g_unichar_validate (c)) {
-			w(g_warning ("Invalid UTF-8 sequence encountered (pos %d, char '%c'): %s",
-				     (inptr-in), inptr[0], in));
-			inptr++;
-			continue;
-		}
-		
-		if (c > 127 && c < 256) {
+	while ( encoding !=2 && (c = camel_utf8_getc(&inptr)) ) {
+		if (c > 127 && c < 256)
 			encoding = MAX (encoding, 1);
-		} else if (c >= 256) {
+		else if (c >= 256)
 			encoding = MAX (encoding, 2);
-		}
-		
-		inptr = newinptr;
 	}
-	
+
 	if (encoding == 2)
-		charset = camel_charset_best (in, inptr - in);
+		charset = camel_charset_best(in, strlen(in));
 	else
 		charset = "iso-8859-1";
 	
-	if (strcasecmp (charset, "UTF-8") != 0)
-		cd = e_iconv_open (charset, "UTF-8");
-	
-	if (cd == (iconv_t) -1) {
+	if (g_ascii_strcasecmp(charset, "UTF-8") != 0
+	    && (outbuf = header_convert(charset, "UTF-8", in, strlen(in)))) {
+		inptr = outbuf;
+	} else {
 		charset = "UTF-8";
 		inptr = in;
-		inend = inptr + strlen (in);
-	} else {
-		size_t inleft, outleft;
-		const char *inbuf;
-		char *outptr;
-		
-		inleft = (inptr - in);
-		outleft = inleft * 6 + 20;
-		outptr = outbuf = g_malloc (outleft);
-		inbuf = in;
-		
-		if (e_iconv (cd, &inbuf, &inleft, &outptr, &outleft) == (size_t) -1) {
-			w(g_warning ("Conversion problem: conversion truncated: %s", strerror (errno)));
-		} else {
-			e_iconv (cd, NULL, 0, &outptr, &outleft);
-		}
-		
-		e_iconv_close (cd);
-		
-		inptr = outbuf;
-		inend = outptr;
 	}
 	
 	/* FIXME: set the 'language' as well, assuming we can get that info...? */
-	out = g_string_new ("");
-	g_string_append_printf (out, "%s''", charset);
-	
-	while (inptr < inend) {
-		unsigned char c = *inptr++;
-		
-		/* FIXME: make sure that '\'', '*', and ';' are also encoded */
-		
-		if (c > 127) {
-			g_string_append_printf (out, "%%%c%c", tohex[(c >> 4) & 0xf], tohex[c & 0xf]);
-		} else if (camel_mime_is_lwsp (c) || !(camel_mime_special_table[c] & CAMEL_MIME_IS_ESAFE)) {
-			g_string_append_printf (out, "%%%c%c", tohex[(c >> 4) & 0xf], tohex[c & 0xf]);
-		} else {
+	out = g_string_new (charset);
+	g_string_append(out, "''");
+
+	while ( (c = *inptr++) ) {
+		if (camel_mime_is_attrchar(c))
 			g_string_append_c (out, c);
-		}
+		else
+			g_string_append_printf (out, "%%%c%c", tohex[(c >> 4) & 0xf], tohex[c & 0xf]);
 	}
-	
 	g_free (outbuf);
 	
-	outstr = out->str;
+	outbuf = out->str;
 	g_string_free (out, FALSE);
 	*encoded = TRUE;
 	
-	return outstr;
+	return outbuf;
 }
 
 void
-- 
cgit v1.2.3