From 5ee8f0795e6f2f1fc5ed49435559f75022f44aa6 Mon Sep 17 00:00:00 2001 From: Robert Brady Date: Sun, 8 Aug 1999 22:38:40 +0000 Subject: more advanced RFC2047 encoder started. 1999-08-08 Robert Brady * camel/gmime-rfc2047.c: more advanced RFC2047 encoder started. svn path=/trunk/; revision=1098 --- ChangeLog | 4 ++ camel/gmime-rfc2047.c | 137 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 110 insertions(+), 31 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9d1221aef9..66794caa7a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +1999-08-08 Robert Brady + + * camel/gmime-rfc2047.c: more advanced RFC2047 encoder started. + 1999-08-08 bertrand * camel/providers/MH/camel-mh-folder.c (_get_message): diff --git a/camel/gmime-rfc2047.c b/camel/gmime-rfc2047.c index ecb75d519d..6130526c94 100644 --- a/camel/gmime-rfc2047.c +++ b/camel/gmime-rfc2047.c @@ -166,7 +166,9 @@ rfc2047_decode_word (const gchar *data, const gchar *into_what) g_free(q); return g_strdup(buffer); } - unicode_iconv(i, &cook_2, &cook_len, &b, &b_len); + if (unicode_iconv(i, &cook_2, &cook_len, &b, &b_len)==-1) + /* FIXME : use approximation code if we can't convert it properly. */ + ; unicode_iconv_close(i); *b = 0; } @@ -174,7 +176,7 @@ rfc2047_decode_word (const gchar *data, const gchar *into_what) return g_strdup(buffer); } -static gchar * +static const gchar * find_end_of_encoded_word(const gchar *data) { /* We can't just search for ?=, because of the case : @@ -228,38 +230,111 @@ gmime_rfc2047_decode (const gchar *data, const gchar *into_what) return buffer; } -gchar -*gmime_rfc2047_encode (const gchar *string, const gchar *charset) -{ - gchar buffer[4096] /* FIXME : constant sized buffer */; - gchar *b = buffer; - const gchar *s = string; - int not_ascii = 0, not_latin1 = 0; - while (*s) { - if (*s <= 20 || *s >= 0x7f || *s == '=') { not_ascii = 1; } - s++; - } - - if (!not_ascii) { - b += sprintf (b, "%s", string); +#define isnt_ascii(a) ((a) <= 0x1f || (a) >= 0x7f) + +static int rfc2047_clean(const gchar *string) { + if (strstr(string, "?=")) return 1; + while (*string) { + if (!isnt_ascii((unsigned char)*string)) + return 0; + string++; } - - else { - b += sprintf (b, "=?%s?Q?", charset); - s = string; - while (*s) { - if (*s == ' ') b += sprintf (b, "_"); - else if (*s < 0x20 || *s >= 0x7f || *s == '=' || *s == '?' || *s == '_') { - b += sprintf (b, "=%2x", (unsigned char)*s); - } else { - b += sprintf (b, "%c", *s); - } - s++; + return 1; +} + +static gchar *encode_word (const gchar *string, const gchar *said_charset) { + if (rfc2047_clean(string)) + /* don't bother encoding it if it has no odd characters in it */ + return g_strdup(string); + { + char *temp = malloc(strlen(string) * 4 + 1), *t = temp; + t += sprintf(t, "=?%s?q?", said_charset); + while (*string) { + if (*string == ' ') + *(t++) = '_'; + else if (*string <= 0x1f || *string >= 0x7f || *string == '=' || *string == '?') + t += sprintf(t, "=%2x", (unsigned char)*string); + else + *(t++) = *string; + + string++; } - b += sprintf (b, "?="); + t += sprintf(t, "?="); + *t = 0; + return temp; } +} + +gchar * +gmime_rfc2047_encode (const gchar *string, const gchar *charset) +{ + int temp_len = strlen(string)*4 + 1; + char *temp = g_malloc(temp_len), *temp_2 = temp; + int string_length = strlen(string); + char *encoded = NULL; + + /* first, let us convert to UTF-8 */ + iconv_t i = unicode_iconv_open("UTF-8", charset); + unicode_iconv(i, &string, &string_length, &temp_2, &temp_len); + unicode_iconv_close(i); - *b = 0; + /* null terminate it */ + *temp_2 = 0; + + /* now encode it as if it were a single word */ + encoded = encode_word(temp, "UTF-8"); + + /* + + real algorithm : + + we need to + + split it into words + + identify portions that have NOT to be encoded (i.e. <> and the comment starter/ender ) + + identify the best character set for each word + + merge words which share a character set, allow jumping and merging with words which + would be ok to encode in non-US-ASCII. + + if we have to use 2 character sets, try and collapse them into one. + + (e.g. if one word contains letters in latin-1, and another letters in latin-2, use + latin-2 for the first word as well if possible). + + finally : + + if utf-8 will still be used, use it for everything. + + and then, at last, generate the encoded text, using base64/quoted-printable for + each word depending upon which is more efficient. + + TODO : + create a priority list of encodings + + i.e. + + US-ASCII, ISO-8859-1, ISO-8859-2, ISO-8859-3, KOI8, + + Should survey for most popular charsets : + what do people usually use for the following scripts? + + * Chinese/Japanese/Korean + * Greek + * Cyrillic + + (any other scripts commonly used in mail/news?) + + This algorithm is probably far from optimal, but should be + reasonably efficient for simple cases. (and almost free if + the text is just in US-ASCII : like 99% of the text that will + pass through it) + + */ + + g_free(temp); - return g_strdup (buffer); + return encoded; } -- cgit v1.2.3