diff options
-rw-r--r-- | camel/ChangeLog | 10 | ||||
-rw-r--r-- | camel/Makefile.am | 2 | ||||
-rw-r--r-- | camel/camel-utf8.c | 257 | ||||
-rw-r--r-- | camel/camel-utf8.h | 16 | ||||
-rw-r--r-- | camel/providers/imap/camel-imap-utils.c | 266 |
5 files changed, 300 insertions, 251 deletions
diff --git a/camel/ChangeLog b/camel/ChangeLog index f1c3f6cc72..a8277e579b 100644 --- a/camel/ChangeLog +++ b/camel/ChangeLog @@ -1,3 +1,13 @@ +2002-08-28 Not Zed <NotZed@Ximian.com> + + * providers/imap/camel-imap-utils.c (imap_mailbox_encode): Chagned + to use camel_utf8_utf7 code. + (imap_mailbox_decode): As above, using camel_utf8_utf7. 'UTF-7' + isn't a widely support iconv() codeset, and besides the new code + is simpler. + + * camel-utf8.[ch]: robust utilities for working with utf8 and utf7. + 2002-08-27 Jeffrey Stedfast <fejj@ximian.com> * camel-folder-thread.c (camel_folder_thread_messages_new): Now diff --git a/camel/Makefile.am b/camel/Makefile.am index 642641e12f..4bb0ddeba6 100644 --- a/camel/Makefile.am +++ b/camel/Makefile.am @@ -109,6 +109,7 @@ libcamel_la_SOURCES = \ camel-transport.c \ camel-uid-cache.c \ camel-url.c \ + camel-utf8.c \ camel-vee-folder.c \ camel-vee-store.c \ camel-vtrash-folder.c \ @@ -208,6 +209,7 @@ libcamelinclude_HEADERS = \ camel-types.h \ camel-uid-cache.h \ camel-url.h \ + camel-utf8.h \ camel-vee-folder.h \ camel-vee-store.h \ camel-vtrash-folder.h \ diff --git a/camel/camel-utf8.c b/camel/camel-utf8.c new file mode 100644 index 0000000000..5ed5a476d0 --- /dev/null +++ b/camel/camel-utf8.c @@ -0,0 +1,257 @@ + +#include <glib.h> +#include "camel-utf8.h" + +/** + * camel_utf8_putc: + * @ptr: + * @c: + * + * Output a 32 bit unicode character as utf8 octets. At most 4 octets will + * be written to @ptr. @ptr will be advanced to the next character position. + **/ +void +camel_utf8_putc(unsigned char **ptr, guint32 c) +{ + register unsigned char *p = *ptr; + + if (c <= 0x7f) + *p++ = c; + else if (c <= 0x7ff) { + *p++ = 0xc0 | c >> 6; + *p++ = 0x80 | (c & 0x3f); + } else if (c <= 0xffff) { + *p++ = 0xe0 | c >> 12; + *p++ = 0x80 | ((c >> 6) & 0x3f); + *p++ = 0x80 | (c & 0x3f); + } else { + /* see unicode standard 3.0, S 3.8, max 4 octets */ + *p++ = 0xf0 | c >> 18; + *p++ = 0x80 | ((c >> 12) & 0x3f); + *p++ = 0x80 | ((c >> 6) & 0x3f); + *p++ = 0x80 | (c & 0x3f); + } + + *ptr = p; +} + +/** + * camel_utf8_getc: + * @ptr: + * + * Get a Unicode character from a utf8 stream. @ptr will be advanced + * to the next character position. Invalid utf8 characters will be + * silently skipped. @ptr should point to a NUL terminated array. + * + * Return value: The next Unicode character. @ptr will be advanced to + * the next character always. + **/ +guint32 +camel_utf8_getc(const unsigned char **ptr) +{ + register unsigned char *p = (unsigned char *)*ptr; + register unsigned char c, r; + register guint32 v, m; + +again: + r = *p++; +loop: + if (r < 0x80) { + *ptr = p; + v = r; + } else if (r < 0xf8) { /* valid start char? (max 4 octets) */ + v = r; + m = 0x7f80; /* used to mask out the length bits */ + do { + c = *p++; + if ((c & 0xc0) != 0x80) { + r = c; + goto loop; + } + v = (v<<6) | (c & 0x3f); + r<<=1; + m<<=5; + } while (r & 0x40); + + *ptr = p; + + v &= ~m; + } else { + goto again; + } + + return v; +} + +void +g_string_append_u(GString *out, guint32 c) +{ + unsigned char buffer[8]; + unsigned char *p = buffer; + + camel_utf8_putc(&p, c); + *p = 0; + g_string_append(out, buffer); +} + +static char *utf7_alphabet = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +static unsigned char utf7_rank[256] = { + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff, + 0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e, + 0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff, + 0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28, + 0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, +}; + +/** + * camel_utf7_utf8: + * @ptr: + * + * Convert a modified utf7 string to utf8. If the utf7 string + * contains 8 bit characters, they are treated as iso-8859-1. + * + * The IMAP rules [rfc2060] are used in the utf7 encoding. + * + * Return value: The converted string. + **/ +char * +camel_utf7_utf8(const char *ptr) +{ + const unsigned char *p = (unsigned char *)ptr; + unsigned int c; + guint32 v=0, x; + GString *out; + int i=0; + int state = 0; + char *ret; + + out = g_string_new(""); + do { + c = *p++; + switch(state) { + case 0: + if (c == '&') + state = 1; + else + g_string_append_u(out, c); + break; + case 1: + if (c == '-') { + g_string_append_c(out, '&'); + state = 0; + } else if (utf7_rank[c] != 0xff) { + v = utf7_rank[c]; + i = 6; + state = 2; + } else { + /* invalid */ + g_string_append(out, "&-"); + state = 0; + } + break; + case 2: + if (c == '-') { + state = 0; + } else if (utf7_rank[c] != 0xff) { + v = (v<<6) | utf7_rank[c]; + i+=6; + if (i >= 16) { + x = (v >> (i-16)) & 0xffff; + g_string_append_u(out, x); + i-=16; + } + } else { + g_string_append_u(out, c); + state = 0; + } + break; + } + } while (c); + + ret = g_strdup(out->str); + g_string_free(out, TRUE); + + return ret; +} + +static void utf7_closeb64(GString *out, guint32 v, guint32 i) +{ + guint32 x; + + if (i>0) { + x = (v << (6-i)) & 0x3f; + g_string_append_c(out, utf7_alphabet[x]); + } + g_string_append_c(out, '-'); +} + +/** + * camel_utf8_utf7: + * @ptr: + * + * Convert a utf8 string to a modified utf7 format. + * + * The IMAP rules [rfc2060] are used in the utf7 encoding. + * + * Return value: + **/ +char * +camel_utf8_utf7(const char *ptr) +{ + const unsigned char *p = (unsigned char *)ptr; + unsigned int c; + guint32 x, v = 0; + int state = 0; + GString *out; + int i = 0; + char *ret; + + out = g_string_new(""); + + while ( (c = camel_utf8_getc(&p)) ) { + if (c >= 0x20 && c <= 0x7e) { + if (state == 1) { + utf7_closeb64(out, v, i); + state = 0; + i = 0; + } + if (c == '&') + g_string_append(out, "&-"); + else + g_string_append_c(out, c); + } else { + if (state == 0) { + g_string_append_c(out, '&'); + state = 1; + } + v = (v << 16) | c; + i += 16; + while (i >= 6) { + x = (v >> (i-6)) & 0x3f; + g_string_append_c(out, utf7_alphabet[x]); + i -= 6; + } + } + } + + if (state == 1) + utf7_closeb64(out, v, i); + + ret = g_strdup(out->str); + g_string_free(out, TRUE); + + return ret; +} diff --git a/camel/camel-utf8.h b/camel/camel-utf8.h new file mode 100644 index 0000000000..7d6fac5410 --- /dev/null +++ b/camel/camel-utf8.h @@ -0,0 +1,16 @@ + +#ifndef _CAMEL_UTF8_H +#define _CAMEL_UTF8_H + +void camel_utf8_putc(unsigned char **ptr, guint32 c); +guint32 camel_utf8_getc(const unsigned char **ptr); + +/* utility func for utf8 gstrings */ +void g_string_append_u(GString *out, guint32 c); + +/* convert utf7 to/from utf8, actually this is modified IMAP utf7 */ +char *camel_utf7_utf8(const char *ptr); +char *camel_utf8_utf7(const char *ptr); + + +#endif /* ! _CAMEL_UTF8_H */ diff --git a/camel/providers/imap/camel-imap-utils.c b/camel/providers/imap/camel-imap-utils.c index 4f0f9f143b..15bf2540b3 100644 --- a/camel/providers/imap/camel-imap-utils.c +++ b/camel/providers/imap/camel-imap-utils.c @@ -30,6 +30,7 @@ #include "camel-imap-summary.h" #include "camel-imap-store.h" #include "camel-folder.h" +#include "camel-utf8.h" #define d(x) x @@ -1119,263 +1120,26 @@ imap_concat (CamelImapStore *imap_store, const char *prefix, const char *suffix) return g_strdup_printf ("%s%c%s", prefix, imap_store->dir_sep, suffix); } -#define UTF8_TO_UTF7_LEN(len) ((len * 3) + 8) -#define UTF7_TO_UTF8_LEN(len) (len) - -enum { - MODE_USASCII, - MODE_AMPERSAND, - MODE_MODUTF7 -}; - -#define is_usascii(c) (((c) >= 0x20 && (c) <= 0x25) || ((c) >= 0x27 && (c) <= 0x7e)) -#define encode_mode(c) (is_usascii (c) ? MODE_USASCII : (c) == '&' ? MODE_AMPERSAND : MODE_MODUTF7) - char * imap_mailbox_encode (const unsigned char *in, size_t inlen) { - const unsigned char *start, *inptr, *inend; - unsigned char *mailbox, *m, *mend; - size_t inleft, outleft, conv; - char *inbuf, *outbuf; - iconv_t cd; - int mode; - - cd = (iconv_t) -1; - m = mailbox = g_malloc (UTF8_TO_UTF7_LEN (inlen) + 1); - mend = mailbox + UTF8_TO_UTF7_LEN (inlen); - - start = inptr = in; - inend = in + inlen; - mode = MODE_USASCII; - - while (inptr < inend) { - int new_mode; - - new_mode = encode_mode (*inptr); - - if (new_mode != mode) { - switch (mode) { - case MODE_USASCII: - memcpy (m, start, inptr - start); - m += (inptr - start); - break; - case MODE_AMPERSAND: - while (start < inptr) { - *m++ = '&'; - *m++ = '-'; - start++; - } - break; - case MODE_MODUTF7: - inbuf = (char *) start; - inleft = inptr - start; - outbuf = (char *) m; - outleft = mend - m; - - if (cd == (iconv_t) -1) - cd = iconv_open ("UTF-7", "UTF-8"); - - conv = iconv (cd, &inbuf, &inleft, &outbuf, &outleft); - if (conv == (size_t) -1) { - g_warning ("error converting mailbox to UTF-7!"); - } - iconv (cd, NULL, NULL, &outbuf, &outleft); - - /* shift into modified UTF-7 mode (overwrite UTF-7's '+' shift)... */ - *m++ = '&'; - - while (m < (unsigned char *) outbuf) { - /* replace '/' with ',' */ - if (*m == '/') - *m = ','; - - m++; - } - - break; - } - - mode = new_mode; - start = inptr; - } - - inptr++; - } - - switch (mode) { - case MODE_USASCII: - memcpy (m, start, inptr - start); - m += (inptr - start); - break; - case MODE_AMPERSAND: - while (start < inptr) { - *m++ = '&'; - *m++ = '-'; - start++; - } - break; - case MODE_MODUTF7: - inbuf = (char *) start; - inleft = inptr - start; - outbuf = (char *) m; - outleft = mend - m; - - if (cd == (iconv_t) -1) - cd = iconv_open ("UTF-7", "UTF-8"); - - conv = iconv (cd, &inbuf, &inleft, &outbuf, &outleft); - if (conv == (size_t) -1) { - g_warning ("error converting mailbox to UTF-7!"); - } - iconv (cd, NULL, NULL, &outbuf, &outleft); - - /* shift into modified UTF-7 mode (overwrite UTF-7's '+' shift)... */ - *m++ = '&'; - - while (m < (unsigned char *) outbuf) { - /* replace '/' with ',' */ - if (*m == '/') - *m = ','; - - m++; - } - - break; - } - - *m = '\0'; - - if (cd != (iconv_t) -1) - iconv_close (cd); - - return mailbox; -} + char *buf; + buf = alloca(inlen+1); + memcpy(buf, in, inlen); + buf[inlen] = 0; + + return camel_utf8_utf7(buf); +} char * imap_mailbox_decode (const unsigned char *in, size_t inlen) { - const unsigned char *start, *inptr, *inend; - unsigned char *mailbox, *m, *mend; - unsigned char mode_switch; - iconv_t cd; - - cd = (iconv_t) -1; - m = mailbox = g_malloc (UTF7_TO_UTF8_LEN (inlen) + 1); - mend = mailbox + UTF7_TO_UTF8_LEN (inlen); - - start = inptr = in; - inend = in + inlen; - mode_switch = '&'; - - while (inptr < inend) { - if (*inptr == mode_switch) { - if (mode_switch == '&') { - /* mode switch from US-ASCII to UTF-7 */ - mode_switch = '-'; - memcpy (m, start, inptr - start); - m += (inptr - start); - start = inptr; - } else if (mode_switch == '-') { - /* mode switch from UTF-7 to US-ASCII or an ampersand (&) */ - mode_switch = '&'; - start++; - if (start == inptr) { - /* we had the sequence "&-" which becomes "&" when decoded */ - *m++ = '&'; - } else { - char *buffer, *inbuf, *outbuf; - size_t buflen, outleft, conv; - - buflen = (inptr - start) + 2; - inbuf = buffer = alloca (buflen); - *inbuf++ = '+'; - while (start < inptr) { - *inbuf++ = *start == ',' ? '/' : *start; - start++; - } - *inbuf = '-'; - - inbuf = buffer; - outbuf = (char *) m; - outleft = mend - m; - - if (cd == (iconv_t) -1) - cd = iconv_open ("UTF-8", "UTF-7"); - - conv = iconv (cd, &inbuf, &buflen, &outbuf, &outleft); - if (conv == (size_t) -1) { - g_warning ("error decoding mailbox: %.*s", inlen, in); - } - iconv (cd, NULL, NULL, NULL, NULL); - - m = (unsigned char *) outbuf; - } - - /* point to the char after the '-' */ - start = inptr + 1; - } - } - - inptr++; - } - - if (*inptr == mode_switch) { - if (mode_switch == '&') { - /* the remaining text is US-ASCII */ - memcpy (m, start, inptr - start); - m += (inptr - start); - start = inptr; - } else if (mode_switch == '-') { - /* We've got encoded UTF-7 or else an ampersand */ - start++; - if (start == inptr) { - /* we had the sequence "&-" which becomes "&" when decoded */ - *m++ = '&'; - } else { - char *buffer, *inbuf, *outbuf; - size_t buflen, outleft, conv; - - buflen = (inptr - start) + 2; - inbuf = buffer = alloca (buflen); - *inbuf++ = '+'; - while (start < inptr) { - *inbuf++ = *start == ',' ? '/' : *start; - start++; - } - *inbuf = '-'; - - inbuf = buffer; - outbuf = (char *) m; - outleft = mend - m; - - if (cd == (iconv_t) -1) - cd = iconv_open ("UTF-8", "UTF-7"); - - conv = iconv (cd, &inbuf, &buflen, &outbuf, &outleft); - if (conv == (size_t) -1) { - g_warning ("error decoding mailbox: %.*s", inlen, in); - } - iconv (cd, NULL, NULL, NULL, NULL); - - m = (unsigned char *) outbuf; - } - } - } else { - if (mode_switch == '-') { - /* illegal encoded mailbox... */ - g_warning ("illegal mailbox name encountered: %.*s", inlen, in); - } - - memcpy (m, start, inptr - start); - m += (inptr - start); - } - - *m = '\0'; - - if (cd != (iconv_t) -1) - iconv_close (cd); - - return mailbox; + char *buf; + + buf = alloca(inlen+1); + memcpy(buf, in, inlen); + buf[inlen] = 0; + + return camel_utf7_utf8(buf); } |