/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */ /* * Authors: * Michael Zucchi * Dan Winship * * Copyright 2000, 2001 Ximian, Inc. (www.ximian.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA */ #ifdef HAVE_CONFIG_H #include #endif #include /* if you want to build the charset map, compile this with something like: gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags` (plus any -I/-L/-l flags you need for iconv), then run it as ./a.out > camel-charset-map-private.h Note that the big-endian variant isn't tested... The tables genereated work like this: An indirect array for each page of unicode character Each array element has an indirect pointer to one of the bytes of the generated bitmask. */ #ifdef BUILD_MAP #include #include static struct { char *name; unsigned int bit; /* assigned bit */ } tables[] = { /* These are the 8bit character sets (other than iso-8859-1, * which is special-cased) which are supported by both other * mailers and the GNOME environment. Note that the order * they're listed in is the order they'll be tried in, so put * the more-popular ones first. */ { "iso-8859-2", 0 }, /* Central/Eastern European */ { "iso-8859-4", 0 }, /* Baltic */ { "koi8-r", 0 }, /* Russian */ { "windows-1251", 0 }, /* Russian */ { "koi8-u", 0 }, /* Ukranian */ { "iso-8859-5", 0 }, /* Least-popular Russian encoding */ { "iso-8859-7", 0 }, /* Greek */ { "iso-8859-9", 0 }, /* Turkish */ { "iso-8859-13", 0 }, /* Baltic again */ { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most * programs that support this support UTF8 */ { 0, 0 } }; unsigned int encoding_map[256 * 256]; #if G_BYTE_ORDER == G_BIG_ENDIAN #define UCS "UCS-4BE" #else #define UCS "UCS-4LE" #endif void main(void) { int i, j; int max, min; int bit = 0x01; int k; int bytes; iconv_t cd; char in[128]; guint32 out[128]; char *inptr, *outptr; size_t inlen, outlen; /* dont count the terminator */ bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; for (i = 0; i < 128; i++) in[i] = i + 128; for (j = 0; tables[j].name; j++) { cd = iconv_open (UCS, tables[j].name); inptr = in; outptr = (char *)(out); inlen = sizeof (in); outlen = sizeof (out); while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) { if (errno == EILSEQ) { inptr++; inlen--; } else { printf ("%s\n", strerror (errno)); exit (1); } } iconv_close (cd); for (i = 0; i < 128 - outlen / 4; i++) { encoding_map[i] |= bit; encoding_map[out[i]] |= bit; } tables[j].bit = bit; bit <<= 1; } printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); for (i=0;i<256;i++) { /* first, do we need this block? */ for (k=0;k> (k*8)) & 0xff ); if (((j+1)&7) == 0 && j<255) printf("\n\t"); } printf("\n};\n\n"); } } } printf("struct {\n"); for (k=0;k>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); if (k #include #include #include #include #include #ifdef ENABLE_THREADS #include #endif #ifdef HAVE_ALLOCA_H #include #endif #define cd(x) /* 'cache debug' */ #ifdef ENABLE_THREADS static pthread_mutex_t iconv_charsets_lock = PTHREAD_MUTEX_INITIALIZER; #define ICONV_CHARSETS_LOCK() pthread_mutex_lock (&iconv_charsets_lock) #define ICONV_CHARSETS_UNLOCK() pthread_mutex_unlock (&iconv_charsets_lock) #else #define ICONV_CHARSETS_LOCK() #define ICONV_CHARSETS_UNLOCK() #endif /* ENABLE_THREADS */ struct _iconv_cache_node { EDListNode ln; iconv_t ip; }; struct _iconv_cache { EDListNode ln; char *conv; EDList inuse; /* opened ic's in use - if both these lists empty == failed to open conversion */ EDList free; /* opened ic's free */ }; #define CAMEL_ICONV_CACHE_SIZE (16) static EDList iconv_cache_list; static GHashTable *iconv_charsets = NULL; static char *locale_charset = NULL; struct { char *charset; char *iconv_name; } known_iconv_charsets[] = { /* charset name, iconv-friendly charset name */ { "iso-8859-1", "iso-8859-1" }, { "iso8859-1", "iso-8859-1" }, /* the above mostly serves as an example for iso-style charsets, but we have code that will populate the iso-*'s if/when they show up in camel_charset_map_to_iconv() so I'm not going to bother putting them all in here... */ { "windows-cp1251", "cp1251" }, { "windows-1251", "cp1251" }, { "cp1251", "cp1251" }, /* the above mostly serves as an example for windows-style charsets, but we have code that will parse and convert them to their cp#### equivalents if/when they show up in camel_charset_map_to_iconv() so I'm not going to bother putting them all in here... */ { "ks_c_5601-1987", "euc-kr" }, { NULL, NULL } }; static void shutdown_foreach (gpointer key, gpointer value, gpointer data) { g_free (key); g_free (value); } static void flush_iconv_entry(struct _iconv_cache *ic) { struct _iconv_cache_node *node; cd(printf("Flushing iconv cache entry: %s\n", ic->conv)); while ( (node = (struct _iconv_cache_node *)e_dlist_remhead(&ic->inuse)) ) { iconv_close(node->ip); g_free(node); } while ( (node = (struct _iconv_cache_node *)e_dlist_remhead(&ic->free)) ) { iconv_close(node->ip); g_free(node); } g_free(ic->conv); g_free(ic); } static void camel_charset_map_shutdown (void) { struct _iconv_cache *ic, *in; g_hash_table_foreach (iconv_charsets, shutdown_foreach, NULL); g_hash_table_destroy (iconv_charsets); g_free (locale_charset); ic = (struct _iconv_cache *)iconv_cache_list.head; in = (struct _iconv_cache *)ic->ln.next; while (in) { flush_iconv_entry(ic); ic = in; in = (struct _iconv_cache *)in->ln.next; } } void camel_charset_map_init (void) { char *locale; int i; if (iconv_charsets) return; iconv_charsets = g_hash_table_new (g_strcase_hash, g_strcase_equal); for (i = 0; known_iconv_charsets[i].charset != NULL; i++) { g_hash_table_insert (iconv_charsets, g_strdup (known_iconv_charsets[i].charset), g_strdup (known_iconv_charsets[i].iconv_name)); } e_dlist_init(&iconv_cache_list); locale = setlocale (LC_ALL, NULL); if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) { /* The locale "C" or "POSIX" is a portable locale; its * LC_CTYPE part corresponds to the 7-bit ASCII character * set. */ locale_charset = NULL; } else { /* A locale name is typically of the form language[_terri- * tory][.codeset][@modifier], where language is an ISO 639 * language code, territory is an ISO 3166 country code, and * codeset is a character set or encoding identifier like * ISO-8859-1 or UTF-8. */ char *p; int len; p = strchr (locale, '@'); len = p ? (p - locale) : strlen (locale); if ((p = strchr (locale, '.'))) { locale_charset = g_strndup (p + 1, len - (p - locale) + 1); g_strdown (locale_charset); } } g_atexit (camel_charset_map_shutdown); } void camel_charset_init (CamelCharset *c) { c->mask = ~0; c->level = 0; } void camel_charset_step (CamelCharset *c, const char *in, int len) { register unsigned int mask; register int level; const char *inptr = in, *inend = in+len; mask = c->mask; level = c->level; /* check what charset a given string will fit in */ while (inptr < inend) { gunichar c; const char *newinptr; newinptr = g_utf8_next_char(inptr); c = g_utf8_get_char(inptr); if (newinptr == NULL || !g_unichar_validate (c)) { inptr++; continue; } inptr = newinptr; if (c<=0xffff) { mask &= charset_mask(c); if (c>=128 && c<256) level = MAX(level, 1); else if (c>=256) level = MAX(level, 2); } else { mask = 0; level = MAX(level, 2); } } c->mask = mask; c->level = level; } /* gets the best charset from the mask of chars in it */ static const char * camel_charset_best_mask(unsigned int mask) { int i; for (i=0;ilevel == 1) return "ISO-8859-1"; else if (charset->level == 2) return camel_charset_best_mask (charset->mask); else return NULL; } /* finds the minimum charset for this string NULL means US-ASCII */ const char * camel_charset_best (const char *in, int len) { CamelCharset charset; camel_charset_init (&charset); camel_charset_step (&charset, in, len); return camel_charset_best_name (&charset); } const char * camel_charset_locale_name (void) { return locale_charset; } const char * camel_charset_to_iconv (const char *name) { const char *charset; if (name == NULL) return NULL; ICONV_CHARSETS_LOCK (); charset = g_hash_table_lookup (iconv_charsets, name); if (!charset) { /* Attempt to friendlyify the charset */ char *new_charset, *p; int len; if (!g_strncasecmp (name, "iso", 3) && name[3] != '-' && name[3] != '_') { /* Hack to convert charsets like ISO8859-1 to iconv-friendly ISO-8859-1 */ len = strlen (name); new_charset = g_malloc (len + 2); memcpy (new_charset, name, 3); new_charset[3] = '-'; memcpy (new_charset + 4, name + 3, len - 3); new_charset[len + 1] = '\0'; } else if (!g_strncasecmp (name, "windows-", 8)) { /* Convert charsets like windows-1251 and windows-cp1251 to iconv-friendly cp1251 */ new_charset = (char *) name + 8; if (!g_strncasecmp (new_charset, "cp", 2)) new_charset += 2; for (p = new_charset; *p && isdigit ((unsigned) *p); p++); if (*p == '\0') new_charset = g_strdup_printf ("cp%s", new_charset); else new_charset = g_strdup (name); } else { /* *shrug* - add it to the hash table just the way it is? */ new_charset = g_strdup (name); } g_hash_table_insert (iconv_charsets, g_strdup (name), new_charset); charset = new_charset; } ICONV_CHARSETS_UNLOCK (); return charset; } iconv_t camel_charset_iconv_open(const char *oto, const char *ofrom) { const char *to, *from; char *tofrom; struct _iconv_cache *ic, *icnew = NULL; struct _iconv_cache_node *node; iconv_t ip; to = camel_charset_to_iconv(oto); from = camel_charset_to_iconv(ofrom); tofrom = alloca(strlen(to) +strlen(from) + 1); sprintf(tofrom, "%s%s", to, from); ICONV_CHARSETS_LOCK(); ic = (struct _iconv_cache *)iconv_cache_list.head; while (ic->ln.next) { if (!strcasecmp(ic->conv, tofrom)) break; ic = (struct _iconv_cache *)ic->ln.next; } if (ic->ln.next == NULL) { int extra = e_dlist_length(&iconv_cache_list) - CAMEL_ICONV_CACHE_SIZE; struct _iconv_cache *old = (struct _iconv_cache *)iconv_cache_list.head, *next = (struct _iconv_cache *)old->ln.next; /* flush any 'old' entries out, if we can */ while (extra>0 && next) { if (e_dlist_empty(&old->inuse)) { e_dlist_remove(&old->ln); flush_iconv_entry(old); extra--; } old = next; next = (struct _iconv_cache *)old->ln.next; } icnew = ic = g_malloc(sizeof(*ic)); e_dlist_init(&ic->inuse); e_dlist_init(&ic->free); ic->conv = g_strdup(tofrom); } else { e_dlist_remove(&ic->ln); } node = (struct _iconv_cache_node *)e_dlist_remhead(&ic->free); if (node) { cd(printf("Returning cached success of: %s to %s\n", from, to)); e_dlist_addhead(&ic->inuse, &node->ln); ip = node->ip; } else { if (e_dlist_empty(&ic->inuse) && icnew == NULL) { cd(printf("returning cached failure of conversion: %s to %s\n", from, to)); ip = (iconv_t)-1; } else { ip = iconv_open(to, from); if (ip != (iconv_t)-1) { cd(printf("Creating cached opening of: %s to %s = %p\n", from, to, ip)); node = g_malloc(sizeof(*node)); node->ip = ip; e_dlist_addhead(&ic->inuse, &node->ln); } } } e_dlist_addtail(&iconv_cache_list, &ic->ln); ICONV_CHARSETS_UNLOCK(); return ip; } void camel_charset_iconv_close(iconv_t ip) { struct _iconv_cache *ic; struct _iconv_cache_node *node; if (ip == (iconv_t)-1) return; ICONV_CHARSETS_LOCK(); ic = (struct _iconv_cache *)iconv_cache_list.tailpred; while (ic->ln.prev) { cd(printf("closing iconv %p, checking against name '%s'\n", ip, ic->conv)); node = (struct _iconv_cache_node *)ic->inuse.head; while (node->ln.next) { cd(printf("closing iconv %p, checking against node '%p'\n", ip, node->ip)); if (node->ip == ip) { e_dlist_remove(&node->ln); e_dlist_addhead(&ic->free, &node->ln); ICONV_CHARSETS_UNLOCK(); return; } node = (struct _iconv_cache_node *)node->ln.next; } ic = (struct _iconv_cache *)ic->ln.prev; } ICONV_CHARSETS_UNLOCK(); g_warning("Trying to close iconv i dont know about: %p", ip); } #endif /* !BUILD_MAP */