#include <stdio.h> /* if you want to build the charset map, add the root directory of libunicode to the include path and define BUILD_MAP, then run it as ./a.out > camel-charset-map-private.h The tables genereated work like this: An indirect array for each page of unicode character Each array element has an indirect pointer to one of the bytes of the generated bitmask. */ #ifdef BUILD_MAP #include "iso/iso8859-2.h" #include "iso/iso8859-3.h" #include "iso/iso8859-4.h" #include "iso/iso8859-5.h" #include "iso/iso8859-6.h" #include "iso/iso8859-7.h" #include "iso/iso8859-8.h" #include "iso/iso8859-9.h" #include "iso/iso8859-10.h" #include "iso/iso8859-13.h" #include "iso/iso8859-14.h" #include "iso/iso8859-15.h" #include "iso/windows-1250.h" #include "iso/windows-1252.h" #include "iso/windows-1257.h" #include "iso/koi8-r.h" #include "iso/koi8-u.h" #include "iso/tis620.2533-1.h" #include "iso/armscii-8.h" #include "iso/georgian-academy.h" #include "iso/georgian-ps.h" #include "msft/cp932.h" #include "jis/shiftjis.h" static struct { unsigned short *table; char *name; int type; /* type of table */ unsigned int bit; /* assigned bit */ } tables[] = { { iso8859_2_table, "iso-8859-2", 0, 0} , { iso8859_3_table, "iso-8859-3", 0, 0} , { iso8859_4_table, "iso-8859-4", 0, 0}, { iso8859_5_table, "iso-8859-5", 0, 0}, /* apparently -6 has special digits? */ { iso8859_6_table, "iso-8859-6", 0, 0}, { iso8859_7_table, "iso-8859-7", 0, 0}, { iso8859_8_table, "iso-8859-8", 0, 0}, { iso8859_9_table, "iso-8859-9", 0, 0}, { iso8859_10_table, "iso-8859-10", 0, 0}, { iso8859_13_table, "iso-8859-13", 0, 0}, { iso8859_14_table, "iso-8859-14", 0, 0}, { iso8859_15_table, "iso-8859-15", 0, 0}, { windows_1250_table, "windows-1250", 0, 0}, { windows_1252_table, "windows-1252", 0, 0}, { windows_1257_table, "windows-1257", 0, 0}, { koi8_r_table, "koi8-r", 0, 0}, { koi8_u_table, "koi8-u", 0, 0}, { tis_620_table, "tis620.2533-1", 0, 0}, { armscii_8_table, "armscii-8", 0, 0}, { georgian_academy_table, "georgian-academy", 0, 0}, { georgian_ps_table, "georgian-ps", 0, 0}, { cp932_table, "CP932", 1, 0}, { sjis_table, "Shift-JIS", 1, 0}, { 0, 0} }; unsigned int encoding_map[256 * 256]; static void add_bigmap(unsigned short **table, int bit) { int i; int j; for (i=0;i<256;i++) { unsigned short *tab = table[i]; if (tab) { for (j=0;j<256;j++) { if (tab[j]) encoding_map[tab[j]] |= bit; } } } } void main(void) { int i, j; unsigned short *tab; int max, min; int bit = 0x01; int k; int bytes; #if 0 /* iso-latin-1 (not needed-detected in code) */ for (i=0;i<256;i++) { encoding_map[i] |= bit; } bit <<= 1; #endif /* dont count the terminator */ bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; /* the other latin charsets */ for (j=0;tables[j].table;j++) { switch (tables[j].type) { case 0: /* table from 128-256 */ tab = tables[j].table; for (i=0;i<128;i++) { /* 0-127 is the common */ encoding_map[i] |= bit; encoding_map[tab[i]] |= bit; } break; case 1: /* sparse table */ add_bigmap(tables[j].table, bit); break; } tables[j].bit = bit; bit <<= 1; } printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); for (i=0;i<256;i++) { /* first, do we need this block? */ for (k=0;k<bytes;k++) { for (j=0;j<256;j++) { if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) break; } if (j < 256) { /* yes, dump it */ printf("static unsigned char m%02x%x[256] = {\n\t", i, k); for (j=0;j<256;j++) { printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff ); if (((j+1)&7) == 0 && j<255) printf("\n\t"); } printf("\n};\n\n"); } } } printf("struct {\n"); for (k=0;k<bytes;k++) { printf("\tunsigned char *bits%d;\n", k); } printf("} camel_charmap[256] = {\n\t"); for (i=0;i<256;i++) { /* first, do we need this block? */ printf("{ "); for (k=0;k<bytes;k++) { for (j=0;j<256;j++) { if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) break; } if (j < 256) { printf("m%02x%x, ", i, k); } else { printf("0, "); } } printf("}, "); if (((i+1)&7) == 0 && i<255) printf("\n\t"); } printf("\n};\n\n"); printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); for (j=0;tables[j].table;j++) { printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); } printf("};\n\n"); printf("#define charset_mask(x) \\\n"); for (k=0;k<bytes;k++) { if (k!=0) printf("\t| "); else printf("\t"); printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); if (k<bytes-1) printf("\t\\\n"); } printf("\n\n"); } #else #include "camel-charset-map.h" #include "camel-charset-map-private.h" #include <unicode.h> #include <glib.h> void camel_charset_init(CamelCharset *c) { c->mask = ~0; c->level = 0; } void camel_charset_step(CamelCharset *c, const char *in, int len) { register unsigned int mask; register int level; const char *inptr = in, *inend = in+len; mask = c->mask; level = c->level; /* check what charset a given string will fit in */ while (inptr < inend) { unicode_char_t c; const char *newinptr; newinptr = unicode_get_utf8(inptr, &c); if (newinptr == NULL) { inptr++; continue; } inptr = newinptr; if (c<=0xffff) { mask |= charset_mask(c); if (c>=128 && c<256) level = MAX(level, 1); else if (c>=256) level = MAX(level, 2); } else { mask = 0; level = MAX(level, 2); } } c->mask = mask; c->level = level; } /* gets the best charset from the mask of chars in it */ static const char * camel_charset_best_mask(unsigned int mask) { int i; for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) { if (camel_charinfo[i].bit & mask) return camel_charinfo[i].name; } return "UTF-8"; } const char *camel_charset_best_name(CamelCharset *charset) { if (charset->level == 1) return "ISO-8859-1"; else if (charset->level == 2) return camel_charset_best_mask(charset->mask); else return NULL; } /* finds the minimum charset for this string NULL means US-ASCII */ const char * camel_charset_best(const char *in, int len) { CamelCharset charset; camel_charset_init(&charset); camel_charset_step(&charset, in, len); return camel_charset_best_name(&charset); } #endif /* !BUILD_MAP */