aboutsummaryrefslogtreecommitdiffstats
path: root/camel/camel-charset-map.c
diff options
context:
space:
mode:
Diffstat (limited to 'camel/camel-charset-map.c')
-rw-r--r--camel/camel-charset-map.c257
1 files changed, 257 insertions, 0 deletions
diff --git a/camel/camel-charset-map.c b/camel/camel-charset-map.c
new file mode 100644
index 0000000000..b6ad0a5f37
--- /dev/null
+++ b/camel/camel-charset-map.c
@@ -0,0 +1,257 @@
+
+#include <stdio.h>
+
+/*
+ if you want to build the charset map, add the root directory of
+ libunicode to the include path and define BUILD_MAP,
+ then run it as
+ ./a.out > camel-charset-map-private.h
+
+ The tables genereated work like this:
+
+ An indirect array for each page of unicode character
+ Each array element has an indirect pointer to one of the bytes of
+ the generated bitmask.
+*/
+
+#ifdef BUILD_MAP
+#include "iso/iso8859-2.h"
+#include "iso/iso8859-3.h"
+#include "iso/iso8859-4.h"
+#include "iso/iso8859-5.h"
+#include "iso/iso8859-6.h"
+#include "iso/iso8859-7.h"
+#include "iso/iso8859-8.h"
+#include "iso/iso8859-9.h"
+#include "iso/iso8859-10.h"
+#include "iso/iso8859-14.h"
+#include "iso/iso8859-15.h"
+#include "iso/koi8-r.h"
+#include "iso/koi8-u.h"
+#include "msft/cp932.h"
+#include "jis/shiftjis.h"
+
+static struct {
+ unsigned short *table;
+ char *name;
+ int type; /* type of table */
+ unsigned int bit; /* assigned bit */
+} tables[] = {
+ { iso8859_2_table, "iso-8859-2", 0, 0} ,
+ { iso8859_3_table, "iso-8859-3", 0, 0} ,
+ { iso8859_4_table, "iso-8859-4", 0, 0},
+ { iso8859_5_table, "iso-8859-5", 0, 0},
+/* apparently -6 has special digits? */
+ { iso8859_6_table, "iso-8859-6", 0, 0},
+ { iso8859_7_table, "iso-8859-7", 0, 0},
+ { iso8859_8_table, "iso-8859-8", 0, 0},
+ { iso8859_9_table, "iso-8859-9", 0, 0},
+ { iso8859_10_table, "iso-8859-10", 0, 0},
+ { iso8859_14_table, "iso-8859-14", 0, 0},
+ { iso8859_15_table, "iso-8859-15", 0, 0},
+ { koi8_r_table, "koi8-r", 0, 0},
+ { koi8_u_table, "koi8-u", 0, 0},
+ { cp932_table, "CP932", 1, 0},
+ { sjis_table, "Shift-JIS", 1, 0},
+ { 0, 0}
+};
+
+unsigned int encoding_map[256 * 256];
+
+static void
+add_bigmap(unsigned short **table, int bit)
+{
+ int i;
+ int j;
+
+ for (i=0;i<256;i++) {
+ unsigned short *tab = table[i];
+ if (tab) {
+ for (j=0;j<256;j++) {
+ if (tab[j])
+ encoding_map[tab[j]] |= bit;
+ }
+ }
+ }
+}
+
+main()
+{
+ int i, j;
+ unsigned short *tab;
+ int max, min;
+ int bit = 0x01;
+ int k;
+ int bytes;
+
+#if 0
+ /* iso-latin-1 (not needed-detected in code) */
+ for (i=0;i<256;i++) {
+ encoding_map[i] |= bit;
+ }
+ bit <<= 1;
+#endif
+
+ /* dont count the terminator */
+ bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
+
+ /* the other latin charsets */
+ for (j=0;tables[j].table;j++) {
+ switch (tables[j].type) {
+ case 0: /* table from 128-256 */
+ tab = tables[j].table;
+ for (i=0;i<128;i++) {
+ /* 0-127 is the common */
+ encoding_map[i] |= bit;
+ encoding_map[tab[i]] |= bit;
+ }
+ break;
+ case 1: /* sparse table */
+ add_bigmap(tables[j].table, bit);
+ break;
+ }
+ tables[j].bit = bit;
+ bit <<= 1;
+ }
+
+ printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
+
+ for (i=0;i<256;i++) {
+ /* first, do we need this block? */
+ for (k=0;k<bytes;k++) {
+ for (j=0;j<256;j++) {
+ if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+ break;
+ }
+ if (j < 256) {
+ /* yes, dump it */
+ printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
+ for (j=0;j<256;j++) {
+ printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
+ if (((j+1)&7) == 0 && j<255)
+ printf("\n\t");
+ }
+ printf("\n};\n\n");
+ }
+ }
+ }
+
+ printf("struct {\n");
+ for (k=0;k<bytes;k++) {
+ printf("\tunsigned char *bits%d;\n", k);
+ }
+ printf("} camel_charmap[256] = {\n\t");
+ for (i=0;i<256;i++) {
+ /* first, do we need this block? */
+ printf("{ ");
+ for (k=0;k<bytes;k++) {
+ for (j=0;j<256;j++) {
+ if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+ break;
+ }
+ if (j < 256) {
+ printf("m%02x%x, ", i, k);
+ } else {
+ printf("0, ");
+ }
+ }
+ printf("}, ");
+ if (((i+1)&7) == 0 && i<255)
+ printf("\n\t");
+ }
+ printf("\n};\n\n");
+
+ printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+ for (j=0;tables[j].table;j++) {
+ printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
+ }
+ printf("};\n\n");
+
+ printf("#define charset_mask(x) \\\n");
+ for (k=0;k<bytes;k++) {
+ if (k!=0)
+ printf("\t| ");
+ else
+ printf("\t");
+ printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
+ if (k<bytes-1)
+ printf("\t\\\n");
+ }
+ printf("\n\n");
+
+}
+
+#else
+
+#include "camel-charset-map.h"
+#include "camel-charset-map-private.h"
+#include <unicode.h>
+#include <glib.h>
+
+unsigned int
+camel_charset_mask(unsigned int c)
+{
+ if (c>0xffff)
+ return 0;
+
+ return charset_mask(c);
+}
+
+/* gets the best charset from the mask of chars in it */
+const char *
+camel_charset_best_mask(unsigned int mask)
+{
+ int i;
+
+ for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
+ if (camel_charinfo[i].bit & mask)
+ return camel_charinfo[i].name;
+ }
+ return "UTF-8";
+}
+
+/* finds the minimum charset for this string NULL means US-ASCII */
+const char *
+camel_charset_best(const char *in, int len)
+{
+ int i;
+ unsigned int mask = ~0;
+ int level = 0;
+ const char *inptr = in, *inend = in+len;
+
+ /* check what charset a given string will fit in */
+ while (inptr < inend) {
+ unicode_char_t c;
+ const char *newinptr;
+ newinptr = unicode_get_utf8(inptr, &c);
+ if (newinptr == NULL) {
+ inptr++;
+ continue;
+ }
+ inptr = newinptr;
+ if (c<=0xffff) {
+ mask |= camel_charset_mask(c);
+
+ if (c>=128 && c<256)
+ level = MAX(level, 1);
+ else if (c>=256)
+ level = MAX(level, 2);
+ } else {
+ mask = 0;
+ level = MAX(level, 2);
+ }
+ }
+
+ switch(level) {
+ case 0:
+ return NULL;
+ case 1:
+ return "ISO-8859-1";
+ case 2:
+ return camel_charset_best_mask(mask);
+ }
+}
+
+
+#endif /* !BUILD_MAP */
+