aboutsummaryrefslogblamecommitdiffstats
path: root/camel/camel-charset-map.c
blob: f33c8082dd4a456398adbe8f1f1c9fd3a7da8137 (plain) (tree)

























                                                                    
                           

                           


                             

                       



                                 









                                                   
                                                







                                                 
                                                 

                                                 


                                                    

                                        



                                                            























                                                                    
               












































































































                                                                                                                 
                                 
                   
                   

                 
                                        
 

                     

 

                                                            
 

                                   

                                                


                         

                                                           
                           
                                     
                                                   

                                                                  


                                 
 

                                 
                                                










                                                      



















                                                                          
                                    

                                                              

                            











                                                                   

 


                                
                                      





























                                                                             


                       

#include <stdio.h>

/*
  if you want to build the charset map, add the root directory of
  libunicode to the include path and define BUILD_MAP,
  then run it as 
    ./a.out > camel-charset-map-private.h

  The tables genereated work like this:

   An indirect array for each page of unicode character
   Each array element has an indirect pointer to one of the bytes of
   the generated bitmask.
*/

#ifdef BUILD_MAP
#include "iso/iso8859-2.h"
#include "iso/iso8859-3.h"
#include "iso/iso8859-4.h"
#include "iso/iso8859-5.h"
#include "iso/iso8859-6.h"
#include "iso/iso8859-7.h"
#include "iso/iso8859-8.h"
#include "iso/iso8859-9.h"
#include "iso/iso8859-10.h"
#include "iso/iso8859-13.h"
#include "iso/iso8859-14.h"
#include "iso/iso8859-15.h"
#include "iso/windows-1250.h"
#include "iso/windows-1252.h"
#include "iso/windows-1257.h"
#include "iso/koi8-r.h"
#include "iso/koi8-u.h"
#include "iso/tis620.2533-1.h"
#include "iso/armscii-8.h"
#include "iso/georgian-academy.h"
#include "iso/georgian-ps.h"
#include "msft/cp932.h"
#include "jis/shiftjis.h"

static struct {
    unsigned short *table;
    char *name;
    int type;       /* type of table */
    unsigned int bit;   /* assigned bit */
} tables[] = {
    { iso8859_2_table, "iso-8859-2", 0, 0} ,
    { iso8859_3_table, "iso-8859-3", 0, 0} ,
    { iso8859_4_table, "iso-8859-4", 0, 0},
    { iso8859_5_table, "iso-8859-5", 0, 0},
/* apparently -6 has special digits? */
    { iso8859_6_table, "iso-8859-6", 0, 0},
    { iso8859_7_table, "iso-8859-7", 0, 0},
    { iso8859_8_table, "iso-8859-8", 0, 0},
    { iso8859_9_table, "iso-8859-9", 0, 0},
    { iso8859_10_table, "iso-8859-10", 0, 0},
    { iso8859_13_table, "iso-8859-13", 0, 0},
    { iso8859_14_table, "iso-8859-14", 0, 0},
    { iso8859_15_table, "iso-8859-15", 0, 0},
    { windows_1250_table, "windows-1250", 0, 0},
    { windows_1252_table, "windows-1252", 0, 0},
    { windows_1257_table, "windows-1257", 0, 0},
    { koi8_r_table, "koi8-r", 0, 0},
    { koi8_u_table, "koi8-u", 0, 0},
    { tis_620_table, "tis620.2533-1", 0, 0},
    { armscii_8_table, "armscii-8", 0, 0},
    { georgian_academy_table, "georgian-academy", 0, 0},
    { georgian_ps_table, "georgian-ps", 0, 0},
    { cp932_table, "CP932", 1, 0},
    { sjis_table, "Shift-JIS", 1, 0},
    { 0, 0}
};

unsigned int encoding_map[256 * 256];

static void
add_bigmap(unsigned short **table, int bit)
{
    int i;
    int j;

    for (i=0;i<256;i++) {
        unsigned short *tab = table[i];
        if (tab) {
            for (j=0;j<256;j++) {
                if (tab[j])
                    encoding_map[tab[j]] |= bit;
            }
        }
    }
}

void main(void)
{
    int i, j;
    unsigned short *tab;
    int max, min;
    int bit = 0x01;
    int k;
    int bytes;

#if 0
    /* iso-latin-1 (not needed-detected in code) */
    for (i=0;i<256;i++) {
        encoding_map[i] |= bit;
    }
    bit <<= 1;
#endif

    /* dont count the terminator */
    bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;

    /* the other latin charsets */
    for (j=0;tables[j].table;j++) {
        switch (tables[j].type) {
        case 0:     /* table from 128-256 */
            tab = tables[j].table;
            for (i=0;i<128;i++) {
                /* 0-127 is the common */
                encoding_map[i] |= bit;
                encoding_map[tab[i]] |= bit;
            }
            break;
        case 1:     /* sparse table */
            add_bigmap(tables[j].table, bit);
            break;
        }
        tables[j].bit = bit;
        bit <<= 1;
    }

    printf("/* This file is automatically generated: DO NOT EDIT */\n\n");

    for (i=0;i<256;i++) {
        /* first, do we need this block? */
        for (k=0;k<bytes;k++) {
            for (j=0;j<256;j++) {
                if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
                    break;
            }
            if (j < 256) {
                /* yes, dump it */
                printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
                for (j=0;j<256;j++) {
                    printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
                    if (((j+1)&7) == 0 && j<255)
                        printf("\n\t");
                }
                printf("\n};\n\n");
            }
        }
    }

    printf("struct {\n");
    for (k=0;k<bytes;k++) {
        printf("\tunsigned char *bits%d;\n", k);
    }
    printf("} camel_charmap[256] = {\n\t");
    for (i=0;i<256;i++) {
        /* first, do we need this block? */
        printf("{ ");
        for (k=0;k<bytes;k++) {
            for (j=0;j<256;j++) {
                if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
                    break;
            }
            if (j < 256) {
                printf("m%02x%x, ", i, k);
            } else {
                printf("0, ");
            }
        }
        printf("}, ");
        if (((i+1)&7) == 0 && i<255)
            printf("\n\t");
    }
    printf("\n};\n\n");

    printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
    for (j=0;tables[j].table;j++) {
        printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
    }
    printf("};\n\n");

    printf("#define charset_mask(x) \\\n");
    for (k=0;k<bytes;k++) {
        if (k!=0)
            printf("\t| ");
        else
            printf("\t");
        printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
        if (k<bytes-1)
            printf("\t\\\n");
    }
    printf("\n\n");

}

#else

#include "camel-charset-map.h"
#include "camel-charset-map-private.h"
#include <gal/unicode/gunicode.h>
#include <locale.h>
#include <string.h>
#include <glib.h>

void camel_charset_init(CamelCharset *c)
{
    c->mask = ~0;
    c->level = 0;
}

void
camel_charset_step(CamelCharset *c, const char *in, int len)
{
    register unsigned int mask;
    register int level;
    const char *inptr = in, *inend = in+len;

    mask = c->mask;
    level = c->level;

    /* check what charset a given string will fit in */
    while (inptr < inend) {
        gunichar c;
        const char *newinptr;
        newinptr = g_utf8_next_char(inptr);
        c = g_utf8_get_char(inptr);
        if (newinptr == NULL || !g_unichar_validate (c)) {
            inptr++;
            continue;
        }

        inptr = newinptr;
        if (c<=0xffff) {
            mask &= charset_mask(c);
        
            if (c>=128 && c<256)
                level = MAX(level, 1);
            else if (c>=256)
                level = MAX(level, 2);
        } else {
            mask = 0;
            level = MAX(level, 2);
        }
    }

    c->mask = mask;
    c->level = level;
}

/* gets the best charset from the mask of chars in it */
static const char *
camel_charset_best_mask(unsigned int mask)
{
    int i;

    for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
        if (camel_charinfo[i].bit & mask)
            return camel_charinfo[i].name;
    }
    return "UTF-8";
}

const char *camel_charset_best_name(CamelCharset *charset)
{
    if (charset->level == 1)
        return "ISO-8859-1";
    else if (charset->level == 2)
        return camel_charset_best_mask(charset->mask);
    else
        return NULL;

}

/* finds the minimum charset for this string NULL means US-ASCII */
const char *
camel_charset_best(const char *in, int len)
{
    CamelCharset charset;

    camel_charset_init(&charset);
    camel_charset_step(&charset, in, len);
    return camel_charset_best_name(&charset);
}

char *
camel_charset_locale_name (void)
{
    char *locale, *charset = NULL;
    
    locale = setlocale (LC_ALL, NULL);
    
    if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
        /* The locale "C"  or  "POSIX"  is  a  portable  locale;  its
         * LC_CTYPE  part  corresponds  to  the 7-bit ASCII character
         * set.
         */
        
        return NULL;
    } else {
        /* A locale name is typically of  the  form  language[_terri-
         * tory][.codeset][@modifier],  where  language is an ISO 639
         * language code, territory is an ISO 3166 country code,  and
         * codeset  is  a  character  set or encoding identifier like
         * ISO-8859-1 or UTF-8.
         */
        char *p;
        int len;
        
        p = strchr (locale, '@');
        len = p ? (p - locale) : strlen (locale);
        if ((p = strchr (locale, '.'))) {
            charset = g_strndup (p + 1, len - (p - locale) + 1);
            g_strdown (charset);
        }
    }
    
    return charset;
}

#endif /* !BUILD_MAP */