aboutsummaryrefslogblamecommitdiffstats
path: root/camel/camel-charset-map.c
blob: c0076585533976c9abea67d75798c6a5c7097c53 (plain) (tree)
1
2
3
4
5
6
7
8






                                                                            
                                                     



















                                                                      



                  


                                                                         

                                         

                                                  







                                                                    

                  

               
                   

                                                  


















                                                                          



                                     




                                
 
               

                 



                       




                             



                                                           















                                                                            
                         
                 






                                                        



















































                                                                                                     
                                      





















                                                                                                                 
                             
                                 
                   
                   
                 














                                                                           
                                   






























                                                                        
                                




                             
                     










                                                                                                


























                                                                                   

                                              
 

                                    
 

                     

 
    
                                                             
 

                                   

                                                


                         

                                                           
                           
                                     
                                                   

                                                                  


                                 
 

                                 
                                                










                                                      
















                                                                          
            
                                               

                                
                                    
                                     
                                                               

                            




                                                                   
                                            


                             


                                                  

 
            

                                
                              
 
 



















                                                                                          


                                                                                    

                 
                                                                                   



                                      

                                                                                              


                       

                       
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */

/* 
 * Authors:
 *   Michael Zucchi <notzed@ximian.com>
 *   Dan Winship <danw@ximian.com>
 *
 * Copyright 2000, 2001 Ximian, Inc. (www.ximian.com)
 *
 * This program is free software; you can redistribute it and/or 
 * modify it under the terms of the GNU General Public License as 
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>

/*
  if you want to build the charset map, compile this with something like:
    gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
  (plus any -I/-L/-l flags you need for iconv), then run it as 
    ./a.out > camel-charset-map-private.h

  Note that the big-endian variant isn't tested...

  The tables genereated work like this:

   An indirect array for each page of unicode character
   Each array element has an indirect pointer to one of the bytes of
   the generated bitmask.
*/

#ifdef BUILD_MAP
#include <iconv.h>
#include <glib.h>

static struct {
    char *name;
    unsigned int bit;   /* assigned bit */
} tables[] = {
    /* These are the 8bit character sets (other than iso-8859-1,
     * which is special-cased) which are supported by both other
     * mailers and the GNOME environment. Note that the order
     * they're listed in is the order they'll be tried in, so put
     * the more-popular ones first.
     */
    { "iso-8859-2", 0 },    /* Central/Eastern European */
    { "iso-8859-4", 0 },    /* Baltic */
    { "koi8-r", 0 },    /* Russian */
    { "windows-1251", 0 },  /* Russian */
    { "koi8-u", 0 },    /* Ukranian */
    { "iso-8859-5", 0 },    /* Least-popular Russian encoding */
    { "iso-8859-7", 0 },    /* Greek */
    { "iso-8859-9", 0 },    /* Turkish */
    { "iso-8859-13", 0 },   /* Baltic again */
    { "iso-8859-15", 0 },   /* New-and-improved iso-8859-1, but most
                 * programs that support this support UTF8
                 */
    { 0, 0 }
};

unsigned int encoding_map[256 * 256];

#if G_BYTE_ORDER == G_BIG_ENDIAN
#define UCS "UCS-4BE"
#else
#define UCS "UCS-4LE"
#endif

void main(void)
{
    int i, j;
    int max, min;
    int bit = 0x01;
    int k;
    int bytes;
    iconv_t cd;
    char in[128];
    guint32 out[128];
    char *inptr, *outptr;
    size_t inlen, outlen;

    /* dont count the terminator */
    bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;

    for (i = 0; i < 128; i++)
        in[i] = i + 128;

    for (j = 0; tables[j].name; j++) {
        cd = iconv_open (UCS, tables[j].name);
        inptr = in;
        outptr = (char *)(out);
        inlen = sizeof (in);
        outlen = sizeof (out);
        while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
            if (errno == EILSEQ) {
                inptr++;
                inlen--;
            } else {
                printf ("%s\n", strerror (errno));
                exit (1);
            }
        }
        iconv_close (cd);

        for (i = 0; i < 128 - outlen / 4; i++) {
            encoding_map[i] |= bit;
            encoding_map[out[i]] |= bit;
        }

        tables[j].bit = bit;
        bit <<= 1;
    }

    printf("/* This file is automatically generated: DO NOT EDIT */\n\n");

    for (i=0;i<256;i++) {
        /* first, do we need this block? */
        for (k=0;k<bytes;k++) {
            for (j=0;j<256;j++) {
                if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
                    break;
            }
            if (j < 256) {
                /* yes, dump it */
                printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
                for (j=0;j<256;j++) {
                    printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
                    if (((j+1)&7) == 0 && j<255)
                        printf("\n\t");
                }
                printf("\n};\n\n");
            }
        }
    }

    printf("struct {\n");
    for (k=0;k<bytes;k++) {
        printf("\tunsigned char *bits%d;\n", k);
    }
    printf("} camel_charmap[256] = {\n\t");
    for (i=0;i<256;i++) {
        /* first, do we need this block? */
        printf("{ ");
        for (k=0;k<bytes;k++) {
            for (j=0;j<256;j++) {
                if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
                    break;
            }
            if (j < 256) {
                printf("m%02x%x, ", i, k);
            } else {
                printf("0, ");
            }
        }
        printf("}, ");
        if (((i+1)&7) == 0 && i<255)
            printf("\n\t");
    }
    printf("\n};\n\n");

    printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
    for (j=0;tables[j].name;j++) {
        printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
    }
    printf("};\n\n");

    printf("#define charset_mask(x) \\\n");
    for (k=0;k<bytes;k++) {
        if (k!=0)
            printf("\t| ");
        else
            printf("\t");
        printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
        if (k<bytes-1)
            printf("\t\\\n");
    }
    printf("\n\n");

}

#else

#include "camel-charset-map.h"
#include "camel-charset-map-private.h"
#include "hash-table-utils.h"
#include <gal/unicode/gunicode.h>
#include <locale.h>
#include <string.h>
#include <glib.h>
#ifdef ENABLE_THREADS
#include <pthread.h>
#endif


#ifdef ENABLE_THREADS
static pthread_mutex_t iconv_charsets_lock = PTHREAD_MUTEX_INITIALIZER;
#define ICONV_CHARSETS_LOCK() pthread_mutex_lock (&iconv_charsets_lock)
#define ICONV_CHARSETS_UNLOCK() pthread_mutex_unlock (&iconv_charsets_lock)
#else
#define ICONV_CHARSETS_LOCK()
#define ICONV_CHARSETS_UNLOCK()
#endif /* ENABLE_THREADS */

static GHashTable *iconv_charsets = NULL;
static char *locale_charset = NULL;

struct {
    char *charset;
    char *iconv_name;
} known_iconv_charsets[] = {
    /* charset name, iconv-friendly charset name */
    { "iso-8859-1",     "iso-8859-1" },
    { "iso8859-1",      "iso-8859-1" },
    /* the above mostly serves as an example for iso-style charsets,
       but we have code that will populate the iso-*'s if/when they
       show up in camel_charset_map_get_iconv_friendly_name() so I'm
       not going to bother putting them all in here... */
    { "windows-cp1251", "cp1251"     },
    { "windows-1251",   "cp1251"     },
    { "cp1251",         "cp1251"     },
    { NULL,             NULL         }
};


static void
shutdown_foreach (gpointer key, gpointer value, gpointer data)
{
    g_free (key);
    g_free (value);
}

static void
camel_charset_map_shutdown (void)
{
    g_hash_table_foreach (iconv_charsets, shutdown_foreach, NULL);
    g_hash_table_destroy (iconv_charsets);
    g_free (locale_charset);
}

void
camel_charset_map_init (void)
{
    char *locale;
    int i;
    
    if (iconv_charsets)
        return;
    
    iconv_charsets = g_hash_table_new (g_strcase_hash, g_strcase_equal);
    for (i = 0; known_iconv_charsets[i].charset != NULL; i++) {
        g_hash_table_insert (iconv_charsets, g_strdup (known_iconv_charsets[i].charset),
                     g_strdup (known_iconv_charsets[i].iconv_name));
    }
    
    locale = setlocale (LC_ALL, NULL);
    
    if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
        /* The locale "C"  or  "POSIX"  is  a  portable  locale;  its
         * LC_CTYPE  part  corresponds  to  the 7-bit ASCII character
         * set.
         */
        
        locale_charset = NULL;
    } else {
        /* A locale name is typically of  the  form  language[_terri-
         * tory][.codeset][@modifier],  where  language is an ISO 639
         * language code, territory is an ISO 3166 country code,  and
         * codeset  is  a  character  set or encoding identifier like
         * ISO-8859-1 or UTF-8.
         */
        char *p;
        int len;
        
        p = strchr (locale, '@');
        len = p ? (p - locale) : strlen (locale);
        if ((p = strchr (locale, '.'))) {
            locale_charset = g_strndup (p + 1, len - (p - locale) + 1);
            g_strdown (locale_charset);
        }
    }
    
    g_atexit (camel_charset_map_shutdown);
}

void
camel_charset_init (CamelCharset *c)
{
    c->mask = ~0;
    c->level = 0;
}

void
camel_charset_step (CamelCharset *c, const char *in, int len)
{
    register unsigned int mask;
    register int level;
    const char *inptr = in, *inend = in+len;

    mask = c->mask;
    level = c->level;

    /* check what charset a given string will fit in */
    while (inptr < inend) {
        gunichar c;
        const char *newinptr;
        newinptr = g_utf8_next_char(inptr);
        c = g_utf8_get_char(inptr);
        if (newinptr == NULL || !g_unichar_validate (c)) {
            inptr++;
            continue;
        }

        inptr = newinptr;
        if (c<=0xffff) {
            mask &= charset_mask(c);
        
            if (c>=128 && c<256)
                level = MAX(level, 1);
            else if (c>=256)
                level = MAX(level, 2);
        } else {
            mask = 0;
            level = MAX(level, 2);
        }
    }

    c->mask = mask;
    c->level = level;
}

/* gets the best charset from the mask of chars in it */
static const char *
camel_charset_best_mask(unsigned int mask)
{
    int i;

    for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
        if (camel_charinfo[i].bit & mask)
            return camel_charinfo[i].name;
    }
    return "UTF-8";
}

const char *
camel_charset_best_name (CamelCharset *charset)
{
    if (charset->level == 1)
        return "ISO-8859-1";
    else if (charset->level == 2)
        return camel_charset_best_mask (charset->mask);
    else
        return NULL;

}

/* finds the minimum charset for this string NULL means US-ASCII */
const char *
camel_charset_best (const char *in, int len)
{
    CamelCharset charset;

    camel_charset_init (&charset);
    camel_charset_step (&charset, in, len);
    return camel_charset_best_name (&charset);
}

const char *
camel_charset_locale_name (void)
{
    return locale_charset;
}

const char *
camel_charset_get_iconv_friendly_name (const char *name)
{
    const char *charset;
    
    ICONV_CHARSETS_LOCK ();
    charset = g_hash_table_lookup (iconv_charsets, name);
    if (!charset) {
        /* Attempt to friendlyify the charset */
        char *new_charset;
        int len;
        
        /* Hack to convert charsets like ISO8859-1 to iconv-friendly ISO-8859-1 */
        if (!g_strncasecmp (name, "iso", 3) && name[3] != '-' && name[3] != '_') {
            len = strlen (name);
            new_charset = g_malloc (len + 2);
            memcpy (new_charset, name, 3);
            new_charset[3] = '-';
            memcpy (new_charset + 4, name + 3, len - 3);
            new_charset[len + 1] = '\0';
        } else {
            /* *shrug* - add it to the hash table just the way it is? */
            new_charset = g_strdup (name);
        }
        
        g_hash_table_insert (iconv_charsets, g_strdup (name), new_charset);
        charset = new_charset;
    }
    ICONV_CHARSETS_UNLOCK ();
    
    g_warning ("camel_charset_get_iconv_friendly_name (\"%s\") => \"%s\"", name, charset);
    
    return charset;
}

#endif /* !BUILD_MAP */