/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
/*
* Authors:
* Michael Zucchi <notzed@ximian.com>
* Dan Winship <danw@ximian.com>
*
* Copyright 2000, 2001 Ximian, Inc. (www.ximian.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <stdio.h>
/*
if you want to build the charset map, compile this with something like:
gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
(plus any -I/-L/-l flags you need for iconv), then run it as
./a.out > camel-charset-map-private.h
Note that the big-endian variant isn't tested...
The tables genereated work like this:
An indirect array for each page of unicode character
Each array element has an indirect pointer to one of the bytes of
the generated bitmask.
*/
#ifdef BUILD_MAP
#include <iconv.h>
#include <glib.h>
static struct {
char *name;
unsigned int bit; /* assigned bit */
} tables[] = {
/* These are the 8bit character sets (other than iso-8859-1,
* which is special-cased) which are supported by both other
* mailers and the GNOME environment. Note that the order
* they're listed in is the order they'll be tried in, so put
* the more-popular ones first.
*/
{ "iso-8859-2", 0 }, /* Central/Eastern European */
{ "iso-8859-4", 0 }, /* Baltic */
{ "koi8-r", 0 }, /* Russian */
{ "windows-1251", 0 }, /* Russian */
{ "koi8-u", 0 }, /* Ukranian */
{ "iso-8859-5", 0 }, /* Least-popular Russian encoding */
{ "iso-8859-7", 0 }, /* Greek */
{ "iso-8859-9", 0 }, /* Turkish */
{ "iso-8859-13", 0 }, /* Baltic again */
{ "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most
* programs that support this support UTF8
*/
{ 0, 0 }
};
unsigned int encoding_map[256 * 256];
#if G_BYTE_ORDER == G_BIG_ENDIAN
#define UCS "UCS-4BE"
#else
#define UCS "UCS-4LE"
#endif
void main(void)
{
int i, j;
int max, min;
int bit = 0x01;
int k;
int bytes;
iconv_t cd;
char in[128];
guint32 out[128];
char *inptr, *outptr;
size_t inlen, outlen;
/* dont count the terminator */
bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
for (i = 0; i < 128; i++)
in[i] = i + 128;
for (j = 0; tables[j].name; j++) {
cd = iconv_open (UCS, tables[j].name);
inptr = in;
outptr = (char *)(out);
inlen = sizeof (in);
outlen = sizeof (out);
while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
if (errno == EILSEQ) {
inptr++;
inlen--;
} else {
printf ("%s\n", strerror (errno));
exit (1);
}
}
iconv_close (cd);
for (i = 0; i < 128 - outlen / 4; i++) {
encoding_map[i] |= bit;
encoding_map[out[i]] |= bit;
}
tables[j].bit = bit;
bit <<= 1;
}
printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
for (i=0;i<256;i++) {
/* first, do we need this block? */
for (k=0;k<bytes;k++) {
for (j=0;j<256;j++) {
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
break;
}
if (j < 256) {
/* yes, dump it */
printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
for (j=0;j<256;j++) {
printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
if (((j+1)&7) == 0 && j<255)
printf("\n\t");
}
printf("\n};\n\n");
}
}
}
printf("struct {\n");
for (k=0;k<bytes;k++) {
printf("\tunsigned char *bits%d;\n", k);
}
printf("} camel_charmap[256] = {\n\t");
for (i=0;i<256;i++) {
/* first, do we need this block? */
printf("{ ");
for (k=0;k<bytes;k++) {
for (j=0;j<256;j++) {
if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
break;
}
if (j < 256) {
printf("m%02x%x, ", i, k);
} else {
printf("0, ");
}
}
printf("}, ");
if (((i+1)&7) == 0 && i<255)
printf("\n\t");
}
printf("\n};\n\n");
printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
for (j=0;tables[j].name;j++) {
printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
}
printf("};\n\n");
printf("#define charset_mask(x) \\\n");
for (k=0;k<bytes;k++) {
if (k!=0)
printf("\t| ");
else
printf("\t");
printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
if (k<bytes-1)
printf("\t\\\n");
}
printf("\n\n");
}
#else
#include "camel-charset-map.h"
#include "camel-charset-map-private.h"
#include "hash-table-utils.h"
#include <gal/unicode/gunicode.h>
#include <locale.h>
#include <string.h>
#include <glib.h>
#ifdef ENABLE_THREADS
#include <pthread.h>
#endif
#ifdef ENABLE_THREADS
static pthread_mutex_t iconv_charsets_lock = PTHREAD_MUTEX_INITIALIZER;
#define ICONV_CHARSETS_LOCK() pthread_mutex_lock (&iconv_charsets_lock)
#define ICONV_CHARSETS_UNLOCK() pthread_mutex_unlock (&iconv_charsets_lock)
#else
#define ICONV_CHARSETS_LOCK()
#define ICONV_CHARSETS_UNLOCK()
#endif /* ENABLE_THREADS */
static GHashTable *iconv_charsets = NULL;
struct {
char *charset;
char *iconv_name;
} known_iconv_charsets[] = {
/* charset name, iconv-friendly charset name */
{ "iso-8859-1", "iso-8859-1" },
{ "iso8859-1", "iso-8859-1" },
/* the above mostly serves as an example for iso-style charsets,
but we have code that will populate the iso-*'s if/when they
show up in camel_charset_map_get_iconv_friendly_name() so I'm
not going to bother putting them all in here... */
{ "windows-cp1251", "cp1251" },
{ "windows-1251", "cp1251" },
{ "cp1251", "cp1251" },
{ NULL, NULL }
};
static void
shutdown_foreach (gpointer key, gpointer value, gpointer data)
{
g_free (key);
g_free (value);
}
static void
camel_charset_map_shutdown (void)
{
g_hash_table_foreach (iconv_charsets, shutdown_foreach, NULL);
g_hash_table_destroy (iconv_charsets);
}
void
camel_charset_map_init (void)
{
int i;
if (iconv_charsets)
return;
iconv_charsets = g_hash_table_new (g_strcase_hash, g_strcase_equal);
for (i = 0; known_iconv_charsets[i].charset != NULL; i++) {
g_hash_table_insert (iconv_charsets, g_strdup (known_iconv_charsets[i].charset),
g_strdup (known_iconv_charsets[i].iconv_name));
}
g_atexit (camel_charset_map_shutdown);
}
void
camel_charset_init (CamelCharset *c)
{
c->mask = ~0;
c->level = 0;
}
void
camel_charset_step (CamelCharset *c, const char *in, int len)
{
register unsigned int mask;
register int level;
const char *inptr = in, *inend = in+len;
mask = c->mask;
level = c->level;
/* check what charset a given string will fit in */
while (inptr < inend) {
gunichar c;
const char *newinptr;
newinptr = g_utf8_next_char(inptr);
c = g_utf8_get_char(inptr);
if (newinptr == NULL || !g_unichar_validate (c)) {
inptr++;
continue;
}
inptr = newinptr;
if (c<=0xffff) {
mask &= charset_mask(c);
if (c>=128 && c<256)
level = MAX(level, 1);
else if (c>=256)
level = MAX(level, 2);
} else {
mask = 0;
level = MAX(level, 2);
}
}
c->mask = mask;
c->level = level;
}
/* gets the best charset from the mask of chars in it */
static const char *
camel_charset_best_mask(unsigned int mask)
{
int i;
for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
if (camel_charinfo[i].bit & mask)
return camel_charinfo[i].name;
}
return "UTF-8";
}
const char *
camel_charset_best_name(CamelCharset *charset)
{
if (charset->level == 1)
return "ISO-8859-1";
else if (charset->level == 2)
return camel_charset_best_mask(charset->mask);
else
return NULL;
}
/* finds the minimum charset for this string NULL means US-ASCII */
const char *
camel_charset_best(const char *in, int len)
{
CamelCharset charset;
camel_charset_init(&charset);
camel_charset_step(&charset, in, len);
return camel_charset_best_name(&charset);
}
char *
camel_charset_locale_name (void)
{
char *locale, *charset = NULL;
locale = setlocale (LC_ALL, NULL);
if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
/* The locale "C" or "POSIX" is a portable locale; its
* LC_CTYPE part corresponds to the 7-bit ASCII character
* set.
*/
return NULL;
} else {
/* A locale name is typically of the form language[_terri-
* tory][.codeset][@modifier], where language is an ISO 639
* language code, territory is an ISO 3166 country code, and
* codeset is a character set or encoding identifier like
* ISO-8859-1 or UTF-8.
*/
char *p;
int len;
p = strchr (locale, '@');
len = p ? (p - locale) : strlen (locale);
if ((p = strchr (locale, '.'))) {
charset = g_strndup (p + 1, len - (p - locale) + 1);
g_strdown (charset);
}
}
return charset;
}
const char *
camel_charset_get_iconv_friendly_name (const char *name)
{
const char *charset;
ICONV_CHARSETS_LOCK ();
charset = g_hash_table_lookup (iconv_charsets, name);
if (!charset) {
/* Attempt to friendlyify the charset */
char *new_charset;
int len;
/* Hack to convert charsets like ISO8859-1 to iconv-friendly ISO-8859-1 */
if (!g_strncasecmp (name, "iso", 3) && name[3] != '-' && name[3] != '_') {
len = strlen (name);
new_charset = g_malloc (len + 2);
memcpy (new_charset, name, 3);
new_charset[3] = '-';
memcpy (new_charset + 4, name + 3, len - 3);
new_charset[len + 1] = '\0';
g_hash_table_insert (iconv_charsets, g_strdup (name), new_charset);
} else {
/* *shrug* - add it to the hash table just the way it is? */
new_charset = g_strdup (name);
g_hash_table_insert (iconv_charsets, g_strdup (name), new_charset);
}
charset = new_charset;
}
ICONV_CHARSETS_UNLOCK ();
return charset;
}
#endif /* !BUILD_MAP */