path: root/camel/camel-utf8.c



/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 *  Authors: Michael Zucchi <notzed@ximian.com>
 *
 *  Copyright 2003 Ximian, Inc. (www.ximian.com)
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
 *
 */


#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <sys/types.h>
#include <string.h>

#include <glib.h>
#include "camel-utf8.h"

#include <netinet/in.h>

/**
 * camel_utf8_putc:
 * @ptr: 
 * @c: 
 * 
 * Output a 32 bit unicode character as utf8 octets.  At most 4 octets will
 * be written to @ptr.  @ptr will be advanced to the next character position.
 **/
void
camel_utf8_putc(unsigned char **ptr, guint32 c)
{
    register unsigned char *p = *ptr;

    if (c <= 0x7f)
        *p++ = c;
    else if (c <= 0x7ff) {
        *p++ = 0xc0 | c >> 6;
        *p++ = 0x80 | (c & 0x3f);
    } else if (c <= 0xffff) {
        *p++ = 0xe0 | c >> 12;
        *p++ = 0x80 | ((c >> 6) & 0x3f);
        *p++ = 0x80 | (c & 0x3f);
    } else {
        /* see unicode standard 3.0, S 3.8, max 4 octets */
        *p++ = 0xf0 | c >> 18;
        *p++ = 0x80 | ((c >> 12) & 0x3f);
        *p++ = 0x80 | ((c >> 6) & 0x3f);
        *p++ = 0x80 | (c & 0x3f);
    }

    *ptr = p;
}

/**
 * camel_utf8_getc:
 * @ptr: 
 * 
 * Get a Unicode character from a utf8 stream.  @ptr will be advanced
 * to the next character position.  Invalid utf8 characters will be
 * silently skipped.  @ptr should point to a NUL terminated array.
 * 
 * Return value: The next Unicode character.  @ptr will be advanced to
 * the next character always.
 **/
guint32
camel_utf8_getc(const unsigned char **ptr)
{
    register unsigned char *p = (unsigned char *)*ptr;
    register unsigned char c, r;
    register guint32 v, m;

again:
    r = *p++;
loop:
    if (r < 0x80) {
        *ptr = p;
        v = r;
    } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
        v = r;
        m = 0x7f80; /* used to mask out the length bits */
        do {
            c = *p++;
            if ((c & 0xc0) != 0x80) {
                r = c;
                goto loop;
            }
            v = (v<<6) | (c & 0x3f);
            r<<=1;
            m<<=5;
        } while (r & 0x40);
        
        *ptr = p;

        v &= ~m;
    } else {
        goto again;
    }

    return v;
}

/**
 * camel_utf8_getc_limit:
 * @ptr: 
 * @end: must not be NULL.
 * 
 * Get the next utf8 char at @ptr, and return it, advancing @ptr to
 * the next character.  If @end is reached before a full utf8
 * character can be read, then the invalid Unicode char 0xffff is
 * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
 * advanced.
 * 
 * Return value: The next utf8 char, or 0xffff.
 **/
guint32
camel_utf8_getc_limit(const unsigned char **ptr, const unsigned char *end)
{
    register unsigned char *p = (unsigned char *)*ptr;
    register unsigned char c, r;
    register guint32 v = 0xffff, m;

again:
    while (p < end) {
        r = *p++;
loop:
        if (r < 0x80) {
            *ptr = p;
            return r;
        } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
            v = r;
            m = 0x7f80; /* used to mask out the length bits */
            do {
                if (p >= end)
                    return 0xffff;

                c = *p++;
                if ((c & 0xc0) != 0x80) {
                    r = c;
                    goto loop;
                }
                v = (v<<6) | (c & 0x3f);
                r<<=1;
                m<<=5;
            } while (r & 0x40);
        
            *ptr = p;
            
            v &= ~m;
            return v;
        } else {
            goto again;
        }
    }

    return 0xffff;
}

void
g_string_append_u(GString *out, guint32 c)
{
    unsigned char buffer[8];
    unsigned char *p = buffer;

    camel_utf8_putc(&p, c);
    *p = 0;
    g_string_append(out, buffer);
}

static char *utf7_alphabet =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";

static unsigned char utf7_rank[256] = {
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
    0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
    0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
    0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
    0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
};

/**
 * camel_utf7_utf8:
 * @ptr: 
 * 
 * Convert a modified utf7 string to utf8.  If the utf7 string
 * contains 8 bit characters, they are treated as iso-8859-1.
 * 
 * The IMAP rules [rfc2060] are used in the utf7 encoding.
 *
 * Return value: The converted string.
 **/
char *
camel_utf7_utf8(const char *ptr)
{
    const unsigned char *p = (unsigned char *)ptr;
    unsigned int c;
    guint32 v=0, x;
    GString *out;
    int i=0;
    int state = 0;
    char *ret;

    out = g_string_new("");
    do {
        c = *p++;
        switch(state) {
        case 0:
            if (c == '&')
                state = 1;
            else
                g_string_append_u(out, c);
            break;
        case 1:
            if (c == '-') {
                g_string_append_c(out, '&');
                state = 0;
            } else if (utf7_rank[c] != 0xff) {
                v = utf7_rank[c];
                i = 6;
                state = 2;
            } else {
                /* invalid */
                g_string_append(out, "&-");
                state = 0;
            }
            break;
        case 2:
            if (c == '-') {
                state = 0;
            } else if (utf7_rank[c] != 0xff) {
                v = (v<<6) | utf7_rank[c];
                i+=6;
                if (i >= 16) {
                    x = (v >> (i-16)) & 0xffff;
                    g_string_append_u(out, x);
                    i-=16;
                }
            } else {
                g_string_append_u(out, c);
                state = 0;
            }
            break;
        }
    } while (c);

    ret = g_strdup(out->str);
    g_string_free(out, TRUE);

    return ret;
}

static void utf7_closeb64(GString *out, guint32 v, guint32 i)
{
    guint32 x;

    if (i>0) {
        x = (v << (6-i)) & 0x3f;
        g_string_append_c(out, utf7_alphabet[x]);
    }
    g_string_append_c(out, '-');
}

/**
 * camel_utf8_utf7:
 * @ptr: 
 * 
 * Convert a utf8 string to a modified utf7 format.
 *
 * The IMAP rules [rfc2060] are used in the utf7 encoding.
 * 
 * Return value: 
 **/
char *
camel_utf8_utf7(const char *ptr)
{
    const unsigned char *p = (unsigned char *)ptr;
    unsigned int c;
    guint32 x, v = 0;
    int state = 0;
    GString *out;
    int i = 0;
    char *ret;

    out = g_string_new("");

    while ( (c = camel_utf8_getc(&p)) ) {
        if (c >= 0x20 && c <= 0x7e) {
            if (state == 1) {
                utf7_closeb64(out, v, i);
                state = 0;
                i = 0;
            }
            if (c == '&')
                g_string_append(out, "&-");
            else
                g_string_append_c(out, c);
        } else {
            if (state == 0) {
                g_string_append_c(out, '&');
                state = 1;
            }
            v = (v << 16) | c;
            i += 16;
            while (i >= 6) {
                x = (v >> (i-6)) & 0x3f;
                g_string_append_c(out, utf7_alphabet[x]);
                i -= 6;
            }
        }
    }

    if (state == 1)
        utf7_closeb64(out, v, i);

    ret = g_strdup(out->str);
    g_string_free(out, TRUE);

    return ret;
}

/**
 * camel_utf8_ucs2:
 * @ptr: 
 * 
 * Convert a utf8 string into a ucs2 one.  The ucs string will be in
 * network byte order, and terminated with a 16 bit NULL.
 * 
 * Return value: 
 **/
char *
camel_utf8_ucs2(const char *ptr)
{
    GByteArray *work = g_byte_array_new();
    guint32 c;
    char *out;

    /* what if c is > 0xffff ? */

    while ( (c = camel_utf8_getc((const unsigned char **)&ptr)) ) {
        guint16 s = htons(c);

        g_byte_array_append(work, (char *)&s, 2);
    }

    g_byte_array_append(work, "\000\000", 2);
    out = g_malloc(work->len);
    memcpy(out, work->data, work->len);
    g_byte_array_free(work, TRUE);

    return out;
}

/**
 * camel_ucs2_utf8:
 * @ptr: 
 * 
 * Convert a ucs2 string into a utf8 one.  The ucs2 string is treated
 * as network byte ordered, and terminated with a 16 bit NUL.
 * 
 * Return value: 
 **/
char *camel_ucs2_utf8(const char *ptr)
{
    guint16 *ucs = (guint16 *)ptr;
    guint32 c;
    GString *work = g_string_new("");
    char *out;

    while ( (c = *ucs++) )
        g_string_append_u(work, ntohs(c));

    out = g_strdup(work->str);
    g_string_free(work, TRUE);

    return out;
}