path: root/camel/gmime-rfc2047.c



/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/* gmime-rfc2047.c: implemention of RFC2047 */

/*
 * Copyright (C) 1999 Bertrand Guiheneuf <Bertrand.Guiheneuf@inria.fr> .
 * 
 * This program is free software; you can redistribute it and/or 
 * modify it under the terms of the GNU General Public License as 
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 *
 */

/* 
 * Authors:  Robert Brady <rwb197@ecs.soton.ac.uk>
 */

#include <stdio.h>
#include <ctype.h>
#include <unicode.h>
#include <string.h>

#include "gmime-rfc2047.h"

#define NOT_RANKED -1

/* This should be changed ASAP to use the base64 code Miguel comitted */

const char *base64_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

static unsigned char base64_rank[256];
static int base64_rank_table_built;
static void build_base64_rank_table (void);

static int 
hexval (gchar c) {
    if (isdigit (c)) return c-'0';
    c = tolower (c);
    return c - 'a' + 10;
}

static gchar *
decode_quoted(const gchar *text, const gchar *end) {
    gchar *to = malloc(end - text + 1), *to_2 = to;
        if (!to) return NULL;
    while (*text && text < end) {
        if (*text == '=') {
            gchar a = hexval (text[1]);
            gchar b = hexval (text[2]);
            int c = (a << 4) + b;
            *to = c;
            to++;
            text+=3;
        } else if (*text == '_') {
            *to = ' ';
            to++;
            text++;
        } else {
            *to = *text;
            to++;
            text++;
        }
    }
    return to_2;
}

static gchar *
decode_base64(const gchar *data, const gchar *end) {
    unsigned short pattern = 0;
    int bits = 0;
    int delimiter = '=';
    gchar x;
    gchar *buffer = g_malloc((end - data) * 3);
    gchar *t = buffer;
    int Q = 0;

    if (!buffer) return NULL;

    while (*data != delimiter) {
        x = base64_rank[(unsigned char)(*data++)];
        if (x == NOT_RANKED)
            continue;
        pattern <<= 6;
        pattern |= x;
        bits += 6;
        if (bits >= 8) {
            x = (pattern >> (bits - 8)) & 0xff;
            *t++ = x;
            Q++;
            bits -= 8;
        }
    }
    *t = 0;
    return buffer;
}

static void
build_base64_rank_table (void)
{
    int i;
    
    if (!base64_rank_table_built) {
        for (i = 0; i < 256; i++)
            base64_rank[i] = NOT_RANKED;
        for (i = 0; i < 64; i++)
            base64_rank[(int) base64_alphabet[i]] = i;
        base64_rank_table_built = 1;
    }
}


gchar*
rfc2047_decode_word (const gchar *data, const gchar *into_what) 
{
    const char *charset = strstr(data, "=?"), *encoding, *text, *end;

    char *buffer, *b, *cooked_data;
    
    buffer = g_malloc(strlen(data) * 2);
    b = buffer;

    if (!charset) return strdup(data);
    charset+=2;

    encoding = strchr(charset, '?');
    if (!encoding) return strdup(data);
    encoding++;

    text = strchr(encoding, '?');
    if (!text) return strdup(data);
    text++;

    end = strstr(text, "?=");
    if (!end) return strdup(data);

    b[0] = 0;

    if (toupper(*encoding)=='Q')
        cooked_data = decode_quoted(text, end);
    else if (toupper(*encoding)=='B')
        cooked_data = decode_base64(text, end);
    else
        return g_strdup(data);

    {
        char *c = strchr(charset, '?');
        char *q = g_malloc(c - charset + 1);
        char *cook_2 = cooked_data;
        int cook_len = strlen(cook_2);
        int b_len = 4096;
        iconv_t i;
        strncpy(q, charset, c - charset);
        q[c - charset] = 0;
        i = unicode_iconv_open(into_what, q);
        if (!i) {
            g_free(q);
            return g_strdup(buffer);
        }
        if (unicode_iconv(i, &cook_2, &cook_len, &b, &b_len)==-1)
            /* FIXME : use approximation code if we can't convert it properly. */
            ;
        unicode_iconv_close(i);
        *b = 0;
    }

    return g_strdup(buffer);
}

static const gchar *
find_end_of_encoded_word(const gchar *data) {
    /* We can't just search for ?=,
           because of the case :
           "=?charset?q?=ff?=" :( */
    if (!data) return NULL;
    data = strstr(data, "=?");
    if (!data) return NULL;
    data = strchr(data+2, '?');
    if (!data) return NULL;
    data = strchr(data+1, '?');
    if (!data) return NULL;
    data = strstr(data+1, "?=");
    if (!data) return NULL;
    return data + 2;
}

gchar *
gmime_rfc2047_decode (const gchar *data, const gchar *into_what) 
{
    char *buffer = malloc(strlen(data) * 4), *b = buffer;

    int was_encoded_word = 0;

    build_base64_rank_table ();

    while (data && *data) {
        char *word_start = strstr(data, "=?"), *decoded;
        if (!word_start) {
            strcpy(b, data);
            b[strlen(data)] = 0;
            return buffer;
        }
        if (word_start != data) {

            if (strspn(data, " \t\n\r") != (word_start - data)) {
                strncpy(b, data, word_start - data);
                b += word_start - data;
                *b = 0;
            }
        }
        decoded = rfc2047_decode_word(word_start, into_what);
        strcpy(b, decoded);
        b += strlen(decoded);
        *b = 0;
        g_free(decoded);

        data = find_end_of_encoded_word(data);
    }

    *b = 0;
    return buffer;
}

#define isnt_ascii(a) ((a) <= 0x1f || (a) >= 0x7f)

static int rfc2047_clean(const gchar *string) {
    if (strstr(string, "?=")) return 1;
    while (*string) {
        if (!isnt_ascii((unsigned char)*string))
            return 0;
        string++;
    }
    return 1;
}

static gchar *encode_word (const gchar *string, const gchar *said_charset) {
    if (rfc2047_clean(string)) 
        /* don't bother encoding it if it has no odd characters in it */
        return g_strdup(string);
    {
        char *temp = malloc(strlen(string) * 4 + 1), *t = temp;
        t += sprintf(t, "=?%s?q?", said_charset);
        while (*string) {
            if (*string == ' ')
                *(t++) = '_';
            else if (*string <= 0x1f || *string >= 0x7f || *string == '=' || *string == '?') 
                t += sprintf(t, "=%2x", (unsigned char)*string);
            else
                *(t++) = *string;
                  
            string++;
        }
        t += sprintf(t, "?=");
        *t = 0;
            return temp;
    }
}

gchar *
gmime_rfc2047_encode (const gchar *string, const gchar *charset) 
{
    int temp_len = strlen(string)*4 + 1;
    char *temp = g_malloc(temp_len), *temp_2 = temp;
    int string_length = strlen(string);
    char *encoded = NULL;

    /* first, let us convert to UTF-8 */
    iconv_t i = unicode_iconv_open("UTF-8", charset);
    unicode_iconv(i, &string, &string_length, &temp_2, &temp_len);
    unicode_iconv_close(i);
    
    /* null terminate it */
    *temp_2 = 0;

    /* now encode it as if it were a single word */
    encoded = encode_word(temp, "UTF-8");
    
    /*
      
      real algorithm :
      
      we need to 
      
      split it into words
      
      identify portions that have NOT to be encoded (i.e. <> and the comment starter/ender )
      
      identify the best character set for each word
      
      merge words which share a character set, allow jumping and merging with words which 
      would be ok to encode in non-US-ASCII.
      
      if we have to use 2 character sets, try and collapse them into one.
      
      (e.g. if one word contains letters in latin-1, and another letters in latin-2, use
      latin-2 for the first word as well if possible).
      
      finally :
      
      if utf-8 will still be used, use it for everything.
      
      and then, at last, generate the encoded text, using base64/quoted-printable for
      each word depending upon which is more efficient.
      
      TODO :
      create a priority list of encodings
      
      i.e.

            US-ASCII, ISO-8859-1, ISO-8859-2, ISO-8859-3, KOI8, 

          Should survey for most popular charsets :
          what do people usually use for the following scripts?

      * Chinese/Japanese/Korean
          * Greek
          * Cyrillic

          (any other scripts commonly used in mail/news?)

      This algorithm is probably far from optimal, but should be
      reasonably efficient for simple cases. (and almost free if
      the text is just in US-ASCII : like 99% of the text that will
      pass through it)
      
    */
    
        g_free(temp);
    
    return encoded;
}