/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* gmime-rfc2047.c: implemention of RFC2047 */ /* * Copyright (C) 1999 Bertrand Guiheneuf . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * */ /* * Authors: Robert Brady */ #include #include #include #include #include "gmime-rfc2047.h" #define NOT_RANKED -1 /* This should be changed ASAP to use the base64 code Miguel comitted */ const char *base64_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; static unsigned char base64_rank[256]; static int base64_rank_table_built; static void build_base64_rank_table (void); static int hexval (gchar c) { if (isdigit (c)) return c-'0'; c = tolower (c); return c - 'a' + 10; } static gchar * decode_quoted (const gchar *text, const gchar *end) { gchar *to = malloc(end - text + 1), *to_2 = to; if (!to) return NULL; while (*text && text < end) { if (*text == '=') { gchar a = hexval (text[1]); gchar b = hexval (text[2]); int c = (a << 4) + b; *to = c; to++; text+=3; } else if (*text == '_') { *to = ' '; to++; text++; } else { *to = *text; to++; text++; } } return to_2; } static gchar * decode_base64 (const gchar *data, const gchar *end) { unsigned short pattern = 0; int bits = 0; int delimiter = '='; gchar x; gchar *buffer = g_malloc((end - data) * 3); gchar *t = buffer; int Q = 0; if (!buffer) return NULL; while (*data != delimiter) { x = base64_rank[(unsigned char)(*data++)]; if (x == NOT_RANKED) continue; pattern <<= 6; pattern |= x; bits += 6; if (bits >= 8) { x = (pattern >> (bits - 8)) & 0xff; *t++ = x; Q++; bits -= 8; } } *t = 0; return buffer; } static void build_base64_rank_table (void) { int i; if (!base64_rank_table_built) { for (i = 0; i < 256; i++) base64_rank[i] = NOT_RANKED; for (i = 0; i < 64; i++) base64_rank[(int) base64_alphabet[i]] = i; base64_rank_table_built = 1; } } static gchar * rfc2047_decode_word (const gchar *data, const gchar *into_what) { const char *charset = strstr (data, "=?"), *encoding, *text, *end; char *buffer, *b, *cooked_data; buffer = g_malloc (strlen(data) * 2); b = buffer; if (!charset) return strdup (data); charset+=2; encoding = strchr (charset, '?'); if (!encoding) return strdup (data); encoding++; text = strchr(encoding, '?'); if (!text) return strdup (data); text++; end = strstr(text, "?="); if (!end) return strdup (data); b[0] = 0; if (toupper(*encoding)=='Q') cooked_data = decode_quoted (text, end); else if (toupper (*encoding)=='B') cooked_data = decode_base64 (text, end); else return g_strdup(data); { char *c = strchr (charset, '?'); char *q = g_malloc (c - charset + 1); char *cook_2 = cooked_data; int cook_len = strlen (cook_2); int b_len = 4096; unicode_iconv_t i; strncpy (q, charset, c - charset); q[c - charset] = 0; i = unicode_iconv_open (into_what, q); if (!i) { g_free (q); return g_strdup (buffer); } if (unicode_iconv (i, &cook_2, &cook_len, &b, &b_len)==-1) /* FIXME : use approximation code if we can't convert it properly. */ ; unicode_iconv_close (i); *b = 0; } return g_strdup (buffer); } static const gchar * find_end_of_encoded_word (const gchar *data) { /* We can't just search for ?=, because of the case : "=?charset?q?=ff?=" :( */ if (!data) return NULL; data = strstr (data, "=?"); if (!data) return NULL; data = strchr(data+2, '?'); if (!data) return NULL; data = strchr (data+1, '?'); if (!data) return NULL; data = strstr (data+1, "?="); if (!data) return NULL; return data + 2; } gchar * gmime_rfc2047_decode (const gchar *data, const gchar *into_what) { char *buffer = malloc (strlen(data) * 4), *b = buffer; int was_encoded_word = 0; build_base64_rank_table (); while (data && *data) { char *word_start = strstr (data, "=?"), *decoded; if (!word_start) { strcpy (b, data); b[strlen (data)] = 0; return buffer; } if (word_start != data) { if (strspn (data, " \t\n\r") != (word_start - data)) { strncpy (b, data, word_start - data); b += word_start - data; *b = 0; } } decoded = rfc2047_decode_word (word_start, into_what); strcpy (b, decoded); b += strlen (decoded); *b = 0; g_free (decoded); data = find_end_of_encoded_word (data); } *b = 0; return buffer; } #define isnt_ascii(a) ((a) <= 0x1f || (a) >= 0x7f) static int rfc2047_clean (const gchar *string, const gchar *max) { // if (strstr (string, "?=")) return 1; while (string < max) { if (isnt_ascii ((unsigned char)*string)) return 0; string++; } return 1; } static gchar * encode_word (const gchar *string, int length, const gchar *said_charset) { const gchar *max = string + length; if (rfc2047_clean(string, max)) { /* don't bother encoding it if it has no odd characters in it */ return g_strndup (string, length); } { char *temp = malloc (length * 4 + 1), *t = temp; t += sprintf (t, "=?%s?q?", said_charset); while (string < max) { if (*string == ' ') *(t++) = '_'; else if ((*string <= 0x1f) || (*string >= 0x7f) || (*string == '=') || (*string == '?')) t += sprintf (t, "=%2x", (unsigned char)*string); else *(t++) = *string; string++; } t += sprintf (t, "?="); *t = 0; return temp; } } static int words_in(char *a) { int words = 1; while (*a) { if (*(a++)==' ') words++; } return words; } struct word_data { const char *word; int word_length; const char *to_encode_in; char *encoded; enum { wt_None, wt_Address, } type; }; static int string_can_fit_in(const char *a, int length, const char *charset) { while (length--) { if (*a < 0x1f || *a >= 0x7f) return 0; a++; } return 1; } static void show_entry(struct word_data *a) { a->type = wt_None; if (string_can_fit_in(a->word, a->word_length, "US-ASCII")) a->to_encode_in = "US-ASCII"; if (a->word[0]=='<' && a->word[a->word_length-1]=='>') { a->type = wt_Address; } } static void break_into_words(const char *string, struct word_data *a, int words) { int i; for (i=0;i0;i--) { if (a[i].to_encode_in == a[i-1].to_encode_in) { a[i-1].word_length += 1 + a[i].word_length; a[i].word = 0; a[i].word_length = 0; } } } static void show_words(struct word_data *words, int count) { int i; for (i=0;i and the comment starter/ender ) identify the best character set for each word merge words which share a character set, allow jumping and merging with words which would be ok to encode in non-US-ASCII. if we have to use 2 character sets, try and collapse them into one. (e.g. if one word contains letters in latin-1, and another letters in latin-2, use latin-2 for the first word as well if possible). finally : if utf-8 will still be used, use it for everything. and then, at last, generate the encoded text, using base64/quoted-printable for each word depending upon which is more efficient. TODO : create a priority list of encodings i.e. US-ASCII, ISO-8859-1, ISO-8859-2, ISO-8859-3, KOI8, Should survey for most popular charsets : what do people usually use for the following scripts? * Chinese/Japanese/Korean * Greek * Cyrillic (any other scripts commonly used in mail/news?) This algorithm is probably far from optimal, but should be reasonably efficient for simple cases. (and almost free if the text is just in US-ASCII : like 99% of the text that will pass through it) current status : Algorithm now partially implemented. */ g_free(words); g_free(temp); return encoded; }