From 9b60cad3dc13970bb0b4562cf4f9b38f3edc46db Mon Sep 17 00:00:00 2001
From: Not Zed <NotZed@Ximian.com>
Date: Mon, 13 Jan 2003 05:46:35 +0000
Subject: Read the characters as utf8, rather than as 8 bit bytes. Remove the

2003-01-13  Not Zed  <NotZed@Ximian.com>

        * camel-mime-filter-tohtml.c (writeln): Read the characters as
        utf8, rather than as 8 bit bytes.  Remove the PRESERVE_8BIT as it
        has no meaning.  Also change the default logic slightly so that 8
        bit or greater characters are properly converted to entities.

        * camel-utf8.c (camel_utf8_getc_limit): new function, gets a utf8
        char, bounded by an end pointer.

svn path=/trunk/; revision=19421
---
 camel/ChangeLog                  | 10 +++++++
 camel/camel-mime-filter-tohtml.c | 38 +++++++++++++++------------
 camel/camel-utf8.c               | 56 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 17 deletions(-)

(limited to 'camel')

diff --git a/camel/ChangeLog b/camel/ChangeLog
index f9c2ce1c3f..916e4b6f70 100644
--- a/camel/ChangeLog
+++ b/camel/ChangeLog
@@ -1,3 +1,13 @@
+2003-01-13  Not Zed  <NotZed@Ximian.com>
+
+	* camel-mime-filter-tohtml.c (writeln): Read the characters as
+	utf8, rather than as 8 bit bytes.  Remove the PRESERVE_8BIT as it
+	has no meaning.  Also change the default logic slightly so that 8
+	bit or greater characters are properly converted to entities.
+	
+	* camel-utf8.c (camel_utf8_getc_limit): new function, gets a utf8
+	char, bounded by an end pointer.
+
 2003-01-07  Dan Winship  <danw@ximian.com>
 
 	* camel-provider.h (CamelProvider): add a "translation_domain"
diff --git a/camel/camel-mime-filter-tohtml.c b/camel/camel-mime-filter-tohtml.c
index 4f9d972625..370d9c6c4e 100644
--- a/camel/camel-mime-filter-tohtml.c
+++ b/camel/camel-mime-filter-tohtml.c
@@ -28,6 +28,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include "camel-utf8.h"
 #include "camel-url-scanner.h"
 #include "camel-mime-filter-tohtml.h"
 
@@ -147,14 +148,18 @@ static char *
 writeln (CamelMimeFilter *filter, const char *in, const char *inend, char *outptr, char **outend)
 {
 	CamelMimeFilterToHTML *html = (CamelMimeFilterToHTML *) filter;
-	register const char *inptr = in;
-	
+	const char *inptr = in;
+
 	while (inptr < inend) {
-		unsigned char u;
-		
-		outptr = check_size (filter, outptr, outend, 9);
-		
-		switch ((u = (unsigned char) *inptr++)) {
+		guint32 u;
+
+		outptr = check_size (filter, outptr, outend, 16);
+
+		u = camel_utf8_getc_limit(&inptr, inend);
+		switch (u) {
+		case 0xffff:
+			g_warning("Truncated utf8 buffer");
+			return outptr;
 		case '<':
 			outptr = g_stpcpy (outptr, "&lt;");
 			html->column++;
@@ -182,22 +187,21 @@ writeln (CamelMimeFilter *filter, const char *in, const char *inend, char *outpt
 			}
 			/* otherwise, FALL THROUGH */
 		case ' ':
-			if (html->flags & CAMEL_MIME_FILTER_TOHTML_CONVERT_SPACES) {
-				if (inptr == (in + 1) || *inptr == ' ' || *inptr == '\t') {
-					outptr = g_stpcpy (outptr, "&nbsp;");
-					html->column++;
-					break;
-				}
+			if (html->flags & CAMEL_MIME_FILTER_TOHTML_CONVERT_SPACES
+			    && ((inptr == (in + 1) || *inptr == ' ' || *inptr == '\t'))) {
+				outptr = g_stpcpy (outptr, "&nbsp;");
+				html->column++;
+				break;
 			}
 			/* otherwise, FALL THROUGH */
 		default:
-			if (!(u >= 0x20 && u < 0x80) && !(html->flags & CAMEL_MIME_FILTER_TOHTML_PRESERVE_8BIT)) {
+			if (u >= 20 && u <0x80)
+				*outptr++ = u;
+			else {
 				if (html->flags & CAMEL_MIME_FILTER_TOHTML_ESCAPE_8BIT)
 					*outptr++ = '?';
 				else
-					outptr += g_snprintf (outptr, 9, "&#%d;", (int) u);
-			} else {
-				*outptr++ = (char) u;
+					outptr += sprintf(outptr, "&#%u;", u);
 			}
 			html->column++;
 			break;
diff --git a/camel/camel-utf8.c b/camel/camel-utf8.c
index 5ed5a476d0..3c7af65b4d 100644
--- a/camel/camel-utf8.c
+++ b/camel/camel-utf8.c
@@ -83,6 +83,62 @@ loop:
 	return v;
 }
 
+/**
+ * camel_utf8_getc_limit:
+ * @ptr: 
+ * @end: must not be NULL.
+ * 
+ * Get the next utf8 char at @ptr, and return it, advancing @ptr to
+ * the next character.  If @end is reached before a full utf8
+ * character can be read, then the invalid Unicode char 0xffff is
+ * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
+ * advanced.
+ * 
+ * Return value: The next utf8 char, or 0xffff.
+ **/
+guint32
+camel_utf8_getc_limit(const unsigned char **ptr, const unsigned char *end)
+{
+	register unsigned char *p = (unsigned char *)*ptr;
+	register unsigned char c, r;
+	register guint32 v = 0xffff, m;
+
+again:
+	while (p < end) {
+		r = *p++;
+loop:
+		if (r < 0x80) {
+			*ptr = p;
+			return r;
+		} else if (r < 0xf8) { /* valid start char? (max 4 octets) */
+			v = r;
+			m = 0x7f80;	/* used to mask out the length bits */
+			do {
+				if (p >= end)
+					return 0xffff;
+
+				c = *p++;
+				if ((c & 0xc0) != 0x80) {
+					r = c;
+					goto loop;
+				}
+				v = (v<<6) | (c & 0x3f);
+				r<<=1;
+				m<<=5;
+			} while (r & 0x40);
+		
+			*ptr = p;
+			
+			v &= ~m;
+			return v;
+		} else {
+			goto again;
+		}
+	}
+
+	return 0xffff;
+}
+
 void
 g_string_append_u(GString *out, guint32 c)
 {
-- 
cgit v1.2.3