summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpiaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204>2011-04-16 19:35:31 +0800
committerpiaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204>2011-04-16 19:35:31 +0800
commit963c6db0a46905ef07f7b15fc525c755699b5bc5 (patch)
treeaf26bf07048394fe98e748ca6f329de5c38388d9
parente66e0ef2801a2660d4b64e0e33552bfe5c02de82 (diff)
downloadpttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar
pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.gz
pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.bz2
pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.lz
pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.xz
pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.zst
pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.zip
add utf8 and big5 charset processing
git-svn-id: http://opensvn.csie.org/pttbbs/trunk@5330 63ad8ddf-47c3-0310-b6dd-a9e9d9715204
-rw-r--r--pttbbs/common/sys/Makefile8
-rwxr-xr-xpttbbs/common/sys/big5_gen.py33
-rw-r--r--pttbbs/common/sys/uao250.zipbin0 -> 314736 bytes
-rw-r--r--pttbbs/common/sys/utf8.c108
4 files changed, 148 insertions, 1 deletions
diff --git a/pttbbs/common/sys/Makefile b/pttbbs/common/sys/Makefile
index bc07bbdd..dd4ca747 100644
--- a/pttbbs/common/sys/Makefile
+++ b/pttbbs/common/sys/Makefile
@@ -4,11 +4,17 @@ SRCROOT= ../..
.include "$(SRCROOT)/pttbbs.mk"
SRCS:= daemon.c file.c lock.c log.c net.c sort.c string.c time.c \
- crypt.c record.c vector.c telnet.c vbuf.c vtkbd.c
+ crypt.c record.c vector.c telnet.c vbuf.c vtkbd.c utf8.c big5.c
+
LIB:= cmsys
all: .depend
+big5.c: uao250.zip big5_gen.py
+ unzip uao250.zip
+ ./big5_gen.py > big5.c
+ rm -f uao*.txt
+
install:
.include <bsd.lib.mk>
diff --git a/pttbbs/common/sys/big5_gen.py b/pttbbs/common/sys/big5_gen.py
new file mode 100755
index 00000000..f34ef697
--- /dev/null
+++ b/pttbbs/common/sys/big5_gen.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+# b2u
+b2u = open('uao250-b2u.txt', 'r').readlines()
+b2u = [line.strip().split(' ')
+ for line in b2u
+ if line.strip().startswith('0x')]
+b2u = dict((int(b, 0), int(u, 0)) for (b, u) in b2u)
+
+print """#include <stdint.h>
+extern const uint16_t const b2u_table[];
+extern const uint16_t const u2b_table[];
+"""
+print "const uint16_t const b2u_table[0x10000] = {"
+for i in range(0x10000):
+ print '0x%04x,' % (i if i not in b2u else b2u[i]),
+ if i % 10 == 9:
+ print ''
+print "};\n"
+
+# u2b
+u2b = open('uao250-u2b.txt', 'r').readlines()
+u2b = [line.strip().split(' ')
+ for line in u2b
+ if line.strip().startswith('0x')]
+u2b = dict((int(u, 0), int(b, 0)) for (b, u) in u2b)
+
+print "const uint16_t const u2b_table[0x10000] = {"
+for i in range(0x10000):
+ print '0x%04x,' % (i if i not in u2b else u2b[i]),
+ if i % 10 == 9:
+ print ''
+print "};\n"
diff --git a/pttbbs/common/sys/uao250.zip b/pttbbs/common/sys/uao250.zip
new file mode 100644
index 00000000..0529fc8a
--- /dev/null
+++ b/pttbbs/common/sys/uao250.zip
Binary files differ
diff --git a/pttbbs/common/sys/utf8.c b/pttbbs/common/sys/utf8.c
new file mode 100644
index 00000000..795f8c8f
--- /dev/null
+++ b/pttbbs/common/sys/utf8.c
@@ -0,0 +1,108 @@
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdint.h>
+
+int ucs2utf(uint16_t ucs2, uint8_t *utf8) {
+ // assume utf8 has enough space.
+
+ // (1) 0xxxxxxx -> 0xxxxxxx
+ if ((ucs2 & (~0x7F)) == 0) {
+ *utf8 = ucs2;
+ return 1;
+ }
+
+ if ((ucs2 & 0xF800) == 0) {
+ // (2) 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx
+ *utf8++ = 0xC0 | (ucs2 >> 6);
+ *utf8++ = 0x80 | (ucs2 & 0x3F);
+ return 2;
+ } else {
+ // (3) zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
+ *utf8++ = 0xE0 | (ucs2 >> 12);
+ *utf8++ = 0x80 | ((ucs2 >> 6) & 0x3F);
+ *utf8++ = 0x80 | ((ucs2) & 0x3F);
+ return 3;
+ }
+}
+
+int utf2ucs(uint8_t *utf8, uint16_t *pucs) {
+ uint16_t c;
+ c = *utf8++;
+ if ((c & 0x80) == 0) {
+ *pucs = c;
+ return 1;
+ }
+ switch (c >> 5) {
+ case 0x06:
+ // case 2
+ *pucs = ((c & 0x1F) << 6) | (utf8[0] & 0x3F);
+ return 2;
+
+ case 0x07:
+ // case 3
+ *pucs = ((c & 0x0F) << 12) |
+ ((utf8[0] & 0x3F) << 6) |
+ (utf8[1] & 0x3F);
+ return 3;
+ }
+ // unknown character
+ *pucs = '?';
+ return 1;
+}
+
+#ifdef _TEST_MAIN_
+
+const char * print_bits(uint8_t c) {
+ static char bits[9] = {0};
+ int i;
+ for (i = 0; i < 8; i++)
+ bits[i] = ((c >> (7-i)) & 0x01) ? '1' : '0';
+ return bits;
+}
+
+void print_bytes(uint8_t *bytes, int len) {
+ while (len-- > 0)
+ printf("%02X ", *bytes++);
+}
+
+void wikipedia_test() {
+ const int tests = 3;
+ uint16_t ucs[] = {
+ 0x24, 0xA2, 0x20AC, 0
+ };
+ uint8_t utf[][4] = {
+ {0x24, 0},
+ {0xC2, 0xA2, 0},
+ {0xE2, 0x82, 0xAC, 0},
+ };
+ uint8_t t[4];
+ int len, i;
+
+ for (i = 0; i < tests; i++) {
+ uint16_t got;
+ utf2ucs(utf[i], &got);
+ if (got != ucs[i]) {
+ printf("wikipedia_test utf2ucs: failed in %04X (got %04X)\n", ucs[i], got);
+ } else {
+ printf("wikipedia_test utf2ucs: passed %04X\n", ucs[i]);
+ }
+ memset(t, 0, sizeof(t));
+ len = ucs2utf(ucs[i], t);
+ if (strcmp(t, utf[i]) != 0) {
+ printf("wikipedia_test ucs2utf: failed in %04X (got %d)\n", ucs[i], len);
+ print_bytes(utf[i], 4);
+ printf("\n");
+ print_bytes(t, len);
+ printf("\n");
+ } else {
+ printf("wikipedia_test ucs2utf: passed %04X\n", ucs[i]);
+ }
+ }
+}
+
+int main(int argc, char *argv[]) {
+ wikipedia_test();
+ return 0;
+}
+#endif