diff options
author | piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204> | 2011-04-16 19:35:31 +0800 |
---|---|---|
committer | piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204> | 2011-04-16 19:35:31 +0800 |
commit | 963c6db0a46905ef07f7b15fc525c755699b5bc5 (patch) | |
tree | af26bf07048394fe98e748ca6f329de5c38388d9 | |
parent | e66e0ef2801a2660d4b64e0e33552bfe5c02de82 (diff) | |
download | pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.gz pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.bz2 pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.lz pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.xz pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.tar.zst pttbbs-963c6db0a46905ef07f7b15fc525c755699b5bc5.zip |
add utf8 and big5 charset processing
git-svn-id: http://opensvn.csie.org/pttbbs/trunk@5330 63ad8ddf-47c3-0310-b6dd-a9e9d9715204
-rw-r--r-- | pttbbs/common/sys/Makefile | 8 | ||||
-rwxr-xr-x | pttbbs/common/sys/big5_gen.py | 33 | ||||
-rw-r--r-- | pttbbs/common/sys/uao250.zip | bin | 0 -> 314736 bytes | |||
-rw-r--r-- | pttbbs/common/sys/utf8.c | 108 |
4 files changed, 148 insertions, 1 deletions
diff --git a/pttbbs/common/sys/Makefile b/pttbbs/common/sys/Makefile index bc07bbdd..dd4ca747 100644 --- a/pttbbs/common/sys/Makefile +++ b/pttbbs/common/sys/Makefile @@ -4,11 +4,17 @@ SRCROOT= ../.. .include "$(SRCROOT)/pttbbs.mk" SRCS:= daemon.c file.c lock.c log.c net.c sort.c string.c time.c \ - crypt.c record.c vector.c telnet.c vbuf.c vtkbd.c + crypt.c record.c vector.c telnet.c vbuf.c vtkbd.c utf8.c big5.c + LIB:= cmsys all: .depend +big5.c: uao250.zip big5_gen.py + unzip uao250.zip + ./big5_gen.py > big5.c + rm -f uao*.txt + install: .include <bsd.lib.mk> diff --git a/pttbbs/common/sys/big5_gen.py b/pttbbs/common/sys/big5_gen.py new file mode 100755 index 00000000..f34ef697 --- /dev/null +++ b/pttbbs/common/sys/big5_gen.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python + +# b2u +b2u = open('uao250-b2u.txt', 'r').readlines() +b2u = [line.strip().split(' ') + for line in b2u + if line.strip().startswith('0x')] +b2u = dict((int(b, 0), int(u, 0)) for (b, u) in b2u) + +print """#include <stdint.h> +extern const uint16_t const b2u_table[]; +extern const uint16_t const u2b_table[]; +""" +print "const uint16_t const b2u_table[0x10000] = {" +for i in range(0x10000): + print '0x%04x,' % (i if i not in b2u else b2u[i]), + if i % 10 == 9: + print '' +print "};\n" + +# u2b +u2b = open('uao250-u2b.txt', 'r').readlines() +u2b = [line.strip().split(' ') + for line in u2b + if line.strip().startswith('0x')] +u2b = dict((int(u, 0), int(b, 0)) for (b, u) in u2b) + +print "const uint16_t const u2b_table[0x10000] = {" +for i in range(0x10000): + print '0x%04x,' % (i if i not in u2b else u2b[i]), + if i % 10 == 9: + print '' +print "};\n" diff --git a/pttbbs/common/sys/uao250.zip b/pttbbs/common/sys/uao250.zip Binary files differnew file mode 100644 index 00000000..0529fc8a --- /dev/null +++ b/pttbbs/common/sys/uao250.zip diff --git a/pttbbs/common/sys/utf8.c b/pttbbs/common/sys/utf8.c new file mode 100644 index 00000000..795f8c8f --- /dev/null +++ b/pttbbs/common/sys/utf8.c @@ -0,0 +1,108 @@ +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdint.h> + +int ucs2utf(uint16_t ucs2, uint8_t *utf8) { + // assume utf8 has enough space. + + // (1) 0xxxxxxx -> 0xxxxxxx + if ((ucs2 & (~0x7F)) == 0) { + *utf8 = ucs2; + return 1; + } + + if ((ucs2 & 0xF800) == 0) { + // (2) 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx + *utf8++ = 0xC0 | (ucs2 >> 6); + *utf8++ = 0x80 | (ucs2 & 0x3F); + return 2; + } else { + // (3) zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx + *utf8++ = 0xE0 | (ucs2 >> 12); + *utf8++ = 0x80 | ((ucs2 >> 6) & 0x3F); + *utf8++ = 0x80 | ((ucs2) & 0x3F); + return 3; + } +} + +int utf2ucs(uint8_t *utf8, uint16_t *pucs) { + uint16_t c; + c = *utf8++; + if ((c & 0x80) == 0) { + *pucs = c; + return 1; + } + switch (c >> 5) { + case 0x06: + // case 2 + *pucs = ((c & 0x1F) << 6) | (utf8[0] & 0x3F); + return 2; + + case 0x07: + // case 3 + *pucs = ((c & 0x0F) << 12) | + ((utf8[0] & 0x3F) << 6) | + (utf8[1] & 0x3F); + return 3; + } + // unknown character + *pucs = '?'; + return 1; +} + +#ifdef _TEST_MAIN_ + +const char * print_bits(uint8_t c) { + static char bits[9] = {0}; + int i; + for (i = 0; i < 8; i++) + bits[i] = ((c >> (7-i)) & 0x01) ? '1' : '0'; + return bits; +} + +void print_bytes(uint8_t *bytes, int len) { + while (len-- > 0) + printf("%02X ", *bytes++); +} + +void wikipedia_test() { + const int tests = 3; + uint16_t ucs[] = { + 0x24, 0xA2, 0x20AC, 0 + }; + uint8_t utf[][4] = { + {0x24, 0}, + {0xC2, 0xA2, 0}, + {0xE2, 0x82, 0xAC, 0}, + }; + uint8_t t[4]; + int len, i; + + for (i = 0; i < tests; i++) { + uint16_t got; + utf2ucs(utf[i], &got); + if (got != ucs[i]) { + printf("wikipedia_test utf2ucs: failed in %04X (got %04X)\n", ucs[i], got); + } else { + printf("wikipedia_test utf2ucs: passed %04X\n", ucs[i]); + } + memset(t, 0, sizeof(t)); + len = ucs2utf(ucs[i], t); + if (strcmp(t, utf[i]) != 0) { + printf("wikipedia_test ucs2utf: failed in %04X (got %d)\n", ucs[i], len); + print_bytes(utf[i], 4); + printf("\n"); + print_bytes(t, len); + printf("\n"); + } else { + printf("wikipedia_test ucs2utf: passed %04X\n", ucs[i]); + } + } +} + +int main(int argc, char *argv[]) { + wikipedia_test(); + return 0; +} +#endif |