diff options
author | piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204> | 2014-08-08 01:49:06 +0800 |
---|---|---|
committer | piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204> | 2014-08-08 01:49:06 +0800 |
commit | 589c0a1b1657e0df72c349bab5d64852f963d66e (patch) | |
tree | 26986942e61738cd9c1f60e84baaefa6c74bca8b | |
parent | baa910a873b24bd5835a59bc41da2470b0af6a36 (diff) | |
download | pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.gz pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.bz2 pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.lz pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.xz pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.zst pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.zip |
BBS Legacy Post File Parser
git-svn-id: http://opensvn.csie.org/pttbbs/trunk@6042 63ad8ddf-47c3-0310-b6dd-a9e9d9715204
-rwxr-xr-x | pttbbs/util/pyutil/pttpost.py | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/pttbbs/util/pyutil/pttpost.py b/pttbbs/util/pyutil/pttpost.py new file mode 100755 index 00000000..b0064183 --- /dev/null +++ b/pttbbs/util/pyutil/pttpost.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# encoding=latin1 + +# This file is using latin1 with some Big5 literals, to prevent parsing files +# with non-standard Big5 data (ex, Big5-UAO or escape sequence between DBCS). + +import re +import sys + +import big5 + +STR_AUTHOR1 = "作者:" +STR_AUTHOR2 = "發信人:" + +def ANSI_COLOR(*codes): + return "\x1b[" + ';'.join(map(str, codes)) + 'm' + +ANSI_RESET = ANSI_COLOR() + +# Comments format: CommentsPrefix ANSI_COLOR(33) [AUTHOR] +# ANSI_RESET ANSI_COLOR(33) ":" [CONTENT] +# ANSI_RESET [trailings] +CommentsPrefixes = ( + ANSI_COLOR(1,37) + r'推 ', + ANSI_COLOR(1,31) + r'噓 ', + # Also known as <OLDRECOMMEND>, shared by P1 and P2. + ANSI_COLOR(1,31) + r'→ ') + +CommentsFormatRe = ( + '(' + '|'.join(map(re.escape, CommentsPrefixes)) + ')' + + re.escape(ANSI_COLOR(33)) + '([^\x1b]*)' + re.escape(ANSI_RESET) + + re.escape(ANSI_COLOR(33) + ':') + '([^\x1b]*)' + + re.escape(ANSI_RESET) + '(.*)') + +# format: "※ " ANSI_COLOR(1;32) "%s" ANSI_COLOR(0;32) ":轉錄至" %s +def IsCrossPostLog(buf): + return (buf.startswith("※ " + ANSI_COLOR(1,32)) and + buf.index(ANSI_COLOR(0,32) + ':轉錄至' > 0)) + +def ParseComment(buf): + """Parses a buffer for known comment formats. + + Returns: + (kind, author, content, trailing) + """ + invalid = (None, None, None, None) + match = re.findall(CommentsFormatRe, buf) + if len(match) < 1: + return invalid + match = match[0] + (kind, author, content, trailing) = match + return map(big5.decode, (str(CommentsPrefixes.index(kind) + 1), + author, content.rstrip(' '), + trailing.rstrip('\n'))) + +def ParsePost(filename): + '''Returns a legacy post into two parts. + + Returns: + (body, comments): body is the main content without header, and + comments is a list to hold parsed comments. + ''' + contents = [] + comments = [] + lineno = 0 + with open(filename) as f: + contents = f.readlines() + + # Now, try to skip header. + if len(contents) < 1: + return ('', comments) + author = contents[0] + if author.startswith(STR_AUTHOR1): + max_lines = 4 + elif author.startswith(STR_AUTHOR2): + max_lines = 5 + else: + max_lines = 0 + + # Skip until empty line is seen or max lines reached. + while len(contents) > 0 and max_lines > 0: + max_lines -= 1 + if contents.pop(0) == '\n': + break + + # Remove trailing comments. + while len(contents) > 0: + if IsCrossPostLog(contents[-1]): + contents.pop(-1) + continue + result = ParseComment(contents[-1]) + if result[0] is None: + break + comments.append(result) + contents.pop(-1) + + # here's the content. + return (''.join(contents), comments) + +def main(argv): + if len(argv) == 0: + filename = 'sample' + else: + filename = argv[0] + print ParsePost(filename) + +if __name__ == '__main__': + main(sys.argv[1:]) |