BBS Legacy Post File Parser

git-svn-id: http://opensvn.csie.org/pttbbs/trunk@6042 63ad8ddf-47c3-0310-b6dd-a9e9d9715204
author: piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204> 2014-08-08 01:49:06 +0800
committer: piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204> 2014-08-08 01:49:06 +0800
commit: 589c0a1b1657e0df72c349bab5d64852f963d66e (patch)
tree: 26986942e61738cd9c1f60e84baaefa6c74bca8b
parent: baa910a873b24bd5835a59bc41da2470b0af6a36 (diff)
download: pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.gz
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.bz2
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.lz
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.xz
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.zst
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.zip
1 files changed, 108 insertions, 0 deletions
diff --git a/pttbbs/util/pyutil/pttpost.py b/pttbbs/util/pyutil/pttpost.py
new file mode 100755
index 00000000..b0064183
--- /dev/null
+++ b/pttbbs/util/pyutil/pttpost.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# encoding=latin1
+
+# This file is using latin1 with some Big5 literals, to prevent parsing files
+# with non-standard Big5 data (ex, Big5-UAO or escape sequence between DBCS).
+
+import re
+import sys
+
+import big5
+
+STR_AUTHOR1 = "作者:"
+STR_AUTHOR2 = "發信人:"
+
+def ANSI_COLOR(*codes):
+    return "\x1b[" + ';'.join(map(str, codes)) + 'm'
+
+ANSI_RESET = ANSI_COLOR()
+
+# Comments format: CommentsPrefix ANSI_COLOR(33) [AUTHOR]
+#                  ANSI_RESET ANSI_COLOR(33) ":" [CONTENT]
+#                  ANSI_RESET [trailings]
+CommentsPrefixes = (
+	ANSI_COLOR(1,37) + r'推 ',
+	ANSI_COLOR(1,31) + r'噓 ',
+	# Also known as <OLDRECOMMEND>, shared by P1 and P2.
+	ANSI_COLOR(1,31) + r'→ ')
+
+CommentsFormatRe = (
+	'(' + '|'.join(map(re.escape, CommentsPrefixes)) + ')' +
+	re.escape(ANSI_COLOR(33)) + '([^\x1b]*)' + re.escape(ANSI_RESET) +
+	re.escape(ANSI_COLOR(33) + ':') + '([^\x1b]*)' +
+	re.escape(ANSI_RESET) + '(.*)')
+
+# format: "※ " ANSI_COLOR(1;32) "%s" ANSI_COLOR(0;32) ":轉錄至" %s
+def IsCrossPostLog(buf):
+    return (buf.startswith("※ " + ANSI_COLOR(1,32)) and
+	    buf.index(ANSI_COLOR(0,32) + ':轉錄至' > 0))
+
+def ParseComment(buf):
+    """Parses a buffer for known comment formats.
+
+    Returns:
+	(kind, author, content, trailing)
+    """
+    invalid = (None, None, None, None)
+    match = re.findall(CommentsFormatRe, buf)
+    if len(match) < 1:
+	return invalid
+    match = match[0]
+    (kind, author, content, trailing) = match
+    return map(big5.decode, (str(CommentsPrefixes.index(kind) + 1),
+			     author, content.rstrip(' '),
+			     trailing.rstrip('\n')))
+
+def ParsePost(filename):
+    '''Returns a legacy post into two parts.
+
+    Returns:
+	(body, comments): body is the main content without header, and
+	                  comments is a list to hold parsed comments.
+    '''
+    contents = []
+    comments = []
+    lineno = 0
+    with open(filename) as f:
+	contents = f.readlines()
+
+    # Now, try to skip header.
+    if len(contents) < 1:
+	return ('', comments)
+    author = contents[0]
+    if author.startswith(STR_AUTHOR1):
+	max_lines = 4
+    elif author.startswith(STR_AUTHOR2):
+	max_lines = 5
+    else:
+	max_lines = 0
+
+    # Skip until empty line is seen or max lines reached.
+    while len(contents) > 0 and max_lines > 0:
+	max_lines -= 1
+	if contents.pop(0) == '\n':
+	    break
+
+    # Remove trailing comments.
+    while len(contents) > 0:
+	if IsCrossPostLog(contents[-1]):
+	    contents.pop(-1)
+	    continue
+	result = ParseComment(contents[-1])
+	if result[0] is None:
+	    break
+	comments.append(result)
+	contents.pop(-1)
+
+    # here's the content.
+    return (''.join(contents), comments)
+
+def main(argv):
+    if len(argv) == 0:
+	filename = 'sample'
+    else:
+	filename = argv[0]
+    print ParsePost(filename)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
author	piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204>	2014-08-08 01:49:06 +0800
committer	piaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204>	2014-08-08 01:49:06 +0800
commit	589c0a1b1657e0df72c349bab5d64852f963d66e (patch)
tree	26986942e61738cd9c1f60e84baaefa6c74bca8b
parent	baa910a873b24bd5835a59bc41da2470b0af6a36 (diff)
download	pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.gz pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.bz2 pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.lz pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.xz pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.zst pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.zip