summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpiaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204>2014-08-08 01:49:06 +0800
committerpiaip <piaip@63ad8ddf-47c3-0310-b6dd-a9e9d9715204>2014-08-08 01:49:06 +0800
commit589c0a1b1657e0df72c349bab5d64852f963d66e (patch)
tree26986942e61738cd9c1f60e84baaefa6c74bca8b
parentbaa910a873b24bd5835a59bc41da2470b0af6a36 (diff)
downloadpttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.gz
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.bz2
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.lz
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.xz
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.tar.zst
pttbbs-589c0a1b1657e0df72c349bab5d64852f963d66e.zip
BBS Legacy Post File Parser
git-svn-id: http://opensvn.csie.org/pttbbs/trunk@6042 63ad8ddf-47c3-0310-b6dd-a9e9d9715204
-rwxr-xr-xpttbbs/util/pyutil/pttpost.py108
1 files changed, 108 insertions, 0 deletions
diff --git a/pttbbs/util/pyutil/pttpost.py b/pttbbs/util/pyutil/pttpost.py
new file mode 100755
index 00000000..b0064183
--- /dev/null
+++ b/pttbbs/util/pyutil/pttpost.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# encoding=latin1
+
+# This file is using latin1 with some Big5 literals, to prevent parsing files
+# with non-standard Big5 data (ex, Big5-UAO or escape sequence between DBCS).
+
+import re
+import sys
+
+import big5
+
+STR_AUTHOR1 = "作者:"
+STR_AUTHOR2 = "發信人:"
+
+def ANSI_COLOR(*codes):
+ return "\x1b[" + ';'.join(map(str, codes)) + 'm'
+
+ANSI_RESET = ANSI_COLOR()
+
+# Comments format: CommentsPrefix ANSI_COLOR(33) [AUTHOR]
+# ANSI_RESET ANSI_COLOR(33) ":" [CONTENT]
+# ANSI_RESET [trailings]
+CommentsPrefixes = (
+ ANSI_COLOR(1,37) + r'推 ',
+ ANSI_COLOR(1,31) + r'噓 ',
+ # Also known as <OLDRECOMMEND>, shared by P1 and P2.
+ ANSI_COLOR(1,31) + r'→ ')
+
+CommentsFormatRe = (
+ '(' + '|'.join(map(re.escape, CommentsPrefixes)) + ')' +
+ re.escape(ANSI_COLOR(33)) + '([^\x1b]*)' + re.escape(ANSI_RESET) +
+ re.escape(ANSI_COLOR(33) + ':') + '([^\x1b]*)' +
+ re.escape(ANSI_RESET) + '(.*)')
+
+# format: "※ " ANSI_COLOR(1;32) "%s" ANSI_COLOR(0;32) ":轉錄至" %s
+def IsCrossPostLog(buf):
+ return (buf.startswith("※ " + ANSI_COLOR(1,32)) and
+ buf.index(ANSI_COLOR(0,32) + ':轉錄至' > 0))
+
+def ParseComment(buf):
+ """Parses a buffer for known comment formats.
+
+ Returns:
+ (kind, author, content, trailing)
+ """
+ invalid = (None, None, None, None)
+ match = re.findall(CommentsFormatRe, buf)
+ if len(match) < 1:
+ return invalid
+ match = match[0]
+ (kind, author, content, trailing) = match
+ return map(big5.decode, (str(CommentsPrefixes.index(kind) + 1),
+ author, content.rstrip(' '),
+ trailing.rstrip('\n')))
+
+def ParsePost(filename):
+ '''Returns a legacy post into two parts.
+
+ Returns:
+ (body, comments): body is the main content without header, and
+ comments is a list to hold parsed comments.
+ '''
+ contents = []
+ comments = []
+ lineno = 0
+ with open(filename) as f:
+ contents = f.readlines()
+
+ # Now, try to skip header.
+ if len(contents) < 1:
+ return ('', comments)
+ author = contents[0]
+ if author.startswith(STR_AUTHOR1):
+ max_lines = 4
+ elif author.startswith(STR_AUTHOR2):
+ max_lines = 5
+ else:
+ max_lines = 0
+
+ # Skip until empty line is seen or max lines reached.
+ while len(contents) > 0 and max_lines > 0:
+ max_lines -= 1
+ if contents.pop(0) == '\n':
+ break
+
+ # Remove trailing comments.
+ while len(contents) > 0:
+ if IsCrossPostLog(contents[-1]):
+ contents.pop(-1)
+ continue
+ result = ParseComment(contents[-1])
+ if result[0] is None:
+ break
+ comments.append(result)
+ contents.pop(-1)
+
+ # here's the content.
+ return (''.join(contents), comments)
+
+def main(argv):
+ if len(argv) == 0:
+ filename = 'sample'
+ else:
+ filename = argv[0]
+ print ParsePost(filename)
+
+if __name__ == '__main__':
+ main(sys.argv[1:])