本項目取材自《Python基礎教程(第三版)》人民郵電出版社
目標:
本項目給純文本文件添加格式,使文檔轉換成其他類型的文檔(以HTML為例)
思路:
- 從原文件提取有用信息:
- 文檔結構---成為目標文檔添加HTML標簽的依據
- 文檔內容---成為目標文檔的內容
- 制定原結構與HTML對應的規則
- 一種是直接添加標簽
- 一種是用新標簽替換舊標記
- 編寫實際執行添加、置換操作的處理程序
- 編寫主邏輯程序,創建實際的規則對象,並應用到原文檔上,控制輸入輸出
具體實現:
#util.py
#這個模塊的功能是為了將原文檔分成塊,以作為規則匹配程序的輸入
def lines(file):
"""在文件末尾添加空行(結束標志)"""
for line in file: yield line #這里的一個line代表文檔中的一段話
yield '\n'
def blocks(file):
"""一段話生成一個文本塊"""
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ''.join(block).strip()
block = []
#handlers.py
#這個模塊的作用是將已經匹配好規則的文本塊進行標簽加工,添加開始結束標簽,或者將某類標記替換成HTML標簽(注釋、列表項等)
class Handler:
"""
start()、end()根據傳入的參數調用具體的標簽方法,並具有一定的異常處理能力,忽略未定義的標簽方法調用
sub()根據傳入的MatchObject對象調用對應的置換方法
"""
def callback(self, prefix, name, *args):
method = getattr(self, prefix + name, None)
if callable(method): return method(*args)
def start(self, name):
self.callback('start_', name)
def end(self, name):
self.callback('end_', name)
def sub(self, name):
def substitution(match):
result = self.callback('sub_', name, match)
if result is None: match.group(0)
return result
return substitution
class HTMLRenderer(Handler):
"""
用於渲染HTML的具體處理程序,其中定義了各類標簽方法的具體實現,這些方法由超類的方法來訪問
feed方法用在start、end之間,給結果字符串添加文本內容
"""
def start_document(self):
print('<html><head><title>...</title></head><body>')
def end_document(self):
print('</body></html>')
def start_paragraph(self):
print('<p>')
def end_paragraph(self):
print('</p>')
def start_heading(self):
print('<h2>')
def end_heading(self):
print('</h2>')
def start_list(self):
print('<ul>')
def end_list(self):
print('</ul>')
def start_listitem(self):
print('<li>')
def end_listitem(self):
print('</li>')
def start_title(self):
print('<h1>')
def end_title(self):
print('</h1>')
#下面這幾個方法的實際調用者是re.sub(),如re.sub(pattern, sub_emphasis(), block),
#re.sub方法會將對block進行模式匹配后的結果(一個MatchObject對象)傳入sub_emphasis,最終返回置換完成的字符串
def sub_emphasis(self, match):
return '<em>{}</em>'.format(match.group(1)) #等價於renturn '<em>/1</em>'
def sub_url(self, match):
return '<a href="{}">{}</a>'.format(match.group(1), match.group(1))
def sub_mail(self, match):
return '<a href="mailto:{}">{}</a>'.format(match.group(1), match.group(1))
def feed(self, data):
print(data)
#rules.py
#這個模塊制定了一系列規則,這些規則會匹配各類文檔塊,並調用相應的標簽處理程序
class Rule:
"""
所有規則的基類,定義了大多數情況通用的action方法
"""
def action(self, block, handler):
handler.start(self.type)
handler.feed(block)
handler.end(self.type)
return True
class HeadingRule(Rule):
"""
標題只包含一行,不超過70個字符且不以冒號結尾
"""
type = 'heading'
def condition(self, block):
return not '\n' in block and len(block) <= 70 and not block[-1] == ':'
class TitleRule(HeadingRule):
"""
題目是文檔中的第一個文本塊,前提條件是它屬於標題
"""
type = 'title'
first = True
def condition(self, block):
if not self.first: return False
self.first = False
return HeadingRule.condition(self, block)
class ListItemRule(Rule):
"""
列表項是以字符打頭的段落。在設置格式的過程中,將把連字符刪除
"""
type = 'listitem'
def condition(self, block):
return block[0] == '-'
def action(self, block, handler):
handler.start(self.type)
handler.feed(block[1:].strip())
handler.end(self.type)
return True
class ListRule(ListItemRule):
"""
列表以緊跟在非列表項文本塊后面的列表項打頭,以相連的最后一個列表項結束
"""
type = 'list'
inside = False
def condition(self, block):
return True
def action(self, block, handler):
if not self.inside and ListItemRule.condition(self, block):
handler.start(self.type)
self.inside = True
elif self.inside and not ListItemRule.condition(self, block):
handler.end(self.type)
self.inside = False
return False
class ParagraphRule(Rule):
"""
段落是不符合其他規則的文本塊
"""
type = 'paragraph'
def condition(self, block):
return True
#markup.py
#負責整合調用各模塊
import sys, re
from handlers import *
from util import *
from rules import *
class Parser:
"""
Paeser讀取文本文件,應用規則並控制處理程序
"""
def __init__(self, handler):
self.handler = handler
self.rules = []
self.filters = []
def addRule(self, rule):
self.rules.append(rule)
def addFilter(self, pattern, name):
def filter(block, handler):
return re.sub(pattern, handler.sub(name), block)
self.filters.append(filter)
def parse(self, file):
self.handler.start('document')
for block in blocks(file):
for filter in self.filters:
block = filter(block, self.handler)
for rule in self.rules:
if rule.condition(block):
last = rule.action(block,
self.handler)
if last: break
self.handler.end('document')
class BasicTextParser(Parser):
"""
在構造函數中添加規則和過濾器的Parser子類
注意:規則列表的添加順序是有要求的,condition判斷失敗才會匹配下一條規則
"""
def __init__(self, handler):
Parser.__init__(self, handler)
self.addRule(ListRule())
self.addRule(ListItemRule())
self.addRule(TitleRule())
self.addRule(HeadingRule())
self.addRule(ParagraphRule())
self.addFilter(r'\*(.+?)\*', 'emphasis')
self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')
handler = HTMLRenderer()
parser = BasicTextParser(handler)
parser.parse(sys.stdin)
這樣就完成了,可以用下面這段文本做個實驗,看看結果如何。
Welcome to World Wide Spam, Inc.
These are the corporate web pages of *World Wide Spam*, Inc. We hope
you find your stay enjoyable, and that you will sample many of our
products.
A short history of the company
World Wide Spam was started in the summer of 2000. The business
concept was to ride the dot-com wave and to make money both through
bulk email and by selling canned meat online.
After receiving several complaints from customers who weren't
satisfied by their bulk email, World Wide Spam altered their profile,
and focused 100% on canned goods. Today, they rank as the world's
13,892nd online supplier of SPAM.
Destinations
From this page you may visit several of our interesting web pages:
- What is SPAM? (http://wwspam.fu/whatisspam)
- How do they make it? (http://wwspam.fu/howtomakeit)
- Why should I eat it? (http://wwspam.fu/whyeatit)
How to get in touch with us
You can get in touch with us in *many* ways: By phone (555-1234), by
email (wwspam@wwspam.fu) or by visiting our customer feedback page
(http://wwspam.fu/feedback).