需求是有一堆這樣的word文檔,要轉換成試題,供web界面使用。
#!/usr/bin/env python3
import docx
import re
import json
file = docx.Document("./2018《廉潔自律准則》知識競賽試題及答案.docx")
list = []
val = None
for para in file.paragraphs:
v = para.text.split()
for line in v:
items = re.compile("\.").split(line)
if (re.match('\d', line)):
if len(items) > 1:
val = {'no': items[0]}
q = ''.join([str(x) for x in items[1:]])
key = re.search('(?<=(\(|())\s*[A-D]*', q)
if key != None:
val['k'] = key.group(0).lstrip()
val['q'] = re.sub('(?<=(\(|())\s*[A-D]*\s*', ' ', q)
list.append(val)
if (re.match('A', line)):
if len(items) > 1:
val['a'] = ''.join([str(x) for x in items[1:]])
if (re.match('B', line)):
if len(items) > 1:
val['b'] = ''.join([str(x) for x in items[1:]])
if (re.match('C', line)):
if len(items) > 1:
val['c'] = ''.join([str(x) for x in items[1:]])
if (re.match('D', line)):
if len(items) > 1:
val['d'] = ''.join([str(x) for x in items[1:]])
with open('data.json', 'w') as outfile:
json.dump(list, outfile, ensure_ascii=False)
轉換過程並不完美,因為word文檔並非標准,大約有90%左右的沒有問題,還有部分是有問題的。