# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
import re
class pythonNToTxt(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
def text(self):
return ''.join(self.__text).strip()
def dehtml(text):
try:
parser = pythonNToTxt()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text
def html_to_txt(fileobject,saveName):
text = r'''
<html>
<body>
<b>Project:</b> DeHTML<br>
<b>Description</b>:<br>
<p>由HTML轉換成txt文件.從HTML文件讀取,存入test3.txt</p>
</body>
</html>
'''
neirong1 = open(fileobject, 'rb')
neirong = neirong1.read()
print neirong
# for line in neirong:
# #Path = open('af58a19ce7b54986a7515f330a48cde3.pdf', 'rb')
# print(dehtml(line))
# with open('%s' % (saveName), 'a') as f:
# # results = dehtml(text).encode('utf-8')
# results = dehtml(text)
# f.write(results + "\n")
with open('%s' % (saveName), 'a') as f:
results = dehtml(neirong)
f.write(results + "\n")
def pdf_to_txt(filename, Save_name):
fileobject = open(filename, 'rb')
parser = PDFParser(fileobject)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if(isinstance(x, LTTextBoxHorizontal)):
with open('%s' % (Save_name), 'a') as f:
results = x.get_text().encode('utf-8')
f.write(results + "\n")
# 分支判斷
filename = 'test3.html'
fenzhihouzhui = re.findall(r'.*(\..*)', str(filename))[0]
if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':
pdf_to_txt(filename, '3.txt')
elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':
html_to_txt(filename, 'wenben1.txt')
正則說明:
第一個.*
代表任意字符出現0次或者多次
括號用來分組
正則匹配到的值就來自於這個大括號里面
\.代表對.這個符號不做轉義
就是.的意思
要不然正常情況.代表任意字符
最后一個.*就是0個或者多個任意字符
此文章摘自多個網站的組合代碼,以及好友支持,僅供借鑒
摘自:
https://blog.csdn.net/quicktest/article/details/7852336
https://www.cnblogs.com/wj-1314/p/9429816.html