Python實現PDF 轉txt 和html轉txt


# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
import re


class pythonNToTxt(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []

def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')

def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')

def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')

def text(self):
return ''.join(self.__text).strip()


def dehtml(text):
try:
parser = pythonNToTxt()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text

def html_to_txt(fileobject,saveName):
text = r'''
<html>
<body>
<b>Project:</b> DeHTML<br>
<b>Description</b>:<br>
<p>由HTML轉換成txt文件.從HTML文件讀取,存入test3.txt</p>
</body>
</html>
'''
neirong1 = open(fileobject, 'rb')
neirong = neirong1.read()
print neirong
# for line in neirong:
# #Path = open('af58a19ce7b54986a7515f330a48cde3.pdf', 'rb')
# print(dehtml(line))
# with open('%s' % (saveName), 'a') as f:
# # results = dehtml(text).encode('utf-8')
# results = dehtml(text)
# f.write(results + "\n")
with open('%s' % (saveName), 'a') as f:
results = dehtml(neirong)
f.write(results + "\n")


def pdf_to_txt(filename, Save_name):
fileobject = open(filename, 'rb')
parser = PDFParser(fileobject)
document = PDFDocument(parser)

if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if(isinstance(x, LTTextBoxHorizontal)):
with open('%s' % (Save_name), 'a') as f:
results = x.get_text().encode('utf-8')
f.write(results + "\n")

# 分支判斷
filename = 'test3.html'

fenzhihouzhui = re.findall(r'.*(\..*)', str(filename))[0]
if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':
pdf_to_txt(filename, '3.txt')
elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':
html_to_txt(filename, 'wenben1.txt')


正則說明:

第一個.*

代表任意字符出現0次或者多次

括號用來分組

正則匹配到的值就來自於這個大括號里面

\.代表對.這個符號不做轉義

就是.的意思

要不然正常情況.代表任意字符

最后一個.*就是0個或者多個任意字符

 

 

此文章摘自多個網站的組合代碼,以及好友支持,僅供借鑒

摘自:

https://blog.csdn.net/quicktest/article/details/7852336

https://www.cnblogs.com/wj-1314/p/9429816.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM