pdfminert提取PDF中文內容

本文轉載自查看原文 2020-04-15 13:43 764 Python

由於PyPDF2提取中文亂碼，無法識別。所以使用pdfminer

pdfminer : https://github.com/euske/pdfminer

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.converter import PDFPageAggregator
import re

# Open a PDF file.
fp = open('1p.pdf', 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
#document = PDFDocument(parser,password)
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
TXT = []
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    for x in layout:
        if isinstance(x, LTTextBox):
            #print(x.get_text().strip())    #strip()去空行
            searchObj = re.search(r'(TPP*\d{8})', x.get_text().strip(), flags=0) #找出TPP的單號
            if searchObj:
                TXT.append(searchObj.group())
print(list(set(TXT))[0])

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 從pdf中提取內容的方法使用itextpdf提取pdf內容 PDF電子發票內容提取 PDF文本內容批量提取到Excel python操作PDF------提取PDF文字內容使用C#閱讀pdf內容，並對其進行提取 pdf文件之itextpdf插入html內容以及中文解決方案 pdf.js 預覽文件中文內容丟失 php抓取圖片進行內容提取解析，文字性pdf進行內容文字提取解析利用python第三方庫提取PDF文件的表格內容