pdfminer批量處理PDF文件


from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTTextLineHorizontal, LTFigure, LTRect, LTLine, LTCurve
import os


class PdfForString(object):
    def __init__(self):
        self.pdf_list = os.listdir(r'E:\StockExchange\PDF')  # 獲取PDF文件夾中所有pdf名稱
        #  存儲文檔資源
        self.src = PDFResourceManager()
        #  設備對象
        self.device = PDFPageAggregator(self.src, laparams=LAParams())
        # 解釋器對象
        self.inter = PDFPageInterpreter(self.src, self.device)

    # 生成pdf路徑
    def for_string(self):
        for pdf in self.pdf_list:
            pdf_path = os.path.join(os.path.dirname(os.path.dirname(__file__)) + '/PDF', pdf)
            yield pdf_path

    # 解析pdf
    def pdf_analysis(self):
        for path in self.for_string():
            pd_file = open(path, 'rb')
            parser = PDFParser(pd_file)  # pdf文件解析對象

            #  pdf文檔對象
            document = PDFDocument()
            parser.set_document(document)
            document.set_parser(parser)
            pages = document.get_pages()
            yield pages

    # 獲取PDF信息
    def get_string(self):
        for pages in self.pdf_analysis():
            for page in pages:
                self.inter.process_page(page)
                layout = self.device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):
                        print(str(x.get_text()))
            # break


PdfForString().get_string()

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM