使用PyPDF2結合pdfminer拆分PDF,並提取關鍵字重命名拆分出來的文件


需求:銀行匯款回單PDF幾十頁,每一頁包含兩個回單。需把每一張回單拆分出來,並且以回單上交易附言處TPPXXXXXXXX格式的流水號重命名拆出來的文件。

思路:

1.使用PyPDF2把每一頁一分為二,輸出PDF到一個目錄A。

2.循環目錄A,使用pdfminer提取TPPXXXXXXXX格式的流水號,重命名PDF文件。

3.使用pyinstaller -F 打包成一個exe文件。(注意:要在C盤打包)CMD: C:\Users\<用戶名>\PDF>pyinstaller -F C:\Users\chende\PDF\pdftools.py 

# -*- coding: UTF-8 -*-  
from PyPDF2 import PdfFileReader, PdfFileWriter

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.converter import PDFPageAggregator

import re
import os
import os.path

#切割PDF
def split_pdf(infile, out_path):

    """
    :param infile: 待拆分的pdf文件
    :param out_path: 拆分成單頁的pdf文件的存儲路徑  
    :return: 無
    """

    if not os.path.exists(out_path):
        os.makedirs(out_path)
    with open(infile, 'rb') as infile:

        pdfReader = PdfFileReader(infile)
        number_of_pages = pdfReader.getNumPages()  #計算此PDF文件中的頁數
        for i in range(number_of_pages):

            page = pdfReader.getPage(i)
            width = float(page.mediaBox.getWidth())
            height = float(page.mediaBox.getHeight())
            #print("width:"+str(width)+" Height"+str(height))

            #top page
            pdfReader=PdfFileReader(infile)   #一定要重新讀取,要不會報錯。
            pdfWriter = PdfFileWriter()    #循環創建空白的pdf 
            page_top = pdfReader.getPage(i)
            page_top.mediaBox.lowerLeft = (0,height/2)
            page_top.mediaBox.lowerRight = (width,height/2)
            page_top.mediaBox.upperLeft = (0,height)
            page_top.mediaBox.upperRight = (width,height)
            pdfWriter.addPage(page_top)
            out_file_name = out_path + str(i+1)+'_top.pdf'
            with open(out_file_name, 'wb') as outfile:
                pdfWriter.write(outfile) 
   
            #bottom page
            pdfReader=PdfFileReader(infile)   #一定要重新讀取,要不會報錯。
            pdfWriter = PdfFileWriter()    #循環創建空白的pdf
            page_buttom = pdfReader.getPage(i)
            page_buttom.mediaBox.lowerLeft = (0,0)
            page_buttom.mediaBox.lowerRight = (width,0)
            page_buttom.mediaBox.upperLeft = (0,height/2)
            page_buttom.mediaBox.upperRight = (width,height/2)
    
            pdfWriter.addPage(page_buttom)   
            out_file_name = out_path + str(i+1)+'_bottom.pdf' 
            with open(out_file_name, 'wb') as outfile:
                pdfWriter.write(outfile) 

    infile.close()
    outfile.close()

#重命名PDF
def extractPDF(out_Path):
    for parent,dirnames,filenames in os.walk(out_Path):     #三個參數:分別返回1.父目錄 2.所有文件夾名字(不含路徑) 3.所有文件名字
        for filename in filenames:     
            #print(filename)                 #文件名
            #os.rename(os.path.join(parent,filename),os.path.join(parent,filename[:-4]+'.black.png')) #重命名

            # Open a PDF file.
            fp = open(out_Path + filename, 'rb')
            # Create a PDF parser object associated with the file object.
            parser = PDFParser(fp)
            # Create a PDF document object that stores the document structure.
            # Supply the password for initialization.
            #document = PDFDocument(parser,password)
            document = PDFDocument(parser)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            # Create a PDF resource manager object that stores shared resources.
            rsrcmgr = PDFResourceManager()
            # Create a PDF device object.
            device = PDFDevice(rsrcmgr)
            # Set parameters for analysis.
            laparams = LAParams()
            # Create a PDF page aggregator object.
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.
            TXT = []
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                # receive the LTPage object for the page.
                layout = device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBox):
                        #print(x.get_text().strip())    #strip()去空行
                        searchObj = re.search(r'(TPP*\d{8})', x.get_text().strip(), flags=0)
                        if searchObj:
                            print(searchObj.group())
                            TXT.append(searchObj.group())
            fp.close() #注意要關閉,要不報錯
            device.close()

            if TXT:
                #print(list(set(TXT))[0])
                NewFileName = list(set(TXT))[0]
                os.rename(out_Path + filename,out_Path + NewFileName+".pdf") #重命名 


if __name__ == '__main__':
    in_File = './PDFfile.pdf'
    out_Path = './Single/'  # 生成輸出文件夾
    split_pdf(in_File, out_Path)                             
    extractPDF(out_Path)    # 指明被遍歷的文件夾
    

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM