需求:銀行匯款回單PDF幾十頁,每一頁包含兩個回單。需把每一張回單拆分出來,並且以回單上交易附言處TPPXXXXXXXX格式的流水號重命名拆出來的文件。
思路:
1.使用PyPDF2把每一頁一分為二,輸出PDF到一個目錄A。
2.循環目錄A,使用pdfminer提取TPPXXXXXXXX格式的流水號,重命名PDF文件。
3.使用pyinstaller -F 打包成一個exe文件。(注意:要在C盤打包)CMD: C:\Users\<用戶名>\PDF>pyinstaller -F C:\Users\chende\PDF\pdftools.py
# -*- coding: UTF-8 -*- from PyPDF2 import PdfFileReader, PdfFileWriter from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams, LTTextBox from pdfminer.converter import PDFPageAggregator import re import os import os.path #切割PDF def split_pdf(infile, out_path): """ :param infile: 待拆分的pdf文件 :param out_path: 拆分成單頁的pdf文件的存儲路徑 :return: 無 """ if not os.path.exists(out_path): os.makedirs(out_path) with open(infile, 'rb') as infile: pdfReader = PdfFileReader(infile) number_of_pages = pdfReader.getNumPages() #計算此PDF文件中的頁數 for i in range(number_of_pages): page = pdfReader.getPage(i) width = float(page.mediaBox.getWidth()) height = float(page.mediaBox.getHeight()) #print("width:"+str(width)+" Height"+str(height)) #top page pdfReader=PdfFileReader(infile) #一定要重新讀取,要不會報錯。 pdfWriter = PdfFileWriter() #循環創建空白的pdf page_top = pdfReader.getPage(i) page_top.mediaBox.lowerLeft = (0,height/2) page_top.mediaBox.lowerRight = (width,height/2) page_top.mediaBox.upperLeft = (0,height) page_top.mediaBox.upperRight = (width,height) pdfWriter.addPage(page_top) out_file_name = out_path + str(i+1)+'_top.pdf' with open(out_file_name, 'wb') as outfile: pdfWriter.write(outfile) #bottom page pdfReader=PdfFileReader(infile) #一定要重新讀取,要不會報錯。 pdfWriter = PdfFileWriter() #循環創建空白的pdf page_buttom = pdfReader.getPage(i) page_buttom.mediaBox.lowerLeft = (0,0) page_buttom.mediaBox.lowerRight = (width,0) page_buttom.mediaBox.upperLeft = (0,height/2) page_buttom.mediaBox.upperRight = (width,height/2) pdfWriter.addPage(page_buttom) out_file_name = out_path + str(i+1)+'_bottom.pdf' with open(out_file_name, 'wb') as outfile: pdfWriter.write(outfile) infile.close() outfile.close() #重命名PDF def extractPDF(out_Path): for parent,dirnames,filenames in os.walk(out_Path): #三個參數:分別返回1.父目錄 2.所有文件夾名字(不含路徑) 3.所有文件名字 for filename in filenames: #print(filename) #文件名 #os.rename(os.path.join(parent,filename),os.path.join(parent,filename[:-4]+'.black.png')) #重命名 # Open a PDF file. fp = open(out_Path + filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. #document = PDFDocument(parser,password) document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. TXT = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for x in layout: if isinstance(x, LTTextBox): #print(x.get_text().strip()) #strip()去空行 searchObj = re.search(r'(TPP*\d{8})', x.get_text().strip(), flags=0) if searchObj: print(searchObj.group()) TXT.append(searchObj.group()) fp.close() #注意要關閉,要不報錯 device.close() if TXT: #print(list(set(TXT))[0]) NewFileName = list(set(TXT))[0] os.rename(out_Path + filename,out_Path + NewFileName+".pdf") #重命名 if __name__ == '__main__': in_File = './PDFfile.pdf' out_Path = './Single/' # 生成輸出文件夾 split_pdf(in_File, out_Path) extractPDF(out_Path) # 指明被遍歷的文件夾