代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
pip install pdfminer3k
pip install openpyxl
pip install pandas
pip install numpy
"""
import datetime
import os
import re
import pandas as pd
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument
current_time = str(datetime.datetime.now().year) + "年" + str(datetime.datetime.now().month) + "月" + str(
datetime.datetime.now().day) + "日"
def from_pdf_to_txt(read_file, page_start=0, page_end=0):
"""
:param read_file: str. 注意后綴名是".pdf"
:param write_file: str. 注意后綴名填".txt"
:param page_start: int
:param page_end: int
:return:
"""
# 以二進制讀模式打開
origin_pdf_file = open(read_file, 'rb')
# 用文件對象來創建一個pdf文檔分析器
parser = PDFParser(origin_pdf_file)
# 創建一個pdf文檔
doc = PDFDocument()
# 連接分析器與文檔對象,這個語句比較有意思,相互set對方進去
parser.set_document(doc)
doc.set_parser(parser)
# 提供初始化密碼.如果pdf沒有密碼,就傳入一個空參數
doc.initialize()
# 檢測文檔是否提供txt轉換,不提供就忽略
if not doc.is_extractable:
# 如果pdf不支持提取,則直接報錯
raise PDFTextExtractionNotAllowed
else:
# 創建pdf資源管理器 來管理共享資源
srcmgr = PDFResourceManager()
# 創建一個pdf設備對象
device = PDFPageAggregator(srcmgr, laparams=LAParams())
# 創建一個pdf解釋器對象
interpreter = PDFPageInterpreter(srcmgr, device)
# 循環遍歷列表,每次處理一個page的內容
pages = list(doc.get_pages())
if page_end == 0:
page_end = len(pages)
results = ''
for i in range(page_start, page_end):
interpreter.process_page(pages[i])
# 接受該頁面的LTPage對象
layout = device.get_result()
# 這里返回的是一個LTPage對象,里面存放着這個page解析出的各種對象
# 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等
# 想要獲取文本就取它的text屬性,即x.get_text()
# 獲取text屬性
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
# with open(write_file, 'a', encoding='utf-8') as f:
string = x.get_text().replace('\n', '')
string = string.replace(":", '')
string = string.replace(":", '')
results = results + string.replace(' ', '')
# f.write(results)
# 最后關閉原始pdf文件
origin_pdf_file.close()
return results
#
def re_text(bt, text):
m1 = re.search(bt, text)
if not m1 is None:
reText = m1[0]
return reText
def get_pdf(dirpath):
pdf_file = []
for root, subdirs, filenames in os.walk(dirpath):
for filename in filenames:
if filename.endswith('.pdf'):
filepath = os.path.join(root, filename)
pdf_file.append(filepath)
return pdf_file
def get_text(dirpath, xlfilename):
results = []
filenames = get_pdf(dirpath)
for filename in filenames:
pdftext = from_pdf_to_txt(filename, 0, 1)
# print(pdftext)
cont = {}
cont['平台'] = "京東"
# 獲取訂單號
bt1 = '(?<=訂單號)\d+'
rt1 = re_text(bt1, pdftext)
cont['訂單號'] = rt1
cont['備注號碼'] = None
if not rt1:
bt1 = '\d+號碼\d+'
rt1 = re_text(bt1, pdftext)
rt1 = rt1.split("號碼")[1]
cont['備注號碼'] = rt1
# 獲取發票抬頭
bt2 = '\d+[\u4e00-\u9fa5]+'
rt2 = re_text(bt2, pdftext)
# print("1-rt2",rt2)
bt2 = '[\u4e00-\u9fa5]+'
rt2 = re_text(bt2, rt2)
# print("2-rt2", rt2)
if "公司" not in rt2:
rt2 = "個人"
cont['發票抬頭'] = rt2
# 獲取發票金額
bt3 = '-?\d+(\.\d+)?小寫'
rt3 = re_text(bt3, pdftext.replace("(", "").replace(")", ""))
rt3 = rt3.replace("小寫", "")
cont['發票金額'] = rt3
# 獲取發票代碼
bt4 = '(?<=發票代碼)\d+'
rt4 = re_text(bt4, pdftext)
cont['發票代碼'] = rt4
# 獲取發票號碼
bt5 = '(?<=發票號碼)\d+'
rt5 = re_text(bt5, pdftext)
cont['發票號碼'] = rt5
# 獲取開票日期
bt6 = '(?<=開票日期)\d+'
rt6 = re_text(bt6, pdftext)
rt6 = rt6[0:4] + "年" + rt6[4:6] + "月" + rt6[6:] + "日"
cont['開票日期'] = rt6
cont['創建時間'] = current_time
cont['最后修改時間'] = current_time
results.append(cont)
# print(results)
pf = pd.DataFrame(results)
order = ["平台", "訂單號", "備注號碼", "發票抬頭", "發票金額", "發票代碼", "發票號碼", "開票日期", "創建時間", "最后修改時間"] # 指定列的順序
pf = pf[order]
file_path = pd.ExcelWriter(xlfilename) # 打開excel文件
# 替換空單元格
pf.fillna(' ', inplace=True)
# 輸出
pf.to_excel(file_path, encoding='utf-8', index=False, sheet_name="sheet1")
file_path.save()
if __name__ == '__main__':
xlfilename = '發票.xlsx'
dirpath = 'C:\\Users\\william\\Desktop\\20210319\\發票'
get_text(dirpath, xlfilename)
三種發票樣式
紅線畫的是需要提取出來的數據