分享一個電子發票信息提取工具(Python)

本文轉載自查看原文 2019-04-25 13:56 4182 python/ source

電子發票太多，想統計下總額異常困難，網上工具不好用，花了2個小時實現一份，測試過中石油、京東開具的電子發票還行，部分發票名稱失敗有問題不影響統計，有需要的小伙伴自己拿去改吧。


import cmd
import sys
import json
import pdfplumber
import os
from pprint import pprint


class FapiaoShell(cmd.Cmd):
    """ 發票 """

    intro = '歡迎使用發票提取工具，輸入?(help)獲取幫助消息和命令列表，CTRL+C退出程序。\n'
    prompt = '\n輸入命令: '
    doc_header = "詳細文檔 (輸入 help <命令>):"
    misc_header = "友情提示:"
    undoc_header = "沒有幫助文檔:"
    nohelp = "*** 沒有命令(%s)的幫助信息 "

    def __init__(self):
        super().__init__()

    def do_load(self, arg):
        """ 加載發票 例如：load D:\ """
        if not os.path.isdir(arg):
            print('參數必須是目錄!')
            return

        os.chdir(os.path.dirname(arg))
        pdfs = []
        for root, _, files in os.walk(arg):
            for fn in files:
                ext = os.path.splitext(fn)[1].lower()
                if ext != '.pdf':
                    continue
                fpth = os.path.join(root, fn)
                fpth = os.path.relpath(fpth)
                print(f'發現pdf文件: {fpth}')
                pdfs.append(fpth)

        pdf_ctxs = self._parse_pdfs(pdfs)
        total = {
            '內容': pdf_ctxs,
            '發票數': len(pdf_ctxs),
            '總計': 0,
        }
        for fpth, info in pdf_ctxs:
            total['總計'] += float(info['總計'])

        print('\n保存到 結果.json...')

        with open("結果.json", 'w', encoding='utf-8') as json_file:
            json.dump(total,
                      json_file,
                      ensure_ascii=False,
                      sort_keys=True,
                      indent=4,
                      separators=(', ', ': '))

        print('完成!')

    def _parse_pdfs(self, pdfs):
        """ 分析 """
        result = []
        for fpth in pdfs:
            info = {}
            with pdfplumber.open(fpth) as pdf:
                page = pdf.pages[0]

                if '增值稅電子普通發票' not in ''.join(page.extract_text()):
                    result.append((fpth, {}))

                inf = self._extrace_from_words(page.extract_words())
                info.update(inf)

                inf = self._extrace_from_table(page.extract_tables()[0])
                info.update(inf)

            result.append((fpth, info))
        return result

    def _extrace_from_words(self, words):
        """ 從單詞中提取 """
        info = {}

        lines = {}
        for word in words:
            top = int(word['top'])
            bottom = int(word['bottom'])
            pos = (top + bottom) // 2
            text = word['text']
            if pos not in lines:
                lines[pos] = [text]
            else:
                lines[pos].append(text)

        lines_pack = []
        last_pos = None
        for pos in sorted(lines):
            arr = lines[pos]

            if len(lines_pack) > 0 and pos - last_pos <= 10:
                lines_pack[-1] += arr
                continue

            lines_pack.append(arr)
            last_pos = pos
            continue

        for pack in lines_pack:
            for idx, line in enumerate(pack):
                if '電子普通發票' in line:
                    info['標題'] = line
                    continue

                if '發票代碼:' in line:
                    info['發票代碼'] = line.split(':')[1]
                    continue

                if '發票號碼:' in line:
                    info['發票號碼'] = line.split(':')[1]
                    continue

                if '開票日期:' in line:
                    year = line.split(':')[1]
                    month = [ln for ln in pack if ln.isdigit()][0]
                    day = [ln[:2] for ln in pack if '日' in ln][0]
                    info['開票日期'] = f'{year}-{month}-{day}'
                    continue

                if '機器編號:' in line:
                    info['機器編號'] = [ln for ln in pack if ln.isdigit()
                                    and len(ln) > 10][0]
                    continue

                if '碼:' in line:
                    c1 = pack[idx].split(':')[1]
                    c2 = pack[idx+1]
                    c3 = pack[idx+2]
                    c4 = pack[idx+3]
                    info['校驗碼'] = f'{c1} {c2} {c3} {c4}'
                    continue

                if '收款人:' in line:
                    info['收款人'] = line.split(':')[1]
                    continue

                if '開票人:' in line:
                    info['開票人'] = line.split(':')[1]
                    continue

        return info

    def _extrace_from_table(self, table):
        """ 從表中提取 """
        info = {}
        if len(table) != 4:
            return None

        # 購買方
        for cell in table[0]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '名        稱:' in line:
                    info['購買方名稱'] = line.split(':')[1]
                    continue

                if len(line) == 18 and line.isalnum():
                    info['購買方稅號'] = line
                    continue

                if len(line) == 27:
                    if '密碼' not in info:
                        info['密碼'] = []
                    info['密碼'].append(line)
                    continue

        # 詳細
        for cell in table[1]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '貨物或應稅勞務、服務名稱' in line:
                    info['商品'] = lines[1:-1]
                    break

                if '金  額' in line:
                    info['總金額'] = lines[-1][1:]
                    break

                if '稅  額' in line:
                    info['總稅額'] = lines[-1][1:]
                    break

        # 合計
        for cell in table[2]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '¥' in line:
                    info['總計'] = line[1:]

        # 銷售方
        for cell in table[3]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '名        稱:' in line:
                    info['銷售方名稱'] = line.split(':')[1]
                    continue

                if len(line) == 18 and line.isalnum():
                    info['銷售方稅號'] = line
                    continue

        return info


if __name__ == '__main__':
    try:
        FapiaoShell().cmdloop()
    except KeyboardInterrupt:
        print('\n\n再見！')

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python網絡爬蟲與信息提取（二）—— BeautifulSoup python網絡爬蟲與信息提取——1.requests庫入門 Python自然語言處理---信息提取 python運用 - log信息提取（知識：遍歷 | os ）（中國大學mooc）Python網絡爬蟲與信息提取 Python自然語言處理學習筆記之信息提取步驟&分塊（chunking）第3次作業-MOOC學習筆記：Python網絡爬蟲與信息提取 java自定義注解及其信息提取 Python通過解壓ofd文件獲取發票信息使用javascript實現身份證校驗與信息提取