pdfminer實現pdf布局分析 python （pdfminer realize layout analysis with PDF python）

本文轉載自查看原文 2019-12-12 15:19 341

使用pdfminer實現pdf文件的布局分析 python

參考資料：

https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file?noredirect=1

import cv2
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import numpy as np
import matplotlib.pyplot as plt
from pdf2image import convert_from_path

image_path = 'literature.pdf'

layout_type = ['LTTextBox', 'LTFigure', 'LTImage', 'LTCurve', 'LTRect']
# Text：紅色， Figure：綠色， Image：藍色， Curve：黃色， Rect：紫色
color = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (160, 32, 240)]

draw_color = dict(zip(layout_type, color))


def parse_obj(lt_objs):

    boxs = {x: [] for x in layout_type}
    # loop over the object list
    for obj in lt_objs:

        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            boxs['LTTextBox'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTFigure):
            boxs['LTFigure'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTImage):
            boxs['LTImage'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTCurve):
            boxs['LTCurve'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTRect):
            boxs['LTRect'].append(obj.bbox)
        else:
            raise
    return boxs


# Open a PDF file.
fp = open(image_path, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
password = '123'
document = PDFDocument(parser, password)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

page_boxs = []
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    # extract text from this object
    boxs = parse_obj(layout._objs)
    page_sized = tuple([round(i) for i in layout.bbox])
    page_boxs.append((page_sized, boxs))
    pass

image = convert_from_path(image_path)

assert len(image) == len(page_boxs), "The number of boxes doesn't match the number of pictures"
for i in range(len(image)):
    # 得到這一頁圖片
    image_pil = image[i]
    # 把這一頁的圖片格式轉成numpy類型
    image_numpy = np.array(image_pil)
    # 得到這一頁圖片德國高度，為了之后得到實際的box
    page_boxs_height = page_boxs[i][0][3]
    print(page_boxs[i][1])

    # 遍歷這一頁的框
    for key, values in page_boxs[i][1].items():
        # 把實際的圖片大小resize到頁面的大小
        image_numpy = cv2.resize(image_numpy, page_boxs[i][0][2:4], interpolation=cv2.INTER_AREA)
        for value in values:
            # The y-coordinates are given as the distance from the bottom of the page.
            real_box = (value[0], page_boxs_height-value[3], value[2], page_boxs_height-value[1])
            real_box_integer = tuple([round(jj) for jj in real_box])
            # 畫圖
            cv2.rectangle(image_numpy, real_box_integer[:2], real_box_integer[2:], draw_color[key], 2)
    plt.figure(), plt.imshow(image_numpy)
    plt.show()

結果如下：

此代碼只涉及到PDF文件的布局分析，沒有涉及到PDF轉成可編輯文檔。供大家參考，有問題希望大家多多指正

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python使用PDFMiner解析PDF Python對pdf中的關鍵字過濾（pdfminer3k或pdfminer使用） python3使用pdfminer3k解析pdf文件 python3用pdfminer3k在線讀取pdf文件 Python：解析PDF文本及表格——pdfminer、tabula、pdfplumber 的用法及對比 Python：解析PDF文本及表格——pdfminer、tabula、pdfplumber 的用法及對比 pdfminer批量處理PDF文件【Python 庫】解析PDF文本及表格——pdfminer、tabula、pdfplumber 的用法及對比使用python的pdfminer庫提取pdf中的圖像之填坑記 pdfminer API介紹：pdf網頁爬蟲