pdfminer實現pdf布局分析 python (pdfminer realize layout analysis with PDF python)


使用pdfminer實現pdf文件的布局分析 python

參考資料:

https://github.com/euske/pdfminer

https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file?noredirect=1

import cv2
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import numpy as np
import matplotlib.pyplot as plt
from pdf2image import convert_from_path

image_path = 'literature.pdf'

layout_type = ['LTTextBox', 'LTFigure', 'LTImage', 'LTCurve', 'LTRect']
# Text:紅色, Figure:綠色, Image:藍色, Curve:黃色, Rect:紫色
color = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (160, 32, 240)]

draw_color = dict(zip(layout_type, color))


def parse_obj(lt_objs):

    boxs = {x: [] for x in layout_type}
    # loop over the object list
    for obj in lt_objs:

        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            boxs['LTTextBox'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTFigure):
            boxs['LTFigure'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTImage):
            boxs['LTImage'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTCurve):
            boxs['LTCurve'].append(obj.bbox)
        elif isinstance(obj, pdfminer.layout.LTRect):
            boxs['LTRect'].append(obj.bbox)
        else:
            raise
    return boxs


# Open a PDF file.
fp = open(image_path, 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
password = '123'
document = PDFDocument(parser, password)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

page_boxs = []
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()
    # extract text from this object
    boxs = parse_obj(layout._objs)
    page_sized = tuple([round(i) for i in layout.bbox])
    page_boxs.append((page_sized, boxs))
    pass

image = convert_from_path(image_path)

assert len(image) == len(page_boxs), "The number of boxes doesn't match the number of pictures"
for i in range(len(image)):
    # 得到這一頁圖片
    image_pil = image[i]
    # 把這一頁的圖片格式轉成numpy類型
    image_numpy = np.array(image_pil)
    # 得到這一頁圖片德國高度,為了之后得到實際的box
    page_boxs_height = page_boxs[i][0][3]
    print(page_boxs[i][1])

    # 遍歷這一頁的框
    for key, values in page_boxs[i][1].items():
        # 把實際的圖片大小resize到頁面的大小
        image_numpy = cv2.resize(image_numpy, page_boxs[i][0][2:4], interpolation=cv2.INTER_AREA)
        for value in values:
            # The y-coordinates are given as the distance from the bottom of the page.
            real_box = (value[0], page_boxs_height-value[3], value[2], page_boxs_height-value[1])
            real_box_integer = tuple([round(jj) for jj in real_box])
            # 畫圖
            cv2.rectangle(image_numpy, real_box_integer[:2], real_box_integer[2:], draw_color[key], 2)
    plt.figure(), plt.imshow(image_numpy)
    plt.show()

結果如下:

此代碼只涉及到PDF文件的布局分析,沒有涉及到PDF轉成可編輯文檔。供大家參考,有問題希望大家多多指正


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM