python3將docx轉換成pdf,html文件,pdf轉doc文件


直接上代碼

# -*- encoding:utf-8 -*-
"""
    author:lgh
    簡單的doc轉pdf,html,pdf轉doc腳本
    依賴庫pdfminer3k,pip install pdfminer3k即可
"""

from win32com.client import Dispatch, constants

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

def doc2pdf(input, output):
    w = Dispatch('Word.Application')
    try:
        # 打開文件
        doc = w.Documents.Open(input, ReadOnly=1)
        # 轉換文件
        doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF,
                                Item=constants.wdExportDocumentWithMarkup, CreateBookmarks = constants.wdExportCreateHeadingBookmarks)
        return True
    except Exception as e:
        print(e)
        return False
    finally:
        w.Quit(constants.wdDoNotSaveChanges)

def doc2html(input, output):
    w = Dispatch('Word.Application')
    try:
        doc = w.Documents.Open(input, ReadOnly=1)
        doc.SaveAs(output, 8)
        return True
    except Exception as e:
        print(e)
        return False
    finally:
        w.Quit(constants.wdDoNotSaveChanges)

def pdf2doc(input, output):
    try:
        with open(input, 'rb') as f:
            parser = PDFParser(f)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            # 設置初始化密碼
            doc.initialize()
            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in doc.get_pages():
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBoxHorizontal):
                            with open(output, 'a', encoding='utf-8') as f1:
                                results = x.get_text()
                                f1.write(results+'\n')
        return True
    except Exception as e:
        print(e)
        return False


def main():
    # rc = doc2pdf(input, output)
    # rc = doc2html(input, output)
    input = r'F:\save_data\流暢的Python.pdf'
    output = r'F:\save_data\test.doc'
    rc = pdf2doc(input, output)
    if rc:
        print('轉換成功')
    else:
        print('轉換失敗')

if __name__ == '__main__':
    main()

以上其實是通過com來調用office API,其他語言貌似也可以

當然你也可以用上面的代碼將word文件轉換成任意格式文件(只要office 2007支持,比如將word文件轉換成PDF文件,把8改成17即可),下面是office 2007支持的全部文件格式對應表:

wdFormatDocument                    =  0
wdFormatDocument97                  =  0
wdFormatDocumentDefault             = 16
wdFormatDOSText                     =  4
wdFormatDOSTextLineBreaks           =  5
wdFormatEncodedText                 =  7
wdFormatFilteredHTML                = 10
wdFormatFlatXML                     = 19
wdFormatFlatXMLMacroEnabled         = 20
wdFormatFlatXMLTemplate             = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML                        =  8
wdFormatPDF                         = 17
wdFormatRTF                         =  6
wdFormatTemplate                    =  1
wdFormatTemplate97                  =  1
wdFormatText                        =  2
wdFormatTextLineBreaks              =  3
wdFormatUnicodeText                 =  7
wdFormatWebArchive                  =  9
wdFormatXML                         = 11
wdFormatXMLDocument                 = 12
wdFormatXMLDocumentMacroEnabled     = 13
wdFormatXMLTemplate                 = 14
wdFormatXMLTemplateMacroEnabled     = 15
wdFormatXPS                         = 18照着字面意思應該能對應到相應的文件格式,如果你是office 2003可能支持不了這么多格式。word文件轉html有兩種格式可選wdFormatHTML、wdFormatFilteredHTML(對應數字8、10),區別是如果是wdFormatHTML格式的話,word文件里面的公式等ole對象將會存儲成wmf格式,而選用wdFormatFilteredHTML的話公式圖片將存儲為gif格式,而且目測可以看出用wdFormatFilteredHTML生成的HTML明顯比wdFormatHTML要干凈許多。

 

參考自https://blog.csdn.net/binger819623/article/details/6770932


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM