直接上代碼
# -*- encoding:utf-8 -*- """ author:lgh 簡單的doc轉pdf,html,pdf轉doc腳本 依賴庫pdfminer3k,pip install pdfminer3k即可 """ from win32com.client import Dispatch, constants from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTTextBoxHorizontal from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFTextExtractionNotAllowed def doc2pdf(input, output): w = Dispatch('Word.Application') try: # 打開文件 doc = w.Documents.Open(input, ReadOnly=1) # 轉換文件 doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF, Item=constants.wdExportDocumentWithMarkup, CreateBookmarks = constants.wdExportCreateHeadingBookmarks) return True except Exception as e: print(e) return False finally: w.Quit(constants.wdDoNotSaveChanges) def doc2html(input, output): w = Dispatch('Word.Application') try: doc = w.Documents.Open(input, ReadOnly=1) doc.SaveAs(output, 8) return True except Exception as e: print(e) return False finally: w.Quit(constants.wdDoNotSaveChanges) def pdf2doc(input, output): try: with open(input, 'rb') as f: parser = PDFParser(f) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) # 設置初始化密碼 doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): with open(output, 'a', encoding='utf-8') as f1: results = x.get_text() f1.write(results+'\n') return True except Exception as e: print(e) return False def main(): # rc = doc2pdf(input, output) # rc = doc2html(input, output) input = r'F:\save_data\流暢的Python.pdf' output = r'F:\save_data\test.doc' rc = pdf2doc(input, output) if rc: print('轉換成功') else: print('轉換失敗') if __name__ == '__main__': main()
以上其實是通過com來調用office API,其他語言貌似也可以
當然你也可以用上面的代碼將word文件轉換成任意格式文件(只要office 2007支持,比如將word文件轉換成PDF文件,把8改成17即可),下面是office 2007支持的全部文件格式對應表:
wdFormatDocument = 0
wdFormatDocument97 = 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
wdFormatXMLDocumentMacroEnabled = 13
wdFormatXMLTemplate = 14
wdFormatXMLTemplateMacroEnabled = 15
wdFormatXPS = 18照着字面意思應該能對應到相應的文件格式,如果你是office 2003可能支持不了這么多格式。word文件轉html有兩種格式可選wdFormatHTML、wdFormatFilteredHTML(對應數字8、10),區別是如果是wdFormatHTML格式的話,word文件里面的公式等ole對象將會存儲成wmf格式,而選用wdFormatFilteredHTML的話公式圖片將存儲為gif格式,而且目測可以看出用wdFormatFilteredHTML生成的HTML明顯比wdFormatHTML要干凈許多。
參考自https://blog.csdn.net/binger819623/article/details/6770932
