1 import os 2 from pdfminer.pdfparser import PDFParser 3 from pdfminer.pdfdocument import PDFDocument 4 from pdfminer.pdfpage import PDFPage 5 from pdfminer.pdfpage import PDFTextExtractionNotAllowed 6 from pdfminer.pdfinterp import PDFResourceManager 7 from pdfminer.pdfinterp import PDFPageInterpreter 8 from pdfminer.pdfdevice import PDFDevice 9 from pdfminer.layout import * 10 from pdfminer.converter import PDFPageAggregator 11 12 13 import os 14 import pdb 15 16 #inputFile = r'D:\用戶目錄\桌面\340xxxxxxxxxxxxxxxxxx0.pdf' 17 18 19 def decode_text(s): 20 """ 21 Decodes a PDFDocEncoding string to Unicode. 22 Adds py3 compatability to pdfminer's version. 23 """ 24 if type(s) == bytes and s.startswith(b'\xfe\xff'): 25 return six.text_type(s[2:], 'utf-16be', 'ignore') 26 else: 27 ords = (ord(c) if type(c) == str else c for c in s) 28 return ''.join(PDFDocEncoding[o] for o in ords) 29 30 31 32 def get_msgs(inputFile): 33 msgs = [] 34 fp = open(inputFile, 'rb') 35 #來創建一個pdf文檔分析器 36 parser = PDFParser(fp) 37 #創建一個PDF文檔對象存儲文檔結構 38 document = PDFDocument(parser) 39 # 檢查文件是否允許文本提取 40 if not document.is_extractable: 41 raise PDFTextExtractionNotAllowed 42 else: 43 # 創建一個PDF資源管理器對象來存儲共賞資源 44 rsrcmgr=PDFResourceManager() 45 # 設定參數進行分析 46 laparams=LAParams() 47 # 創建一個PDF設備對象 48 # device=PDFDevice(rsrcmgr) 49 device=PDFPageAggregator(rsrcmgr,laparams=laparams) 50 # 創建一個PDF解釋器對象 51 interpreter=PDFPageInterpreter(rsrcmgr,device) 52 53 # 處理每一頁 54 for page in PDFPage.create_pages(document): 55 56 interpreter.process_page(page) 57 58 # 接受該頁面的LTPage對象 59 layout=device.get_result() 60 61 for x in layout: 62 63 if(isinstance(x,LTTextBoxHorizontal)): 64 #print(x.get_text().strip()) 65 66 msgs.append(x.get_text().strip()) 67 68 return msgs 69 70 71 72 #print(msgs[5][5:]+ '\t' + msgs[4][4:]) 73 74 75 76 if __name__ == "__main__": 77 names = os.listdir('.') 78 for i in names: 79 if os.path.splitext(i)[-1] == '.pdf': 80 #print(i) 81 msg = get_msgs(i) 82 #print(msg) 83 ms = msg[5][5:]+ '\t' + msg[4][4:] 84 with open('學生信息表.txt','a') as f: 85 f.write(ms+'\n') 86