python讀取txt
# coding=utf-8 import requests # 爬蟲規則 url = "https://en.wikipedia.org/robots.txt" # 讀取網絡資源 res = requests.get(url).text # 打印 print(res)
讀取pdf
# coding=utf-8 from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator import requests # pip install pdfminer3k # 獲取文檔對象,示例pdf在 https://pypi.org/project/pdfminer3k/1.0.1/#files 下載后的文件夾中 pdfminer3k-master\samples\nonfree fp = open("naacl06-shinyama.pdf", "rb") # print(type(fp)) # 創建一個與文檔關聯的解釋器 parser = PDFParser(fp) # PDF 文檔的對象 doc = PDFDocument() # 連接解釋器與文檔對象 parser.set_document(doc) doc.set_parser(parser) # 初始化文檔 doc.initialize("") # 創建PDF資源管理器 resource = PDFResourceManager() # 參數分析器 laparam = LAParams() # 創建一個聚合器 device = PDFPageAggregator(resource, laparams = laparam) # 頁面解釋器 interpreter = PDFPageInterpreter(resource, device) # 使用文檔對象得到頁面的集合 for page in doc.get_pages(): # 使用頁面解釋器來讀取 interpreter.process_page(page) # 使用聚合器獲得內容 layout = device.get_result() for out in layout: if hasattr(out, "get_text"): print(out.get_text())