最近在實習,老板一下子發給了我120份研報,然而很多都是沒用的。聰明的大腦一定要想辦法讓電腦幫助自己完成簡單的工作!
下面是Python篩選含有“”丙烯“”關鍵字的程序,由於文件的保密性只能貼出代碼。
注意:
pip install pdfminer3k而不是pdfminer
導入的時候名字是pdfminer,原因我才是python版本的問題
# -*- coding: utf-8 -*- """ Created on Fri May 10 16:54:16 2019 @author: didi.lv """ import os from io import StringIO import shutil # 注意:一定要pip install pdfminer3k 而不是pdfminer from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams # 讀取pdf的函數,返回內容 def readPdf(pdf_file): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams) process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file) device.close() content = retstr.getvalue() retstr.close() return content def file_name(file_dir): names = [] for root, dirs, files in os.walk(file_dir): names.append(files) return files if __name__ == '__main__': file_dir = r'C:\\Users\didi.lv\Desktop\filenames' file_names_str = str(file_name(file_dir)) name_ = file_names_str.split('.pdf\', ') # 簡單的check下這個代碼的細節,需要理解 name_temp1 = name_[0] name_[0] = name_temp1[1:] name_temp2 = name_[-1] name_[-1] = name_temp2[0:-6] i = 0 for name_check in name_[48:]: print('--------------------------------------------------------') i += 1 print(i) name_check = name_check[1:] name_check += '.pdf' name_check_open = r'C:\\Users\didi.lv\Desktop\filenames' + '\\' + name_check pdf_file = open(name_check_open, 'rb') content = readPdf(pdf_file) if '丙烯' in content: # 注意這里是從原始位置filenames1復制到目標位置filenames2 file_origin = r'C:\\Users\didi.lv\Desktop\filenames1' + '\\' + name_check file_target = r'C:\\Users\didi.lv\Desktop\filenames2' + '\\' + name_check shutil.copyfile(file_origin,file_target) print('copy No. %d file' %i)
原文:https://blog.csdn.net/Eric2016_Lv/article/details/90082280