最近在實習,老板一下子發給了我120份研報,然而很多都是沒用的。聰明的大腦一定要想辦法讓電腦幫助自己完成簡單的工作!
下面是Python篩選含有“”丙烯“”關鍵字的程序,由於文件的保密性只能貼出代碼。
注意:
pip install pdfminer3k而不是pdfminer
導入的時候名字是pdfminer,原因我才是python版本的問題
# -*- coding: utf-8 -*-
"""
Created on Fri May 10 16:54:16 2019
@author: didi.lv
"""
import os
from io import StringIO
import shutil
# 注意:一定要pip install pdfminer3k 而不是pdfminer
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
# 讀取pdf的函數,返回內容
def readPdf(pdf_file):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)
process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)
device.close()
content = retstr.getvalue()
retstr.close()
return content
def file_name(file_dir):
names = []
for root, dirs, files in os.walk(file_dir):
names.append(files)
return files
if __name__ == '__main__':
file_dir = r'C:\\Users\didi.lv\Desktop\filenames'
file_names_str = str(file_name(file_dir))
name_ = file_names_str.split('.pdf\', ')
# 簡單的check下這個代碼的細節,需要理解
name_temp1 = name_[0]
name_[0] = name_temp1[1:]
name_temp2 = name_[-1]
name_[-1] = name_temp2[0:-6]
i = 0
for name_check in name_[48:]:
print('--------------------------------------------------------')
i += 1
print(i)
name_check = name_check[1:]
name_check += '.pdf'
name_check_open = r'C:\\Users\didi.lv\Desktop\filenames' + '\\' + name_check
pdf_file = open(name_check_open, 'rb')
content = readPdf(pdf_file)
if '丙烯' in content:
# 注意這里是從原始位置filenames1復制到目標位置filenames2
file_origin = r'C:\\Users\didi.lv\Desktop\filenames1' + '\\' + name_check
file_target = r'C:\\Users\didi.lv\Desktop\filenames2' + '\\' + name_check
shutil.copyfile(file_origin,file_target)
print('copy No. %d file' %i)
原文:https://blog.csdn.net/Eric2016_Lv/article/details/90082280
