1 import os,shutil,docx,re,time 2 from win32com import client as wc 3 #從所有級聯目錄讀取文件到指定目錄內 4 def count_files(file_dir): 5 count=0 6 for p,d,f in os.walk(file_dir): 7 for c in f: 8 if c.split('.')[-1]=="doc": 9 count +=1 10 src_dir = os.path.join(p, c) 11 print(src_dir) 12 dst_dir = file_dir + "back" 13 if not os.path.exists(dst_dir): 14 os.makedirs(dst_dir) 15 shutil.copy(src_dir, dst_dir) 16 return count 17 #提取每個docx簡歷文檔里面的郵箱地址,我們這里使用python-docx模塊來解決pip install python-docx 18 def count_mail(file_dir,dst_file): 19 mail_list = [] 20 for parent,dirctiory,files in os.walk(file_dir): 21 for f in files: 22 doc = docx.Document(os.path.join(parent,f)) 23 pattern = re.compile(r'''([a-zA-Z0-9._%+-]+@[a-zA-Z0-9\t\s.-]+(\.[a-zA-Z0-9\t\s]{2,4}))''', re.VERBOSE) 24 for para in doc.paragraphs: 25 for groups in pattern.findall(para.text): 26 mail_list.append(groups[0].replace(" ","")+";") 27 with open(dst_file,'w')as f: 28 f.writelines(mail_list) 29 print("=====================郵件信息寫入成功===================") 30 #由於python-docx模塊只能處理docx后綴,我們需要處理doc后綴的文件,必須通過win32com模塊來把doc后綴轉換成docx 31 def docxTodoc(old_doc,new_doc): 32 word = wc.Dispatch('Word.Application') 33 for parent,directory,files in os.walk(old_doc): 34 for f in files: 35 doc = word.Documents.Open(os.path.join(parent,f)) # 目標路徑下的文件 36 new_filepath=os.path.join(new_doc,f.split(".")[0]+".docx") 37 print(new_filepath) 38 doc.SaveAs(new_filepath, 12, False, "", True, "", False, False, False,False) # 轉化后路徑下的文件 39 doc.Close() 40 print(time.time()) 41 word.Quit() 42 43 44 45 if __name__ == '__main__': 46 print(count_files(r"C:\Users\icestick\Desktop\51job_導出簡歷_20180917")) 47 count_mail(r"C:\Users\icestick\Desktop\new_doc",r"C:\Users\icestick\Desktop\test.txt" ) 48 old_doc = r"C:\Users\icestick\Desktop\51job_導出簡歷_20180917" #需要把doc目錄轉成docx格式的原目錄 49 new_doc = r"C:\Users\icestick\Desktop\new_doc" #需要把doc目錄轉成docx格式的目標目錄 50 mail_extract = r"C:\Users\icestick\Desktop\test.txt" #郵箱提取好的文件 51 if not os.path.exists(new_doc): 52 os.mkdir(new_doc) 53 print("=====================目錄創建成功======================") 54 docxTodoc(old_doc, new_doc) 55 print("=====================docx格式轉換成功===================") 56 count_mail(new_doc, mail_extract) 57 58 else: 59 docxTodoc(old_doc, new_doc) 60 print("=====================docx格式轉換成功===================") 61 count_mail(new_doc, mail_extract)