python3中的docx模塊處理doc或docx格式文件


 1 import os,shutil,docx,re,time
 2 from win32com import client as wc
 3 #從所有級聯目錄讀取文件到指定目錄內
 4 def count_files(file_dir):
 5     count=0
 6     for p,d,f in os.walk(file_dir):
 7         for c in f:
 8             if c.split('.')[-1]=="doc":
 9                 count +=1
10                 src_dir = os.path.join(p, c)
11                 print(src_dir)
12                 dst_dir = file_dir + "back"
13                 if not os.path.exists(dst_dir):
14                     os.makedirs(dst_dir)
15                 shutil.copy(src_dir, dst_dir)
16     return count
17 #提取每個docx簡歷文檔里面的郵箱地址,我們這里使用python-docx模塊來解決pip install python-docx
18 def count_mail(file_dir,dst_file):
19     mail_list = []
20     for parent,dirctiory,files in os.walk(file_dir):
21         for f in files:
22             doc = docx.Document(os.path.join(parent,f))
23             pattern = re.compile(r'''([a-zA-Z0-9._%+-]+@[a-zA-Z0-9\t\s.-]+(\.[a-zA-Z0-9\t\s]{2,4}))''', re.VERBOSE)
24             for para in doc.paragraphs:
25                 for groups in pattern.findall(para.text):
26                     mail_list.append(groups[0].replace(" ","")+";")
27     with open(dst_file,'w')as f:
28         f.writelines(mail_list)
29     print("=====================郵件信息寫入成功===================")
30 #由於python-docx模塊只能處理docx后綴,我們需要處理doc后綴的文件,必須通過win32com模塊來把doc后綴轉換成docx
31 def docxTodoc(old_doc,new_doc):
32     word = wc.Dispatch('Word.Application')
33     for parent,directory,files in os.walk(old_doc):
34         for f in files:
35             doc = word.Documents.Open(os.path.join(parent,f))  # 目標路徑下的文件
36             new_filepath=os.path.join(new_doc,f.split(".")[0]+".docx")
37             print(new_filepath)
38             doc.SaveAs(new_filepath, 12, False, "", True, "", False, False, False,False)  # 轉化后路徑下的文件
39             doc.Close()
40             print(time.time())
41     word.Quit()
42 
43 
44 
45 if __name__ == '__main__':
46     print(count_files(r"C:\Users\icestick\Desktop\51job_導出簡歷_20180917"))
47     count_mail(r"C:\Users\icestick\Desktop\new_doc",r"C:\Users\icestick\Desktop\test.txt" )
48     old_doc = r"C:\Users\icestick\Desktop\51job_導出簡歷_20180917"  #需要把doc目錄轉成docx格式的原目錄
49     new_doc = r"C:\Users\icestick\Desktop\new_doc"                  #需要把doc目錄轉成docx格式的目標目錄
50     mail_extract = r"C:\Users\icestick\Desktop\test.txt"            #郵箱提取好的文件
51     if not os.path.exists(new_doc):
52         os.mkdir(new_doc)
53         print("=====================目錄創建成功======================")
54         docxTodoc(old_doc, new_doc)
55         print("=====================docx格式轉換成功===================")
56         count_mail(new_doc, mail_extract)
57 
58     else:
59         docxTodoc(old_doc, new_doc)
60         print("=====================docx格式轉換成功===================")
61         count_mail(new_doc, mail_extract)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM