python如何提取word內的圖片

本文轉載自查看原文 2022-03-24 18:01 1220

解壓.docx文件實現提取圖片

前言

.docx文件其實也就是一個壓縮文件，當我們將一個.docx文件直接解壓后可以看到如下目錄

其中我們要找的圖片就在word/media目錄內，如圖

所以，要提取word內的圖片就需要將.docx文件解壓，再從media文件內取得圖片，然后將解壓后的文件刪除

代碼實現

 
                   import  
                   os 
                  
                   import  
                   shutil 
                  
                   import  
                   zipfile 
                  
                   def  
                   get_pictures(word_path, result_path): 
                  
                   """ 
                  
                   獲取word內的所有圖片 
                  
                   :param word_path: word文件 
                  
                   :param result_path: 結果目錄，無需手動創建 
                  
                   :return: None or generator，None：word內沒有圖片，generator：每個圖片的路徑 
                  
                   """ 
                  
                   tmp_path  
                   =  
                   f 
                   '{os.path.splitext(word_path)[0]}' 
                  
                   # 拷貝源文件后重命名再解壓 
                  
                   splitext  
                   =  
                   os.path.splitext(word_path) 
                  
                   zip_path  
                   =  
                   shutil.copy(word_path, f 
                   '{splitext[0]}_new{splitext[1]}' 
                   ) 
                  
                   with zipfile.ZipFile(zip_path,  
                   'r' 
                   ) as f: 
                  
                   for  
                   file  
                   in  
                   f.namelist(): 
                  
                   f.extract( 
                   file 
                   , tmp_path) 
                  
                   os.remove(zip_path) 
                  
                   # 注：word圖片在zip文件內的word/media目錄下 
                  
                   pic_path  
                   =  
                   os.path.join(tmp_path,  
                   'word/media' 
                   ) 
                  
                   if  
                   not  
                   os.path.exists(pic_path): 
                  
                   shutil.rmtree(tmp_path) 
                  
                   return  
                   'no pictures found' 
                  
                   pictures  
                   =  
                   os.listdir(pic_path) 
                  
                   if  
                   not  
                   os.path.exists(result_path): 
                  
                   os.makedirs(result_path) 
                  
                   for  
                   picture  
                   in  
                   pictures: 
                  
                   # 根據word的文件名生成圖片的名稱 
                  
                   word_name  
                   =  
                   os.path.splitext(word_path)[ 
                   0 
                   ] 
                  
                   if  
                   os.sep  
                   in  
                   word_name: 
                  
                   new_name  
                   =  
                   word_name.split( 
                   '\\' 
                   )[ 
                   - 
                   1 
                   ] 
                  
                   else 
                   : 
                  
                   new_name  
                   =  
                   word_name.split( 
                   '/' 
                   )[ 
                   - 
                   1 
                   ] 
                  
                   picture_name  
                   =  
                   f 
                   '{new_name}_{picture}' 
                  
                   shutil.copy(os.path.join(pic_path, picture), os.path.join(result_path, picture_name)) 
                  
                   shutil.rmtree(tmp_path) 
                  
                   return  
                   (os.path.join(result_path, pic)  
                   for  
                   pic  
                   in  
                   os.listdir(result_path))

word_path可以支持所有類型路徑，如

 
                   p  
                   =  
                   r 
                   "C:\Users\Desktop\test\小說.docx" 
                  
                   p1  
                   =  
                   "C:/Users/Desktop/test/小說.docx" 
                  
                   p2  
                   =  
                   "C:\\Users\\Desktop\\test\\小說.docx"

利用三方庫docx實現圖片提取(推薦)

 
                   import  
                   docx 
                  
                   import  
                   os, re 
                  
                   def  
                   get_pictures(word_path, result_path): 
                  
                   """ 
                  
                   圖片提取 
                  
                   :param word_path: word路徑 
                  
                   :param result_path: 結果路徑 
                  
                   :return:  
                  
                   """ 
                  
                   doc  
                   =  
                   docx.Document(word_path) 
                  
                   dict_rel  
                   =  
                   doc.part._rels 
                  
                   for  
                   rel  
                   in  
                   dict_rel: 
                  
                   rel  
                   =  
                   dict_rel[rel] 
                  
                   if  
                   "image"  
                   in  
                   rel.target_ref: 
                  
                   if  
                   not  
                   os.path.exists(result_path): 
                  
                   os.makedirs(result_path) 
                  
                   img_name  
                   =  
                   re.findall( 
                   "/(.*)" 
                   , rel.target_ref)[ 
                   0 
                   ] 
                  
                   word_name  
                   =  
                   os.path.splitext(word_path)[ 
                   0 
                   ] 
                  
                   if  
                   os.sep  
                   in  
                   word_name: 
                  
                   new_name  
                   =  
                   word_name.split( 
                   '\\' 
                   )[ 
                   - 
                   1 
                   ] 
                  
                   else 
                   : 
                  
                   new_name  
                   =  
                   word_name.split( 
                   '/' 
                   )[ 
                   - 
                   1 
                   ] 
                  
                   img_name  
                   =  
                   f 
                   '{new_name}_{img_name}' 
                  
                   with  
                   open 
                   (f 
                   '{result_path}/{img_name}' 
                   ,  
                   "wb" 
                   ) as f: 
                  
                   f.write(rel.target_part.blob)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python如何提取word內的圖片 python如何提取word內的圖片 Python從圖片提取文字 Java 提取Word中的文本和圖片 C# 提取Word文檔中的圖片 python 圖片提取文字 Python提取圖片的ROI python提取視頻中的圖片 c# 提取word文件中的圖片問題圖片中的公式轉為word格式，從圖片提取word公式、latex公式