解壓.docx文件實現提取圖片
前言
.docx文件其實也就是一個壓縮文件,當我們將一個.docx文件直接解壓后可以看到如下目錄
其中我們要找的圖片就在word/media目錄內,如圖
所以,要提取word內的圖片就需要將.docx文件解壓,再從media文件內取得圖片,然后將解壓后的文件刪除
代碼實現
import os import shutil import zipfile def get_pictures(word_path, result_path): """ 獲取word內的所有圖片 :param word_path: word文件 :param result_path: 結果目錄,無需手動創建 :return: None or generator,None:word內沒有圖片,generator:每個圖片的路徑 """ tmp_path = f'{os.path.splitext(word_path)[0]}' # 拷貝源文件后重命名再解壓 splitext = os.path.splitext(word_path) zip_path = shutil.copy(word_path, f'{splitext[0]}_new{splitext[1]}') with zipfile.ZipFile(zip_path, 'r') as f: for file in f.namelist(): f.extract(file, tmp_path) os.remove(zip_path) # 注:word圖片在zip文件內的word/media目錄下 pic_path = os.path.join(tmp_path, 'word/media') if not os.path.exists(pic_path): shutil.rmtree(tmp_path) return 'no pictures found' pictures = os.listdir(pic_path) if not os.path.exists(result_path): os.makedirs(result_path) for picture in pictures: # 根據word的文件名生成圖片的名稱 word_name = os.path.splitext(word_path)[0] if os.sep in word_name: new_name = word_name.split('\\')[-1] else: new_name = word_name.split('/')[-1] picture_name = f'{new_name}_{picture}' shutil.copy(os.path.join(pic_path, picture), os.path.join(result_path, picture_name)) shutil.rmtree(tmp_path) return (os.path.join(result_path, pic) for pic in os.listdir(result_path))
word_path可以支持所有類型路徑,如
p = r"C:\Users\Desktop\test\小說.docx" p1 = "C:/Users/Desktop/test/小說.docx" p2 = "C:\\Users\\Desktop\\test\\小說.docx"
利用三方庫docx實現圖片提取(推薦)
import docx import os, re def get_pictures(word_path, result_path): """ 圖片提取 :param word_path: word路徑 :param result_path: 結果路徑 :return: """ doc = docx.Document(word_path) dict_rel = doc.part._rels for rel in dict_rel: rel = dict_rel[rel] if "image" in rel.target_ref: if not os.path.exists(result_path): os.makedirs(result_path) img_name = re.findall("/(.*)", rel.target_ref)[0] word_name = os.path.splitext(word_path)[0] if os.sep in word_name: new_name = word_name.split('\\')[-1] else: new_name = word_name.split('/')[-1] img_name = f'{new_name}_{img_name}' with open(f'{result_path}/{img_name}', "wb") as f: f.write(rel.target_part.blob)