注意:此方法只能用於docx文檔,doc的不行,doc的得另存為Html文件,進入到上述網頁文件所在的文件夾中,你會發現有一個名稱為“文件名.files”(如“圖片.files”),進入其中,前面 Word 文檔中的圖片就一一被保存在里面了。
在解壓的時候遇到點問題,1,不能解壓,2,不能刪除文件夾,3,不能移動文件,用try解決了
doc的也不是不行,需要使用vba把doc自動另存為docx格式的文檔,vba代碼如下
Sub doc2docx() 'doc文件轉docx文件
Dim myDialog As FileDialog
Set myDialog = Application.FileDialog(msoFileDialogFilePicker)
Dim oFile As Object
Dim oFilePath As Variant
With myDialog
.Filters.Clear '清除所有文件篩選器中的項目
.Filters.Add "所有 WORD2007 文件", "*.doc", 1 '增加篩選器的項目為所有doc文件
.AllowMultiSelect = True '允許多項選擇
If .Show = -1 Then '確定
For Each oFilePath In .SelectedItems '在所有選取項目中循環
Set oFile = Documents.Open(oFilePath)
oFile.SaveAs FileName:=Replace(oFilePath, "doc", "docx"), FileFormat:=16
oFile.Close
Next
End If
End With
End Sub
然后再使用下面的代碼提取"docx"格式文檔里面的圖片
#revised by Stephen Shen 2020-3-8 19:51:46
#to extract pics from *.docx files
import zipfile
import os,shutil
import datetime
# import send2trash
# from tempfile import TemporaryDirectory
def renameFile(dstpath):
fdirname,fbasename=os.path.split(dstpath)
#文件名相同但大小不同
fname,fext=os.path.splitext(fbasename)
nowtime=datetime.datetime.now()
strtime=str(nowtime.year)+str(nowtime.month)+str(nowtime.day)+str(nowtime.hour)+str(nowtime.minute)
newfbasename=fname+'-'+strtime+fext
dstpath=os.path.join(fdirname,newfbasename)
return dstpath
def extractPics(doc_inpath,outDir,zipDir):
file_ext=['.docx']
dirPath,fname,fext=splitPath(doc_inpath)
if fext in file_ext:
print(doc_inpath+' is extracting...')
fnameNew=fname.strip()
out_dir =os.path.join(outDir,fnameNew)
if not os.path.exists(out_dir):#文件夾不存在
if len(os.path.split(out_dir)[-1])>200:
print('dirname is too long :'+out_dir)
else:
os.mkdir(out_dir)
else:#文件夾已存在
nowtime=datetime.datetime.now()
strtime=str(nowtime.year)+str(nowtime.month)+str(nowtime.day)+str(nowtime.hour)+str(nowtime.minute)
out_dir=os.path.join(outDir,fnameNew+'-'+strtime)
zip_path=os.path.join(zipDir,'docx.zip')
shutil.copyfile(doc_inpath,zip_path)
# zip_path=os.path.join(zip_dir,fnameNew+'.zip')
# os.rename(copy_path,zip_path)
try:
with zipfile.ZipFile(zip_path,'r') as f:
f.extractall()
except:
print(zip_path+' cannot be extracted')
else:
picsDir=os.path.join(zipDir,'word/media')
if os.path.exists(picsDir):
for pic in os.listdir(picsDir):
oldpic=os.path.join(picsDir,pic)
newpic=os.path.join(out_dir,pic)
try:
shutil.move(oldpic,newpic)
except:
print(doc_inpath+' is skipped')
filelist=os.listdir(zipDir)
for f in filelist:
filepath = os.path.join(zipDir, f )
if os.path.isfile(filepath):
os.remove(filepath)
elif os.path.isdir(filepath):
shutil.rmtree(filepath,True)
def splitPath(doc_inpath):
dirPath=os.path.dirname(doc_inpath)
fname,fext=os.path.splitext(os.path.basename(doc_inpath))
return dirPath,fname,fext
outDir=r'k:/fileExtracted/imagesFromDocs'
zipDir=r'k:/fileExtracted/zip'
# docDir=r'K:\imagsDocx'
docDir=r'K:/docs'
if not os.path.exists(outDir):
os.makedirs(outDir)
if not os.path.exists(zipDir):
os.makedirs(zipDir)
for folder,subFolderName,files in os.walk(docDir):
for file in files:
doc_inpath=os.path.join(folder,file)
os.chdir(zipDir)
extractPics(doc_inpath,outDir,zipDir)
print('done')
