使用python讀取多重文件夾下的word(doc、docx)文件,並處理存儲到excel(xls、xlsx)文件


#-*- coding:gbk -*-
import os
import docx
from win32com import client as wc
import xlwt
import xlsxwriter

# 獲取filepath文件夾下的所有的文件
def getfilelist(filepath):
    filelist =  os.listdir(filepath)  
    files = []
    for i in range(len(filelist)):
        child = os.path.join('%s\\%s' % (filepath, filelist[i]))
        if os.path.isdir(child):
            files.extend(getfilelist(child))
        else:
            files.append(child)
    return files

# 獲取word文件文本
def getDocx(fileName):
    d = docx.opendocx(fileName)
    doc = docx.getdocumenttext(d)
    return doc

# 將doc轉換為docx
def doc2Docx(fileName):
    word = wc.Dispatch("Word.Application")
    doc = word.Documents.Open(fileName)
    doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False)
    os.remove(fileName)
    doc.Close()
    word.Quit()

filepath = "C:\\xxx\\xx\\xx\\xx\\數據集"
filelist = (getfilelist(filepath))
##如果文件夾下的文件都是doc,需要先通過該函數全部轉變為docx
##for i in range(len(filelist)):
##    doc2Docx(filelist[i])
    

list = []
for i in range(len(filelist)):
    if (filelist[i].endswith("docx")):
        list.append(filelist[i])
     
# 使用xlwt寫入到excel,當存在大文本的時候會出現錯誤:Exception: String longer than 32767 characters
##for i in range(len(list)):
##    fileName = list[i]
##    doc = get_docx(fileName)
##    filePaths = fileName.split("\\")
##    string = ""
##    for j in range(len(doc)):
##        string += doc[j] + "\n"
##    if (len(string) > 10000):
##        string = string[:10000]
##    filePaths.append(string)
##    for j in range(20, -1, -1):
##        if j < len(filePaths):
##            worksheet.write(i, j, label = filePaths[j])
##workbook.save('Excel_Workbook.xls')

# 使用xlsxwriter處理超過的32767word文本
workbook = xlsxwriter.Workbook(u'數據.xlsx') 
worksheet = workbook.add_worksheet(u"數據")
for i in range(len(list)):
    fileName = list[i]
    doc = get_docx(fileName)
    filePaths = fileName.split("\\")
    string = ""
    for j in range(len(doc)):
        string += doc[j] + "\n"
    filePaths.append(string)
    for j in range(20, -1, -1):
        if j < len(filePaths):
            worksheet.write(i, j, filePaths[j])  
workbook.close()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM