用python實現一個文檔小工具(支持文檔關鍵字篩選)


功能:根據關鍵詞批量從doc、docx、pdf文件中篩選出包含所輸入關鍵詞的文件

那么開始上代碼,不是專業python程序猿,代碼寫的不好勿噴,哈哈

from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtCore import *
import sys, os
import docx
from docx import Document
import os
import shutil
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

class Window(QDialog):
    def __init__(self, parent=None):
        super(Window, self).__init__(parent)
        self.path = ''
        self.initUI()
        self.setWindowTitle("文件小助手")
        self.resize(240, 200)

    def initUI(self):
        grid = QGridLayout()

        grid.addWidget(QLabel("源路徑:"), 0, 0)
        self.pathLineEdit = QLineEdit()
        self.pathLineEdit.setFixedWidth(200)
        self.pathLineEdit.setText(self.path)
        grid.addWidget(self.pathLineEdit, 0, 1)
        button = QPushButton("選擇文件夾")
        grid.addWidget(button, 0, 3)
        button.clicked.connect(self.msg)

        grid.addWidget(QLabel("輸出路徑:"), 1, 0)
        self.pathLineEdit1 = QLineEdit()
        self.pathLineEdit1.setFixedWidth(200)
        self.pathLineEdit1.setText(self.path)
        grid.addWidget(self.pathLineEdit1, 1, 1)
        button = QPushButton("選擇文件夾")
        grid.addWidget(button, 1, 3)
        button.clicked.connect(self.msg1)

        # create textbox
        grid.addWidget(QLabel("關鍵字:"), 2, 0)
        self.textbox = QLineEdit(self)
        self.textbox.move(20, 20)
        self.textbox.resize(180, 30)
        grid.addWidget(self.textbox, 2, 1)

        # Create a button in the window
        self.button1 = QPushButton('點我開始干活兒', self)
        grid.addWidget(self.button1, 3, 1)
        self.setLayout(grid)
        fileDir = self.pathLineEdit.text()
        keyword = self.textbox.text()
        self.button1.clicked.connect(lambda : self.working(self.pathLineEdit,self.pathLineEdit1,self.textbox))

    def msg(self):
        dir = QFileDialog.getExistingDirectory(self,"選取文件夾","./")  # 起始路徑
        self.pathLineEdit.setText(dir)
        print(dir)

    def msg1(self):
        dir = QFileDialog.getExistingDirectory(self, "選取文件夾", "./")  # 起始路徑
        self.pathLineEdit1.setText(dir)
        print(dir)

    #word 解析器
    def readDoc(self,root,path,target,key):
        #將doc文件改為docx
        filename = path[-3:]
        if filename == 'doc':
            name = os.path.basename(path)
            os.rename(path,root+'/'+name+'x')
            path = path+'x'

        flag = False
        try:
            document = Document(path)
        except:
            return
        else:
            for paragraph in document.paragraphs:
                if key in paragraph.text:
                    flag = True
                    self.copyFile(target,path)
                    break
            if flag == False:
                tables = document.tables
                for table in tables:
                    # 行列個數
                    row_count = len(table.rows)
                    col_count = len(table.columns)
                    for i in range(row_count):
                        for j in range(col_count):
                            if key in table.cell(i, j).text:
                                self.copyFile(target, path)
                                break

    #pdf文件解析器
    def readPdf(self,root,path,target,key):
        # 獲取文檔對象
        fp = open(path, "rb")

        # 創建一個一個與文檔關聯的解釋器
        parser = PDFParser(fp)

        # PDF文檔的對象
        doc = PDFDocument()

        # 連接解釋器和文檔對象
        parser.set_document(doc)
        doc.set_parser(parser)

        # 初始化文檔,當前文檔沒有密碼,設為空字符串
        doc.initialize("")

        # 創建PDF資源管理器
        resource = PDFResourceManager()

        # 參數分析器
        laparam = LAParams()

        # 創建一個聚合器
        device = PDFPageAggregator(resource, laparams=laparam)

        # 創建PDF頁面解釋器
        interpreter = PDFPageInterpreter(resource, device)

        # 使用文檔對象得到頁面的集合
        for page in doc.get_pages():
            # 使用頁面解釋器讀取
            interpreter.process_page(page)

            # 使用聚合器來獲得內容
            layout = device.get_result()

            for out in layout:
                if hasattr(out, "get_text"):
                    txt = out.get_text()
                    if key in txt:
                        self.copyFile(target,path)
                        break

    # 復制文件
    def copyFile(self, path, oldname):
        hasFile = os.path.exists(path)
        if hasFile == True:
            name = os.path.basename(oldname)
            shutil.copyfile(oldname, path + '/' + name)
        else:
            os.mkdir(path)
            name = os.path.basename(oldname)
            shutil.copyfile(oldname, path + '/' + name)

    # 開始干活兒
    @pyqtSlot()
    def working(self,pathLineEdit1,pathLineEdit2,textbox):
        sourcedir = pathLineEdit1.text()
        targetdir = pathLineEdit2.text()
        key = textbox.text()
        msg = '處理好了'
        if sourcedir.strip() == '':
            msg = '源路徑不能為空'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)
            return
        if targetdir.strip() == '':
            msg = '輸出路徑不能為空'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)
            return
        if key.strip() == '':
            msg = '關鍵字不能為空'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)
            return
        # 處理文件
        if sourcedir.strip() != '' and targetdir.strip() != '' and key.strip() != '':
            flag = False
            for root, dirs, files in os.walk(sourcedir):
                for file in files:
                    diricto = os.path.join(root, file)
                    filetype = diricto[-4:]
                    if 'doc' in filetype:
                        self.readDoc(root, diricto, targetdir, key)
                        flag = True
                    if 'pdf' in filetype:
                        self.readPdf(root, diricto, targetdir, key)
                        flag = True

            if flag == False :
                msg = '源路徑中沒有word和pdf文件'
            QMessageBox.question(self, "Message", msg,
                                 QMessageBox.Ok, QMessageBox.Ok)

if __name__ == '__main__':
    app = QApplication(sys.argv)
    dialog = Window()
    if dialog.exec_():
        pass

 

工具演示效果圖如下

 

工具下載鏈接:  https://pan.baidu.com/s/1w7CQUAowSgR_d6V2h5OlwA  密碼:kyuy


文末小福利免費視頻資源網站:www.sousuohou.com


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM