主要使用python-docx 與pandas
因為python-docx對表格的解析不夠友好且效率低,故需轉換一次
代碼如下
# coding:utf-8 import os, re import docx from docx.document import Document as dc from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table from docx.text.paragraph import Paragraph from docx.shared import RGBColor # 設置字體顏色 from docx import Document from docx.shared import Pt # 設置字體 from docx.oxml.ns import qn # 設置中文字體 import pandas as pd FILE_PATH = r"D:\xxxx\xxxx\xxxx\xxxx.docx" obj = docx.Document(FILE_PATH) def iter_block_items(parent): # print('utils.py ----> iter_block_items:', 2) if isinstance(parent, dc): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("[TypeError] Document in insuitable type.") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) def table2list(table): data = [] for i, row in enumerate(table.rows): row_data = [] for cell in row.cells: row_data.append(cell.text) data.append(row_data) return data #替換的段落關鍵字 word = '段落關鍵字' #替換的表格關鍵字 table_text = '表格關鍵字' def set_run(run, font_size, bold, color, name): ''' 設置run對象 :param run: :param font_size: 字體大小 :param bold: 是否加粗 :param color: 字體顏色 :param name: 字體名 :return: ''' run.font.size = font_size run.bold = bold run.font.color.rgb = color run.font.name = name # 設置字體必須要下面2步 s = run._element s.rPr.rFonts.set(qn('w:eastAsia'), name) def paragraphs_utils(obj): for p in obj.paragraphs: # 先循環得到單個段落p for r in p.runs: if word not in r.text: # 判斷關鍵字是否存在於段落文本中 continue # print(r.text) # print(r.style.name) font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷體' # 使用關鍵詞切分當前run的文本 rest = r.text.split(word) # 清除當前run的內容 r.text = '' for text in rest[:-1]: # 循環切割出來的列表 ['','xxxxxxx']或者['xxxxx',''] run = p.add_run(text=text) set_run(run, font_size, bold, color, name) run = p.add_run(word) # 重寫關鍵字部分 set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) run = p.add_run(rest[-1]) # 在補齊r.text的內容 set_run(run, font_size, bold, color, name) obj.save('標注后的文檔.docx') def table_utils(obj): for p in obj.tables: # 先循環得到單個表格p pd_block = pd.DataFrame(table2list(p)) # 使用table2list 將table轉成列表,然后轉成pandas的DateFrame對象 for rows in range(pd_block.shape[0]): # 循環pd_block(DateFrame對象)的行數 -》shape方法得到元祖 為行數和列數 if rows == 0: continue if table_text != pd_block.iloc[rows, 0]: continue # 判斷關鍵字是否等於當前表的 rows行0列,否則跳過 for cols in range(pd_block.shape[1]): if cols == 0: continue rs = p.cell(rows, cols).paragraphs[0] # 此時rows和cols肯定為關鍵字所在的那行數據,用document對象獲取paragraphs取0 for r in rs.runs: # paragraphs中有個runs 是個列表 font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷體' data = r.text.strip() # 清除當前run的內容 r.text = '' run = rs.add_run(data) # 此時要使用paragraphs的add_run方法重寫data數據 set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) obj.save('標注后的表格.docx') for block in iter_block_items(obj): if isinstance(block, Paragraph): for r in block.runs: if word not in r.text: continue print(r.text) print(r.style.name) font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷體' # 使用關鍵詞切分當前run的文本 rest = r.text.split(word) # 清除當前run的內容 r.text = '' for text in rest[:-1]: run = block.add_run(text=text) set_run(run, font_size, bold, color, name) run = block.add_run(word) set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) run = block.add_run(rest[-1]) set_run(run, font_size, bold, color, name) else: pd_block = pd.DataFrame(table2list(block)) # 使用table2list 將table轉成列表,然后轉成pandas的DateFrame對象 for rows in range(pd_block.shape[0]): # 循環pd_block(DateFrame對象)的行數 -》shape方法得到元祖 為行數和列數 if rows == 0: continue if table_text != pd_block.iloc[rows, 0]: continue # 判斷關鍵字是否等於當前表的 rows行0列,否則跳過 for cols in range(pd_block.shape[1]): if cols == 0: continue rs = block.cell(rows, cols).paragraphs[0] # 此時rows和cols肯定為關鍵字所在的那行數據,用document對象獲取paragraphs取0 for r in rs.runs: # paragraphs中有個runs 是個列表 font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷體' data = r.text.strip() # 清除當前run的內容 r.text = '' run = rs.add_run(data) # 此時要使用paragraphs的add_run方法重寫data數據 set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) obj.save('段落與表格標注后的文檔.docx')
匹配關鍵字回寫docx替換顏色