在以下方法中用到的三方庫是:python-docx
from docx import Document
獲取指定段落的文本
def get_paragraph_text(path, n):
"""
獲取指定段落的文本
:param path: word路徑
:param n: 第幾段落,從0開始計數
:return: word文本
"""
document = Document(path)
all_paragraphs = len(document.paragraphs)
if all_paragraphs > n:
paragraph_text = document.paragraphs[n].text
return paragraph_text
else:
raise IndexError('paragraph index (%s) out of range, in total %s' % (n, all_paragraphs))
獲取全部段落的文本
def get_paragraphs_text(path):
"""
獲取所有段落的文本
:param path: word路徑
:return: list類型,如:
['Test', 'hello world', ...]
"""
document = Document(path)
all_paragraphs = document.paragraphs
paragraphs_text = []
for paragraph in all_paragraphs:
paragraphs_text.append(paragraph.text)
return paragraphs_text
獲取所有表格的文本
def get_all_tables_text(path):
"""
獲取word中所有表格的文本
:param path: word路徑
:return: list類型的二維數組
如:[['年齡', '排序'], ['23', '00',], ...]
"""
document = Document(path)
all_tables = document.tables
text_list = []
for table in all_tables:
for row in table.rows:
text = []
for cell in row.cells:
text.append(cell.text)
text_list.append(text)
return text_list
獲取指定表格的文本
def get_table_text(path, n=0):
"""
獲取word中的第n個表格的文本
:param path: word路徑
:param n: 第幾個表格,從0開始計算
:return: list類型的二維數組
如:[['年齡', '排序'], ['23', '00',], ...]
"""
document = Document(path)
all_tables = len(document.tables)
if all_tables > n:
table = document.tables[n]
text_list = []
for row in table.rows:
text = []
for cell in row.cells:
text.append(cell.text)
text_list.append(text)
return text_list
else:
raise IndexError('table index (%s) out of range, in total %s' % (n, all_tables))
獲取指定表格內指定單元格文本
def get_cell_text(path, n=0, row=0, col=0):
"""
獲取某個表格的某個單元格的值
:param path: word路徑
:param n: 第幾個表格,從0開始計算
:param row: 第幾行,從0開始計算
:param col: 第幾列,從0開始計算
:return: 單元格的值,str類型
"""
document = Document(path)
all_tables = len(document.tables)
if all_tables > n:
rows = len(document.tables[n].rows)
cols = len(document.tables[n].columns)
if rows > row and cols > col:
tab = document.tables[n].rows[row].cells[col]
return tab.text
else:
raise IndexError('cell index out of range, %s;%s' % (row, col))
else:
raise IndexError('table index (%s) out of range, in toatl %s' % (n, all_tables))
