Python Word 文档信息提取工具:从标题、段落到词性分析
这段代码使用 Python 对 Word 文档进行处理,提取其中的信息并存储到 Excel 表格中。具体包括以下几个步骤:
- 创建一个新的 Excel 工作簿,用于存储提取出的信息。
- 提取文本中的标题,并将其写入 Excel 表格中。
- 提取文本中所有段落的第一句话,并将其写入 Excel 表格中。
- 提取文本中所有加粗汉语,并将其写入 Excel 表格中。
- 识别汉语词性,并将其写入 Excel 表格中。
- 根据指定关键词,提取包含关键词的句子,并将其写入 Excel 表格中。
- 调整 Excel 表格中列的宽度。
- 保存 Excel 表格。
- 关闭 Word。
其中用到了 openpyxl、docx、jieba 和 win32com 等 Python 库。
import os
import openpyxl
import docx
import jieba.posseg as pseg
import win32com.client as win32
import re
def extract_info_from_word_files(folder_path):
# 创建一个新的工作簿
wb = openpyxl.Workbook()
# 提取文本中的标题
ws_title = wb.active
ws_title.title = 'Title'
ws_title['A1'] = '文件名'
ws_title['B1'] = '标题'
row_count_title = 2
# 提取文本中所有段落的第一句话
ws_first_sentence = wb.create_sheet('First Sentence')
ws_first_sentence['A1'] = '文件名'
ws_first_sentence['B1'] = '段落号'
ws_first_sentence['C1'] = '第一句话'
row_count_fs = 2
# 提取文本中所有加粗汉语
ws_bold = wb.create_sheet('Bold')
ws_bold['A1'] = '文件名'
ws_bold['B1'] = '加粗汉语'
row_count_bold = 2
# 识别汉语词性
ws_pseg = wb.create_sheet('Pseg')
ws_pseg['A1'] = '文件名'
ws_pseg['B1'] = '段落号'
ws_pseg['C1'] = '汉语词组'
ws_pseg['D1'] = '词性'
row_count_pseg = 2
# 根据指定关键词,提取包含关键词的句子
keywords = ['随着', '加快', '加强']
ws_keyword_sentence = wb.create_sheet('Keyword Sentence')
ws_keyword_sentence['A1'] = '文件名'
ws_keyword_sentence['B1'] = '关键词'
ws_keyword_sentence['C1'] = '句子'
row_count_ks = 2
word = win32.Dispatch('Word.Application')
for filename in os.listdir(folder_path):
if filename.endswith('.docx') or filename.endswith('.doc'):
file_path = os.path.join(folder_path, filename)
# 提取文本中的标题
ws_title.cell(row=row_count_title, column=1, value=filename)
doc = docx.Document(file_path)
title = doc.paragraphs[0].text
ws_title.cell(row=row_count_title, column=2, value=title)
row_count_title += 1
# 提取文本中所有段落的第一句话
ws_first_sentence.cell(row=row_count_fs, column=1, value=filename)
for i, para in enumerate(doc.paragraphs):
if len(para.text.strip()) > 0:
first_sentence = para.text.split('。')[0]
ws_first_sentence.cell(row=row_count_fs, column=2, value=i+1)
ws_first_sentence.cell(row=row_count_fs, column=3, value=first_sentence)
row_count_fs += 1
# 提取文本中所有加粗汉语
# ws_bold.cell(row=row_count_bold, column=1, value=filename)
# paragraphs = [p for p in doc.paragraphs]
# bold_sentences = extract_bold_sentences(paragraphs, word)
# for bs in bold_sentences:
# ws_bold.cell(row=row_count_bold, column=2, value=bs)
# row_count_bold += 1
doc = docx.Document(file_path)
# 遍历所有段落和文本样式
for para in doc.paragraphs:
sentence = ''
for run in para.runs:
if run.bold:
# 将多行文本合并为一行
sentence += run.text.replace('
', '') + ' '
# 将句子写入 Excel 文件
if sentence:
row = ws_bold.max_row + 1
col = 2
ws_bold.cell(row=row, column=col, value=sentence.strip())
ws_bold.cell(row=row, column=col-1, value=title)
# 识别汉语词性
ws_pseg.cell(row=row_count_pseg, column=1, value=filename)
for i, para in enumerate(doc.paragraphs):
if len(para.text.strip()) > 0:
para = re.sub('[。
!?,、“”‘’;:()《》【】~@#¥%…&—]', '', para.text)
para = para.strip()
words = pseg.cut(para)
for word, flag in words:
if 'x' in flag or 'n' in flag or 'v' in flag or 'a' in flag:
ws_pseg.cell(row=row_count_pseg, column=2, value=i+1)
ws_pseg.cell(row=row_count_pseg, column=3, value=word)
ws_pseg.cell(row=row_count_pseg, column=4, value=flag)
row_count_pseg += 1
# 根据指定关键词,提取包含关键词的句子
ws_keyword_sentence.cell(row=row_count_ks, column=1, value=filename)
for para in doc.paragraphs:
for run in para.runs:
for keyword in keywords:
if keyword in run.text:
ws_keyword_sentence.cell(row=row_count_ks, column=2, value=keyword)
ws_keyword_sentence.cell(row=row_count_ks, column=3, value=run.text.strip())
row_count_ks += 1
# 调整列宽
for sheet in wb.worksheets:
for col in sheet.columns:
max_length = 0
column = col[0].column_letter
for cell in col:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = (max_length + 2)
sheet.column_dimensions[column].width = adjusted_width
# 保存 Excel 表格
wb.save(os.path.join(folder_path, 'result.xlsx'))
# 关闭 Word
# word.Quit()
# def extract_bold_sentences(paragraphs, word):
# result = []
# for paragraph in paragraphs:
# if len(paragraph.text.strip()) == 0:
# continue
# for r in paragraph.runs:
# if r.font.bold == True:
# bold_sentence = r.text.strip()
# result.append(bold_sentence)
# return result
if __name__ == '__main__':
folder_path = 'C:/Users/法克伯伊阿齐/Desktop/word'
extract_info_from_word_files(folder_path)
原文地址: https://www.cveoy.top/t/topic/mLw9 著作权归作者所有。请勿转载和采集!