这段代码使用 Python 对 Word 文档进行处理,提取其中的信息并存储到 Excel 表格中。具体包括以下几个步骤:

  1. 创建一个新的 Excel 工作簿,用于存储提取出的信息。
  2. 提取文本中的标题,并将其写入 Excel 表格中。
  3. 提取文本中所有段落的第一句话,并将其写入 Excel 表格中。
  4. 提取文本中所有加粗汉语,并将其写入 Excel 表格中。
  5. 识别汉语词性,并将其写入 Excel 表格中。
  6. 根据指定关键词,提取包含关键词的句子,并将其写入 Excel 表格中。
  7. 调整 Excel 表格中列的宽度。
  8. 保存 Excel 表格。
  9. 关闭 Word。

其中用到了 openpyxl、docx、jieba 和 win32com 等 Python 库。

import os
import openpyxl
import docx
import jieba.posseg as pseg
import win32com.client as win32
import re

def extract_info_from_word_files(folder_path):
    # 创建一个新的工作簿
    wb = openpyxl.Workbook()

    # 提取文本中的标题
    ws_title = wb.active
    ws_title.title = 'Title'
    ws_title['A1'] = '文件名'
    ws_title['B1'] = '标题'
    row_count_title = 2

    # 提取文本中所有段落的第一句话
    ws_first_sentence = wb.create_sheet('First Sentence')
    ws_first_sentence['A1'] = '文件名'
    ws_first_sentence['B1'] = '段落号'
    ws_first_sentence['C1'] = '第一句话'
    row_count_fs = 2

    # 提取文本中所有加粗汉语
    ws_bold = wb.create_sheet('Bold')
    ws_bold['A1'] = '文件名'
    ws_bold['B1'] = '加粗汉语'
    row_count_bold = 2

    # 识别汉语词性
    ws_pseg = wb.create_sheet('Pseg')
    ws_pseg['A1'] = '文件名'
    ws_pseg['B1'] = '段落号'
    ws_pseg['C1'] = '汉语词组'
    ws_pseg['D1'] = '词性'
    row_count_pseg = 2

    # 根据指定关键词,提取包含关键词的句子
    keywords = ['随着', '加快', '加强']
    ws_keyword_sentence = wb.create_sheet('Keyword Sentence')
    ws_keyword_sentence['A1'] = '文件名'
    ws_keyword_sentence['B1'] = '关键词'
    ws_keyword_sentence['C1'] = '句子'
    row_count_ks = 2

    word = win32.Dispatch('Word.Application')
    for filename in os.listdir(folder_path):
        if filename.endswith('.docx') or filename.endswith('.doc'):
            file_path = os.path.join(folder_path, filename)

            # 提取文本中的标题
            ws_title.cell(row=row_count_title, column=1, value=filename)
            doc = docx.Document(file_path)
            title = doc.paragraphs[0].text
            ws_title.cell(row=row_count_title, column=2, value=title)
            row_count_title += 1

            # 提取文本中所有段落的第一句话
            ws_first_sentence.cell(row=row_count_fs, column=1, value=filename)
            for i, para in enumerate(doc.paragraphs):
                if len(para.text.strip()) > 0:
                    first_sentence = para.text.split('。')[0]
                    ws_first_sentence.cell(row=row_count_fs, column=2, value=i+1)
                    ws_first_sentence.cell(row=row_count_fs, column=3, value=first_sentence)
                    row_count_fs += 1

            # 提取文本中所有加粗汉语
            # ws_bold.cell(row=row_count_bold, column=1, value=filename)
            # paragraphs = [p for p in doc.paragraphs]
            # bold_sentences = extract_bold_sentences(paragraphs, word)
            # for bs in bold_sentences:
            #     ws_bold.cell(row=row_count_bold, column=2, value=bs)
            #     row_count_bold += 1
            doc = docx.Document(file_path)
            # 遍历所有段落和文本样式
            for para in doc.paragraphs:
                sentence = ''
                for run in para.runs:
                    if run.bold:
                        # 将多行文本合并为一行
                        sentence += run.text.replace('
            ', '') + ' '
                # 将句子写入 Excel 文件
                if sentence:
                    row = ws_bold.max_row + 1
                    col = 2
                    ws_bold.cell(row=row, column=col, value=sentence.strip())
                    ws_bold.cell(row=row, column=col-1, value=title)

            # 识别汉语词性
            ws_pseg.cell(row=row_count_pseg, column=1, value=filename)
            for i, para in enumerate(doc.paragraphs):
                if len(para.text.strip()) > 0:
                    para = re.sub('[。
!?,、“”‘’;:()《》【】~@#¥%…&—]', '', para.text)
                    para = para.strip()
                    words = pseg.cut(para)
                    for word, flag in words:
                        if 'x' in flag or 'n' in flag or 'v' in flag or 'a' in flag:
                            ws_pseg.cell(row=row_count_pseg, column=2, value=i+1)
                            ws_pseg.cell(row=row_count_pseg, column=3, value=word)
                            ws_pseg.cell(row=row_count_pseg, column=4, value=flag)
                            row_count_pseg += 1

            # 根据指定关键词,提取包含关键词的句子
            ws_keyword_sentence.cell(row=row_count_ks, column=1, value=filename)
            for para in doc.paragraphs:
                for run in para.runs:
                    for keyword in keywords:
                        if keyword in run.text:
                            ws_keyword_sentence.cell(row=row_count_ks, column=2, value=keyword)
                            ws_keyword_sentence.cell(row=row_count_ks, column=3, value=run.text.strip())
                            row_count_ks += 1

    # 调整列宽
    for sheet in wb.worksheets:
        for col in sheet.columns:
            max_length = 0
            column = col[0].column_letter
            for cell in col:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(str(cell.value))
                except:
                    pass
            adjusted_width = (max_length + 2)
            sheet.column_dimensions[column].width = adjusted_width

    # 保存 Excel 表格
    wb.save(os.path.join(folder_path, 'result.xlsx'))

    # 关闭 Word
    # word.Quit()

# def extract_bold_sentences(paragraphs, word):
#     result = []
#     for paragraph in paragraphs:
#         if len(paragraph.text.strip()) == 0:
#             continue
#         for r in paragraph.runs:
#             if r.font.bold == True:
#                 bold_sentence = r.text.strip()
#                 result.append(bold_sentence)
#     return result

if __name__ == '__main__':
    folder_path = 'C:/Users/法克伯伊阿齐/Desktop/word'
    extract_info_from_word_files(folder_path)
Python Word 文档信息提取工具:从标题、段落到词性分析

原文地址: https://www.cveoy.top/t/topic/mLw9 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录