导入必要的库

import re import math from collections import Counter

定义一个函数,用于对文本进行预处理

def preprocess_text(text): # 将文本中的标点符号、数字、空格等非文字字符替换为空格 text = re.sub(r'[^\w\s]','',text) text = re.sub(r'\d+','',text) text = re.sub(r'\s+',' ',text) # 将文本转换为小写字母 text = text.lower() return text

定义一个函数,用于计算TF-IDF权重

def compute_tf_idf(text): # 对文本进行预处理 text = preprocess_text(text) # 将文本分词 words = text.split() # 计算每个单词在文本中出现的次数 word_counts = Counter(words) # 计算文本中单词的总数 total_words = len(words) # 计算每个单词在文本中出现的频率 word_freqs = {word: count/total_words for word, count in word_counts.items()} # 定义一个空字典,用于存储每个单词的TF-IDF权重 tf_idf = {} # 计算每个单词的TF-IDF权重 for word in word_freqs: # 计算单词在所有文本中出现的次数 word_in_docs = sum(1 for doc in documents if word in doc) # 计算单词的IDF值 idf = math.log(len(documents)/word_in_docs) # 计算单词的TF-IDF权重 tf_idf[word] = word_freqs[word] * idf return tf_idf

定义一个函数,用于生成文本摘要

def generate_summary(text, num_sentences=3): # 对文本进行预处理 text = preprocess_text(text) # 将文本分句 sentences = re.split(r'(?<!\w.\w.)(?<![A-Z][a-z].)(?<=.|?)\s', text) # 计算每个句子的TF-IDF权重 sentence_scores = {} for sentence in sentences: # 将句子分词 words = sentence.split() # 计算句子中单词的TF-IDF权重之和 score = sum(tf_idf[word] for word in words if word in tf_idf) # 存储每个句子的TF-IDF权重 sentence_scores[sentence] = score # 获取TF-IDF权重最高的前num_sentences个句子 summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences] # 将摘要句子按照原文本的顺序重新排序 summary_sentences = sorted(summary_sentences, key=lambda sentence: sentences.index(sentence)) # 将摘要句子合并成一个字符串 summary = ' '.join(summary_sentences) return summary

定义一个包含多篇文本的列表

documents = [ "Python is a popular programming language. It was created in 1991 by Guido van Rossum.", "Python is used for web development, data analysis, artificial intelligence, and more.", "Python is easy to learn and has a simple syntax, making it a popular choice for beginners.", "Python is open-source software, which means it is free to use and distribute.", "Python has a large and active community, which provides support and contributes to its development." ]

计算TF-IDF权重

tf_idf = compute_tf_idf(' '.join(documents))

生成文本摘要

summary = generate_summary(' '.join(documents)) print(summary

随着大数据时代的到来人们面对越来越多的信息无法获取自己所关心的信息无法关注一篇文章的所有内容只需要关注文章的核心要义文本自动摘要技术可以一定程度上缓解这个问题。方法:采用基于传统机器学习的抽取式文本摘要生成方法;结果:针对任意一条网络文本数据可获取对应的摘要信息。python编写代码生成一个例子

原文地址: https://www.cveoy.top/t/topic/heRh 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录