基于TextRank、TF-IDF、LSI和LDA模型的关键词提取及相似度计算

以下是Python代码实现基于TextRank、TF-IDF、LSI以及LDA模型的关键字提取，关键字为10个，并计算不同模型提取到的关键字的相似度，并说明相似度计算方法：/n/npython/nimport nltk/nfrom nltk.tokenize import word_tokenize, sent_tokenize/nfrom nltk.corpus import stopwords/nfrom string import punctuation/nfrom collections import defaultdict/nfrom heapq import nlargest/nfrom gensim.models import TfidfModel/nfrom gensim.corpora import Dictionary/nfrom gensim.models import LsiModel, LdaModel/nfrom sklearn.metrics.pairwise import cosine_similarity/n/n# 加载停用词/nnltk.download('stopwords')/nstopwords = set(stopwords.words('english') + list(punctuation))/n/n# 加载文章/narticle = 'The economic downturn caused by the COVID-19 pandemic has led to an increase in unemployment rates across the world. In the United States, the unemployment rate has reached its highest level since the Great Depression. Many industries, such as travel and hospitality, have been hit particularly hard by the pandemic. However, some industries, such as e-commerce and online entertainment, have seen an increase in demand. It remains to be seen how the pandemic will impact the economy in the long run.'/n/n# 分句/nsentences = sent_tokenize(article)/n/n# TextRank算法/ndef text_rank(sentences, n):/n # 分词/n words = [word_tokenize(sentence.lower()) for sentence in sentences]/n # 去停用词/n words = [[word for word in sentence if word not in stopwords] for sentence in words]/n # 构建词频矩阵/n freq = defaultdict(int)/n for sentence in words:/n for word in sentence:/n freq[word] += 1/n # 计算句子分数/n scores = defaultdict(float)/n for i, sentence in enumerate(words):/n for word in sentence:/n scores[i] += freq[word]/n scores[i] /= len(sentence)/n # 获取前n个句子/n top_n = nlargest(n, scores, key=scores.get)/n return [sentences[i] for i in sorted(top_n)]/n/n# TF-IDF算法/ndef tf_idf(sentences, n):/n # 分词/n words = [word_tokenize(sentence.lower()) for sentence in sentences]/n # 去停用词/n words = [[word for word in sentence if word not in stopwords] for sentence in words]/n # 构建词典/n dictionary = Dictionary(words)/n # 构建语料库/n corpus = [dictionary.doc2bow(sentence) for sentence in words]/n # 计算TF-IDF值/n tfidf_model = TfidfModel(corpus)/n tfidf = tfidf_model[corpus]/n # 计算句子分数/n scores = defaultdict(float)/n for i, sentence in enumerate(tfidf):/n for index, score in sentence:/n scores[i] += score/n # 获取前n个句子/n top_n = nlargest(n, scores, key=scores.get)/n return [sentences[i] for i in sorted(top_n)]/n/n# LSI算法/ndef lsi(sentences, n):/n # 分词/n words = [word_tokenize(sentence.lower()) for sentence in sentences]/n # 去停用词/n words = [[word for word in sentence if word not in stopwords] for sentence in words]/n # 构建词典/n dictionary = Dictionary(words)/n # 构建语料库/n corpus = [dictionary.doc2bow(sentence) for sentence in words]/n # 训练LSI模型/n lsi_model = LsiModel(tfidf_model[corpus], id2word=dictionary, num_topics=n)/n lsi = lsi_model[tfidf]/n # 计算句子分数/n scores = defaultdict(float)/n for i, sentence in enumerate(lsi):/n for index, score in sentence:/n scores[i] += score/n # 获取前n个句子/n top_n = nlargest(n, scores, key=scores.get)/n return [sentences[i] for i in sorted(top_n)]/n/n# LDA算法/ndef lda(sentences, n):/n # 分词/n words = [word_tokenize(sentence.lower()) for sentence in sentences]/n # 去停用词/n words = [[word for word in sentence if word not in stopwords] for sentence in words]/n # 构建词典/n dictionary = Dictionary(words)/n # 构建语料库/n corpus = [dictionary.doc2bow(sentence) for sentence in words]/n # 训练LDA模型/n lda_model = LdaModel(corpus, id2word=dictionary, num_topics=n)/n lda = lda_model[corpus]/n # 计算句子分数/n scores = defaultdict(float)/n for i, sentence in enumerate(lda):/n for index, score in sentence:/n scores[i] += score/n # 获取前n个句子/n top_n = nlargest(n, scores, key=scores.get)/n return [sentences[i] for i in sorted(top_n)]/n/n# 获取关键字/ndef get_keywords(sentences, n):/n # 分词/n words = [word_tokenize(sentence.lower()) for sentence in sentences]/n # 去停用词/n words = [[word for word in sentence if word not in stopwords] for sentence in words]/n # 构建词频矩阵/n freq = defaultdict(int)/n for sentence in words:/n for word in sentence:/n freq[word] += 1/n # 获取前n个关键字/n top_n = nlargest(n, freq, key=freq.get)/n return top_n/n/n# 获取关键字/nkeywords = get_keywords(sentences, 10)/n/n# 获取TextRank算法提取的关键句/ntext_rank_sentences = text_rank(sentences, 3)/n/n# 获取TF-IDF算法提取的关键句/ntf_idf_sentences = tf_idf(sentences, 3)/n/n# 获取LSI算法提取的关键句/nlsi_sentences = lsi(sentences, 3)/n/n# 获取LDA算法提取的关键句/nlda_sentences = lda(sentences, 3)/n/n# 计算相似度/nsimilarity = {}/nfor i, sentence1 in enumerate([text_rank_sentences, tf_idf_sentences, lsi_sentences, lda_sentences]):/n for j, sentence2 in enumerate([text_rank_sentences, tf_idf_sentences, lsi_sentences, lda_sentences]):/n if i != j and (j, i) not in similarity:/n # 分词/n words1 = word_tokenize(sentence1.lower())/n words2 = word_tokenize(sentence2.lower())/n # 去停用词/n words1 = [word for word in words1 if word not in stopwords]/n words2 = [word for word in words2 if word not in stopwords]/n # 构建词典/n dictionary = Dictionary([words1, words2])/n # 构建语料库/n corpus1 = dictionary.doc2bow(words1)/n corpus2 = dictionary.doc2bow(words2)/n # 计算余弦相似度/n similarity[(i, j)] = cosine_similarity([corpus1], [corpus2])[0][0]/n/n# 输出结果/nprint('Keywords:', keywords)/nprint('TextRank Sentences:', text_rank_sentences)/nprint('TF-IDF Sentences:', tf_idf_sentences)/nprint('LSI Sentences:', lsi_sentences)/nprint('LDA Sentences:', lda_sentences)/nprint('Similarity:', similarity)/n/n/n相似度计算方法是余弦相似度。具体地，对于两个向量'a'和'b'，它们的余弦相似度为：/n/n$$/cos(//mathbf{a}, //mathbf{b}) = //frac{//mathbf{a} //cdot //mathbf{b}}{//|//mathbf{a}//| //|//mathbf{b}//|}$$ /n/n其中'a' · 'b'表示'a'和'b'的点积，||'a'||和||'b'||分别表示'a'和'b'的模长。在本代码中，我们将每个句子表示为一个词袋模型（即每个词的出现次数），然后计算它们的余弦相似度。具体地，对于两个句子$s_1$和$s_2$，它们的余弦相似度为：/n/n$$/cos(s_1, s_2) = //frac{//sum_{w //in s_1} tf(w, s_1) //cdot tf(w, s_2)}{//sqrt{//sum_{w //in s_1} tf(w, s_1)^2} //cdot //sqrt{//sum_{w //in s_2} tf(w, s_2)^2}}$$/n/n其中$tf(w, s_1)$表示词$w$在句子$s_1$中的出现次数，$tf(w, s_2)$表示词$w$在句子$s_2$中的出现次数。