Python代码实现基于TESTRANKTF-IDFLSI以及LDA模型的关键字提取关键字为10个并计算不同模型提取到的关键字的相似度并说明相似度计算方法

以下是Python代码实现基于TESTRANK，TF-IDF，LSI以及LDA模型的关键字提取，关键字为10个，并计算不同模型提取到的关键字的相似度，并说明相似度计算方法：

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict
from heapq import nlargest
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.models import LsiModel, LdaModel
from sklearn.metrics.pairwise import cosine_similarity

# 加载停用词
nltk.download('stopwords')
stopwords = set(stopwords.words('english') + list(punctuation))

# 加载文章
article = """The economic downturn caused by the COVID-19 pandemic has led to an increase in unemployment rates across the world. In the United States, the unemployment rate has reached its highest level since the Great Depression. Many industries, such as travel and hospitality, have been hit particularly hard by the pandemic. However, some industries, such as e-commerce and online entertainment, have seen an increase in demand. It remains to be seen how the pandemic will impact the economy in the long run."""

# 分句
sentences = sent_tokenize(article)

# TextRank算法
def text_rank(sentences, n):
    # 分词
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    # 去停用词
    words = [[word for word in sentence if word not in stopwords] for sentence in words]
    # 构建词频矩阵
    freq = defaultdict(int)
    for sentence in words:
        for word in sentence:
            freq[word] += 1
    # 计算句子分数
    scores = defaultdict(float)
    for i, sentence in enumerate(words):
        for word in sentence:
            scores[i] += freq[word]
        scores[i] /= len(sentence)
    # 获取前n个句子
    top_n = nlargest(n, scores, key=scores.get)
    return [sentences[i] for i in sorted(top_n)]

# TF-IDF算法
def tf_idf(sentences, n):
    # 分词
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    # 去停用词
    words = [[word for word in sentence if word not in stopwords] for sentence in words]
    # 构建词典
    dictionary = Dictionary(words)
    # 构建语料库
    corpus = [dictionary.doc2bow(sentence) for sentence in words]
    # 计算TF-IDF值
    tfidf_model = TfidfModel(corpus)
    tfidf = tfidf_model[corpus]
    # 计算句子分数
    scores = defaultdict(float)
    for i, sentence in enumerate(tfidf):
        for index, score in sentence:
            scores[i] += score
    # 获取前n个句子
    top_n = nlargest(n, scores, key=scores.get)
    return [sentences[i] for i in sorted(top_n)]

# LSI算法
def lsi(sentences, n):
    # 分词
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    # 去停用词
    words = [[word for word in sentence if word not in stopwords] for sentence in words]
    # 构建词典
    dictionary = Dictionary(words)
    # 构建语料库
    corpus = [dictionary.doc2bow(sentence) for sentence in words]
    # 训练LSI模型
    lsi_model = LsiModel(tfidf_model[corpus], id2word=dictionary, num_topics=n)
    lsi = lsi_model[tfidf]
    # 计算句子分数
    scores = defaultdict(float)
    for i, sentence in enumerate(lsi):
        for index, score in sentence:
            scores[i] += score
    # 获取前n个句子
    top_n = nlargest(n, scores, key=scores.get)
    return [sentences[i] for i in sorted(top_n)]

# LDA算法
def lda(sentences, n):
    # 分词
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    # 去停用词
    words = [[word for word in sentence if word not in stopwords] for sentence in words]
    # 构建词典
    dictionary = Dictionary(words)
    # 构建语料库
    corpus = [dictionary.doc2bow(sentence) for sentence in words]
    # 训练LDA模型
    lda_model = LdaModel(corpus, id2word=dictionary, num_topics=n)
    lda = lda_model[corpus]
    # 计算句子分数
    scores = defaultdict(float)
    for i, sentence in enumerate(lda):
        for index, score in sentence:
            scores[i] += score
    # 获取前n个句子
    top_n = nlargest(n, scores, key=scores.get)
    return [sentences[i] for i in sorted(top_n)]

# 获取关键字
def get_keywords(sentences, n):
    # 分词
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    # 去停用词
    words = [[word for word in sentence if word not in stopwords] for sentence in words]
    # 构建词频矩阵
    freq = defaultdict(int)
    for sentence in words:
        for word in sentence:
            freq[word] += 1
    # 获取前n个关键字
    top_n = nlargest(n, freq, key=freq.get)
    return top_n

# 获取关键字
keywords = get_keywords(sentences, 10)

# 获取TextRank算法提取的关键句
text_rank_sentences = text_rank(sentences, 3)

# 获取TF-IDF算法提取的关键句
tf_idf_sentences = tf_idf(sentences, 3)

# 获取LSI算法提取的关键句
lsi_sentences = lsi(sentences, 3)

# 获取LDA算法提取的关键句
lda_sentences = lda(sentences, 3)

# 计算相似度
similarity = {}
for i, sentence1 in enumerate([text_rank_sentences, tf_idf_sentences, lsi_sentences, lda_sentences]):
    for j, sentence2 in enumerate([text_rank_sentences, tf_idf_sentences, lsi_sentences, lda_sentences]):
        if i != j and (j, i) not in similarity:
            # 分词
            words1 = word_tokenize(sentence1.lower())
            words2 = word_tokenize(sentence2.lower())
            # 去停用词
            words1 = [word for word in words1 if word not in stopwords]
            words2 = [word for word in words2 if word not in stopwords]
            # 构建词典
            dictionary = Dictionary([words1, words2])
            # 构建语料库
            corpus1 = dictionary.doc2bow(words1)
            corpus2 = dictionary.doc2bow(words2)
            # 计算余弦相似度
            similarity[(i, j)] = cosine_similarity([corpus1], [corpus2])[0][0]

# 输出结果
print('Keywords:', keywords)
print('TextRank Sentences:', text_rank_sentences)
print('TF-IDF Sentences:', tf_idf_sentences)
print('LSI Sentences:', lsi_sentences)
print('LDA Sentences:', lda_sentences)
print('Similarity:', similarity)

相似度计算方法是余弦相似度。具体地，对于两个向量$\mathbf{a}$和$\mathbf{b}$，它们的余弦相似度为：

$$\cos(\mathbf{a}, \mathbf{b}) = \frac{\mathbf{a} \cdot \mathbf{b}}{|\mathbf{a}| |\mathbf{b}|}$$

其中$\mathbf{a} \cdot \mathbf{b}$表示$\mathbf{a}$和$\mathbf{b}$的点积，$|\mathbf{a}|$和$|\mathbf{b}|$分别表示$\mathbf{a}$和$\mathbf{b}$的模长。在本代码中，我们将每个句子表示为一个词袋模型（即每个词的出现次数），然后计算它们的余弦相似度。具体地，对于两个句子$s_1$和$s_2$，它们的余弦相似度为：

$$\cos(s_1, s_2) = \frac{\sum_{w \in s_1} tf(w, s_1) \cdot tf(w, s_2)}{\sqrt{\sum_{w \in s_1} tf(w, s_1)^2} \cdot \sqrt{\sum_{w \in s_2} tf(w, s_2)^2}}$$

其中$tf(w, s_1)$表示词$w$在句子$s_1$中的出现次数，$tf(w, s_2)$表示词$w$在句子$s_2$中的出现次数

Python代码实现基于TESTRANKTF-IDFLSI以及LDA模型的关键字提取关键字为10个并计算不同模型提取到的关键字的相似度并说明相似度计算方法