使用 Gensim LDA 模型分析分词后的 Excel 数据集:主题数与困惑度曲线
import pandas as pd
import numpy as np
import gensim
from gensim import corpora, models
import matplotlib.pyplot as plt
# 读取数据集
df = pd.read_excel('dp.xlsx')
# 创建语料库
texts = [[word for word in str(doc).split()] for doc in df['content']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# 训练LDA模型并绘制主题数与困惑度曲线
max_topics = 16
perplexity_scores = []
for num_topics in range(1, max_topics+1):
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=dictionary,
num_topics=num_topics,
random_state=42,
update_every=1,
passes=10,
alpha='auto',
per_word_topics=True)
perplexity_scores.append(lda_model.log_perplexity(corpus))
plt.plot(range(1, max_topics+1), perplexity_scores)
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity Score')
plt.show()
注意:在绘制主题数与困惑度曲线之前,需要先安装 gensim 库。可以使用以下命令进行安装:
pip install gensim
原文地址: https://www.cveoy.top/t/topic/m0XR 著作权归作者所有。请勿转载和采集!