import pandas as pd import numpy as np import matplotlibpyplot as plt import gensim from gensimcorpora import Dictionary from gensimmodels import LdaModel CoherenceModel # 读取数据 data = pdread_exceldat
import pandas as pd import numpy as np import matplotlib.pyplot as plt import gensim from gensim.corpora import Dictionary from gensim.models import LdaModel, CoherenceModel
读取数据
data = pd.read_excel('data_perplexity.xlsx') texts = data.apply(lambda x:str(x).split())
构建词典
dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]
定义最大主题数
max_topics = 16
训练LDA模型并计算困惑度
perplexity = [] for num_topics in range(1, max_topics+1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) perplexity.append(np.exp2(-lda_model.log_perplexity(corpus))) # 将困惑度转换为正值
绘制主题数与困惑度曲线
plt.plot(range(1, max_topics+1), perplexity) plt.xlabel("Number of Topics") plt.ylabel("Perplexity") plt.title("Number of Topics vs Perplexity") plt.show()
计算一致性
coherence = [] for num_topics in range(1, max_topics+1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') coherence_lda = coherence_model_lda.get_coherence() coherence.append(coherence_lda)
绘制主题数与一致性曲线
plt.plot(range(1, max_topics+1), coherence) plt.xlabel("Number of Topics") plt.ylabel("Coherence Score") plt.title("Number of Topics vs Coherence Score") plt.show()
原文地址: https://www.cveoy.top/t/topic/bxpP 著作权归作者所有。请勿转载和采集!