LDA主题模型选择:困惑度和一致性分析
import pandas as pd import numpy as np import matplotlib.pyplot as plt import gensim from gensim.corpora import Dictionary from gensim.models import LdaModel, CoherenceModel
读取数据
data = pd.read_excel('data_perplexity.xlsx') texts = data.apply(lambda x: str(x).split())
构建词典
dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]
定义最大主题数
max_topics = 16
训练LDA模型并计算困惑度
perplexity = [] for num_topics in range(1, max_topics + 1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) perplexity.append(np.exp2(-lda_model.log_perplexity(corpus))) # 将困惑度转换为正值
绘制主题数与困惑度曲线
plt.plot(range(1, max_topics + 1), perplexity) plt.xlabel('Number of Topics') plt.ylabel('Perplexity') plt.title('Number of Topics vs Perplexity') plt.show()
计算一致性
coherence = [] for num_topics in range(1, max_topics + 1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) if coherence == 'c_v' and texts is None: raise ValueError('’texts’ should be provided for c_v coherence.') else: coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='c_v', texts=texts) coherence_lda = coherence_model_lda.get_coherence() coherence.append(coherence_lda)
绘制主题数与一致性曲线
plt.plot(range(1, max_topics + 1), coherence) plt.xlabel('Number of Topics') plt.ylabel('Coherence Score') plt.title('Number of Topics vs Coherence Score') plt.show()
原文地址: https://www.cveoy.top/t/topic/m1w0 著作权归作者所有。请勿转载和采集!