import pandas as pd import numpy as np import matplotlib.pyplot as plt import gensim from gensim.corpora import Dictionary from gensim.models import LdaModel, CoherenceModel

读取数据

data = pd.read_excel('data_perplexity.xlsx') texts = data.apply(lambda x: str(x).split())

构建词典

dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]

定义最大主题数

max_topics = 16

训练LDA模型并计算困惑度

perplexity = [] for num_topics in range(1, max_topics + 1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) perplexity.append(np.exp2(-lda_model.log_perplexity(corpus))) # 将困惑度转换为正值

绘制主题数与困惑度曲线

plt.plot(range(1, max_topics + 1), perplexity) plt.xlabel('Number of Topics') plt.ylabel('Perplexity') plt.title('Number of Topics vs Perplexity') plt.show()

计算一致性

coherence = [] for num_topics in range(1, max_topics + 1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) if coherence == 'c_v' and texts is None: raise ValueError('’texts’ should be provided for c_v coherence.') else: coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='c_v', texts=texts) coherence_lda = coherence_model_lda.get_coherence() coherence.append(coherence_lda)

绘制主题数与一致性曲线

plt.plot(range(1, max_topics + 1), coherence) plt.xlabel('Number of Topics') plt.ylabel('Coherence Score') plt.title('Number of Topics vs Coherence Score') plt.show()

LDA主题模型选择：困惑度和一致性分析