import pandas as pd import numpy as np import matplotlib.pyplot as plt import gensim from gensim.corpora import Dictionary from gensim.models import LdaModel, CoherenceModel

读取数据

data = pd.read_excel('data_perplexity.xlsx') texts = data.apply(lambda x:str(x).split())

构建词典

dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]

定义最大主题数

max_topics = 16

训练LDA模型并计算困惑度

perplexity = [] for num_topics in range(1, max_topics+1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) perplexity.append(np.exp2(-lda_model.log_perplexity(corpus))) # 将困惑度转换为正值

绘制主题数与困惑度曲线

plt.plot(range(1, max_topics+1), perplexity) plt.xlabel("Number of Topics") plt.ylabel("Perplexity") plt.title("Number of Topics vs Perplexity") plt.show()

计算一致性

coherence = [] for num_topics in range(1, max_topics+1): lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') coherence_lda = coherence_model_lda.get_coherence() coherence.append(coherence_lda)

绘制主题数与一致性曲线

plt.plot(range(1, max_topics+1), coherence) plt.xlabel("Number of Topics") plt.ylabel("Coherence Score") plt.title("Number of Topics vs Coherence Score") plt.show()

import pandas as pd import numpy as np import matplotlibpyplot as plt import gensim from gensimcorpora import Dictionary from gensimmodels import LdaModel CoherenceModel # 读取数据 data = pdread_exceldat

import pandas as pd import numpy as np import matplotlibpyplot as plt import gensim from gensimcorpora import Dictionary from gensimmodels import LdaModel CoherenceModel # 读取数据 data = pdread_exceldat