import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

读取数据

data = pd.read_excel('data_perplexity.xlsx')
texts = data.apply(lambda x:str(x).split())

构建词典

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

定义最大主题数

max_topics = 16

训练LDA模型并计算困惑度

perplexity = []
for num_topics in range(1, max_topics+1):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
perplexity.append(np.exp2(-lda_model.log_perplexity(corpus))) # 将困惑度转换为正值

绘制主题数与困惑度曲线

plt.plot(range(1, max_topics+1), perplexity)
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")
plt.title("Number of Topics vs Perplexity")
plt.show()

计算一致性

coherence = []
for num_topics in range(1, max_topics+1):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
if coherence == 'c_v' and texts is None:
raise ValueError("'texts' should be provided for c_v coherence.")
else:
coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='c_v', texts=texts)
coherence_lda = coherence_model_lda.get_coherence()
coherence.append(coherence_lda)

绘制主题数与一致性曲线

plt.plot(range(1, max_topics+1), coherence)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Number of Topics vs Coherence Score")
plt.show()

import pandas as pd import numpy as np import matplotlibpyplot as plt import gensim from gensimcorpora import Dictionary from gensimmodels import LdaModel CoherenceModel # 读取数据 data = pdread_

import pandas as pd import numpy as np import matplotlibpyplot as plt import gensim from gensimcorpora import Dictionary from gensimmodels import LdaModel CoherenceModel # 读取数据 data = pdread_