import pandas as pd import numpy as np import matplotlibpyplot as plt import gensim from gensimcorpora import Dictionary from gensimmodels import LdaModel CoherenceModel # 读取数据 data = pdread_
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
读取数据
data = pd.read_excel('data_perplexity.xlsx')
texts = data.apply(lambda x:str(x).split())
构建词典
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
定义最大主题数
max_topics = 16
训练LDA模型并计算困惑度
perplexity = []
for num_topics in range(1, max_topics+1):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
perplexity.append(np.exp2(-lda_model.log_perplexity(corpus))) # 将困惑度转换为正值
绘制主题数与困惑度曲线
plt.plot(range(1, max_topics+1), perplexity)
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")
plt.title("Number of Topics vs Perplexity")
plt.show()
计算一致性
coherence = []
for num_topics in range(1, max_topics+1):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
if coherence == 'c_v' and texts is None:
raise ValueError("'texts' should be provided for c_v coherence.")
else:
coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='c_v', texts=texts)
coherence_lda = coherence_model_lda.get_coherence()
coherence.append(coherence_lda)
绘制主题数与一致性曲线
plt.plot(range(1, max_topics+1), coherence)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Number of Topics vs Coherence Score")
plt.show()
原文地址: https://www.cveoy.top/t/topic/bxyU 著作权归作者所有。请勿转载和采集!