LDA主题模型困惑度与一致性曲线绘制 - Python代码示例
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
# 读取数据
data = pd.read_excel('data_perplexity.xlsx')
texts = data['texts'].apply(lambda x: x.split())
# 构建词典
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# 定义最大主题数
max_topics = 16
# 训练LDA模型并计算困惑度
perplexity = []
for num_topics in range(1, max_topics+1):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
perplexity.append(lda_model.log_perplexity(corpus))
# 绘制主题数与困惑度曲线
plt.plot(range(1, max_topics+1), perplexity)
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity')
plt.title('Number of Topics vs Perplexity')
plt.show()
# 计算一致性
coherence = []
for num_topics in range(1, max_topics+1):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
coherence.append(coherence_lda)
# 绘制主题数与一致性曲线
plt.plot(range(1, max_topics+1), coherence)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Number of Topics vs Coherence Score')
plt.show()
注:该代码仅为示例,实际情况中需根据数据集和模型进行适当调整。
原文地址: https://www.cveoy.top/t/topic/m1ij 著作权归作者所有。请勿转载和采集!