LDA主题模型困惑度与一致性曲线绘制 - Python代码示例 - 常规

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

# 读取数据
data = pd.read_excel('data_perplexity.xlsx')
texts = data['texts'].apply(lambda x: x.split())

# 构建词典
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# 定义最大主题数
max_topics = 16

# 训练LDA模型并计算困惑度
perplexity = []
for num_topics in range(1, max_topics+1):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    perplexity.append(lda_model.log_perplexity(corpus))

# 绘制主题数与困惑度曲线
plt.plot(range(1, max_topics+1), perplexity)
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity')
plt.title('Number of Topics vs Perplexity')
plt.show()

# 计算一致性
coherence = []
for num_topics in range(1, max_topics+1):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherence_lda = coherence_model_lda.get_coherence()
    coherence.append(coherence_lda)

# 绘制主题数与一致性曲线
plt.plot(range(1, max_topics+1), coherence)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Number of Topics vs Coherence Score')
plt.show()

注：该代码仅为示例，实际情况中需根据数据集和模型进行适当调整。