import pandas as pd from sklearnfeature_extractiontext import CountVectorizer from sklearndecomposition import LatentDirichletAllocation import matplotlibpyplot as plt from sklearnmetrics import coher
解释代码
导入需要的库
import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import matplotlib.pyplot as plt from sklearn.metrics import coherence_score from sklearn.model_selection import GridSearchCV
读入数据集
data = pd.read_excel('data_perplexity.xlsx') corpus = data['content'].tolist()
构建词袋模型
vectorizer = CountVectorizer(token_pattern='\w+', max_df=0.95, min_df=2) X = vectorizer.fit_transform(corpus)
设置参数范围
params = {'n_components': [i for i in range(2, 17)]}
使用GridSearchCV选择最优参数
lda = LatentDirichletAllocation() grid = GridSearchCV(lda, params, cv=5, verbose=1, n_jobs=-1) grid.fit(X)
获取最优模型
best_lda = grid.best_estimator_
绘制主题数与困惑度曲线
n_topics = [i for i in range(2, 17)] perplexity_score = [] for n in n_topics: lda = LatentDirichletAllocation(n_components=n, max_iter=50, learning_method='online') lda.fit(X) perplexity_score.append(lda.perplexity(X)) plt.plot(n_topics, perplexity_score) plt.xlabel('Number of topics') plt.ylabel('Perplexity score') plt.show()
绘制主题数与一致性曲线
n_topics = [i for i in range(2, 17)] coherence_score = [] for n in n_topics: lda = LatentDirichletAllocation(n_components=n, max_iter=50, learning_method='online') lda.fit(X) coherence_score.append(coherence_score(lda, X, vectorizer)) plt.plot(n_topics, coherence_score) plt.xlabel('Number of topics') plt.ylabel('Coherence score') plt.show()
代码解释
1. 导入需要的库
2. 读入数据集,将内容列转换为列表形式
3. 构建词袋模型,去掉出现频率大于0.95和出现次数小于2的词
4. 设置需要调整的参数范围
5. 使用GridSearchCV选择最优的LDA模型,cv=5表示使用五折交叉验证,n_jobs=-1表示使用全部CPU并行计算
6. 获取最优的LDA模型
7. 绘制主题数与困惑度曲线,perplexity_score表示困惑度分数,n_iter表示迭代次数,learning_method表示学习方法
8. 绘制主题数与一致性曲线,coherence_score表示一致性分数,使用coherence_score函数计算
9. 显示图形
原文地址: https://www.cveoy.top/t/topic/bxRP 著作权归作者所有。请勿转载和采集!