使用Gensim库对txt文件进行LDA主题模型分析
以下是基于Python的Gensim库实现的LDA主题模型代码:
- 加载文本数据
import os
from gensim import corpora
# 加载文本数据
def load_data(path):
file_list = os.listdir(path)
data = []
for file_name in file_list:
with open(os.path.join(path, file_name), 'r', encoding='utf-8') as f:
data.append(f.read())
return data
# 构建语料库
def build_corpus(data):
texts = [[word for word in document.lower().split()] for document in data]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
return corpus, dictionary
- 训练LDA主题模型
from gensim.models.ldamodel import LdaModel
# 训练LDA主题模型
def train_lda_model(corpus, dictionary, num_topics=10, passes=10):
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes)
return lda_model
- 可视化主题分布
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')
# 可视化主题分布
def visualize_topics(lda_model, corpus, dictionary):
vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)
- 主程序
if __name__ == '__main__':
data = load_data('./data')
corpus, dictionary = build_corpus(data)
lda_model = train_lda_model(corpus, dictionary)
visualize_topics(lda_model, corpus, dictionary)
以上是基于Gensim库的LDA主题模型代码,可以通过调整模型参数来优化主题分布效果。
原文地址: https://www.cveoy.top/t/topic/n5Sk 著作权归作者所有。请勿转载和采集!