Python文本主题分类：用jieba分词和LDA模型分析汽车评论数据

本文将介绍如何使用Python对中文文本进行主题分类，以汽车评论数据为例，使用jieba库进行分词，LDA模型进行主题建模，并通过可视化工具展示结果。

1. 数据读取和预处理

import pandas as pd
import re
import jieba

# 读取数据
data = pd.read_excel('汽车.xlsx',header=None)
data.columns=['content']
data.head()

# 定义中文分词函数
def chinese_word_cut(mytext):
    # 文本预处理：去除一些无用的字符，只提取出中文出来
    new_data = re.findall('[一-龥]+', mytext, re.S)
    new_data = ' '.join(new_data)
 
    # 文本分词
    seg_list_exact = jieba.cut(new_data, cut_all=True)
    result_list = []
    with open('cn_stopwords.txt',encoding='utf-8') as f: # 可根据需要打开停用词库，然后加上不想显示的词语
        con = f.readlines()
        stop_words = set()
        for i in con:
            i = i.replace('\n', '')   # 去掉读取每一行数据的\n
            stop_words.add(i)
 
    for word in seg_list_exact:
        if word not in stop_words and len(word) > 1:
            result_list.append(word)      
    return ' '.join(result_list)

# 生成分词后的字段
data['content_cutted'] = data.content.apply(chinese_word_cut)
data.head()

2. 特征提取和LDA模型训练

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 提取词向量
n_features = 1000 #提取1000个特征词语
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(data.content_cutted)

# LDA模型训练
n_topics = 5  # 设置LDA分类的主题个数
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
                                doc_topic_prior=0.1,
                                topic_word_prior=0.01,
                               random_state=666)  
lda.fit(tf)

3. 主题词提取和结果可视化

# 输出每个主题对应词语
def print_top_words(model, feature_names, n_top_words):
    tword = []
    for topic_idx, topic in enumerate(model.components_):
        print(f'Topic #{topic_idx}:')
        topic_w = ' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        tword.append(topic_w)
        print(topic_w)
    return tword

n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)

# 将主题信息添加到数据集中
import numpy as np
topics=lda.transform(tf)
topic = []
for t in topics:
    topic.append(list(t).index(np.max(t)))
data['topic']=topic
data.to_excel('data_topic.xlsx',index=False)  # 将结果保存为Excel文件

# 绘制困惑度曲线确定最佳主题数
import matplotlib.pyplot as plt

plexs = []
scores = []
n_max_topics = 16  
for i in range(1,n_max_topics):
    lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50,random_state=666)
    lda.fit(tf)
    plexs.append(lda.perplexity(tf))
    scores.append(lda.score(tf))

n_t=15 # 区间最右侧的值
x=list(range(1,n_t))
plt.plot(x,plexs[1:n_t])
plt.xlabel('number of topics')
plt.ylabel('perplexity')
plt.show()

# 使用pyLDAvis进行可视化
import pyLDAvis.gensim
import gensim

# 将LDA模型结果转换为gensim格式
lda_gensim = gensim.models.ldamodel.LdaModel(
    corpus=tf,
    id2word=dict((v, k) for k, v in tf_vectorizer.vocabulary_.items()),
    num_topics=n_topics
)

# 使用pyLDAvis库进行可视化
vis_data = pyLDAvis.gensim.prepare(lda_gensim, tf, tf_vectorizer)
pyLDAvis.display(vis_data)

总结

本文介绍了如何使用Python对中文文本进行主题分类，并以汽车评论数据为例进行了实战演示。通过jieba库进行分词、LDA模型进行主题建模以及可视化工具展示结果，可以帮助我们更好地理解文本数据背后的主题信息。需要注意的是，LDA模型的参数设置和主题个数的选择都会影响最终的结果，需要根据实际情况进行调整。