import jieba import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import csv

读取文本文件,每行为一个文本数据

with open('es.csv', 'r', encoding='gbk') as f: texts = f.readlines()

对每个文本进行分词,保存为字符串列表

text_list = [] stopwords = set(open('stopwords.txt', 'r', encoding='utf-8').read().split(' ')) for text in texts: words = jieba.cut(text.strip()) words = [word for word in words if word not in stopwords] text_list.append(' '.join(words))

保存分词结果

with open('F.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for text in text_list: writer.writerow([text])

对文本数据进行tf-idf处理

vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(text_list)

保存去停用词结果

with open('T.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for row in X.toarray(): writer.writerow(row)

使用KMeans聚类

kmeans = KMeans(n_clusters=5, random_state=0) kmeans.fit(X)

输出每个文本所属的簇标签和特征矩阵

#labels = kmeans.labels_ #print('每个文本所属的簇标签:', labels)

输出每个文本所属的簇标签和特征矩阵

labels = kmeans.labels_ for i, text in enumerate(texts): print('第'+str(labels[i])+'类'+ ':' +text.strip()) print('特征矩阵: ', X.toarray()) print(X) np.savetxt('jz.txt', X.toarray(),fmt='%.8f')

#保存聚类结果 with open('result.txt', 'w', encoding='utf-8') as f: for i, text in enumerate(texts): f.write('第'+str(labels[i])+'类' + ':' + text.strip() + ' ') 加入可视化聚类结果内容:import matplotlib.pyplot as plt from sklearn.decomposition import PCA

对tf-idf特征矩阵进行降维处理

pca = PCA(n_components=2) X_pca = pca.fit_transform(X.toarray())

可视化聚类结果

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels) plt.title('Cluster Result') plt.show()

Python 使用jieba分词、TF-IDF和KMeans聚类文本数据

原文地址: https://www.cveoy.top/t/topic/noTx 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录