使用 Python 和 KMeans 聚类进行文本聚类
import jieba import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import csv
读取文本文件,每行为一个文本数据
with open('ps.csv', 'r', encoding='gbk') as f: texts = f.readlines()
对每个文本进行分词,保存为字符串列表
text_list = [] stopwords = set(open('stopwords.txt', 'r', encoding='utf-8').read().split(' ')) for text in texts: words = jieba.cut(text.strip()) words = [word for word in words if word not in stopwords] text_list.append(' '.join(words))
保存分词结果
with open('F.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for text in text_list: writer.writerow([text])
对文本数据进行tf-idf处理
vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(text_list)
保存去停用词结果
with open('T.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for row in X.toarray(): writer.writerow(row)
使用KMeans聚类
kmeans = KMeans(n_clusters=5, random_state=0) kmeans.fit(X)
输出每个文本所属的簇标签和特征矩阵
#labels = kmeans.labels_ #print('每个文本所属的簇标签:', labels)
输出每个文本所属的簇标签和特征矩阵
labels = kmeans.labels_ for i, text in enumerate(texts): print('第'+str(labels[i])+'类'+ ':' +text.strip()) print('特征矩阵: ', X.toarray()) print(X) np.savetxt('jz.txt', X.toarray()) 保存聚类结果形式为‘第i类:文本’内容:with open('result.txt', 'w', encoding='utf-8') as f: for i, text in enumerate(texts): f.write('第'+str(labels[i])+'类' + ':' + text.strip() + ' ')
原文地址: https://www.cveoy.top/t/topic/noQ5 著作权归作者所有。请勿转载和采集!