import jieba import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import csv

读取文本文件,每行为一个文本数据

with open('ps.csv', 'r', encoding='gbk') as f: texts = f.readlines()

对每个文本进行分词,保存为字符串列表

text_list = [] stopwords = set(open('stopwords.txt', 'r', encoding='utf-8').read().split(' ')) for text in texts: words = jieba.cut(text.strip()) words = [word for word in words if word not in stopwords] text_list.append(' '.join(words))

保存分词结果

with open('F.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for text in text_list: writer.writerow([text])

对文本数据进行tf-idf处理

vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(text_list)

保存去停用词结果

with open('T.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for row in X.toarray(): writer.writerow(row)

使用KMeans聚类

kmeans = KMeans(n_clusters=5, random_state=0) kmeans.fit(X)

输出每个文本所属的簇标签和特征矩阵

#labels = kmeans.labels_ #print('每个文本所属的簇标签:', labels)

输出每个文本所属的簇标签和特征矩阵

labels = kmeans.labels_ for i, text in enumerate(texts): print('第'+str(labels[i])+'类'+ ':' +text.strip()) print('特征矩阵: ', X.toarray()) print(X) np.savetxt('jz.txt', X.toarray()) 保存聚类结果形式为‘第i类:文本’内容:with open('result.txt', 'w', encoding='utf-8') as f: for i, text in enumerate(texts): f.write('第'+str(labels[i])+'类' + ':' + text.strip() + ' ')

使用 Python 和 KMeans 聚类进行文本聚类

原文地址: https://www.cveoy.top/t/topic/noQ5 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录