停用词表采用哈工大等停用词表给定数据集corpus文件地址为ELearning大三下自然语言处理corpus该数据集包含若干类数据其目录结构为:2个每个目录下面有若干个文本文件其结构如下所示:neg1txt2txtpos1txt2txt采用相关的评价分析方法:如精确率、召回率和F1值进行分析。用python编写
以下是Python代码实现:
import os
import jieba
import codecs
# 加载停用词表
def load_stopwords(filepath):
stopwords = []
with codecs.open(filepath, 'r', encoding='utf-8') as f:
for line in f:
stopwords.append(line.strip())
return stopwords
# 分词并去除停用词
def seg_and_remove_stopwords(sentence, stopwords):
seg_list = jieba.cut(sentence)
seg_list = [word for word in seg_list if word not in stopwords]
return ' '.join(seg_list)
# 计算文本准确率、召回率和F1值
def evaluate(actual, predicted):
tp, fp, fn = 0, 0, 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
tp += 1
elif actual[i] == 1 and predicted[i] == 0:
fn += 1
else:
fp += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)
return precision, recall, f1_score
# 加载停用词表
stopwords = load_stopwords('stopwords.txt')
# 读取数据集
corpus_dir = 'E:/Learning/大三下/自然语言处理/corpus'
corpus = []
labels = []
for category in os.listdir(corpus_dir):
if category.startswith('.'):
continue
category_dir = os.path.join(corpus_dir, category)
for filename in os.listdir(category_dir):
if filename.startswith('.'):
continue
filepath = os.path.join(category_dir, filename)
with codecs.open(filepath, 'r', encoding='utf-8') as f:
content = f.read().strip()
corpus.append(seg_and_remove_stopwords(content, stopwords))
labels.append(1 if category == 'pos' else 0)
# 划分训练集和测试集
train_size = int(len(corpus) * 0.8)
train_corpus = corpus[:train_size]
train_labels = labels[:train_size]
test_corpus = corpus[train_size:]
test_labels = labels[train_size:]
# 训练模型
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([
('vect', CountVectorizer()),
('clf', MultinomialNB(alpha=0.01)),
])
clf.fit(train_corpus, train_labels)
# 在测试集上进行预测
predicted = clf.predict(test_corpus)
# 计算测试结果
precision, recall, f1_score = evaluate(test_labels, predicted)
print('precision:', precision)
print('recall:', recall)
print('F1 score:', f1_score)
上述代码中,load_stopwords函数用于加载停用词表,seg_and_remove_stopwords函数用于对文本进行分词并去除停用词,evaluate函数用于计算文本的准确率、召回率和F1值。在主程序中,首先加载停用词表,然后读取数据集并进行分词去停用词处理。接着,将数据集划分为训练集和测试集,使用sklearn库中的Pipeline和MultinomialNB类训练朴素贝叶斯分类器模型。最后,在测试集上进行预测并计算评价指标
原文地址: https://www.cveoy.top/t/topic/fmAj 著作权归作者所有。请勿转载和采集!