以下是Python代码实现:

import os
import jieba
import codecs

# 加载停用词表
def load_stopwords(filepath):
    stopwords = []
    with codecs.open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.append(line.strip())
    return stopwords

# 分词并去除停用词
def seg_and_remove_stopwords(sentence, stopwords):
    seg_list = jieba.cut(sentence)
    seg_list = [word for word in seg_list if word not in stopwords]
    return ' '.join(seg_list)

# 计算文本准确率、召回率和F1值
def evaluate(actual, predicted):
    tp, fp, fn = 0, 0, 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            tp += 1
        elif actual[i] == 1 and predicted[i] == 0:
            fn += 1
        else:
            fp += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * precision * recall / (precision + recall)
    return precision, recall, f1_score

# 加载停用词表
stopwords = load_stopwords('stopwords.txt')

# 读取数据集
corpus_dir = 'E:/Learning/大三下/自然语言处理/corpus'
corpus = []
labels = []
for category in os.listdir(corpus_dir):
    if category.startswith('.'):
        continue
    category_dir = os.path.join(corpus_dir, category)
    for filename in os.listdir(category_dir):
        if filename.startswith('.'):
            continue
        filepath = os.path.join(category_dir, filename)
        with codecs.open(filepath, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            corpus.append(seg_and_remove_stopwords(content, stopwords))
            labels.append(1 if category == 'pos' else 0)

# 划分训练集和测试集
train_size = int(len(corpus) * 0.8)
train_corpus = corpus[:train_size]
train_labels = labels[:train_size]
test_corpus = corpus[train_size:]
test_labels = labels[train_size:]

# 训练模型
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])
clf.fit(train_corpus, train_labels)

# 在测试集上进行预测
predicted = clf.predict(test_corpus)

# 计算测试结果
precision, recall, f1_score = evaluate(test_labels, predicted)
print('precision:', precision)
print('recall:', recall)
print('F1 score:', f1_score)

上述代码中,load_stopwords函数用于加载停用词表,seg_and_remove_stopwords函数用于对文本进行分词并去除停用词,evaluate函数用于计算文本的准确率、召回率和F1值。在主程序中,首先加载停用词表,然后读取数据集并进行分词去停用词处理。接着,将数据集划分为训练集和测试集,使用sklearn库中的Pipeline和MultinomialNB类训练朴素贝叶斯分类器模型。最后,在测试集上进行预测并计算评价指标

停用词表采用哈工大等停用词表给定数据集corpus文件地址为ELearning大三下自然语言处理corpus该数据集包含若干类数据其目录结构为:2个每个目录下面有若干个文本文件其结构如下所示:neg1txt2txtpos1txt2txt采用相关的评价分析方法:如精确率、召回率和F1值进行分析。用python编写

原文地址: https://www.cveoy.top/t/topic/fmAj 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录