新闻真假识别模型训练与评估
#!/usr/bin/env python
coding: utf-8
import pathlib
In[1]:
import re
import nltk
import joblib from sklearn.metrics import accuracy_score, confusion_matrix import seaborn as sns import matplotlib.pyplot as plt
plt.style.use('seaborn') plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] plt.rcParams['axes.unicode_minus'] = False
nltk.download('stopwords')
In[2]:
import pandas as pd
pathlib.Path('cache').mkdir(exist_ok=True)
def load_data(filename='news.csv'): df = pd.read_csv(filename) joblib.dump(df, 'cache/df.pkl') return df
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stop_words = stopwords.words('english')
stop_words = []
def get_stemmed_text(corpus): text = re.sub('[^a-zA-Z]', ' ', corpus).lower() return ' '.join( stemmer.stem(word) for word in text.split() if word not in stop_words )
def data_process(): df = joblib.load('cache/df.pkl') yield '开始数据处理' df['text'] = df['text'] + ' ' + df['title'] yield '数据合并完成' yield '开始去除停用词, 耗时较长...' df['text'] = df['text'].apply(get_stemmed_text) yield df joblib.dump(df, 'cache/df.pkl') yield '数据处理完成'
def data_visualization(): df = joblib.load('cache/df.pkl') df['label'].value_counts().plot(kind='bar') plt.savefig('cache/label.png') yield pathlib.Path('cache/label.png').absolute()
# 词云
from wordcloud import WordCloud
wordcloud = WordCloud(background_color='white', stopwords=stop_words, max_words=100, max_font_size=50,
random_state=42).generate(str(df['text']))
wordcloud.to_file('cache/wordcloud.png')
yield pathlib.Path('cache/wordcloud.png').absolute()
yield '数据可视化完成'
数据集划分
from sklearn.model_selection import train_test_split
def train(size=0.2): df = joblib.load('cache/df.pkl') yield '开始划分数据集' x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=size, random_state=42)
# In[38]:
# 词袋模型
from sklearn.feature_extraction.text import CountVectorizer
yield '开始构建词袋模型'
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(x_train)
count_test = count_vectorizer.transform(x_test)
# TF-IDF模型
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)
# 训练模型
from sklearn.linear_model import LogisticRegression
yield '开始训练模型'
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
yield '模型训练完成'
joblib.dump(model, 'cache/model.pkl')
joblib.dump(count_test, 'cache/count_test.pkl')
joblib.dump(y_test, 'cache/y_test.pkl')
yield '模型保存完成'
yield '训练完成'
def predict(): model = joblib.load('cache/model.pkl') count_test = joblib.load('cache/count_test.pkl')
yield '开始预测'
pred = model.predict(count_test)
yield '预测完成'
joblib.dump(pred, 'cache/pred.pkl')
# In[46]:
def evaluate(): y_test = joblib.load('cache/y_test.pkl') pred = joblib.load('cache/pred.pkl') print('Accuracy: {}'.format(accuracy_score(y_test, pred))) yield '准确率: {}'.format(accuracy_score(y_test, pred)) # 混淆矩阵 value = confusion_matrix(y_test, pred, labels=[0, 1]) sns.heatmap(value, annot=True, fmt='d', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real']) plt.savefig('cache/confusion_matrix.png') yield pathlib.Path('cache/confusion_matrix.png').absolute() # # In[ ]: # # # In[51]: # # # 预测第一条测试新闻是真还是假 # print('Predicted: {}'.format(model.predict(count_test[1])))
原文地址: https://www.cveoy.top/t/topic/oDKz 著作权归作者所有。请勿转载和采集!