#!/usr/bin/env python

coding: utf-8

import pathlib

In[1]:

import re

import nltk

import joblib from sklearn.metrics import accuracy_score, confusion_matrix import seaborn as sns import matplotlib.pyplot as plt

plt.style.use('seaborn') plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] plt.rcParams['axes.unicode_minus'] = False

nltk.download('stopwords')

In[2]:

import pandas as pd

pathlib.Path('cache').mkdir(exist_ok=True)

def load_data(filename='news.csv'): df = pd.read_csv(filename) joblib.dump(df, 'cache/df.pkl') return df

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

stop_words = stopwords.words('english')

stop_words = []

def get_stemmed_text(corpus): text = re.sub('[^a-zA-Z]', ' ', corpus).lower() return ' '.join( stemmer.stem(word) for word in text.split() if word not in stop_words )

def data_process(): df = joblib.load('cache/df.pkl') yield '开始数据处理' df['text'] = df['text'] + ' ' + df['title'] yield '数据合并完成' yield '开始去除停用词, 耗时较长...' df['text'] = df['text'].apply(get_stemmed_text) yield df joblib.dump(df, 'cache/df.pkl') yield '数据处理完成'

def data_visualization(): df = joblib.load('cache/df.pkl') df['label'].value_counts().plot(kind='bar') plt.savefig('cache/label.png') yield pathlib.Path('cache/label.png').absolute()

# 词云
from wordcloud import WordCloud

wordcloud = WordCloud(background_color='white', stopwords=stop_words, max_words=100, max_font_size=50,
                      random_state=42).generate(str(df['text']))

wordcloud.to_file('cache/wordcloud.png')
yield pathlib.Path('cache/wordcloud.png').absolute()
yield '数据可视化完成'

数据集划分

from sklearn.model_selection import train_test_split

def train(size=0.2): df = joblib.load('cache/df.pkl') yield '开始划分数据集' x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=size, random_state=42)

# In[38]:

# 词袋模型
from sklearn.feature_extraction.text import CountVectorizer

yield '开始构建词袋模型'
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(x_train)
count_test = count_vectorizer.transform(x_test)

# TF-IDF模型
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

# 训练模型
from sklearn.linear_model import LogisticRegression

yield '开始训练模型'
model = LogisticRegression(max_iter=1000)
model.fit(count_train, y_train)
yield '模型训练完成'
joblib.dump(model, 'cache/model.pkl')
joblib.dump(count_test, 'cache/count_test.pkl')
joblib.dump(y_test, 'cache/y_test.pkl')
yield '模型保存完成'
yield '训练完成'

def predict(): model = joblib.load('cache/model.pkl') count_test = joblib.load('cache/count_test.pkl')

yield '开始预测'
pred = model.predict(count_test)
yield '预测完成'
joblib.dump(pred, 'cache/pred.pkl')

# In[46]:

def evaluate(): y_test = joblib.load('cache/y_test.pkl') pred = joblib.load('cache/pred.pkl') print('Accuracy: {}'.format(accuracy_score(y_test, pred))) yield '准确率: {}'.format(accuracy_score(y_test, pred)) # 混淆矩阵 value = confusion_matrix(y_test, pred, labels=[0, 1]) sns.heatmap(value, annot=True, fmt='d', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real']) plt.savefig('cache/confusion_matrix.png') yield pathlib.Path('cache/confusion_matrix.png').absolute() # # In[ ]: # # # In[51]: # # # 预测第一条测试新闻是真还是假 # print('Predicted: {}'.format(model.predict(count_test[1])))

新闻真假识别模型训练与评估