Clickbait Detection: Combining Datasets, Training a BOW Classifier, and Hyperparameter Tuning
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import precision_score, recall_score, f1_score from sklearn.model_selection import ParameterGrid from sklearn.pipeline import Pipeline import random
Read the two datasets
positive_examples = pd.read_csv('https://raw.githubusercontent.com/pfrcks/clickbait-detection/master/clickbait') negative_examples = pd.read_csv('https://raw.githubusercontent.com/pfrcks/clickbait-detection/master/not-clickbait')
Merge the datasets
combined_dataset = pd.concat([positive_examples, negative_examples], ignore_index=True)
Shuffle the dataset
random.seed(42) combined_dataset = combined_dataset.sample(frac=1).reset_index(drop=True)
Split the dataset into train, validation, and test datasets
train_data, test_data, train_labels, test_labels = train_test_split(combined_dataset['text'], combined_dataset['label'], test_size=0.2, random_state=42) train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.1, random_state=42)
Calculate target rates
target_rate_train = sum(train_labels) / len(train_labels) target_rate_val = sum(val_labels) / len(val_labels) target_rate_test = sum(test_labels) / len(test_labels)
print('Target rate in train dataset:', target_rate_train) print('Target rate in validation dataset:', target_rate_val) print('Target rate in test dataset:', target_rate_test)
Create a pipeline for BOW naive bayes model
pipeline = Pipeline([ ('vectorizer', CountVectorizer(ngram_range=(1, 2))), ('classifier', MultinomialNB()) ])
Fit the classifier on the training set
pipeline.fit(train_data, train_labels)
Make predictions on the training and validation sets
train_predictions = pipeline.predict(train_data) val_predictions = pipeline.predict(val_data)
Compute precision, recall, and F1-score
train_precision = precision_score(train_labels, train_predictions) train_recall = recall_score(train_labels, train_predictions) train_f1_score = f1_score(train_labels, train_predictions)
val_precision = precision_score(val_labels, val_predictions) val_recall = recall_score(val_labels, val_predictions) val_f1_score = f1_score(val_labels, val_predictions)
print('Training set - Precision:', train_precision) print('Training set - Recall:', train_recall) print('Training set - F1-score:', train_f1_score)
print('Validation set - Precision:', val_precision) print('Validation set - Recall:', val_recall) print('Validation set - F1-score:', val_f1_score)
Hyperparameter tuning
param_grid = { 'vectorizer__max_df': [0.5, 0.75, 1.0], 'classifier__alpha': [0.1, 0.5, 1.0], 'vectorizer__ngram_range': [(1, 1), (1, 2)] }
grid_search = ParameterGrid(param_grid)
best_precision = 0.0 best_recall = 0.0 best_f1_score = 0.0 best_params = {}
for params in grid_search: pipeline.set_params(**params) pipeline.fit(train_data, train_labels) val_predictions = pipeline.predict(val_data) precision = precision_score(val_labels, val_predictions) recall = recall_score(val_labels, val_predictions) f1 = f1_score(val_labels, val_predictions) if f1 > best_f1_score: best_precision = precision best_recall = recall best_f1_score = f1 best_params = params
print('Best parameters:', best_params) print('Precision on validation set:', best_precision) print('Recall on validation set:', best_recall) print('F1-score on validation set:', best_f1_score)
原文地址: https://www.cveoy.top/t/topic/8pN 著作权归作者所有。请勿转载和采集!