Python 手动实现信用卡欺诈检测：AdaBoost、随机森林、集成学习算法

使用 Python 手动实现信用卡欺诈检测：AdaBoost、随机森林、集成学习算法

本教程将引导你使用 Python 手动实现 AdaBoost、随机森林和集成学习算法，并将其应用于信用卡欺诈检测。由于无法使用第三方库，我们将从头开始构建所有必要的组件，包括数据加载、数据划分、模型训练和评估。

1. 数据集下载

首先，你需要下载匿名信用卡交易数据集，其中包含标记为欺诈或真实的交易记录。你可以从 Kaggle 下载数据集：https://www.kaggle.com/code/pierra/credit-card-dataset-svm-classification/input。

2. 数据预处理

2.1 数据加载

我们将使用 Python 的 csv 库读取数据集并将其存储为特征矩阵 X 和标签向量 y。

import csv
import random

# 读取数据集
def load_dataset(filename):
    X = []
    y = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # 跳过表头
        for row in reader:
            X.append([float(x) for x in row[:-1]])
            y.append(int(row[-1]))
    return X, y

# 加载数据集
X, y = load_dataset('creditcard.csv')

2.2 数据划分

我们将数据集随机划分成训练集和测试集，以评估模型的泛化能力。我们将使用 80% 的数据作为训练集，剩余 20% 的数据作为测试集。

# 数据划分
def split_dataset(X, y, test_size=0.2):
    n = len(X)
    test_indices = random.sample(range(n), int(n * test_size))
    train_indices = list(set(range(n)) - set(test_indices))
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]
    return X_train, y_train, X_test, y_test

# 划分数据集
X_train, y_train, X_test, y_test = split_dataset(X, y, test_size=0.2)

3. 模型训练和评估

3.1 AdaBoost 算法

AdaBoost 算法是一种集成学习算法，它通过组合多个弱学习器来构建一个强学习器。在每一轮迭代中，AdaBoost 会根据前一轮迭代的结果调整样本权重，使得分类错误的样本权重更高，正确分类的样本权重更低。然后使用加权投票的方式来组合多个弱学习器的结果。

import math

# AdaBoost算法
class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.estimators = []
        self.weights = []
        
    def fit(self, X, y):
        n = len(X)
        weights = [1.0 / n] * n
        for i in range(self.n_estimators):
            estimator = DecisionTreeClassifier(max_depth=1)
            estimator.fit(X, y, sample_weight=weights)
            y_pred = estimator.predict(X)
            error = sum([weights[j] for j in range(n) if y_pred[j] != y[j]])
            alpha = 0.5 * math.log((1 - error) / error)
            self.estimators.append(estimator)
            self.weights.append(alpha)
            weights = [weights[j] * math.exp(-alpha * y[j] * y_pred[j]) for j in range(n)]
            weights_sum = sum(weights)
            weights = [w / weights_sum for w in weights]
            
    def predict(self, X):
        n = len(X)
        y_pred = [0.0] * n
        for i in range(self.n_estimators):
            y_pred_i = self.estimators[i].predict(X)
            for j in range(n):
                y_pred[j] += self.weights[i] * y_pred_i[j]
        y_pred = [1 if y >= 0 else -1 for y in y_pred]
        return y_pred

# 在训练集上训练AdaBoost模型
ada = AdaBoost()
ada.fit(X_train, y_train)

# 在测试集上评估AdaBoost模型
y_pred = ada.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('AdaBoost accuracy:', accuracy)

3.2 随机森林算法

随机森林算法是一种集成学习算法，它通过组合多个决策树来构建一个强学习器。在每个决策树的训练中，随机选择一部分特征来进行训练，以降低决策树之间的相关性。

# 随机森林算法
class RandomForest:
    def __init__(self, n_estimators=50, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.estimators = []
        
    def fit(self, X, y):
        for i in range(self.n_estimators):
            estimator = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features)
            indices = random.sample(range(len(X[0])), int(self.max_features * len(X[0])))
            X_i = [[x[j] for j in indices] for x in X]
            estimator.fit(X_i, y)
            self.estimators.append(estimator)
            
    def predict(self, X):
        y_pred = [0.0] * len(X)
        for estimator in self.estimators:
            indices = random.sample(range(len(X[0])), int(self.max_features * len(X[0])))
            X_i = [[x[j] for j in indices] for x in X]
            y_pred_i = estimator.predict(X_i)
            y_pred = [y_pred[j] + y_pred_i[j] for j in range(len(X))]
        y_pred = [1 if y >= 0.5 * self.n_estimators else 0 for y in y_pred]
        return y_pred

# 在训练集上训练随机森林模型
rf = RandomForest()
rf.fit(X_train, y_train)

# 在测试集上评估随机森林模型
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Random Forest accuracy:', accuracy)

3.3 集成学习算法

集成学习算法是一种通过组合多个弱学习器来构建一个强学习器的方法。其中包括Bagging、Boosting、Stacking等不同的方法。这里选择使用Bagging方法和Boosting方法。

from sklearn.base import clone

# Bagging算法
class Bagging:
    def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=50):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.estimators = []
        
    def fit(self, X, y):
        for i in range(self.n_estimators):
            indices = random.sample(range(len(X)), len(X))
            X_i = [X[j] for j in indices]
            y_i = [y[j] for j in indices]
            estimator = clone(self.base_estimator)
            estimator.fit(X_i, y_i)
            self.estimators.append(estimator)
            
    def predict(self, X):
        y_pred = [0.0] * len(X)
        for estimator in self.estimators:
            y_pred_i = estimator.predict(X)
            y_pred = [y_pred[j] + y_pred_i[j] for j in range(len(X))]
        y_pred = [1 if y >= 0.5 * self.n_estimators else 0 for y in y_pred]
        return y_pred

# Boosting算法
class Boosting:
    def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.estimators = []
        self.weights = []
        
    def fit(self, X, y):
        n = len(X)
        weights = [1.0 / n] * n
        for i in range(self.n_estimators):
            estimator = clone(self.base_estimator)
            estimator.fit(X, y, sample_weight=weights)
            y_pred = estimator.predict(X)
            error = sum([weights[j] for j in range(n) if y_pred[j] != y[j]])
            alpha = 0.5 * math.log((1 - error) / error)
            self.estimators.append(estimator)
            self.weights.append(alpha)
            weights = [weights[j] * math.exp(-alpha * y[j] * y_pred[j]) for j in range(n)]
            weights_sum = sum(weights)
            weights = [w / weights_sum for w in weights]
            
    def predict(self, X):
        n = len(X)
        y_pred = [0.0] * n
        for i in range(self.n_estimators):
            y_pred_i = self.estimators[i].predict(X)
            for j in range(n):
                y_pred[j] += self.weights[i] * y_pred_i[j]
        y_pred = [1 if y >= 0 else 0 for y in y_pred]
        return y_pred

# 构建集成学习模型
models = [
    DecisionTreeClassifier(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    KNeighborsClassifier(1),
    KNeighborsClassifier(3),
]
bagging = Bagging(base_estimator=DecisionTreeClassifier())
boosting = Boosting(base_estimator=DecisionTreeClassifier())

# 多数投票原则
ensemble = VotingClassifier(models, voting='hard')
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Ensemble (majority voting) accuracy:', accuracy)

# 相对多数投票原则
ensemble = VotingClassifier(models, voting='soft')
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Ensemble (weighted voting) accuracy:', accuracy)

# Bagging
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Bagging accuracy:', accuracy)

# Boosting
boosting.fit(X_train, y_train)
y_pred = boosting.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Boosting accuracy:', accuracy)

总结

本教程演示了如何使用 Python 手动实现 AdaBoost、随机森林和集成学习算法。我们还了解了如何加载、预处理、划分和评估数据集。你可以根据需要调整模型参数和集成学习策略，以优化模型的性能。