Python实现随机森林分类及可视化

本示例展示如何使用Python的Scikit-learn库实现随机森林分类算法，并利用Matplotlib库将分类结果可视化。

1. 数据准备与模型训练

import pandas as pd
data=pd.read_excel('处理后的数据.xlsx')
import numpy as np
from sklearn.tree import DecisionTreeClassifier
X = data[['特征1', '特征2', '特征3', '特征4', '特征5', '特征6', '特征7', '特征8','特征9']]
y = data['类别']

class rfc:
    '''
    随机森林分类器
    '''
    
    def __init__(self, n_estimators = 100, random_state = 0):
        # 随机森林的大小
        self.n_estimators = n_estimators
        # 随机森林的随机种子
        self.random_state = random_state
        
    def fit(self, X, y):
        '''
        随机森林分类器拟合
        '''
        self.y_classes = np.unique(y)
        # 决策树数组
        dts = []
        n = X.shape[0]
        rs = np.random.RandomState(self.random_state)
        for i in range(self.n_estimators):
            # 创建决策树分类器
            dt = DecisionTreeClassifier(random_state=rs.randint(np.iinfo(np.int32).max), max_features = 'auto')
            # 根据随机生成的权重，拟合数据集
            dt.fit(X, y, sample_weight=np.bincount(rs.randint(0, n, n), minlength = n))
            dts.append(dt)
        self.trees = dts
            
    def predict(self, X):
        '''
        随机森林分类器预测
        '''
        # 预测结果数组
        probas = np.zeros((X.shape[0], len(self.y_classes)))
        for i in range(self.n_estimators):
            # 决策树分类器
            dt = self.trees[i]
            # 依次预测结果可能性
            probas += dt.predict_proba(X)
        # 预测结果可能性取平均
        probas /= self.n_estimators
        # 返回预测结果
        return self.y_classes.take(np.argmax(probas, axis = 1), axis = 0)
# 随机森林分类器
rf = rfc()
# 拟合数据集
rf.fit(X, y)

2. 分类结果可视化

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

%matplotlib notebook

plt.rcParams['font.sans-serif'] = ['PingFang HK']  # 选择一个本地的支持中文的字体
fig, ax = plt.subplots()
ax.set_facecolor('#f8f9fa')

X = X.to_numpy()  # 将X转换为numpy数组
# 使用前两个特征进行可视化
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .05), np.arange(y_min, y_max, .05))
Z = rf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
clist=['#ffadad', '#8ecae6']
newcmp = LinearSegmentedColormap.from_list('point_color', clist)
plt.pcolormesh(xx, yy, Z, cmap = newcmp)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

x1 = X[y==-1][:, 0]
y1 = X[y==-1][:, 1]
x2 = X[y==1][:, 0]
y2 = X[y==1][:, 1]
p1 = plt.scatter(x1, y1, c='#e63946', marker='o', s=20)
p2 = plt.scatter(x2, y2, c='#457b9d', marker='x', s=20)

ax.set_title('随机森林分类', color='#264653')
ax.set_xlabel('X1', color='#264653')
ax.set_ylabel('X2', color='#264653')
ax.tick_params(labelcolor='#264653')
plt.legend([p1, p2], ['-1', '1'], loc='upper left')
plt.show()

代码说明：

首先，我们加载数据，定义特征 (X) 和目标变量 (y)。
然后，我们创建并训练随机森林分类器。
接下来，我们使用训练好的模型对特征空间进行预测，并将预测结果可视化。
最后，我们将不同类别的样本点绘制在图表上，以便更直观地观察分类结果和决策边界。

注意：

在可视化过程中，我们仅使用了前两个特征。如果数据集包含更多特征，您需要选择合适的特征进行可视化。
您需要根据实际情况修改代码中的文件路径、特征列名和目标变量列名。

希望本示例能够帮助您理解如何使用Python实现随机森林分类并进行可视化。