#encoding=utf8import numpy as np#adaboost算法class AdaBoost inputn_estimatorsint迭代轮数 learning_ratefloat弱分类器权重缩减系数 def __init__self n_estimators=50 learning_rate=10 selfclf_
#encoding=utf8 import numpy as np #adaboost算法 class AdaBoost: ''' input:n_estimators(int):迭代轮数 learning_rate(float):弱分类器权重缩减系数 ''' def init(self, n_estimators=50, learning_rate=1.0): self.clf_num = n_estimators self.learning_rate = learning_rate def init_args(self, datasets, labels): self.X = datasets self.Y = labels self.M, self.N = datasets.shape # 弱分类器数目和集合 self.clf_sets = [] # 初始化weights self.weights = [1.0/self.M]self.M # G(x)系数 alpha self.alpha = [] #******** Begin *********# def _G(self, features, labels, weights): ''' input:features(ndarray):数据特征 labels(ndarray):数据标签 weights(ndarray):样本权重系数 ''' m = len(features) error = 100000.0 # 初始化为一个较大的值 best_v = 0.0 # 单维features循环 for i in range(m): feature = features[:, i] # 找到feature中所有唯一的值并从小到大排序 unique_values = np.unique(feature) # 计算阈值v,取相邻两值的平均数 for j in range(len(unique_values)-1): v = (unique_values[j] + unique_values[j+1]) / 2 # 根据阈值v将样本分类为-1或+1 direct = np.array([1 if feature[k]>v else -1 for k in range(m)]) # 计算误差 e = sum([weights[k] for k in range(m) if direct[k]!=labels[k]]) # 如果误差小于当前最小误差,则更新最小误差 if e < error: error = e best_v = v best_direct = direct return best_v, best_direct, error
# 计算alpha
def _alpha(self, error):
return 0.5*np.log((1-error)/error)
# 规范化因子
def _Z(self, weights, a, clf):
return sum([weights[i]*np.exp(-1*a*self.Y[i]*clf[i]) for i in range(self.M)])
# 权值更新
def _w(self, a, clf, Z):
for i in range(self.M):
self.weights[i] = self.weights[i]*np.exp(-1*a*self.Y[i]*clf[i])/Z
# G(x)的线性组合
def G(self, x, v, direct):
if x > v:
return direct
else:
return -1*direct
def fit(self, X, y):
'''
X(ndarray):训练数据
y(ndarray):训练标签
'''
self.init_args(X, y)
for epoch in range(self.clf_num):
# 选择具有最小错误率的决策树
best_clf_error, best_v, best_direct, best_clf = 100000, None, None, None
for j in range(self.N):
features = self.X[:, j]
# 从训练数据中找到最佳阈值以及对应的分类方向
v, direct, error = self._G(features, self.Y, self.weights)
if error < best_clf_error:
best_clf_error = error
best_v = v
best_direct = direct
best_clf = (j, v, direct)
# 计算alpha系数
a = self._alpha(best_clf_error)
# 保存弱分类器
best_clf += (a,)
self.clf_sets.append(best_clf)
# 计算规范化因子Z
Z = self._Z(self.weights, a, best_clf_error)
# 更新权值
self._w(a, best_clf, Z)
def predict(self, data):
'''
input:data(ndarray):单个样本
output:预测为正样本返回+1,负样本返回-1
'''
result = 0.0
for i in range(len(self.clf_sets)):
feature, v, direct, a = self.clf_sets[i]
# 预测分类方向:大于阈值则为1,小于阈值则为-1
tmp = self.G(data[feature], v, direct)
result += a*tmp
return 1 if result > 0 else -
原文地址: https://www.cveoy.top/t/topic/fg92 著作权归作者所有。请勿转载和采集!