Python 数据预处理、模型选择和训练:使用 Pandas、Scikit-learn 和交叉验证
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Normalizer, MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import pickle
class Model:
def __init__(self):
self.model = None
self.scaler = None
#归一化/标准化
def data_scale(self, data):
'''
数据归一化,可以采用:MinMaxScaler,Normalizer,MaxAbsScaler
:param data:
:return:
'''
scale = MinMaxScaler()
#scale = Normalizer
#scale = MaxAbsScaler
data = scale.fit_transform(data)
self.scaler = scale
return data
#降维
def dimension_reduce(self, data):
'''
1、保留相关性大的维度
2、降维
:param data:
:return:
'''
corr = data.corr().loc[:, 'label']
# print(corr)
corr_bigger_001 = corr[(corr > 0.02) | (corr < -0.02)]
# print(corr_bigger_001)
return data
#处理空值
def handle_na(self, data):
'''
填充空值,ffill ,bfill,均值填充, 预测值填充,
:param data:
:return:
'''
# dropna前的行数 6296
# dropna后的行数 0
# print(pd.isnull(data).sum())
# print('dropna前的行数',data.shape[0])
# data = data.dropna()
# print('dropna后的行数',data.shape[0])
data = data.apply(lambda x: x.fillna(x.mean()))
# print(pd.isnull(data).sum())
#降维
return data
def preprocessing(self, data):
# 2.0 查看类型分布
# print(data.loc[:, 'label'].value_counts())
# plt.bar(range(0, 6), data.loc[:, 'label'].value_counts())
# plt.show()
# 2.1筛选列: 方案一筛选出相关性大的列
data = self.dimension_reduce(data)
# 2.2筛选列: 方案二PCA降维
# 2.3空值处理方案一:扔掉空值,行不通
# 2.4空值处理方案二:列均值填充
data = self.handle_na(data)
return data
def select_model(self):
# 5.选择模型
# 决策树
model = DecisionTreeClassifier(max_depth=5)
# model = RandomForestClassifier(n_estimators=20,max_depth=5,criterion='entropy')
# base_estimator =DecisionTreeClassifier(max_depth=5)
# model = AdaBoostClassifier(n_estimators=100,base_estimator=base_estimator)
# 贝叶斯
# model = MultinomialNB()
return model
def train(self):
# 1.加载数据
data = pd.read_csv('preprocess_train.csv')
# 2.探索数据:查看数据空值情况,数据分布,数据相关性,数据特殊特征
data = self.preprocessing(data)
# 3.数据预处理:空值处理,值映射(分段),归一化/标准化
X = data.iloc[:, 0:-1]
Y = data.iloc[:, -1]
X = self.data_scale(X)
# 4.分割测试集和训练集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
# 5.选择模型
model = self.select_model()
# 6.训练模型
model.fit(X_train, y_train)
self.model = model
# 7.保存模型
pickle.dump(self.model, open('model.pkl', 'wb'))
pickle.dump(self.scaler, open('scaler.pkl', 'wb'))
def load_model(self):
# 8.加载模型
self.model = pickle.load(open('model.pkl', 'rb'))
self.scaler = pickle.load(open('scaler.pkl', 'rb'))
def evaluate(self):
X = pd.read_csv('preprocess_test.csv')
X = self.preprocessing(X)
X = self.data_scale(X)
y_predict = self.model.predict(X)
return y_predict
if __name__ == '__main__':
model = Model()
model.train()
model.load_model()
y_pred = model.evaluate()
print(y_pred)
原文地址: http://www.cveoy.top/t/topic/pFsi 著作权归作者所有。请勿转载和采集!