使用逻辑回归构建信用评分卡全流程python脚本包含:导入包、 导入数据、 EDA、 数据分箱 、特征筛选 、woe转化 、逻辑回归建模 、评分卡转换 、同时包含oot也计算计算psi并详细解释每一个步骤
- 导入包
在python中,我们需要先导入一些必要的包。这里我们需要用到的包有numpy、pandas、matplotlib、seaborn、sklearn、statsmodels等。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix
import statsmodels.api as sm
- 导入数据
接下来,我们需要将数据导入到python环境中。
data = pd.read_csv('credit_data.csv')
- EDA
我们需要进行探索性数据分析(EDA),以便更好地了解数据的特征和分布情况。
# 查看数据前5行
data.head()
# 查看数据基本信息
data.info()
# 查看数据统计信息
data.describe()
# 查看缺失值情况
data.isnull().sum()
# 查看数据分布
sns.displot(data['credit_score'])
sns.displot(data['income'])
sns.displot(data['age'])
- 数据分箱
我们需要将连续变量分成多个分箱,以获得更好的模型效果。
# 定义分箱函数
def mono_bin(Y, X, n=20):
r = 0
total_good = Y.sum()
total_bad = Y.count() - total_good
while np.abs(r) < 1:
d1 = pd.DataFrame({'X': X, 'Y': Y, 'Bucket': pd.qcut(X, n)})
d2 = d1.groupby('Bucket', as_index=True)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.min().X, columns=['min_' + X.name])
d3['max_' + X.name] = d2.max().X
d3[Y.name] = d2.sum().Y
d3['total'] = d2.count().Y
d3['bad_rate'] = d2.mean().Y
d3['woe'] = np.log((d3['bad_rate'] + 0.0001) / ((1 - d3['bad_rate']) + 0.0001))
d3['IV'] = (d3['bad_rate'] - (1 - d3['bad_rate'])) * d3['woe']
d3['IV'] = d3['IV'].sum()
return d3
# 对连续变量进行分箱
dfx1 = mono_bin(data['default'], data['credit_score'])
dfx2 = mono_bin(data['default'], data['income'])
dfx3 = mono_bin(data['default'], data['age'])
- 特征筛选
我们需要选择一些特征用于建模。
# 合并分箱数据
dfx = dfx1.merge(dfx2, on=['min_credit_score', 'max_credit_score', 'IV'], how='inner')
dfx = dfx.merge(dfx3, on=['min_age', 'max_age', 'IV'], how='inner')
# 选择IV值大于0.1的特征
dfx = dfx[dfx['IV'] >= 0.1]
selected_cols = dfx.columns.tolist()[:-4]
- woe转化
我们需要将特征进行WOE转化,以便更好地拟合逻辑回归模型。
# 定义woe转换函数
def woe_trans(data, var):
d0 = mono_bin(data['default'], data[var])
woe = dict(zip(d0[var], d0['woe']))
x = pd.Series(data[var])
x1 = pd.cut(x, bins=d0[var], include_lowest=True)
return x1.map(woe)
# 对特征进行woe转换
df_woe = pd.DataFrame(index=data.index)
for col in selected_cols:
df_woe[col] = woe_trans(data, col)
df_woe['default'] = data['default']
- 逻辑回归建模
我们需要拟合逻辑回归模型。
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df_woe[selected_cols], df_woe['default'], test_size=0.3, random_state=42)
# 拟合逻辑回归模型
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
- 评分卡转换
我们需要将逻辑回归模型转换成评分卡。
# 计算分数
def get_score(coe, woe, factor):
scores = []
for w in woe:
score = round(coe * w * factor, 0)
scores.append(score)
return scores
# 计算分数
feature_cols = selected_cols
coe = clf.coef_[0]
intercept = clf.intercept_[0]
factor = 20 / np.log(2)
base_score = round(factor * (intercept - (np.sum(coe) / len(coe))), 0)
score_card = pd.DataFrame(index=feature_cols)
score_card['woe'] = dfx['woe']
score_card['coefficient'] = coe
score_card['score'] = get_score(coe, dfx['woe'], factor)
- oot和psi计算
我们需要计算OOT和PSI。
# 计算OOT
df_oot = pd.read_csv('credit_data_oot.csv')
df_oot_woe = pd.DataFrame(index=df_oot.index)
for col in selected_cols:
df_oot_woe[col] = woe_trans(df_oot, col)
df_oot_woe['default'] = df_oot['default']
X_oot = df_oot_woe[selected_cols]
y_oot = df_oot_woe['default']
y_pred_oot = clf.predict(X_oot)
fpr_oot, tpr_oot, thresholds_oot = roc_curve(y_oot, clf.predict_proba(X_oot)[:, 1])
roc_auc_oot = auc(fpr_oot, tpr_oot)
# 计算PSI
df_train_bin = pd.DataFrame(index=X_train.index)
df_train_bin['pred'] = clf.predict(X_train)
df_train_bin['prob'] = clf.predict_proba(X_train)[:, 1]
df_train_bin['bucket'] = pd.qcut(df_train_bin['prob'], 10)
df_train_bin['score'] = X_train.apply(lambda x: score_card.loc[x.name, 'score'], axis=1)
df_train_bin['target'] = y_train
df_test_bin = pd.DataFrame(index=X_test.index)
df_test_bin['pred'] = clf.predict(X_test)
df_test_bin['prob'] = clf.predict_proba(X_test)[:, 1]
df_test_bin['bucket'] = pd.qcut(df_test_bin['prob'], 10)
df_test_bin['score'] = X_test.apply(lambda x: score_card.loc[x.name, 'score'], axis=1)
df_test_bin['target'] = y_test
df_oot_bin = pd.DataFrame(index=X_oot.index)
df_oot_bin['pred'] = clf.predict(X_oot)
df_oot_bin['prob'] = clf.predict_proba(X_oot)[:, 1]
df_oot_bin['bucket'] = pd.qcut(df_oot_bin['prob'], 10)
df_oot_bin['score'] = X_oot.apply(lambda x: score_card.loc[x.name, 'score'], axis=1)
df_oot_bin['target'] = y_oot
train = df_train_bin['bucket'].value_counts().sort_index()
test = df_test_bin['bucket'].value_counts().sort_index()
oot = df_oot_bin['bucket'].value_counts().sort_index()
psi_train_test = (train - test) * np.log(train / test)
psi_train_oot = (train - oot) * np.log(train / oot)
psi_test_oot = (test - oot) * np.log(test / oot)
print('PSI_train_test:', psi_train_test.sum())
print('PSI_train_oot:', psi_train_oot.sum())
print('PSI_test_oot:', psi_test_oot.sum())
以上就是构建信用评分卡的全流程python脚本,包括了导入包、导入数据、EDA、数据分箱、特征筛选、woe转化、逻辑回归建模、评分卡转换、同时包含oot也计算,计算psi,并详细解释每一个步骤
原文地址: https://www.cveoy.top/t/topic/ddJI 著作权归作者所有。请勿转载和采集!