1. 导入包

在python中,我们需要先导入一些必要的包。这里我们需要用到的包有numpy、pandas、matplotlib、seaborn、sklearn、statsmodels等。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix
import statsmodels.api as sm
  1. 导入数据

接下来,我们需要将数据导入到python环境中。

data = pd.read_csv('credit_data.csv')
  1. EDA

我们需要进行探索性数据分析(EDA),以便更好地了解数据的特征和分布情况。

# 查看数据前5行
data.head()

# 查看数据基本信息
data.info()

# 查看数据统计信息
data.describe()

# 查看缺失值情况
data.isnull().sum()

# 查看数据分布
sns.displot(data['credit_score'])
sns.displot(data['income'])
sns.displot(data['age'])
  1. 数据分箱

我们需要将连续变量分成多个分箱,以获得更好的模型效果。

# 定义分箱函数
def mono_bin(Y, X, n=20):
    r = 0
    total_good = Y.sum()
    total_bad = Y.count() - total_good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({'X': X, 'Y': Y, 'Bucket': pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index=True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.min().X, columns=['min_' + X.name])
    d3['max_' + X.name] = d2.max().X
    d3[Y.name] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['bad_rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['bad_rate'] + 0.0001) / ((1 - d3['bad_rate']) + 0.0001))
    d3['IV'] = (d3['bad_rate'] - (1 - d3['bad_rate'])) * d3['woe']
    d3['IV'] = d3['IV'].sum()
    return d3

# 对连续变量进行分箱
dfx1 = mono_bin(data['default'], data['credit_score'])
dfx2 = mono_bin(data['default'], data['income'])
dfx3 = mono_bin(data['default'], data['age'])
  1. 特征筛选

我们需要选择一些特征用于建模。

# 合并分箱数据
dfx = dfx1.merge(dfx2, on=['min_credit_score', 'max_credit_score', 'IV'], how='inner')
dfx = dfx.merge(dfx3, on=['min_age', 'max_age', 'IV'], how='inner')

# 选择IV值大于0.1的特征
dfx = dfx[dfx['IV'] >= 0.1]
selected_cols = dfx.columns.tolist()[:-4]
  1. woe转化

我们需要将特征进行WOE转化,以便更好地拟合逻辑回归模型。

# 定义woe转换函数
def woe_trans(data, var):
    d0 = mono_bin(data['default'], data[var])
    woe = dict(zip(d0[var], d0['woe']))
    x = pd.Series(data[var])
    x1 = pd.cut(x, bins=d0[var], include_lowest=True)
    return x1.map(woe)

# 对特征进行woe转换
df_woe = pd.DataFrame(index=data.index)
for col in selected_cols:
    df_woe[col] = woe_trans(data, col)
df_woe['default'] = data['default']
  1. 逻辑回归建模

我们需要拟合逻辑回归模型。

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df_woe[selected_cols], df_woe['default'], test_size=0.3, random_state=42)

# 拟合逻辑回归模型
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
  1. 评分卡转换

我们需要将逻辑回归模型转换成评分卡。

# 计算分数
def get_score(coe, woe, factor):
    scores = []
    for w in woe:
        score = round(coe * w * factor, 0)
        scores.append(score)
    return scores

# 计算分数
feature_cols = selected_cols
coe = clf.coef_[0]
intercept = clf.intercept_[0]
factor = 20 / np.log(2)
base_score = round(factor * (intercept - (np.sum(coe) / len(coe))), 0)
score_card = pd.DataFrame(index=feature_cols)
score_card['woe'] = dfx['woe']
score_card['coefficient'] = coe
score_card['score'] = get_score(coe, dfx['woe'], factor)
  1. oot和psi计算

我们需要计算OOT和PSI。

# 计算OOT
df_oot = pd.read_csv('credit_data_oot.csv')
df_oot_woe = pd.DataFrame(index=df_oot.index)
for col in selected_cols:
    df_oot_woe[col] = woe_trans(df_oot, col)
df_oot_woe['default'] = df_oot['default']

X_oot = df_oot_woe[selected_cols]
y_oot = df_oot_woe['default']
y_pred_oot = clf.predict(X_oot)
fpr_oot, tpr_oot, thresholds_oot = roc_curve(y_oot, clf.predict_proba(X_oot)[:, 1])
roc_auc_oot = auc(fpr_oot, tpr_oot)

# 计算PSI
df_train_bin = pd.DataFrame(index=X_train.index)
df_train_bin['pred'] = clf.predict(X_train)
df_train_bin['prob'] = clf.predict_proba(X_train)[:, 1]
df_train_bin['bucket'] = pd.qcut(df_train_bin['prob'], 10)
df_train_bin['score'] = X_train.apply(lambda x: score_card.loc[x.name, 'score'], axis=1)
df_train_bin['target'] = y_train

df_test_bin = pd.DataFrame(index=X_test.index)
df_test_bin['pred'] = clf.predict(X_test)
df_test_bin['prob'] = clf.predict_proba(X_test)[:, 1]
df_test_bin['bucket'] = pd.qcut(df_test_bin['prob'], 10)
df_test_bin['score'] = X_test.apply(lambda x: score_card.loc[x.name, 'score'], axis=1)
df_test_bin['target'] = y_test

df_oot_bin = pd.DataFrame(index=X_oot.index)
df_oot_bin['pred'] = clf.predict(X_oot)
df_oot_bin['prob'] = clf.predict_proba(X_oot)[:, 1]
df_oot_bin['bucket'] = pd.qcut(df_oot_bin['prob'], 10)
df_oot_bin['score'] = X_oot.apply(lambda x: score_card.loc[x.name, 'score'], axis=1)
df_oot_bin['target'] = y_oot

train = df_train_bin['bucket'].value_counts().sort_index()
test = df_test_bin['bucket'].value_counts().sort_index()
oot = df_oot_bin['bucket'].value_counts().sort_index()

psi_train_test = (train - test) * np.log(train / test)
psi_train_oot = (train - oot) * np.log(train / oot)
psi_test_oot = (test - oot) * np.log(test / oot)

print('PSI_train_test:', psi_train_test.sum())
print('PSI_train_oot:', psi_train_oot.sum())
print('PSI_test_oot:', psi_test_oot.sum())

以上就是构建信用评分卡的全流程python脚本,包括了导入包、导入数据、EDA、数据分箱、特征筛选、woe转化、逻辑回归建模、评分卡转换、同时包含oot也计算,计算psi,并详细解释每一个步骤

使用逻辑回归构建信用评分卡全流程python脚本包含:导入包、 导入数据、 EDA、 数据分箱 、特征筛选 、woe转化 、逻辑回归建模 、评分卡转换 、同时包含oot也计算计算psi并详细解释每一个步骤

原文地址: https://www.cveoy.top/t/topic/ddJI 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录