Trong post trước chúng ta đã so sánh và khảo sát vai trò của tiêu chuẩn AUC như là một điều kiện cần của việc lựa chọn mô hình phù hợp gắn liền với mục tiêu của các tổ chức hoạt động vì lợi nhuận như Ngân Hàng. Trong post này chúng ta sẽ tìm hiểu tác động của việc sử dụng các tham số mà tối ưu AUC bằng Bayesian Optimization lên mức lợi nhuận tối đa (maximum profit) giữa Random Forest không được tinh chỉnh và Random Forest được tinh chỉnh.
Random Forest với tham số tối ưu tìm được bằng Bayesian Optimization có AUC cao hơn AUC của Random Forest mặc định chỉ 2.618% nhưng maximum profit tương ứng thì chênh lệch nhau đến 474.0189% (Figure 2):
Dưới đây là Python codes của các kết quả quan trọng này:
# ===============================
# Prepare data for training
# ==============================
# Turn off warnings:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
# Load data:
import pandas as pd
df_bank = pd.read_csv("C:/Users/ADMIN/Desktop/DataMining/dmba/GermanCredit.csv")
# Relabel for RESPONSE (1 = default, 0 = nondefault):
df_bank["RESPONSE"] = df_bank["RESPONSE"].map({1: 0, 0: 1})
# Drop OBS# feature:
my_df_binary = df_bank.drop(["OBS#"], axis=1)
# Define input features and target output:
Y = my_df_binary["RESPONSE"]
X = my_df_binary.drop("RESPONSE", axis=1)
# Prepare data:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=29)
# Train Random Forest and calculate probability of default:
from sklearn.ensemble import RandomForestClassifier
ran = RandomForestClassifier(random_state=29)
ran.fit(X_train, y_train)
pd_ran = ran.predict_proba(X_test)[:, 1]
# ==========================================================================
# Search optimal parameters for Random Forest using Bayesian Optimization
# ==========================================================================
# Define objective function:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=29)
def objective_function(params):
clf = RandomForestClassifier(**params, n_jobs=-1, random_state=29)
score = cross_val_score(clf, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
loss_value = -1 * score.mean()
return loss_value
# Define space of parameters:
from hyperopt.pyll import scope
from hyperopt import hp
param_hyperopt_rf = {
'max_depth': scope.int(hp.quniform('max_depth', 1, 50, 1)),
'n_estimators': scope.int(hp.quniform('n_estimators', 50, 1000, 100)),
'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 30, 1)),
'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 2, 30, 1))
}
# Search optimal parameters for Random Forest by Bayesian Optimization:
from hyperopt import fmin, tpe, Trials
import numpy as np
tpe = tpe.suggest
tpe_trials = Trials()
rf_bayesian_TPE = fmin(fn=objective_function,
space=param_hyperopt_rf,
algo=tpe,
max_evals=50,
trials=tpe_trials,
rstate=np.random.RandomState(29))
# Show AUC by interation:
hyperopt_scores = [-1 * trial['result']['loss'] for trial in tpe_trials.trials]
hyperopt_scores = np.maximum.accumulate(hyperopt_scores)
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.figure(figsize=(8, 6))
plt.plot(hyperopt_scores)
plt.xlabel("Interation")
plt.ylabel("AUC")
plt.title("Figure 1: AUC by Interation (TPE Bayesian Optimization)", fontsize=15)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()
# Extract optimal values and parameter names:
best_param_tpe = [x for x in rf_bayesian_TPE.values()]
param_names = [x for x in rf_bayesian_TPE.keys()]
# Reset Random Forest with optimal parameters:
param_hyperopt_rf['max_depth'] = int(best_param_tpe[0])
param_hyperopt_rf['min_samples_leaf'] = int(best_param_tpe[1])
param_hyperopt_rf['min_samples_split'] = int(best_param_tpe[2])
param_hyperopt_rf['n_estimators'] = int(best_param_tpe[3])
# Retrain Random Forest with optimal parameters:
bestRF = RandomForestClassifier(**param_hyperopt_rf, random_state=29, n_jobs=-1)
bestRF.fit(X_train, y_train)
# Recalculate probability of default:
pd_best = bestRF.predict_proba(X_test)[:, 1]
# ================================================================
# Compare profit between default and turned Random Forest
# ================================================================
# Function calculates profit with given cutoff when interest rate of 10%:
def profit_by_cutoff(cutoff, pred_prob):
rate = 0.10
pred_bg = (pred_prob >= cutoff).astype(int)
gg = X_test[(y_test == 0) & (pred_bg == 0)]
bg = X_test[(y_test == 1) & (pred_bg == 0)]
profit = np.sum(rate * gg["AMOUNT"]) - np.sum(bg["AMOUNT"])
return profit
def profit(cutoff):
pro_none = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
pro_turned = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_best)
df_pro = pd.DataFrame({"Profit_None": [pro_none],
"Profit_Turned": [pro_turned],
"Cutoff": [cutoff]})
return df_pro
# Profit for the two models by a range of cutoff:
df_profit = pd.DataFrame()
for j in np.arange(0.01, 0.3, 0.005):
df_j = profit(j)
df_profit = df_profit.append(df_j)
# Compare profit by line graph:
plt.plot("Cutoff", "Profit_None", data=df_profit, label="None", lw=2)
plt.plot("Cutoff", "Profit_Turned", data=df_profit, label="Turned", lw=2)
plt.title("Figure 2: Profit by Default and Turned Random Forest", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()
# AUC (2.618% increased):
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, pd_ran))
print(roc_auc_score(y_test, pd_best))
# Gap in maximum profit by two models:
max_pro_turned = np.max(df_profit['Profit_Turned'])
max_pro_default = np.max(df_profit['Profit_None'])
gap = max_pro_turned / max_pro_default
print(gap)
# ================================
# Expansion: Remove noises
# ================================
# Function return average ROC:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
def average_roc(n_features):
rfe = RFE(estimator=RandomForestClassifier(random_state=29), n_features_to_select=n_features)
model = RandomForestClassifier(random_state=29)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
n_scores = cross_val_score(pipeline, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
return np.mean(n_scores)
my_range = np.arange(5, 40, 1)
avg_auc = []
for j in my_range:
avg_auc.append(average_roc(j))
df_fet = pd.DataFrame({"n_features": my_range, "auc": avg_auc})
df_max = df_fet[df_fet['auc'] == np.max(df_fet['auc'])]
plt.plot("n_features", "auc", data=df_fet)
plt.scatter('n_features', 'auc', data=df_max, s=80, label=None, color='r')
plt.show()
rfe = RFE(estimator=RandomForestClassifier(random_state=29), n_features_to_select=14)
model = RandomForestClassifier(random_state=29)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
pipeline.fit(X_train, y_train)
pd_fet = pipeline.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, pd_fet))
cutoff_range = np.arange(0.01, 0.3, 0.005)
profit_fet = []
for j in cutoff_range:
profit3 = profit_by_cutoff(cutoff=j, pred_prob=pd_fet)
profit_fet.append(profit3)
df3 = pd.DataFrame({"Cutoff": cutoff_range, "Profit_Fet": profit_fet})
plt.plot("Cutoff", "Profit_None", data=df_profit, label="None", lw=2)
plt.plot("Cutoff", "Profit_Turned", data=df_profit, label="Turned", lw=2)
plt.plot("Cutoff", "Profit_Fet", data=df3, label="Fet", lw=2)
plt.title("Figure 3: Profit", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()
# ================================================
# Prove
# ================================================
# Function calculates average profit with given cutoff
# for CatBoostClassifier and RandomForestClassifier:
from catboost import CatBoostClassifier
def average_pro(cutoff):
n_times = 10
randomSeeds = np.arange(1, n_times + 1, 1)
pro_cat = []
pro_ran = []
for j in randomSeeds:
# For CatBoostClassifier:
cat = CatBoostClassifier(random_state=j, verbose=False)
cat.fit(X_train, y_train)
pd_cat = cat.predict_proba(X_test)[:, 1]
profit_cat = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_cat)
pro_cat.append(profit_cat)
# For RandomForestClassifier:
ran = RandomForestClassifier(random_state=j)
ran.fit(X_train, y_train)
pd_ran = ran.predict_proba(X_test)[:, 1]
profit_ran = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
pro_ran.append(profit_ran)
df_result = pd.DataFrame({"AvgProCat": [np.mean(pro_cat)],
"AvgProRan": [np.mean(pro_ran)],
"Cutoff": [cutoff]})
return df_result
# Avg profit by range of cutoff:
df_avgPro = pd.DataFrame()
for i in cutoff_range:
df_i = average_pro(cutoff=i)
df_avgPro = df_avgPro.append(df_i)
# Result:
plt.plot("Cutoff", "AvgProCat", data=df_avgPro, label="CatBoost", lw=2)
plt.plot("Cutoff", "AvgProRan", data=df_avgPro, label="RandomForest", lw=2)
plt.title("Figure 4: Profit", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()