Motivations

Trong post trước chúng ta đã so sánh và khảo sát vai trò của tiêu chuẩn AUC như là một điều kiện cần của việc lựa chọn mô hình phù hợp gắn liền với mục tiêu của các tổ chức hoạt động vì lợi nhuận như Ngân Hàng. Trong post này chúng ta sẽ tìm hiểu tác động của việc sử dụng các tham số mà tối ưu AUC bằng Bayesian Optimization lên mức lợi nhuận tối đa (maximum profit) giữa Random Forest không được tinh chỉnh và Random Forest được tinh chỉnh.

Findings

Random Forest với tham số tối ưu tìm được bằng Bayesian Optimization có AUC cao hơn AUC của Random Forest mặc định chỉ 2.618% nhưng maximum profit tương ứng thì chênh lệch nhau đến 474.0189% (Figure 2):

Python Codes

Dưới đây là Python codes của các kết quả quan trọng này:

# ===============================
#  Prepare data for training
# ==============================
# Turn off warnings:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

# Load data:
import pandas as pd

df_bank = pd.read_csv("C:/Users/ADMIN/Desktop/DataMining/dmba/GermanCredit.csv")

# Relabel for RESPONSE (1 = default, 0 = nondefault):
df_bank["RESPONSE"] = df_bank["RESPONSE"].map({1: 0, 0: 1})

# Drop OBS# feature:
my_df_binary = df_bank.drop(["OBS#"], axis=1)

# Define input features and target output:
Y = my_df_binary["RESPONSE"]
X = my_df_binary.drop("RESPONSE", axis=1)

# Prepare data:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=29)

# Train Random Forest and calculate probability of default:
from sklearn.ensemble import RandomForestClassifier

ran = RandomForestClassifier(random_state=29)
ran.fit(X_train, y_train)
pd_ran = ran.predict_proba(X_test)[:, 1]

# ==========================================================================
# Search optimal parameters for Random Forest using Bayesian Optimization
# ==========================================================================

# Define objective function:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=29)

def objective_function(params):
    clf = RandomForestClassifier(**params, n_jobs=-1, random_state=29)
    score = cross_val_score(clf, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
    loss_value = -1 * score.mean()
    return loss_value

# Define space of parameters:

from hyperopt.pyll import scope
from hyperopt import hp

param_hyperopt_rf = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 50, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 1000, 100)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 30, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 2, 30, 1))
}

# Search optimal parameters for Random Forest by Bayesian Optimization:
from hyperopt import fmin, tpe, Trials
import numpy as np

tpe = tpe.suggest
tpe_trials = Trials()

rf_bayesian_TPE = fmin(fn=objective_function,
                       space=param_hyperopt_rf,
                       algo=tpe,
                       max_evals=50,
                       trials=tpe_trials,
                       rstate=np.random.RandomState(29))

# Show AUC by interation:
hyperopt_scores = [-1 * trial['result']['loss'] for trial in tpe_trials.trials]
hyperopt_scores = np.maximum.accumulate(hyperopt_scores)

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

plt.figure(figsize=(8, 6))
plt.plot(hyperopt_scores)
plt.xlabel("Interation")
plt.ylabel("AUC")
plt.title("Figure 1: AUC by Interation (TPE Bayesian Optimization)", fontsize=15)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

# Extract optimal values and parameter names:
best_param_tpe = [x for x in rf_bayesian_TPE.values()]
param_names = [x for x in rf_bayesian_TPE.keys()]

# Reset Random Forest with optimal parameters:
param_hyperopt_rf['max_depth'] = int(best_param_tpe[0])
param_hyperopt_rf['min_samples_leaf'] = int(best_param_tpe[1])
param_hyperopt_rf['min_samples_split'] = int(best_param_tpe[2])
param_hyperopt_rf['n_estimators'] = int(best_param_tpe[3])

# Retrain Random Forest with optimal parameters:
bestRF = RandomForestClassifier(**param_hyperopt_rf, random_state=29, n_jobs=-1)
bestRF.fit(X_train, y_train)

# Recalculate probability of default:
pd_best = bestRF.predict_proba(X_test)[:, 1]

# ================================================================
#  Compare profit between default and turned Random Forest
# ================================================================

# Function calculates profit with given cutoff when interest rate of 10%:
def profit_by_cutoff(cutoff, pred_prob):
    rate = 0.10
    pred_bg = (pred_prob >= cutoff).astype(int)
    gg = X_test[(y_test == 0) & (pred_bg == 0)]
    bg = X_test[(y_test == 1) & (pred_bg == 0)]
    profit = np.sum(rate * gg["AMOUNT"]) - np.sum(bg["AMOUNT"])
    return profit

def profit(cutoff):
    pro_none = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
    pro_turned = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_best)
    df_pro = pd.DataFrame({"Profit_None": [pro_none],
                           "Profit_Turned": [pro_turned],

                           "Cutoff": [cutoff]})

    return df_pro


# Profit for the two models by a range of cutoff:
df_profit = pd.DataFrame()

for j in np.arange(0.01, 0.3, 0.005):
    df_j = profit(j)
    df_profit = df_profit.append(df_j)

# Compare profit by line graph:

plt.plot("Cutoff", "Profit_None", data=df_profit, label="None", lw=2)
plt.plot("Cutoff", "Profit_Turned", data=df_profit, label="Turned", lw=2)
plt.title("Figure 2: Profit by Default and Turned Random Forest", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

# AUC (2.618% increased):
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, pd_ran))
print(roc_auc_score(y_test, pd_best))

# Gap in maximum profit by two models:
max_pro_turned = np.max(df_profit['Profit_Turned'])
max_pro_default = np.max(df_profit['Profit_None'])
gap = max_pro_turned / max_pro_default
print(gap)

Remove noises by RFE


# ================================
#  Expansion: Remove noises
# ================================

# Function return average ROC:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

def average_roc(n_features):
    rfe = RFE(estimator=RandomForestClassifier(random_state=29), n_features_to_select=n_features)
    model = RandomForestClassifier(random_state=29)
    pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
    n_scores = cross_val_score(pipeline, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
    return np.mean(n_scores)

my_range = np.arange(5, 40, 1)
avg_auc = []

for j in my_range:
    avg_auc.append(average_roc(j))

df_fet = pd.DataFrame({"n_features": my_range, "auc": avg_auc})
df_max = df_fet[df_fet['auc'] == np.max(df_fet['auc'])]

plt.plot("n_features", "auc", data=df_fet)
plt.scatter('n_features', 'auc', data=df_max, s=80, label=None, color='r')
plt.show()

rfe = RFE(estimator=RandomForestClassifier(random_state=29), n_features_to_select=14)
model = RandomForestClassifier(random_state=29)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
pipeline.fit(X_train, y_train)
pd_fet = pipeline.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, pd_fet))

cutoff_range = np.arange(0.01, 0.3, 0.005)
profit_fet = []

for j in cutoff_range:
    profit3 = profit_by_cutoff(cutoff=j, pred_prob=pd_fet)
    profit_fet.append(profit3)

df3 = pd.DataFrame({"Cutoff": cutoff_range, "Profit_Fet": profit_fet})

plt.plot("Cutoff", "Profit_None", data=df_profit, label="None", lw=2)
plt.plot("Cutoff", "Profit_Turned", data=df_profit, label="Turned", lw=2)
plt.plot("Cutoff", "Profit_Fet", data=df3, label="Fet", lw=2)
plt.title("Figure 3: Profit", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

Profit Comparision


# ================================================
#                   Prove
# ================================================
# Function calculates average profit with given cutoff
# for CatBoostClassifier and RandomForestClassifier:

from catboost import CatBoostClassifier

def average_pro(cutoff):
    n_times = 10
    randomSeeds = np.arange(1, n_times + 1, 1)

    pro_cat = []
    pro_ran = []

    for j in randomSeeds:
        # For CatBoostClassifier:
        cat = CatBoostClassifier(random_state=j, verbose=False)
        cat.fit(X_train, y_train)
        pd_cat = cat.predict_proba(X_test)[:, 1]
        profit_cat = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_cat)
        pro_cat.append(profit_cat)
        # For RandomForestClassifier:
        ran = RandomForestClassifier(random_state=j)
        ran.fit(X_train, y_train)
        pd_ran = ran.predict_proba(X_test)[:, 1]
        profit_ran = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
        pro_ran.append(profit_ran)

    df_result = pd.DataFrame({"AvgProCat": [np.mean(pro_cat)],
                              "AvgProRan": [np.mean(pro_ran)],
                              "Cutoff": [cutoff]})

    return df_result

# Avg profit by range of cutoff:
df_avgPro = pd.DataFrame()

for i in cutoff_range:
    df_i = average_pro(cutoff=i)
    df_avgPro = df_avgPro.append(df_i)

# Result:

plt.plot("Cutoff", "AvgProCat", data=df_avgPro, label="CatBoost", lw=2)
plt.plot("Cutoff", "AvgProRan", data=df_avgPro, label="RandomForest", lw=2)
plt.title("Figure 4: Profit", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()
---
title: 'Effect of AUC on maximum profit (Python)'
author: 'Author: Nguyen Chi Dung'
subtitle: "Python Machine Learning Series"
output:
  html_document: 
    code_download: true
    # code_folding: hide
    highlight: zenburn
    # number_sections: yes
    theme: "flatly"
    toc: TRUE
    toc_float: TRUE
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = TRUE,eval = FALSE)

```


# Motivations

Trong [post trước](https://rpubs.com/chidungkt/664943) chúng ta đã so sánh và khảo sát vai trò của tiêu chuẩn AUC như là một điều kiện cần của việc lựa chọn mô hình phù hợp gắn liền với mục tiêu của các tổ chức hoạt động vì lợi nhuận như Ngân Hàng. Trong post này chúng ta sẽ tìm hiểu tác động của việc sử dụng các tham số mà tối ưu AUC bằng Bayesian Optimization lên mức lợi nhuận tối đa (maximum profit) giữa Random Forest không được tinh chỉnh và Random Forest được tinh chỉnh. 

# Findings

Random Forest với tham số tối ưu tìm được bằng Bayesian Optimization có AUC cao hơn AUC của Random Forest mặc định chỉ 2.618% nhưng maximum profit tương ứng thì chênh lệch nhau đến 474.0189% (Figure 2):
![](C:/Users/Admin/Documents/profit.jpg)


# Python Codes

Dưới đây là Python codes của các kết quả quan trọng này: 


```{python, python.reticulate = FALSE}
# ===============================
#  Prepare data for training
# ==============================
# Turn off warnings:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

# Load data:
import pandas as pd

df_bank = pd.read_csv("C:/Users/ADMIN/Desktop/DataMining/dmba/GermanCredit.csv")

# Relabel for RESPONSE (1 = default, 0 = nondefault):
df_bank["RESPONSE"] = df_bank["RESPONSE"].map({1: 0, 0: 1})

# Drop OBS# feature:
my_df_binary = df_bank.drop(["OBS#"], axis=1)

# Define input features and target output:
Y = my_df_binary["RESPONSE"]
X = my_df_binary.drop("RESPONSE", axis=1)

# Prepare data:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=29)

# Train Random Forest and calculate probability of default:
from sklearn.ensemble import RandomForestClassifier

ran = RandomForestClassifier(random_state=29)
ran.fit(X_train, y_train)
pd_ran = ran.predict_proba(X_test)[:, 1]

# ==========================================================================
# Search optimal parameters for Random Forest using Bayesian Optimization
# ==========================================================================

# Define objective function:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=29)

def objective_function(params):
    clf = RandomForestClassifier(**params, n_jobs=-1, random_state=29)
    score = cross_val_score(clf, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
    loss_value = -1 * score.mean()
    return loss_value

# Define space of parameters:

from hyperopt.pyll import scope
from hyperopt import hp

param_hyperopt_rf = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 50, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 1000, 100)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 30, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 2, 30, 1))
}

# Search optimal parameters for Random Forest by Bayesian Optimization:
from hyperopt import fmin, tpe, Trials
import numpy as np

tpe = tpe.suggest
tpe_trials = Trials()

rf_bayesian_TPE = fmin(fn=objective_function,
                       space=param_hyperopt_rf,
                       algo=tpe,
                       max_evals=50,
                       trials=tpe_trials,
                       rstate=np.random.RandomState(29))

# Show AUC by interation:
hyperopt_scores = [-1 * trial['result']['loss'] for trial in tpe_trials.trials]
hyperopt_scores = np.maximum.accumulate(hyperopt_scores)

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

plt.figure(figsize=(8, 6))
plt.plot(hyperopt_scores)
plt.xlabel("Interation")
plt.ylabel("AUC")
plt.title("Figure 1: AUC by Interation (TPE Bayesian Optimization)", fontsize=15)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

# Extract optimal values and parameter names:
best_param_tpe = [x for x in rf_bayesian_TPE.values()]
param_names = [x for x in rf_bayesian_TPE.keys()]

# Reset Random Forest with optimal parameters:
param_hyperopt_rf['max_depth'] = int(best_param_tpe[0])
param_hyperopt_rf['min_samples_leaf'] = int(best_param_tpe[1])
param_hyperopt_rf['min_samples_split'] = int(best_param_tpe[2])
param_hyperopt_rf['n_estimators'] = int(best_param_tpe[3])

# Retrain Random Forest with optimal parameters:
bestRF = RandomForestClassifier(**param_hyperopt_rf, random_state=29, n_jobs=-1)
bestRF.fit(X_train, y_train)

# Recalculate probability of default:
pd_best = bestRF.predict_proba(X_test)[:, 1]

# ================================================================
#  Compare profit between default and turned Random Forest
# ================================================================

# Function calculates profit with given cutoff when interest rate of 10%:
def profit_by_cutoff(cutoff, pred_prob):
    rate = 0.10
    pred_bg = (pred_prob >= cutoff).astype(int)
    gg = X_test[(y_test == 0) & (pred_bg == 0)]
    bg = X_test[(y_test == 1) & (pred_bg == 0)]
    profit = np.sum(rate * gg["AMOUNT"]) - np.sum(bg["AMOUNT"])
    return profit

def profit(cutoff):
    pro_none = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
    pro_turned = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_best)
    df_pro = pd.DataFrame({"Profit_None": [pro_none],
                           "Profit_Turned": [pro_turned],

                           "Cutoff": [cutoff]})

    return df_pro


# Profit for the two models by a range of cutoff:
df_profit = pd.DataFrame()

for j in np.arange(0.01, 0.3, 0.005):
    df_j = profit(j)
    df_profit = df_profit.append(df_j)

# Compare profit by line graph:

plt.plot("Cutoff", "Profit_None", data=df_profit, label="None", lw=2)
plt.plot("Cutoff", "Profit_Turned", data=df_profit, label="Turned", lw=2)
plt.title("Figure 2: Profit by Default and Turned Random Forest", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

# AUC (2.618% increased):
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, pd_ran))
print(roc_auc_score(y_test, pd_best))

# Gap in maximum profit by two models:
max_pro_turned = np.max(df_profit['Profit_Turned'])
max_pro_default = np.max(df_profit['Profit_None'])
gap = max_pro_turned / max_pro_default
print(gap)

```

# Remove noises by RFE


```{python, python.reticulate = FALSE}

# ================================
#  Expansion: Remove noises
# ================================

# Function return average ROC:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

def average_roc(n_features):
    rfe = RFE(estimator=RandomForestClassifier(random_state=29), n_features_to_select=n_features)
    model = RandomForestClassifier(random_state=29)
    pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
    n_scores = cross_val_score(pipeline, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
    return np.mean(n_scores)

my_range = np.arange(5, 40, 1)
avg_auc = []

for j in my_range:
    avg_auc.append(average_roc(j))

df_fet = pd.DataFrame({"n_features": my_range, "auc": avg_auc})
df_max = df_fet[df_fet['auc'] == np.max(df_fet['auc'])]

plt.plot("n_features", "auc", data=df_fet)
plt.scatter('n_features', 'auc', data=df_max, s=80, label=None, color='r')
plt.show()

rfe = RFE(estimator=RandomForestClassifier(random_state=29), n_features_to_select=14)
model = RandomForestClassifier(random_state=29)
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
pipeline.fit(X_train, y_train)
pd_fet = pipeline.predict_proba(X_test)[:, 1]

print(roc_auc_score(y_test, pd_fet))

cutoff_range = np.arange(0.01, 0.3, 0.005)
profit_fet = []

for j in cutoff_range:
    profit3 = profit_by_cutoff(cutoff=j, pred_prob=pd_fet)
    profit_fet.append(profit3)

df3 = pd.DataFrame({"Cutoff": cutoff_range, "Profit_Fet": profit_fet})

plt.plot("Cutoff", "Profit_None", data=df_profit, label="None", lw=2)
plt.plot("Cutoff", "Profit_Turned", data=df_profit, label="Turned", lw=2)
plt.plot("Cutoff", "Profit_Fet", data=df3, label="Fet", lw=2)
plt.title("Figure 3: Profit", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

```


# Profit Comparision

![](C:/Users/Admin/Documents/profit2.jpg)


```{python, python.reticulate = FALSE}

# ================================================
#                   Prove
# ================================================
# Function calculates average profit with given cutoff
# for CatBoostClassifier and RandomForestClassifier:

from catboost import CatBoostClassifier

def average_pro(cutoff):
    n_times = 10
    randomSeeds = np.arange(1, n_times + 1, 1)

    pro_cat = []
    pro_ran = []

    for j in randomSeeds:
        # For CatBoostClassifier:
        cat = CatBoostClassifier(random_state=j, verbose=False)
        cat.fit(X_train, y_train)
        pd_cat = cat.predict_proba(X_test)[:, 1]
        profit_cat = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_cat)
        pro_cat.append(profit_cat)
        # For RandomForestClassifier:
        ran = RandomForestClassifier(random_state=j)
        ran.fit(X_train, y_train)
        pd_ran = ran.predict_proba(X_test)[:, 1]
        profit_ran = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
        pro_ran.append(profit_ran)

    df_result = pd.DataFrame({"AvgProCat": [np.mean(pro_cat)],
                              "AvgProRan": [np.mean(pro_ran)],
                              "Cutoff": [cutoff]})

    return df_result

# Avg profit by range of cutoff:
df_avgPro = pd.DataFrame()

for i in cutoff_range:
    df_i = average_pro(cutoff=i)
    df_avgPro = df_avgPro.append(df_i)

# Result:

plt.plot("Cutoff", "AvgProCat", data=df_avgPro, label="CatBoost", lw=2)
plt.plot("Cutoff", "AvgProRan", data=df_avgPro, label="RandomForest", lw=2)
plt.title("Figure 4: Profit", fontsize=13)
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=8)
plt.show()

```






















