Motivations

Tinh chỉnh tham số để tìm tham số tối ưu cho mô hình là công việc tốn thời gian và nặng nhọc. Bayesian Optimization là một cách tiếp cận hiệu quả để tinh chỉnh tham số cho các mô hình Machine Learning.

Data used and results

Trước hết train và đánh giá một loạt Machine Learning Classifiers. Kết quả cho thấy XGBoost có trung bình Recall (n_splits=3, n_repeats=4) là 0.7176:

# Hide warnings from Python: 

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

# Load data:
import pandas as pd
df = pd.read_csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")

# Convert categories to dummies:
df = pd.get_dummies(df)

# Impute missing data (https://academic.oup.com/bioinformatics/article/28/1/112/219101,
#                      https://academic.oup.com/aje/article/179/6/764/107562,
#                      https://github.com/epsilon-machine/missingpy):

from missingpy import MissForest
imputer = MissForest()
df_imputed = imputer.fit_transform(df)

# Convert to data frame:
df_imputed = pd.DataFrame(df_imputed)

# Rename for columns:
df_imputed.columns = df.columns

# Prepare data:
X = df_imputed.drop(labels=["BAD"], axis=1)
Y = df_imputed["BAD"]

# Standardize 0-1 for features:
from sklearn.preprocessing import MinMaxScaler
scaler_01 = MinMaxScaler()
scaler_01.fit(X)
X_scaler = scaler_01.transform(X)

# Some classifiers from Scikit-learn:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier

# LightGBM, Catboost and XGBoost:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Initiative estimators:
ran = RandomForestClassifier()
gbm = LGBMClassifier()
log = LogisticRegression()
gbc = GradientBoostingClassifier()
xgb = XGBClassifier()
ext = ExtraTreesClassifier()
ada = AdaBoostClassifier()
gnb = GaussianNB()
bag = BaggingClassifier()
nnn = MLPClassifier()
cat = CatBoostClassifier()

# List of classifiers:
models = [ran, gbm, log, gbc, xgb, ext, ada, gnb, bag, nnn, cat]

# Train all classifiers by using for loop:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=4, random_state=29)

recall_mean = []
recall_sd = []
auc_mean = []
auc_sd = []

import numpy as np

for mod in models:
    acc = cross_val_score(mod, X_scaler, Y, scoring="recall", cv=cv, n_jobs=-1, verbose=False)
    auc = cross_val_score(mod, X_scaler, Y, scoring="roc_auc", cv=cv, n_jobs=-1, verbose=False)
    # Recall metric:
    recall_mean.append(acc.mean())
    recall_sd.append(np.std(acc))
    # AUC metric:
    auc_mean.append(auc.mean())
    auc_sd.append(np.std(auc))

df_results = pd.DataFrame({"Classifier": [j.__class__.__name__ for j in models],
                           "Recall_mean": recall_mean,
                           "Recall_sd": recall_sd,
                           "AUC_mean": auc_mean,
                           "AUC_sd": auc_sd})

# Report results:
print(df_results)

Chúng ta có thể tinh chỉnh tham số theo Bayesian Optimization để cho Recall tăng lên 0.7400 (mức tăng 3% - khá là khiêm tốn) như sau:

# Define the space of parameters to search:
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

# Space of parameters:
search_space = list()
search_space.append(Categorical(["binary:logistic"], name="objective"))
search_space.append(Integer(2, 20, name="max_depth"))
search_space.append(Integer(2, 20, name="min_child_weight"))
search_space.append(Integer(10, 1000, name="n_estimators"))
search_space.append(Real(1e-3, 100, "log-uniform", name="learning_rate"))
search_space.append(Real(1e-2, 100, "log-uniform", name="eta"))
search_space.append(Real(0.05, 0.8, name="gamma"))
search_space.append(Real(0.1, 0.9, name="subsample"))
search_space.append(Real(0.5, 1, name="colsample_bytree"))

# Define the function used to evaluate a given configuration:
@use_named_args(search_space)
def evaluate_model(**params):
    model = XGBClassifier()
    model.set_params(**params)
    result = cross_val_score(model, X_scaler, Y, cv=cv, n_jobs=-1, scoring="recall")
    estimate = result.mean()
    return -1 * estimate

# Perform optimization:
result_opt = gp_minimize(func=evaluate_model, dimensions=search_space, random_state=29, verbose=True)

# Report findings:
print("Best Recall: %.3f" % (-1 * result_opt.fun))
print("Best Parameters: %s" % result_opt.x)

Huấn luyện lại XGBoost với tham số tối ưu tìm được. Recall trên test data là 0.7580:

# Best XGB:
best_param = result_opt.x
best_xgb = XGBClassifier(objective=str(best_param[0]),
                         max_depth=int(best_param[1]),
                         min_child_weight=int(best_param[2]),
                         n_estimators=int(best_param[3]),
                         learning_rate=float(best_param[4]),
                         eta=float(best_param[5]),
                         gamma=float(best_param[6]),
                         subsample=float(best_param[7]),
                         colsample_bytree=float(best_param[8]),
                         random_state=29,
                         n_jobs=-1)

# Split data:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaler, Y, test_size=0.3, random_state=1)

# Train best XGBClassifier:
best_xgb.fit(X_train, y_train)
pred = best_xgb.predict(X_test)

# Recall metrics:
from sklearn.metrics import recall_score
recall_score(y_test, pred)

# CM metrics:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
print(cm)
---
title: 'Bayesian Optimization for searching optimal Recall for XGBoost Classifier (Python)'
author: 'Author: Nguyen Chi Dung'
subtitle: "Python Machine Learning Series"
output:
  html_document: 
    code_download: true
    # code_folding: hide
    highlight: zenburn
    # number_sections: yes
    theme: "flatly"
    toc: TRUE
    toc_float: TRUE
---

```{r setup,include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = TRUE, eval = FALSE)

```


# Motivations

Tinh chỉnh tham số để tìm tham số tối ưu cho mô hình là công việc tốn thời gian và nặng nhọc. Bayesian Optimization là một cách tiếp cận hiệu quả để tinh chỉnh tham số cho các mô hình Machine Learning. 


# Data used and results

Trước hết train và đánh giá một loạt Machine Learning Classifiers. Kết quả cho thấy XGBoost có trung bình Recall (n_splits=3, n_repeats=4) là 0.7176: 


```{r, eval=FALSE}
# Hide warnings from Python: 

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

# Load data:
import pandas as pd
df = pd.read_csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")

# Convert categories to dummies:
df = pd.get_dummies(df)

# Impute missing data (https://academic.oup.com/bioinformatics/article/28/1/112/219101,
#                      https://academic.oup.com/aje/article/179/6/764/107562,
#                      https://github.com/epsilon-machine/missingpy):

from missingpy import MissForest
imputer = MissForest()
df_imputed = imputer.fit_transform(df)

# Convert to data frame:
df_imputed = pd.DataFrame(df_imputed)

# Rename for columns:
df_imputed.columns = df.columns

# Prepare data:
X = df_imputed.drop(labels=["BAD"], axis=1)
Y = df_imputed["BAD"]

# Standardize 0-1 for features:
from sklearn.preprocessing import MinMaxScaler
scaler_01 = MinMaxScaler()
scaler_01.fit(X)
X_scaler = scaler_01.transform(X)

# Some classifiers from Scikit-learn:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier

# LightGBM, Catboost and XGBoost:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Initiative estimators:
ran = RandomForestClassifier()
gbm = LGBMClassifier()
log = LogisticRegression()
gbc = GradientBoostingClassifier()
xgb = XGBClassifier()
ext = ExtraTreesClassifier()
ada = AdaBoostClassifier()
gnb = GaussianNB()
bag = BaggingClassifier()
nnn = MLPClassifier()
cat = CatBoostClassifier()

# List of classifiers:
models = [ran, gbm, log, gbc, xgb, ext, ada, gnb, bag, nnn, cat]

# Train all classifiers by using for loop:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=4, random_state=29)

recall_mean = []
recall_sd = []
auc_mean = []
auc_sd = []

import numpy as np

for mod in models:
    acc = cross_val_score(mod, X_scaler, Y, scoring="recall", cv=cv, n_jobs=-1, verbose=False)
    auc = cross_val_score(mod, X_scaler, Y, scoring="roc_auc", cv=cv, n_jobs=-1, verbose=False)
    # Recall metric:
    recall_mean.append(acc.mean())
    recall_sd.append(np.std(acc))
    # AUC metric:
    auc_mean.append(auc.mean())
    auc_sd.append(np.std(auc))

df_results = pd.DataFrame({"Classifier": [j.__class__.__name__ for j in models],
                           "Recall_mean": recall_mean,
                           "Recall_sd": recall_sd,
                           "AUC_mean": auc_mean,
                           "AUC_sd": auc_sd})

# Report results:
print(df_results)
```


Chúng ta có thể tinh chỉnh tham số theo Bayesian Optimization để cho Recall tăng lên 0.7400 (mức tăng 3% - khá là khiêm tốn) như sau: 

```{r}

# Define the space of parameters to search:
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize

# Space of parameters:
search_space = list()
search_space.append(Categorical(["binary:logistic"], name="objective"))
search_space.append(Integer(2, 20, name="max_depth"))
search_space.append(Integer(2, 20, name="min_child_weight"))
search_space.append(Integer(10, 1000, name="n_estimators"))
search_space.append(Real(1e-3, 100, "log-uniform", name="learning_rate"))
search_space.append(Real(1e-2, 100, "log-uniform", name="eta"))
search_space.append(Real(0.05, 0.8, name="gamma"))
search_space.append(Real(0.1, 0.9, name="subsample"))
search_space.append(Real(0.5, 1, name="colsample_bytree"))

# Define the function used to evaluate a given configuration:
@use_named_args(search_space)
def evaluate_model(**params):
    model = XGBClassifier()
    model.set_params(**params)
    result = cross_val_score(model, X_scaler, Y, cv=cv, n_jobs=-1, scoring="recall")
    estimate = result.mean()
    return -1 * estimate

# Perform optimization:
result_opt = gp_minimize(func=evaluate_model, dimensions=search_space, random_state=29, verbose=True)

# Report findings:
print("Best Recall: %.3f" % (-1 * result_opt.fun))
print("Best Parameters: %s" % result_opt.x)
```

Huấn luyện lại XGBoost với tham số tối ưu tìm được. Recall trên test data là 0.7580: 


```{r}
# Best XGB:
best_param = result_opt.x
best_xgb = XGBClassifier(objective=str(best_param[0]),
                         max_depth=int(best_param[1]),
                         min_child_weight=int(best_param[2]),
                         n_estimators=int(best_param[3]),
                         learning_rate=float(best_param[4]),
                         eta=float(best_param[5]),
                         gamma=float(best_param[6]),
                         subsample=float(best_param[7]),
                         colsample_bytree=float(best_param[8]),
                         random_state=29,
                         n_jobs=-1)

# Split data:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaler, Y, test_size=0.3, random_state=1)

# Train best XGBClassifier:
best_xgb.fit(X_train, y_train)
pred = best_xgb.predict(X_test)

# Recall metrics:
from sklearn.metrics import recall_score
recall_score(y_test, pred)

# CM metrics:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
print(cm)
```



