Data used and results
Trước hết train và đánh giá một loạt Machine Learning Classifiers. Kết quả cho thấy XGBoost có trung bình Recall (n_splits=3, n_repeats=4) là 0.7176:
# Hide warnings from Python:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
# Load data:
import pandas as pd
df = pd.read_csv("http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv")
# Convert categories to dummies:
df = pd.get_dummies(df)
# Impute missing data (https://academic.oup.com/bioinformatics/article/28/1/112/219101,
# https://academic.oup.com/aje/article/179/6/764/107562,
# https://github.com/epsilon-machine/missingpy):
from missingpy import MissForest
imputer = MissForest()
df_imputed = imputer.fit_transform(df)
# Convert to data frame:
df_imputed = pd.DataFrame(df_imputed)
# Rename for columns:
df_imputed.columns = df.columns
# Prepare data:
X = df_imputed.drop(labels=["BAD"], axis=1)
Y = df_imputed["BAD"]
# Standardize 0-1 for features:
from sklearn.preprocessing import MinMaxScaler
scaler_01 = MinMaxScaler()
scaler_01.fit(X)
X_scaler = scaler_01.transform(X)
# Some classifiers from Scikit-learn:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
# LightGBM, Catboost and XGBoost:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
# Initiative estimators:
ran = RandomForestClassifier()
gbm = LGBMClassifier()
log = LogisticRegression()
gbc = GradientBoostingClassifier()
xgb = XGBClassifier()
ext = ExtraTreesClassifier()
ada = AdaBoostClassifier()
gnb = GaussianNB()
bag = BaggingClassifier()
nnn = MLPClassifier()
cat = CatBoostClassifier()
# List of classifiers:
models = [ran, gbm, log, gbc, xgb, ext, ada, gnb, bag, nnn, cat]
# Train all classifiers by using for loop:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=4, random_state=29)
recall_mean = []
recall_sd = []
auc_mean = []
auc_sd = []
import numpy as np
for mod in models:
acc = cross_val_score(mod, X_scaler, Y, scoring="recall", cv=cv, n_jobs=-1, verbose=False)
auc = cross_val_score(mod, X_scaler, Y, scoring="roc_auc", cv=cv, n_jobs=-1, verbose=False)
# Recall metric:
recall_mean.append(acc.mean())
recall_sd.append(np.std(acc))
# AUC metric:
auc_mean.append(auc.mean())
auc_sd.append(np.std(auc))
df_results = pd.DataFrame({"Classifier": [j.__class__.__name__ for j in models],
"Recall_mean": recall_mean,
"Recall_sd": recall_sd,
"AUC_mean": auc_mean,
"AUC_sd": auc_sd})
# Report results:
print(df_results)
Chúng ta có thể tinh chỉnh tham số theo Bayesian Optimization để cho Recall tăng lên 0.7400 (mức tăng 3% - khá là khiêm tốn) như sau:
# Define the space of parameters to search:
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
# Space of parameters:
search_space = list()
search_space.append(Categorical(["binary:logistic"], name="objective"))
search_space.append(Integer(2, 20, name="max_depth"))
search_space.append(Integer(2, 20, name="min_child_weight"))
search_space.append(Integer(10, 1000, name="n_estimators"))
search_space.append(Real(1e-3, 100, "log-uniform", name="learning_rate"))
search_space.append(Real(1e-2, 100, "log-uniform", name="eta"))
search_space.append(Real(0.05, 0.8, name="gamma"))
search_space.append(Real(0.1, 0.9, name="subsample"))
search_space.append(Real(0.5, 1, name="colsample_bytree"))
# Define the function used to evaluate a given configuration:
@use_named_args(search_space)
def evaluate_model(**params):
model = XGBClassifier()
model.set_params(**params)
result = cross_val_score(model, X_scaler, Y, cv=cv, n_jobs=-1, scoring="recall")
estimate = result.mean()
return -1 * estimate
# Perform optimization:
result_opt = gp_minimize(func=evaluate_model, dimensions=search_space, random_state=29, verbose=True)
# Report findings:
print("Best Recall: %.3f" % (-1 * result_opt.fun))
print("Best Parameters: %s" % result_opt.x)
Huấn luyện lại XGBoost với tham số tối ưu tìm được. Recall trên test data là 0.7580:
# Best XGB:
best_param = result_opt.x
best_xgb = XGBClassifier(objective=str(best_param[0]),
max_depth=int(best_param[1]),
min_child_weight=int(best_param[2]),
n_estimators=int(best_param[3]),
learning_rate=float(best_param[4]),
eta=float(best_param[5]),
gamma=float(best_param[6]),
subsample=float(best_param[7]),
colsample_bytree=float(best_param[8]),
random_state=29,
n_jobs=-1)
# Split data:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaler, Y, test_size=0.3, random_state=1)
# Train best XGBClassifier:
best_xgb.fit(X_train, y_train)
pred = best_xgb.predict(X_test)
# Recall metrics:
from sklearn.metrics import recall_score
recall_score(y_test, pred)
# CM metrics:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
print(cm)
