Xgboost Model
library(data.table)
library(xgboost)
library(mlr)
library(mlrMBO)
library(smoof)
library(pROC)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
# --- Load XGBoost data ---
train_xgb <- readRDS(file.path(OUTPUT_DIR, "train_xgb.rds"))
valid_xgb <- readRDS(file.path(OUTPUT_DIR, "valid_xgb.rds"))
# --- Xgb matrices ---
xgb_features <- setdiff(names(train_xgb),
c("loan_sequence_number", "monthly_reporting_period",
"default_next_12m"))
dtrain <- xgb.DMatrix(data = as.matrix(train_xgb[, ..xgb_features]),
label = train_xgb$default_next_12m)
dvalid <- xgb.DMatrix(data = as.matrix(valid_xgb[, ..xgb_features]),
label = valid_xgb$default_next_12m)
# --- Objective function ---
obj_fun <- makeSingleObjectiveFunction(
name = "xgb_auc",
fn = function(x) {
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = x["eta"],
max_depth = as.integer(x["max_depth"]),
subsample = x["subsample"],
colsample_bytree = x["colsample_bytree"],
nthread = 4
)
fit <- xgb.train(
params = params,
data = dtrain,
nrounds = 5000,
watchlist = list(valid = dvalid),
early_stopping_rounds = 50,
verbose = 0
)
-fit$best_score
},
par.set = makeParamSet(
makeNumericParam("eta", lower = 0.01, upper = 0.3),
makeIntegerParam("max_depth", lower = 3, upper = 8),
makeNumericParam("subsample", lower = 0.5, upper = 1.0),
makeNumericParam("colsample_bytree", lower = 0.5, upper = 1.0)
),
minimize = TRUE
)
# --- Bayesian optimisation ---
ctrl <- makeMBOControl()
ctrl <- setMBOControlTermination(ctrl, iters = 20)
design <- generateDesign(n = 10, par.set = getParamSet(obj_fun))
set.seed(123)
bayes_result <- mbo(obj_fun, design = design, control = ctrl, show.info = TRUE)
# --- Best hyperparameters ---
best_params <- bayes_result$x
best_auc <- -bayes_result$y # flip sign back
best_params
best_auc
# --- Retrain final model with best hyperparameters ---
final_params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = best_params$eta,
max_depth = best_params$max_depth,
subsample = best_params$subsample,
colsample_bytree = best_params$colsample_bytree,
nthread = 4
)
final_model <- xgb.train(
params = final_params,
data = dtrain,
nrounds = 5000,
watchlist = list(valid = dvalid),
early_stopping_rounds = 50,
verbose = 1
)
# --- Validation AUC ---
valid_preds <- predict(final_model, dvalid)
valid_auc <- as.numeric(auc(roc(valid_xgb$default_next_12m, valid_preds, quiet = TRUE)))
data.table(validation_auc = round(valid_auc, 6),
best_nrounds = final_model$best_iteration)
# --- Save ---
saveRDS(final_model, file.path(OUTPUT_DIR, "xgb_model.rds"))
saveRDS(best_params, file.path(OUTPUT_DIR, "best_hyperparams.rds"))
saveRDS(bayes_result, file.path(OUTPUT_DIR, "bayesopt_results.rds"))
XGBoost Results
library(data.table)
library(xgboost)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
final_model <- readRDS(file.path(OUTPUT_DIR, "xgb_model.rds"))
best_params <- readRDS(file.path(OUTPUT_DIR, "best_hyperparams.rds"))
bayes_result <- readRDS(file.path(OUTPUT_DIR, "bayesopt_results.rds"))
best_params
## $eta
## [1] 0.01905691
##
## $max_depth
## [1] 6
##
## $subsample
## [1] 0.8491454
##
## $colsample_bytree
## [1] 0.9303162
data.table(
validation_auc = round(-bayes_result$y, 6),
best_nrounds = final_model$best_iteration
)
## validation_auc best_nrounds
## <num> <num>
## 1: 0.848728 1099