Xgboost Model

library(data.table)
library(xgboost)
library(mlr)
library(mlrMBO)
library(smoof)
library(pROC)


OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

# --- Load XGBoost data ---
train_xgb <- readRDS(file.path(OUTPUT_DIR, "train_xgb.rds"))
valid_xgb <- readRDS(file.path(OUTPUT_DIR, "valid_xgb.rds"))

# --- Xgb matrices ---
xgb_features <- setdiff(names(train_xgb),
                        c("loan_sequence_number", "monthly_reporting_period",
                          "default_next_12m"))

dtrain <- xgb.DMatrix(data  = as.matrix(train_xgb[, ..xgb_features]),
                      label = train_xgb$default_next_12m)
dvalid <- xgb.DMatrix(data  = as.matrix(valid_xgb[, ..xgb_features]),
                      label = valid_xgb$default_next_12m)

# --- Objective function ---
obj_fun <- makeSingleObjectiveFunction(
  name = "xgb_auc",
  fn = function(x) {
    params <- list(
      objective        = "binary:logistic",
      eval_metric      = "auc",
      eta              = x["eta"],
      max_depth        = as.integer(x["max_depth"]),
      subsample        = x["subsample"],
      colsample_bytree = x["colsample_bytree"],
      nthread          = 4
    )
    
    fit <- xgb.train(
      params    = params,
      data      = dtrain,
      nrounds   = 5000,
      watchlist = list(valid = dvalid),
      early_stopping_rounds = 50,
      verbose   = 0
    )
    
    -fit$best_score
  },
  par.set = makeParamSet(
    makeNumericParam("eta",              lower = 0.01, upper = 0.3),
    makeIntegerParam("max_depth",        lower = 3,    upper = 8),
    makeNumericParam("subsample",        lower = 0.5,  upper = 1.0),
    makeNumericParam("colsample_bytree", lower = 0.5,  upper = 1.0)
  ),
  minimize = TRUE
)

# --- Bayesian optimisation ---
ctrl <- makeMBOControl()
ctrl <- setMBOControlTermination(ctrl, iters = 20)
design <- generateDesign(n = 10, par.set = getParamSet(obj_fun))

set.seed(123)
bayes_result <- mbo(obj_fun, design = design, control = ctrl, show.info = TRUE)

# --- Best hyperparameters ---
best_params <- bayes_result$x
best_auc    <- -bayes_result$y  # flip sign back

best_params
best_auc

# --- Retrain final model with best hyperparameters ---
final_params <- list(
  objective        = "binary:logistic",
  eval_metric      = "auc",
  eta              = best_params$eta,
  max_depth        = best_params$max_depth,
  subsample        = best_params$subsample,
  colsample_bytree = best_params$colsample_bytree,
  nthread          = 4
)

final_model <- xgb.train(
  params    = final_params,
  data      = dtrain,
  nrounds   = 5000,
  watchlist = list(valid = dvalid),
  early_stopping_rounds = 50,
  verbose   = 1
)

# --- Validation AUC ---
valid_preds <- predict(final_model, dvalid)
valid_auc   <- as.numeric(auc(roc(valid_xgb$default_next_12m, valid_preds, quiet = TRUE)))
data.table(validation_auc = round(valid_auc, 6),
           best_nrounds   = final_model$best_iteration)

# --- Save ---
saveRDS(final_model,  file.path(OUTPUT_DIR, "xgb_model.rds"))
saveRDS(best_params,  file.path(OUTPUT_DIR, "best_hyperparams.rds"))
saveRDS(bayes_result, file.path(OUTPUT_DIR, "bayesopt_results.rds"))

XGBoost Results

library(data.table)
library(xgboost)

OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

final_model  <- readRDS(file.path(OUTPUT_DIR, "xgb_model.rds"))
best_params  <- readRDS(file.path(OUTPUT_DIR, "best_hyperparams.rds"))
bayes_result <- readRDS(file.path(OUTPUT_DIR, "bayesopt_results.rds"))

best_params
## $eta
## [1] 0.01905691
## 
## $max_depth
## [1] 6
## 
## $subsample
## [1] 0.8491454
## 
## $colsample_bytree
## [1] 0.9303162
data.table(
  validation_auc = round(-bayes_result$y, 6),
  best_nrounds   = final_model$best_iteration
)
##    validation_auc best_nrounds
##             <num>        <num>
## 1:       0.848728         1099