One hot encoded categorical variables (same set as XGBoost
Model)
library(data.table)
library(keras3)
library(pROC)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
set.seed(123)
tensorflow::tf$random$set_seed(123L)
# --- load data ---
train_xgb <- readRDS(file.path(OUTPUT_DIR, "train_xgb.rds"))
valid_xgb <- readRDS(file.path(OUTPUT_DIR, "valid_xgb.rds"))
test_xgb <- readRDS(file.path(OUTPUT_DIR, "test_xgb.rds"))
target <- "default_next_12m"
drop_cols <- c(
"loan_sequence_number",
"monthly_reporting_period",
target
)
feature_cols <- setdiff(names(train_xgb), drop_cols)
# --- preprocessing: mean imputation and standardisation ---
train_means <- sapply(train_xgb[, ..feature_cols], function(x) mean(x, na.rm = TRUE))
train_sds <- sapply(train_xgb[, ..feature_cols], function(x) sd(x, na.rm = TRUE))
train_sds[train_sds < 1e-8] <- 1
prepare_nn_data <- function(dt, feature_cols, target, train_means, train_sds) {
X <- as.matrix(dt[, ..feature_cols])
y <- dt[[target]]
for (j in seq_along(feature_cols)) {
na_idx <- is.na(X[, j])
if (any(na_idx)) {
X[na_idx, j] <- train_means[j]
}
}
X <- sweep(X, 2, train_means, "-")
X <- sweep(X, 2, train_sds, "/")
list(X = X, y = y)
}
train_prep <- prepare_nn_data(train_xgb, feature_cols, target, train_means, train_sds)
valid_prep <- prepare_nn_data(valid_xgb, feature_cols, target, train_means, train_sds)
test_prep <- prepare_nn_data(test_xgb, feature_cols, target, train_means, train_sds)
rm(train_xgb, valid_xgb, test_xgb)
gc()
n_features <- length(feature_cols)
# --- model-building function ---
build_nn_model <- function(n_features,
units1,
units2,
units3,
dropout1,
dropout2,
dropout3,
learning_rate) {
model <- keras_model_sequential(input_shape = n_features) |>
layer_batch_normalization() |>
layer_dense(units = units1, activation = "relu") |>
layer_batch_normalization() |>
layer_dropout(rate = dropout1) |>
layer_dense(units = units2, activation = "relu") |>
layer_batch_normalization() |>
layer_dropout(rate = dropout2) |>
layer_dense(units = units3, activation = "relu") |>
layer_batch_normalization() |>
layer_dropout(rate = dropout3) |>
layer_dense(units = 1, activation = "sigmoid")
model |> compile(
optimizer = optimizer_adam(learning_rate = learning_rate),
loss = "binary_crossentropy",
metrics = list(metric_auc(name = "auc"))
)
model
}
# --- hyperparameter grid ---
param_grid <- expand.grid(
architecture = c("64-32-16", "128-64-32"),
dropout_base = c(0.2, 0.3),
learning_rate = c(1e-3, 5e-4),
batch_size = c(4096),
stringsAsFactors = FALSE
)
print(param_grid)
# --- hyperparameter tuning ---
results <- data.table()
best_auc <- -Inf
best_model <- NULL
best_config <- NULL
for (i in seq_len(nrow(param_grid))) {
config <- param_grid[i, ]
arch <- as.integer(strsplit(config$architecture, "-")[[1]])
dropout1 <- config$dropout_base
dropout2 <- max(config$dropout_base - 0.1, 0)
dropout3 <- max(config$dropout_base - 0.2, 0)
model <- build_nn_model(
n_features = n_features,
units1 = arch[1],
units2 = arch[2],
units3 = arch[3],
dropout1 = dropout1,
dropout2 = dropout2,
dropout3 = dropout3,
learning_rate = config$learning_rate
)
callbacks <- list(
callback_early_stopping(
monitor = "val_auc",
mode = "max",
patience = 5,
min_delta = 0.0005,
restore_best_weights = TRUE
),
callback_reduce_lr_on_plateau(
monitor = "val_auc",
mode = "max",
factor = 0.5,
patience = 2
)
)
history <- model |> fit(
x = train_prep$X,
y = train_prep$y,
validation_data = list(valid_prep$X, valid_prep$y),
epochs = 50,
batch_size = config$batch_size,
callbacks = callbacks,
verbose = 1
)
valid_probs <- as.numeric(model |> predict(valid_prep$X, verbose = 0))
valid_auc <- as.numeric(auc(roc(valid_prep$y, valid_probs, quiet = TRUE)))
result_i <- data.table(
model_id = i,
architecture = config$architecture,
dropout_base = config$dropout_base,
dropout1 = dropout1,
dropout2 = dropout2,
dropout3 = dropout3,
learning_rate = config$learning_rate,
batch_size = config$batch_size,
validation_auc = valid_auc
)
results <- rbind(results, result_i)
print(result_i)
if (valid_auc > best_auc) {
best_auc <- valid_auc
best_model <- model
best_config <- result_i
}
gc()
}
# --- tuning results ---
results <- results[order(-validation_auc)]
print(results)
print(best_config)
# --- test set evaluation ---
nn_probs <- as.numeric(best_model |> predict(test_prep$X, verbose = 0))
nn_labels <- test_prep$y
roc_nn <- roc(nn_labels, nn_probs, quiet = TRUE)
auc_nn <- as.numeric(auc(roc_nn))
ci_nn <- ci.auc(roc_nn)
gini_nn <- 2 * auc_nn - 1
print(round(auc_nn, 4))
print(round(ci_nn[c(1, 3)], 4))
print(round(gini_nn, 4))
# --- save ---
saveRDS(results, file.path(OUTPUT_DIR, "nn_tuning_results.rds"))
saveRDS(best_config, file.path(OUTPUT_DIR, "nn_best_config.rds"))
saveRDS(nn_probs, file.path(OUTPUT_DIR, "nn_test_probs_tuned.rds"))
best_model |> save_model(file.path(OUTPUT_DIR, "nn_model_tuned.keras"))
# --- test set evaluation ---
nn_probs <- as.numeric(best_model |> predict(test_prep$X, verbose = 0))
nn_labels <- test_prep$y
roc_nn <- roc(nn_labels, nn_probs, quiet = TRUE)
auc_nn <- as.numeric(auc(roc_nn))
ci_nn <- ci.auc(roc_nn)
gini_nn <- 2 * auc_nn - 1
print(round(auc_nn, 4))
print
# --- save ---
saveRDS(results, file.path(OUTPUT_DIR, "nn_tuning_results.rds"))
saveRDS(best_config, file.path(OUTPUT_DIR, "nn_best_config.rds"))
saveRDS(nn_probs, file.path(OUTPUT_DIR, "nn_test_probs_tuned.rds"))
best_model |> save_model(file.path(OUTPUT_DIR, "nn_model_tuned.keras"))