This analysis develops and compares three credit risk models for predicting loan defaults:
The models are evaluated using Area Under the ROC Curve (AUC) and other classification metrics. We examine whether modern ML approaches outperform traditional logistic regression, and establish an optimal probability threshold for credit decisions.
Exercise 1.2
Load the data for the credit risk applications. Run necessary preprocessing steps. Train logistic regression, random forest and xgboost models. Compare the models performance for AUC.
Key Questions:
# Data manipulation
library(dplyr)
library(tidyr)
library(tibble)
# Visualization
library(ggplot2)
library(gridExtra)
library(corrplot)
library(scales)
# Machine learning
library(caret)
library(glmnet)
library(randomForest)
library(xgboost)
# Model evaluation
library(pROC)
library(ROCR)
library(MLmetrics)
# Data preprocessing
library(recipes)# Load credit default data
credit_data <- read.csv("/Users/ashutoshverma/Downloads/Projects/Applied Finance/Assignment Risk/HW/default_data.csv")
# Display data structure
str(credit_data)## 'data.frame': 690 obs. of 16 variables:
## $ A1 : chr "b" "a" "a" "b" ...
## $ A2 : chr "30.83" "58.67" "24.50" "27.83" ...
## $ A3 : num 0 4.46 0.5 1.54 5.62 ...
## $ A4 : chr "u" "u" "u" "u" ...
## $ A5 : chr "g" "g" "g" "g" ...
## $ A6 : chr "w" "q" "q" "w" ...
## $ A7 : chr "v" "h" "h" "v" ...
## $ A8 : num 1.25 3.04 1.5 3.75 1.71 ...
## $ A9 : chr "t" "t" "t" "t" ...
## $ A10 : chr "t" "t" "f" "t" ...
## $ A11 : int 1 6 0 5 0 0 0 0 0 0 ...
## $ A12 : chr "f" "f" "f" "t" ...
## $ A13 : chr "g" "g" "g" "g" ...
## $ A14 : chr "00202" "00043" "00280" "00100" ...
## $ A15 : int 0 560 824 3 0 0 31285 1349 314 1442 ...
## $ default: int 0 0 0 0 0 0 0 0 0 0 ...
## [1] 690 16
# Missing values summary
missing_summary <- data.frame(
Variable = names(credit_data),
Missing_Count = sapply(credit_data, function(x) sum(is.na(x))),
Missing_Percent = round(sapply(credit_data, function(x) sum(is.na(x)) / length(x) * 100), 2)
)
missing_summary <- missing_summary[missing_summary$Missing_Count > 0, ]
if (nrow(missing_summary) > 0) {
print(missing_summary)
}
# Data types
data_types <- data.frame(
Variable = names(credit_data),
Type = sapply(credit_data, class)
)
print(data_types)## Variable Type
## A1 A1 character
## A2 A2 character
## A3 A3 numeric
## A4 A4 character
## A5 A5 character
## A6 A6 character
## A7 A7 character
## A8 A8 numeric
## A9 A9 character
## A10 A10 character
## A11 A11 integer
## A12 A12 character
## A13 A13 character
## A14 A14 character
## A15 A15 integer
## default default integer
# Identify target variable (assuming it's named 'default' or similar)
target_col <- names(credit_data)[grep("default|target|y|status|label",
names(credit_data), ignore.case = TRUE)]
if (length(target_col) == 0) {
target_col <- names(credit_data)[ncol(credit_data)]
} else if (length(target_col) > 1) {
target_col <- target_col[1]
}
# Create a copy with standardized target name
credit_data$default <- as.numeric(credit_data[[target_col]])
# Ensure binary encoding
unique_vals <- unique(credit_data$default)
if (length(unique_vals) > 2) {
credit_data$default <- as.integer(credit_data$default == max(credit_data$default))
}
# Distribution
default_table <- table(credit_data$default)
default_prop <- prop.table(default_table)
# Ensure we have both classes
if (length(default_table) < 2) {
stop("Error: Target variable must have at least 2 classes")
}
# Visualization
par(mfrow = c(1, 2))
barplot(default_table,
main = "Default Distribution (Count)",
xlab = "Default Status",
ylab = "Frequency",
col = c("lightgreen", "salmon"),
names.arg = c("Non-Default", "Default"))
barplot(default_prop * 100,
main = "Default Distribution (Percentage)",
xlab = "Default Status",
ylab = "Percentage",
col = c("lightgreen", "salmon"),
names.arg = c("Non-Default", "Default"))par(mfrow = c(1, 1))
# Create summary table
target_summary <- data.frame(
Class = c("Non-Default", "Default", "Imbalance Ratio"),
Count = c(default_table[1], default_table[2], NA),
Percentage = c(
sprintf("%.2f%%", default_prop[1] * 100),
sprintf("%.2f%%", default_prop[2] * 100),
sprintf("%.2f:1", default_table[1] / default_table[2])
)
)
print(target_summary)## Class Count Percentage
## 0 Non-Default 307 44.49%
## 1 Default 383 55.51%
## Imbalance Ratio NA 0.80:1
# Numeric variables summary
numeric_vars <- sapply(credit_data, is.numeric)
summary(credit_data[, numeric_vars])## A3 A8 A11 A15
## Min. : 0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 0.165 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 2.750 Median : 1.000 Median : 0.0 Median : 5.0
## Mean : 4.759 Mean : 2.223 Mean : 2.4 Mean : 1017.4
## 3rd Qu.: 7.207 3rd Qu.: 2.625 3rd Qu.: 3.0 3rd Qu.: 395.5
## Max. :28.000 Max. :28.500 Max. :67.0 Max. :100000.0
## default
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5551
## 3rd Qu.:1.0000
## Max. :1.0000
# Correlation matrix for numeric variables
numeric_data <- credit_data[, sapply(credit_data, is.numeric)]
# Remove target from correlation if it exists
if ("default" %in% names(numeric_data)) {
numeric_data_no_target <- numeric_data[, names(numeric_data) != "default"]
} else {
numeric_data_no_target <- numeric_data
}
if (ncol(numeric_data_no_target) > 1) {
cor_matrix <- cor(numeric_data_no_target, use = "complete.obs")
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
title = "Feature Correlation Matrix",
mar = c(0, 0, 2, 0))
}# Set seed for reproducibility
set.seed(123)
# Create train-test split (70-30)
train_index <- createDataPartition(credit_data$default, p = 0.7, list = FALSE)
train_data <- credit_data[train_index, ]
test_data <- credit_data[-train_index, ]
# Create split summary table
split_summary <- data.frame(
Dataset = c("Training", "Test"),
`Sample Size` = c(nrow(train_data), nrow(test_data)),
`Default Rate (%)` = c(
round(mean(train_data$default) * 100, 2),
round(mean(test_data$default) * 100, 2)
),
check.names = FALSE
)
print(split_summary)## Dataset Sample Size Default Rate (%)
## 1 Training 483 55.07
## 2 Test 207 56.52
# Identify feature columns (exclude target)
feature_cols <- names(credit_data)[names(credit_data) != "default" &
names(credit_data) != target_col]
# Ensure we have features
if (length(feature_cols) == 0) {
stop("No feature columns found. Please check the data structure.")
}
# Handle categorical variables if present
categorical_cols <- names(train_data)[sapply(train_data, function(x)
is.character(x) | is.factor(x))]
categorical_cols <- categorical_cols[categorical_cols != "default" &
categorical_cols != target_col]
if (length(categorical_cols) > 0) {
# Convert to factors and then to numeric
for (col in categorical_cols) {
train_data[[col]] <- as.numeric(as.factor(train_data[[col]]))
test_data[[col]] <- as.numeric(as.factor(test_data[[col]]))
}
}
# Feature scaling (standardization)
scale_params <- list()
for (col in feature_cols) {
if (is.numeric(train_data[[col]])) {
mean_val <- mean(train_data[[col]], na.rm = TRUE)
sd_val <- sd(train_data[[col]], na.rm = TRUE)
# Avoid division by zero
if (sd_val == 0 || is.na(sd_val)) {
sd_val <- 1
}
scale_params[[col]] <- list(mean = mean_val, sd = sd_val)
# Scale both train and test using training parameters
train_data[[col]] <- (train_data[[col]] - scale_params[[col]]$mean) /
scale_params[[col]]$sd
test_data[[col]] <- (test_data[[col]] - scale_params[[col]]$mean) /
scale_params[[col]]$sd
}
}
# Handle any remaining missing values
train_data[is.na(train_data)] <- 0
test_data[is.na(test_data)] <- 0
# Prepare feature matrices
X_train <- as.matrix(train_data[, feature_cols])
y_train <- as.numeric(train_data$default)
X_test <- as.matrix(test_data[, feature_cols])
y_test <- as.numeric(test_data$default)
# Ensure binary encoding (0 and 1)
y_train <- as.integer(y_train > 0)
y_test <- as.integer(y_test > 0)
# Create validation summary table
preprocessing_summary <- data.frame(
Metric = c("Number of Features", "Training Samples", "Test Samples",
"Training Default Rate (%)", "Test Default Rate (%)"),
Value = c(ncol(X_train), nrow(X_train), nrow(X_test),
round(mean(y_train) * 100, 2), round(mean(y_test) * 100, 2))
)
print(preprocessing_summary)## Metric Value
## 1 Number of Features 15.00
## 2 Training Samples 483.00
## 3 Test Samples 207.00
## 4 Training Default Rate (%) 55.07
## 5 Test Default Rate (%) 56.52
# Train logistic regression
logit_model <- glm(default ~ .,
data = train_data[, c(feature_cols, "default")],
family = binomial(link = "logit"))
# Model summary
summary(logit_model)##
## Call:
## glm(formula = default ~ ., family = binomial(link = "logit"),
## data = train_data[, c(feature_cols, "default")])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.381710 0.179729 2.124 0.033686 *
## A1 -0.030061 0.152967 -0.197 0.844206
## A2 0.073523 0.155724 0.472 0.636831
## A3 -0.003596 0.153996 -0.023 0.981372
## A4 1.297203 0.346195 3.747 0.000179 ***
## A5 -0.860543 0.365080 -2.357 0.018417 *
## A6 -0.267528 0.145272 -1.842 0.065540 .
## A7 -0.062500 0.152071 -0.411 0.681080
## A8 -0.298186 0.175250 -1.701 0.088852 .
## A9 -1.782323 0.187052 -9.529 < 2e-16 ***
## A10 -0.185154 0.198763 -0.932 0.351580
## A11 -0.602810 0.332517 -1.813 0.069852 .
## A12 0.138786 0.148916 0.932 0.351349
## A13 0.060530 0.150690 0.402 0.687918
## A14 0.079072 0.155931 0.507 0.612090
## A15 -1.377839 0.504121 -2.733 0.006273 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 664.60 on 482 degrees of freedom
## Residual deviance: 330.59 on 467 degrees of freedom
## AIC: 362.59
##
## Number of Fisher Scoring iterations: 6
# Convert target to factor for classification
train_data$default_factor <- as.factor(train_data$default)
test_data$default_factor <- as.factor(test_data$default)
# Train Random Forest
set.seed(123)
rf_model <- randomForest(
x = X_train,
y = train_data$default_factor,
ntree = 500,
mtry = floor(sqrt(ncol(X_train))),
importance = TRUE,
nodesize = 10
)
# Model summary
print(rf_model)##
## Call:
## randomForest(x = X_train, y = train_data$default_factor, ntree = 500, mtry = floor(sqrt(ncol(X_train))), nodesize = 10, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.25%
## Confusion matrix:
## 0 1 class.error
## 0 189 28 0.1290323
## 1 36 230 0.1353383
# Predictions (probability of class 1)
rf_pred_train <- predict(rf_model, newdata = X_train, type = "prob")[, 2]
rf_pred_test <- predict(rf_model, newdata = X_test, type = "prob")[, 2]
# Store predictions
predictions_list$rf_train <- rf_pred_train
predictions_list$rf_test <- rf_pred_test
# Feature importance
importance_df <- data.frame(
Feature = rownames(importance(rf_model)),
MeanDecreaseGini = importance(rf_model)[, "MeanDecreaseGini"]
)
importance_df <- importance_df[order(-importance_df$MeanDecreaseGini), ]
print(head(importance_df, 10))## Feature MeanDecreaseGini
## A9 A9 60.051774
## A8 A8 19.400619
## A11 A11 19.225536
## A15 A15 14.086413
## A3 A3 11.883894
## A14 A14 10.480419
## A10 A10 10.425688
## A6 A6 9.076357
## A2 A2 8.917857
## A7 A7 4.006698
# Plot feature importance
top_features <- head(importance_df, 15)
ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Random Forest: Top 15 Feature Importance",
x = "Feature",
y = "Mean Decrease in Gini") +
theme_minimal() +
theme(text = element_text(size = 11))# Prepare data for XGBoost
dtrain <- xgb.DMatrix(data = X_train, label = y_train)
dtest <- xgb.DMatrix(data = X_test, label = y_test)
# Set parameters
xgb_params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.1,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.8,
min_child_weight = 1
)
# Train XGBoost with cross-validation to find optimal rounds
set.seed(123)
xgb_cv <- xgb.cv(
params = xgb_params,
data = dtrain,
nrounds = 200,
nfold = 5,
early_stopping_rounds = 20,
verbose = 0
)
# Get optimal number of rounds
best_iteration <- xgb_cv$best_iteration
# Handle edge case where best_iteration might be NULL
if (is.null(best_iteration) || length(best_iteration) == 0) {
best_iteration <- xgb_cv$niter
}
# Train final model
set.seed(123)
xgb_model <- xgb.train(
params = xgb_params,
data = dtrain,
nrounds = best_iteration,
verbose = 0
)
# Predictions
xgb_pred_train <- predict(xgb_model, dtrain)
xgb_pred_test <- predict(xgb_model, dtest)
# Store predictions
predictions_list$xgb_train <- xgb_pred_train
predictions_list$xgb_test <- xgb_pred_test
# Feature importance
xgb_importance <- xgb.importance(
feature_names = colnames(X_train),
model = xgb_model
)
print(head(xgb_importance, 10))## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: A9 0.43530843 0.14021329 0.03630363
## 2: A14 0.09256480 0.11832459 0.13531353
## 3: A3 0.08526433 0.14370353 0.16666667
## 4: A15 0.08204162 0.11490088 0.11056106
## 5: A8 0.06840300 0.10603257 0.12458746
## 6: A2 0.06709319 0.11495080 0.16584158
## 7: A6 0.04920152 0.09034140 0.10313531
## 8: A11 0.03804709 0.05002304 0.02640264
## 9: A4 0.02348613 0.03604928 0.02475248
## 10: A10 0.01779498 0.01950123 0.01650165
# Plot XGBoost feature importance (only if we have importance scores)
if (nrow(xgb_importance) > 0) {
xgb.plot.importance(importance_matrix = head(xgb_importance, 15),
main = "XGBoost: Top 15 Feature Importance")
}# Calculate ROC curves
roc_logit <- roc(y_test, predictions_list$logit_test, quiet = TRUE)
roc_rf <- roc(y_test, predictions_list$rf_test, quiet = TRUE)
roc_xgb <- roc(y_test, predictions_list$xgb_test, quiet = TRUE)
# Plot ROC curves
plot(roc_logit, col = "red", lwd = 2, main = "ROC Curves Comparison")
plot(roc_rf, col = "blue", lwd = 2, add = TRUE)
plot(roc_xgb, col = "green3", lwd = 2, add = TRUE)
abline(a = 0, b = 1, lty = 2, col = "gray")
legend("bottomright",
legend = c(
sprintf("Logistic Regression (AUC = %.4f)", auc(roc_logit)),
sprintf("Random Forest (AUC = %.4f)", auc(roc_rf)),
sprintf("XGBoost (AUC = %.4f)", auc(roc_xgb))
),
col = c("red", "blue", "green3"),
lwd = 2,
bty = "n")# Create AUC comparison table
auc_results <- data.frame(
Model = c("Logistic Regression", "Random Forest", "XGBoost"),
`Training AUC` = c(
auc(roc(y_train, predictions_list$logit_train, quiet = TRUE)),
auc(roc(y_train, predictions_list$rf_train, quiet = TRUE)),
auc(roc(y_train, predictions_list$xgb_train, quiet = TRUE))
),
`Test AUC` = c(
auc(roc_logit),
auc(roc_rf),
auc(roc_xgb)
),
check.names = FALSE
)
# Calculate overfitting metric
auc_results$`Overfitting Gap` <- auc_results$`Training AUC` - auc_results$`Test AUC`
# Round values
auc_results$`Training AUC` <- round(auc_results$`Training AUC`, 4)
auc_results$`Test AUC` <- round(auc_results$`Test AUC`, 4)
auc_results$`Overfitting Gap` <- round(auc_results$`Overfitting Gap`, 4)
print(auc_results)## Model Training AUC Test AUC Overfitting Gap
## 1 Logistic Regression 0.9231 0.9556 -0.0324
## 2 Random Forest 0.9856 0.9645 0.0211
## 3 XGBoost 0.9996 0.9538 0.0459
# Visualize AUC comparison
auc_long <- auc_results %>%
select(Model, `Training AUC`, `Test AUC`) %>%
pivot_longer(cols = c(`Training AUC`, `Test AUC`),
names_to = "Dataset",
values_to = "AUC")
ggplot(auc_long, aes(x = Model, y = AUC, fill = Dataset)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = sprintf("%.4f", AUC)),
position = position_dodge(width = 0.9),
vjust = -0.5, size = 3.5) +
scale_fill_manual(values = c("Training AUC" = "lightblue",
"Test AUC" = "steelblue")) +
ylim(0, 1) +
labs(title = "Model Performance Comparison: AUC",
y = "Area Under the Curve (AUC)",
x = "") +
theme_minimal() +
theme(legend.position = "top",
axis.text.x = element_text(angle = 0, hjust = 0.5))# Select best model based on test AUC
best_model_idx <- which.max(auc_results$`Test AUC`)
best_model_name <- auc_results$Model[best_model_idx]
# Get predictions from best model
if (best_model_name == "Logistic Regression") {
best_predictions <- predictions_list$logit_test
} else if (best_model_name == "Random Forest") {
best_predictions <- predictions_list$rf_test
} else {
best_predictions <- predictions_list$xgb_test
}
# Calculate metrics across different thresholds
thresholds <- seq(0.1, 0.9, by = 0.05)
threshold_metrics <- data.frame(
Threshold = thresholds,
Accuracy = numeric(length(thresholds)),
Precision = numeric(length(thresholds)),
Recall = numeric(length(thresholds)),
F1_Score = numeric(length(thresholds)),
Specificity = numeric(length(thresholds)),
FPR = numeric(length(thresholds))
)
for (i in seq_along(thresholds)) {
thresh <- thresholds[i]
pred_class <- ifelse(best_predictions >= thresh, 1, 0)
# Confusion matrix elements
tp <- sum(pred_class == 1 & y_test == 1)
tn <- sum(pred_class == 0 & y_test == 0)
fp <- sum(pred_class == 1 & y_test == 0)
fn <- sum(pred_class == 0 & y_test == 1)
# Calculate metrics
threshold_metrics$Accuracy[i] <- (tp + tn) / (tp + tn + fp + fn)
threshold_metrics$Precision[i] <- ifelse(tp + fp > 0, tp / (tp + fp), 0)
threshold_metrics$Recall[i] <- ifelse(tp + fn > 0, tp / (tp + fn), 0)
threshold_metrics$F1_Score[i] <- ifelse(
threshold_metrics$Precision[i] + threshold_metrics$Recall[i] > 0,
2 * (threshold_metrics$Precision[i] * threshold_metrics$Recall[i]) /
(threshold_metrics$Precision[i] + threshold_metrics$Recall[i]),
0
)
threshold_metrics$Specificity[i] <- ifelse(tn + fp > 0, tn / (tn + fp), 0)
threshold_metrics$FPR[i] <- ifelse(fp + tn > 0, fp / (fp + tn), 0)
}
# Find optimal thresholds
optimal_f1_idx <- which.max(threshold_metrics$F1_Score)
optimal_accuracy_idx <- which.max(threshold_metrics$Accuracy)
# Youden's Index: Sensitivity + Specificity - 1
threshold_metrics$Youden <- threshold_metrics$Recall + threshold_metrics$Specificity - 1
optimal_youden_idx <- which.max(threshold_metrics$Youden)
# Create summary table
threshold_summary <- data.frame(
Method = c("Max F1-Score", "Max Accuracy", "Youden's Index"),
Threshold = c(thresholds[optimal_f1_idx],
thresholds[optimal_accuracy_idx],
thresholds[optimal_youden_idx]),
Value = c(threshold_metrics$F1_Score[optimal_f1_idx],
threshold_metrics$Accuracy[optimal_accuracy_idx],
threshold_metrics$Youden[optimal_youden_idx])
)
threshold_summary$Threshold <- round(threshold_summary$Threshold, 2)
threshold_summary$Value <- round(threshold_summary$Value, 4)
print(threshold_summary)## Method Threshold Value
## 1 Max F1-Score 0.45 0.9212
## 2 Max Accuracy 0.45 0.9082
## 3 Youden's Index 0.45 0.8043
# Plot threshold metrics
threshold_long <- threshold_metrics %>%
select(Threshold, Accuracy, Precision, Recall, F1_Score, Specificity) %>%
pivot_longer(cols = -Threshold, names_to = "Metric", values_to = "Value")
ggplot(threshold_long, aes(x = Threshold, y = Value, color = Metric)) +
geom_line(lwd = 1.2) +
geom_vline(xintercept = thresholds[optimal_f1_idx],
linetype = "dashed", color = "red", alpha = 0.5) +
geom_vline(xintercept = thresholds[optimal_youden_idx],
linetype = "dashed", color = "blue", alpha = 0.5) +
labs(title = "Performance Metrics Across Probability Thresholds",
subtitle = sprintf("Red line = Optimal F1 (%.2f) | Blue line = Optimal Youden (%.2f)",
thresholds[optimal_f1_idx], thresholds[optimal_youden_idx]),
x = "Probability Threshold",
y = "Metric Value") +
theme_minimal() +
theme(legend.position = "right") +
scale_color_brewer(palette = "Set1")# Define costs (these should be adjusted based on business context)
cost_fp <- 100 # Cost of False Positive (rejecting good customer)
cost_fn <- 1000 # Cost of False Negative (accepting bad customer who defaults)
# Calculate expected cost for each threshold
threshold_metrics$Expected_Cost <- numeric(length(thresholds))
for (i in seq_along(thresholds)) {
thresh <- thresholds[i]
pred_class <- ifelse(best_predictions >= thresh, 1, 0)
fp <- sum(pred_class == 1 & y_test == 0)
fn <- sum(pred_class == 0 & y_test == 1)
threshold_metrics$Expected_Cost[i] <- (fp * cost_fp + fn * cost_fn) / length(y_test)
}
# Find cost-optimal threshold
optimal_cost_idx <- which.min(threshold_metrics$Expected_Cost)
# Create cost summary
cost_summary <- data.frame(
Parameter = c("Cost of False Positive ($)",
"Cost of False Negative ($)",
"Optimal Threshold",
"Expected Cost per Loan ($)"),
Value = c(cost_fp, cost_fn,
thresholds[optimal_cost_idx],
round(threshold_metrics$Expected_Cost[optimal_cost_idx], 2))
)
print(cost_summary)## Parameter Value
## 1 Cost of False Positive ($) 100.00
## 2 Cost of False Negative ($) 1000.00
## 3 Optimal Threshold 0.30
## 4 Expected Cost per Loan ($) 10.14
# Plot cost analysis
ggplot(threshold_metrics, aes(x = Threshold, y = Expected_Cost)) +
geom_line(color = "darkred", lwd = 1.2) +
geom_point(data = threshold_metrics[optimal_cost_idx, ],
aes(x = Threshold, y = Expected_Cost),
color = "red", size = 4) +
geom_vline(xintercept = thresholds[optimal_cost_idx],
linetype = "dashed", color = "red", alpha = 0.5) +
labs(title = "Expected Cost vs. Probability Threshold",
subtitle = sprintf("Optimal threshold: %.2f (minimizes expected cost)",
thresholds[optimal_cost_idx]),
x = "Probability Threshold",
y = "Expected Cost per Loan ($)") +
theme_minimal()# Select recommended threshold based on business context
# Using Youden's Index as it balances sensitivity and specificity
recommended_threshold <- thresholds[optimal_youden_idx]
# Calculate final performance metrics
pred_class_final <- ifelse(best_predictions >= recommended_threshold, 1, 0)
# Confusion matrix
conf_matrix <- table(Predicted = pred_class_final, Actual = y_test)
print(conf_matrix)## Actual
## Predicted 0 1
## 0 77 6
## 1 13 111
# Calculate final metrics
tp <- conf_matrix[2, 2]
tn <- conf_matrix[1, 1]
fp <- conf_matrix[2, 1]
fn <- conf_matrix[1, 2]
final_metrics <- data.frame(
Metric = c("Accuracy", "Precision", "Recall", "Specificity",
"F1-Score", "False Positive Rate"),
Value = c(
(tp + tn) / sum(conf_matrix),
tp / (tp + fp),
tp / (tp + fn),
tn / (tn + fp),
2 * tp / (2 * tp + fp + fn),
fp / (fp + tn)
)
)
final_metrics$Value <- round(final_metrics$Value, 4)
print(final_metrics)## Metric Value
## 1 Accuracy 0.9082
## 2 Precision 0.8952
## 3 Recall 0.9487
## 4 Specificity 0.8556
## 5 F1-Score 0.9212
## 6 False Positive Rate 0.1444
# Rank models by test AUC
auc_ranked <- auc_results[order(-auc_results$`Test AUC`), ]
print(auc_ranked)## Model Training AUC Test AUC Overfitting Gap
## 2 Random Forest 0.9856 0.9645 0.0211
## 1 Logistic Regression 0.9231 0.9556 -0.0324
## 3 XGBoost 0.9996 0.9538 0.0459
Analysis:
Recommended Threshold: 0.45
Justification:
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] recipes_1.3.1 MLmetrics_1.1.3 ROCR_1.0-11
## [4] pROC_1.18.5 xgboost_3.1.2.1 randomForest_4.7-1.2
## [7] glmnet_4.1-8 Matrix_1.7-0 caret_7.0-1
## [10] lattice_0.22-6 scales_1.4.0 corrplot_0.95
## [13] gridExtra_2.3 ggplot2_4.0.1 tibble_3.2.1
## [16] tidyr_1.3.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] shape_1.4.6.1 gtable_0.3.6 xfun_0.54
## [4] bslib_0.9.0 vctrs_0.6.5 tools_4.4.1
## [7] generics_0.1.4 stats4_4.4.1 parallel_4.4.1
## [10] ModelMetrics_1.2.2.2 pkgconfig_2.0.3 data.table_1.17.0
## [13] RColorBrewer_1.1-3 S7_0.2.0 lifecycle_1.0.4
## [16] compiler_4.4.1 farver_2.1.2 stringr_1.5.1
## [19] codetools_0.2-20 htmltools_0.5.8.1 class_7.3-22
## [22] sass_0.4.10 yaml_2.3.10 prodlim_2025.04.28
## [25] pillar_1.10.2 jquerylib_0.1.4 MASS_7.3-60.2
## [28] cachem_1.1.0 gower_1.0.2 iterators_1.0.14
## [31] rpart_4.1.23 foreach_1.5.2 nlme_3.1-164
## [34] parallelly_1.44.0 lava_1.8.2 tidyselect_1.2.1
## [37] digest_0.6.37 stringi_1.8.4 future_1.49.0
## [40] reshape2_1.4.4 purrr_1.2.0 listenv_0.9.1
## [43] labeling_0.4.3 splines_4.4.1 fastmap_1.2.0
## [46] grid_4.4.1 cli_3.6.5 magrittr_2.0.4
## [49] survival_3.6-4 future.apply_1.20.1 withr_3.0.2
## [52] lubridate_1.9.4 timechange_0.3.0 rmarkdown_2.30
## [55] globals_0.18.0 nnet_7.3-19 timeDate_4041.110
## [58] evaluate_1.0.5 knitr_1.49 hardhat_1.4.2
## [61] rlang_1.1.6 Rcpp_1.1.1 glue_1.8.0
## [64] ipred_0.9-15 rstudioapi_0.17.1 jsonlite_2.0.0
## [67] R6_2.6.1 plyr_1.8.9
Report generated on 2026-01-30