Cross-Validation Simulation Analysis

Setup and Data Generation

set.seed(0326)
library(caret)
library(ggplot2)
n <- 1000  # observations
p <- 20    # predictors
# outcome variable (66% unbalanced)
y <- rbinom(n, 1, prob = 1/3)
# Initialize predictor matrix
X <- matrix(nrow = n, ncol = p)

# predictors with different effect sizes
for (j in 1:p) {
  if (j <= 8) {
    # Moderate 
    X[, j] <- ifelse(y == 1, 
                     rnorm(n, mean = 0.35, sd = 1),
                     rnorm(n, mean = -0.18, sd = 1))
  } else if (j <= 12) {
    # Weak 
    X[, j] <- ifelse(y == 1, 
                     rnorm(n, mean = 0.20, sd = 1),
                     rnorm(n, mean = -0.10, sd = 1))
  } else {
    # Noise
    X[, j] <- rnorm(n, mean = 0, sd = 1)
  }
}
# df
colnames(X) <- paste0("X", 1:p)
df <- data.frame(y = factor(y, levels = c(0, 1)), X)
cat("\nProportions:\n", prop.table(table(df$y)), "\n")
## 
## Proportions:
##  0.683 0.317

Full Dataset Model Evaluation

# logistic regression
model <- glm(y ~ ., data = df, family = "binomial")

# Predictions
pred_prob <- predict(model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)

# balanced accuracy
cm <- confusionMatrix(factor(pred_class, levels = c(0, 1)), df$y)
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 614 104
##          1  69 213
##                                         
##                Accuracy : 0.827         
##                  95% CI : (0.8021, 0.85)
##     No Information Rate : 0.683         
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.5883        
##                                         
##  Mcnemar's Test P-Value : 0.009739      
##                                         
##             Sensitivity : 0.8990        
##             Specificity : 0.6719        
##          Pos Pred Value : 0.8552        
##          Neg Pred Value : 0.7553        
##              Prevalence : 0.6830        
##          Detection Rate : 0.6140        
##    Detection Prevalence : 0.7180        
##       Balanced Accuracy : 0.7854        
##                                         
##        'Positive' Class : 0             
## 
sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
balanced_accuracy <- (sensitivity + specificity) / 2
cat("\nBalanced Accuracy:", round(balanced_accuracy, 3), "\n")
## 
## Balanced Accuracy: 0.785

Bootstrap CV (1000 iterations)

# Store results across 1000 bootstraps
sampling_results <- data.frame()

# Repeat sampling 1000 times
for (sampling in 1:1000) {
  # Randomly sample 73 individuals 
  sample_indices <- sample(1:nrow(df), size = 73, replace = FALSE)
  df_sample <- df[sample_indices, ]
  
  # Define training control for 5-fold CV
  train_control <- trainControl(
    method = "cv",
    number = 5,
    savePredictions = "final",
    classProbs = TRUE,
    summaryFunction = twoClassSummary
  )
  
  # Rename factor levels
  df_sample$y <- factor(df_sample$y, levels = c(0, 1), labels = c("Class0", "Class1"))
  
  # Train logistic regression model 
  cv_model <- suppressWarnings(train(
    y ~ .,
    data = df_sample,
    method = "glm",
    family = "binomial",
    trControl = train_control,
    metric = "ROC"
  ))
  
  # Extract predictions from each fold
  cv_predictions <- cv_model$pred
  
  # Calculate accuracy for each fold
  fold_results <- data.frame()
  
  for (fold in 1:5) {
    fold_data <- cv_predictions[cv_predictions$Resample == paste0("Fold", fold), ]
    
    # Calculate accuracy
    accuracy <- mean(fold_data$pred == fold_data$obs)
    error_rate <- 1 - accuracy
    
    cm_fold <- suppressWarnings(confusionMatrix(fold_data$pred, fold_data$obs))
    sensitivity <- cm_fold$byClass["Sensitivity"]
    specificity <- cm_fold$byClass["Specificity"]
    balanced_acc <- (sensitivity + specificity) / 2
    
    fold_results <- rbind(fold_results, data.frame(
      Fold = fold,
      Accuracy = accuracy,
      Error_Rate = error_rate,
      Sensitivity = sensitivity,
      Specificity = specificity,
      Balanced_Accuracy = balanced_acc
    ))
  }
  
  # Calculate mean and SD
  mean_balanced_acc <- mean(fold_results$Balanced_Accuracy, na.rm = TRUE)
  sd_balanced_acc <- sd(fold_results$Balanced_Accuracy, na.rm = TRUE)
  min_balanced_acc <- min(fold_results$Balanced_Accuracy, na.rm = TRUE)
  max_balanced_acc <- max(fold_results$Balanced_Accuracy, na.rm = TRUE)
  range_balanced_acc <- max_balanced_acc - min_balanced_acc
  
  # Store results
  sampling_results <- rbind(sampling_results, data.frame(
    Sampling = sampling,
    Mean_Balanced_Accuracy = mean_balanced_acc,
    SD_Balanced_Accuracy = sd_balanced_acc,
    Min_Balanced_Accuracy = min_balanced_acc,
    Max_Balanced_Accuracy = max_balanced_acc,
    Range_Balanced_Accuracy = range_balanced_acc
  ))
}

Results

Summary Statistics

cat("Overall Average Mean Balanced Accuracy:", 
    round(mean(sampling_results$Mean_Balanced_Accuracy, na.rm = TRUE), 3), "\n")
## Overall Average Mean Balanced Accuracy: 0.667
cat("Overall Average SD Balanced Accuracy:", 
    round(mean(sampling_results$SD_Balanced_Accuracy, na.rm = TRUE), 3), "\n")
## Overall Average SD Balanced Accuracy: 0.127
cat("Overall Average Range Balanced Accuracy:", 
    round(mean(sampling_results$Range_Balanced_Accuracy, na.rm = TRUE), 3), "\n")
## Overall Average Range Balanced Accuracy: 0.314

Distribution of Balanced Accuracy Range

ggplot(sampling_results, aes(x = Sampling)) +
  geom_ribbon(aes(ymin = Min_Balanced_Accuracy, ymax = Max_Balanced_Accuracy), 
              alpha = 0.3, fill = "steelblue") +
  geom_line(aes(y = Mean_Balanced_Accuracy), color = "darkblue", size = 0.5) +
  geom_hline(yintercept = mean(sampling_results$Mean_Balanced_Accuracy, na.rm = TRUE), 
             color = "red", linetype = "dashed", size = 1) +
  labs(title = "Balanced Accuracy Across 1000 Bootstrap Iterations",
       subtitle = paste0("Average Range: ", 
                        round(mean(sampling_results$Range_Balanced_Accuracy, na.rm = TRUE), 3)),
       x = "Bootstrap Iteration",
       y = "Balanced Accuracy",
       caption = "Blue ribbon = min to max across 5 folds | Blue line = mean | Red dashed = overall average") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14),
        plot.subtitle = element_text(size = 12))