Cross-Validation Simulation Analysis
Setup and Data Generation
set.seed(0326)
library(caret)
library(ggplot2)
n <- 1000 # observations
p <- 20 # predictors
# outcome variable (66% unbalanced)
y <- rbinom(n, 1, prob = 1/3)
# Initialize predictor matrix
X <- matrix(nrow = n, ncol = p)
# predictors with different effect sizes
for (j in 1:p) {
if (j <= 8) {
# Moderate
X[, j] <- ifelse(y == 1,
rnorm(n, mean = 0.35, sd = 1),
rnorm(n, mean = -0.18, sd = 1))
} else if (j <= 12) {
# Weak
X[, j] <- ifelse(y == 1,
rnorm(n, mean = 0.20, sd = 1),
rnorm(n, mean = -0.10, sd = 1))
} else {
# Noise
X[, j] <- rnorm(n, mean = 0, sd = 1)
}
}
# df
colnames(X) <- paste0("X", 1:p)
df <- data.frame(y = factor(y, levels = c(0, 1)), X)
cat("\nProportions:\n", prop.table(table(df$y)), "\n")
##
## Proportions:
## 0.683 0.317
Full Dataset Model Evaluation
# logistic regression
model <- glm(y ~ ., data = df, family = "binomial")
# Predictions
pred_prob <- predict(model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
# balanced accuracy
cm <- confusionMatrix(factor(pred_class, levels = c(0, 1)), df$y)
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 614 104
## 1 69 213
##
## Accuracy : 0.827
## 95% CI : (0.8021, 0.85)
## No Information Rate : 0.683
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5883
##
## Mcnemar's Test P-Value : 0.009739
##
## Sensitivity : 0.8990
## Specificity : 0.6719
## Pos Pred Value : 0.8552
## Neg Pred Value : 0.7553
## Prevalence : 0.6830
## Detection Rate : 0.6140
## Detection Prevalence : 0.7180
## Balanced Accuracy : 0.7854
##
## 'Positive' Class : 0
##
sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
balanced_accuracy <- (sensitivity + specificity) / 2
cat("\nBalanced Accuracy:", round(balanced_accuracy, 3), "\n")
##
## Balanced Accuracy: 0.785
Bootstrap CV (1000 iterations)
# Store results across 1000 bootstraps
sampling_results <- data.frame()
# Repeat sampling 1000 times
for (sampling in 1:1000) {
# Randomly sample 73 individuals
sample_indices <- sample(1:nrow(df), size = 73, replace = FALSE)
df_sample <- df[sample_indices, ]
# Define training control for 5-fold CV
train_control <- trainControl(
method = "cv",
number = 5,
savePredictions = "final",
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Rename factor levels
df_sample$y <- factor(df_sample$y, levels = c(0, 1), labels = c("Class0", "Class1"))
# Train logistic regression model
cv_model <- suppressWarnings(train(
y ~ .,
data = df_sample,
method = "glm",
family = "binomial",
trControl = train_control,
metric = "ROC"
))
# Extract predictions from each fold
cv_predictions <- cv_model$pred
# Calculate accuracy for each fold
fold_results <- data.frame()
for (fold in 1:5) {
fold_data <- cv_predictions[cv_predictions$Resample == paste0("Fold", fold), ]
# Calculate accuracy
accuracy <- mean(fold_data$pred == fold_data$obs)
error_rate <- 1 - accuracy
cm_fold <- suppressWarnings(confusionMatrix(fold_data$pred, fold_data$obs))
sensitivity <- cm_fold$byClass["Sensitivity"]
specificity <- cm_fold$byClass["Specificity"]
balanced_acc <- (sensitivity + specificity) / 2
fold_results <- rbind(fold_results, data.frame(
Fold = fold,
Accuracy = accuracy,
Error_Rate = error_rate,
Sensitivity = sensitivity,
Specificity = specificity,
Balanced_Accuracy = balanced_acc
))
}
# Calculate mean and SD
mean_balanced_acc <- mean(fold_results$Balanced_Accuracy, na.rm = TRUE)
sd_balanced_acc <- sd(fold_results$Balanced_Accuracy, na.rm = TRUE)
min_balanced_acc <- min(fold_results$Balanced_Accuracy, na.rm = TRUE)
max_balanced_acc <- max(fold_results$Balanced_Accuracy, na.rm = TRUE)
range_balanced_acc <- max_balanced_acc - min_balanced_acc
# Store results
sampling_results <- rbind(sampling_results, data.frame(
Sampling = sampling,
Mean_Balanced_Accuracy = mean_balanced_acc,
SD_Balanced_Accuracy = sd_balanced_acc,
Min_Balanced_Accuracy = min_balanced_acc,
Max_Balanced_Accuracy = max_balanced_acc,
Range_Balanced_Accuracy = range_balanced_acc
))
}
Results
Summary Statistics
cat("Overall Average Mean Balanced Accuracy:",
round(mean(sampling_results$Mean_Balanced_Accuracy, na.rm = TRUE), 3), "\n")
## Overall Average Mean Balanced Accuracy: 0.667
cat("Overall Average SD Balanced Accuracy:",
round(mean(sampling_results$SD_Balanced_Accuracy, na.rm = TRUE), 3), "\n")
## Overall Average SD Balanced Accuracy: 0.127
cat("Overall Average Range Balanced Accuracy:",
round(mean(sampling_results$Range_Balanced_Accuracy, na.rm = TRUE), 3), "\n")
## Overall Average Range Balanced Accuracy: 0.314
Distribution of Balanced Accuracy Range
ggplot(sampling_results, aes(x = Sampling)) +
geom_ribbon(aes(ymin = Min_Balanced_Accuracy, ymax = Max_Balanced_Accuracy),
alpha = 0.3, fill = "steelblue") +
geom_line(aes(y = Mean_Balanced_Accuracy), color = "darkblue", size = 0.5) +
geom_hline(yintercept = mean(sampling_results$Mean_Balanced_Accuracy, na.rm = TRUE),
color = "red", linetype = "dashed", size = 1) +
labs(title = "Balanced Accuracy Across 1000 Bootstrap Iterations",
subtitle = paste0("Average Range: ",
round(mean(sampling_results$Range_Balanced_Accuracy, na.rm = TRUE), 3)),
x = "Bootstrap Iteration",
y = "Balanced Accuracy",
caption = "Blue ribbon = min to max across 5 folds | Blue line = mean | Red dashed = overall average") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 12))
