Cross-Validation Simulation Analysis

Setup and Data Generation

Load Required Libraries

set.seed(0326)
library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Loading required package: lattice

Define Parameters

n <- 1000  # observations
p <- 20    # predictors

Generate Outcome Variable

Create an unbalanced outcome variable with approximately 66% class 0 and 33% class 1:

y <- rbinom(n, 1, prob = 1/3)

Initialize Predictor Matrix

X <- matrix(nrow = n, ncol = p)

Generate Predictors from null to moderate strength

for (j in 1:p) {
  if (j <= 8) {
    # Moderate predictors (X1-X8)
    X[, j] <- ifelse(y == 1, 
                     rnorm(n, mean = 0.35, sd = 1),
                     rnorm(n, mean = -0.18, sd = 1))
  } else if (j <= 12) {
    # Weak predictors (X9-X12)
    X[, j] <- ifelse(y == 1, 
                     rnorm(n, mean = 0.20, sd = 1),
                     rnorm(n, mean = -0.10, sd = 1))
  } else {
    # Noise predictors (X13-X20)
    X[, j] <- rnorm(n, mean = 0, sd = 1)
  }
}

Create Data Frame

colnames(X) <- paste0("X", 1:p)
df <- data.frame(y = factor(y, levels = c(0, 1)), X)

Check Class Proportions

cat("\nProportions:\n", prop.table(table(df$y)), "\n")

## 
## Proportions:
##  0.683 0.317

Full Dataset Model Evaluation

Fit Logistic Regression Model

model <- glm(y ~ ., data = df, family = "binomial")

Generate Predictions

pred_prob <- predict(model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)

Calculate Balanced Accuracy

cm <- confusionMatrix(factor(pred_class, levels = c(0, 1)), df$y)
print(cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 614 104
##          1  69 213
##                                         
##                Accuracy : 0.827         
##                  95% CI : (0.8021, 0.85)
##     No Information Rate : 0.683         
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.5883        
##                                         
##  Mcnemar's Test P-Value : 0.009739      
##                                         
##             Sensitivity : 0.8990        
##             Specificity : 0.6719        
##          Pos Pred Value : 0.8552        
##          Neg Pred Value : 0.7553        
##              Prevalence : 0.6830        
##          Detection Rate : 0.6140        
##    Detection Prevalence : 0.7180        
##       Balanced Accuracy : 0.7854        
##                                         
##        'Positive' Class : 0             
##

sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
balanced_accuracy <- (sensitivity + specificity) / 2
cat("\nBalanced Accuracy:", round(balanced_accuracy, 3), "\n")

## 
## Balanced Accuracy: 0.785

Cross-Validation with Repeated Sampling

Initialize Results Storage

sampling_results <- data.frame()

Repeat Sampling 10 Times

for (sampling in 1:10) {
  # Randomly sample 73 individuals 
  sample_indices <- sample(1:nrow(df), size = 73, replace = FALSE)
  df_sample <- df[sample_indices, ]
  
  # Check class distribution in sample
  print(prop.table(table(df_sample$y)))
  
  # Define training control for 5-fold CV
  train_control <- trainControl(
    method = "cv",
    number = 5,
    savePredictions = "final",
    classProbs = TRUE,
    summaryFunction = twoClassSummary
  )
  
  # Rename factor levels for caret (needs non-numeric levels)
  df_sample$y <- factor(df_sample$y, levels = c(0, 1), labels = c("Class0", "Class1"))
  
  # Train logistic regression model with 5-fold CV
  cv_model <- train(
    y ~ .,
    data = df_sample,
    method = "glm",
    family = "binomial",
    trControl = train_control,
    metric = "ROC"
  )
  
  # Extract predictions from each fold
  cv_predictions <- cv_model$pred
  
  # Calculate accuracy for each fold
  fold_results <- data.frame()
  
  for (fold in 1:5) {
    fold_data <- cv_predictions[cv_predictions$Resample == paste0("Fold", fold), ]
    
    # Calculate metrics
    accuracy <- mean(fold_data$pred == fold_data$obs)
    error_rate <- 1 - accuracy
    
    cm_fold <- confusionMatrix(fold_data$pred, fold_data$obs)
    sensitivity <- cm_fold$byClass["Sensitivity"]
    specificity <- cm_fold$byClass["Specificity"]
    balanced_acc <- (sensitivity + specificity) / 2
    
    fold_results <- rbind(fold_results, data.frame(
      Fold = fold,
      Accuracy = accuracy,
      Error_Rate = error_rate,
      Sensitivity = sensitivity,
      Specificity = specificity,
      Balanced_Accuracy = balanced_acc
    ))
  }
  
  # Calculate mean and SD for this sampling
  mean_balanced_acc <- mean(fold_results$Balanced_Accuracy)
  sd_balanced_acc <- sd(fold_results$Balanced_Accuracy)
  
  # Store results
  sampling_results <- rbind(sampling_results, data.frame(
    Sampling = sampling,
    Mean_Balanced_Accuracy = mean_balanced_acc,
    SD_Balanced_Accuracy = sd_balanced_acc
  ))
}

## 
##         0         1 
## 0.6849315 0.3150685

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##         0         1 
## 0.6575342 0.3424658

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##         0         1 
## 0.7260274 0.2739726

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##        0        1 
## 0.630137 0.369863

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##        0        1 
## 0.630137 0.369863

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##         0         1 
## 0.7123288 0.2876712

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##        0        1 
## 0.630137 0.369863

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##         0         1 
## 0.6575342 0.3424658 
## 
##         0         1 
## 0.6575342 0.3424658

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
##         0         1 
## 0.6986301 0.3013699

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Results Summary

Display Summary Table

print(sampling_results, row.names = FALSE)

##  Sampling Mean_Balanced_Accuracy SD_Balanced_Accuracy
##         1              0.5900000           0.13986601
##         2              0.7055556           0.16063147
##         3              0.5431818           0.11903104
##         4              0.6188889           0.13936585
##         5              0.6566667           0.15842590
##         6              0.6163636           0.09921437
##         7              0.6600000           0.22809598
##         8              0.6133333           0.14554071
##         9              0.6455556           0.07395361
##        10              0.6477273           0.07554896

Calculate Overall Averages

print(mean(sampling_results$Mean_Balanced_Accuracy))

## [1] 0.6297273

print(mean(sampling_results$SD_Balanced_Accuracy))

## [1] 0.1339674

CV Simulation

Matt Mattoni

2026-03-25

Cross-Validation Simulation Analysis

Setup and Data Generation

Load Required Libraries

Define Parameters

Generate Outcome Variable

Initialize Predictor Matrix

Generate Predictors from null to moderate strength

Create Data Frame

Check Class Proportions

Full Dataset Model Evaluation

Fit Logistic Regression Model

Generate Predictions

Calculate Balanced Accuracy

Cross-Validation with Repeated Sampling

Initialize Results Storage

Repeat Sampling 10 Times

Results Summary

Display Summary Table

Calculate Overall Averages