Cross-Validation Simulation Analysis

Setup and Data Generation

Load Required Libraries

set.seed(0326)
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Loading required package: lattice

Define Parameters

n <- 1000  # observations
p <- 20    # predictors

Generate Outcome Variable

Create an unbalanced outcome variable with approximately 66% class 0 and 33% class 1:

y <- rbinom(n, 1, prob = 1/3)

Initialize Predictor Matrix

X <- matrix(nrow = n, ncol = p)

Generate Predictors from null to moderate strength

for (j in 1:p) {
  if (j <= 8) {
    # Moderate predictors (X1-X8)
    X[, j] <- ifelse(y == 1, 
                     rnorm(n, mean = 0.35, sd = 1),
                     rnorm(n, mean = -0.18, sd = 1))
  } else if (j <= 12) {
    # Weak predictors (X9-X12)
    X[, j] <- ifelse(y == 1, 
                     rnorm(n, mean = 0.20, sd = 1),
                     rnorm(n, mean = -0.10, sd = 1))
  } else {
    # Noise predictors (X13-X20)
    X[, j] <- rnorm(n, mean = 0, sd = 1)
  }
}

Create Data Frame

colnames(X) <- paste0("X", 1:p)
df <- data.frame(y = factor(y, levels = c(0, 1)), X)

Check Class Proportions

cat("\nProportions:\n", prop.table(table(df$y)), "\n")
## 
## Proportions:
##  0.683 0.317

Full Dataset Model Evaluation

Fit Logistic Regression Model

model <- glm(y ~ ., data = df, family = "binomial")

Generate Predictions

pred_prob <- predict(model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)

Calculate Balanced Accuracy

cm <- confusionMatrix(factor(pred_class, levels = c(0, 1)), df$y)
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 614 104
##          1  69 213
##                                         
##                Accuracy : 0.827         
##                  95% CI : (0.8021, 0.85)
##     No Information Rate : 0.683         
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.5883        
##                                         
##  Mcnemar's Test P-Value : 0.009739      
##                                         
##             Sensitivity : 0.8990        
##             Specificity : 0.6719        
##          Pos Pred Value : 0.8552        
##          Neg Pred Value : 0.7553        
##              Prevalence : 0.6830        
##          Detection Rate : 0.6140        
##    Detection Prevalence : 0.7180        
##       Balanced Accuracy : 0.7854        
##                                         
##        'Positive' Class : 0             
## 
sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
balanced_accuracy <- (sensitivity + specificity) / 2
cat("\nBalanced Accuracy:", round(balanced_accuracy, 3), "\n")
## 
## Balanced Accuracy: 0.785

Cross-Validation with Repeated Sampling

Initialize Results Storage

sampling_results <- data.frame()

Repeat Sampling 10 Times

for (sampling in 1:10) {
  # Randomly sample 73 individuals 
  sample_indices <- sample(1:nrow(df), size = 73, replace = FALSE)
  df_sample <- df[sample_indices, ]
  
  # Check class distribution in sample
  print(prop.table(table(df_sample$y)))
  
  # Define training control for 5-fold CV
  train_control <- trainControl(
    method = "cv",
    number = 5,
    savePredictions = "final",
    classProbs = TRUE,
    summaryFunction = twoClassSummary
  )
  
  # Rename factor levels for caret (needs non-numeric levels)
  df_sample$y <- factor(df_sample$y, levels = c(0, 1), labels = c("Class0", "Class1"))
  
  # Train logistic regression model with 5-fold CV
  cv_model <- train(
    y ~ .,
    data = df_sample,
    method = "glm",
    family = "binomial",
    trControl = train_control,
    metric = "ROC"
  )
  
  # Extract predictions from each fold
  cv_predictions <- cv_model$pred
  
  # Calculate accuracy for each fold
  fold_results <- data.frame()
  
  for (fold in 1:5) {
    fold_data <- cv_predictions[cv_predictions$Resample == paste0("Fold", fold), ]
    
    # Calculate metrics
    accuracy <- mean(fold_data$pred == fold_data$obs)
    error_rate <- 1 - accuracy
    
    cm_fold <- confusionMatrix(fold_data$pred, fold_data$obs)
    sensitivity <- cm_fold$byClass["Sensitivity"]
    specificity <- cm_fold$byClass["Specificity"]
    balanced_acc <- (sensitivity + specificity) / 2
    
    fold_results <- rbind(fold_results, data.frame(
      Fold = fold,
      Accuracy = accuracy,
      Error_Rate = error_rate,
      Sensitivity = sensitivity,
      Specificity = specificity,
      Balanced_Accuracy = balanced_acc
    ))
  }
  
  # Calculate mean and SD for this sampling
  mean_balanced_acc <- mean(fold_results$Balanced_Accuracy)
  sd_balanced_acc <- sd(fold_results$Balanced_Accuracy)
  
  # Store results
  sampling_results <- rbind(sampling_results, data.frame(
    Sampling = sampling,
    Mean_Balanced_Accuracy = mean_balanced_acc,
    SD_Balanced_Accuracy = sd_balanced_acc
  ))
}
## 
##         0         1 
## 0.6849315 0.3150685
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##         0         1 
## 0.6575342 0.3424658
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##         0         1 
## 0.7260274 0.2739726
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##        0        1 
## 0.630137 0.369863
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##        0        1 
## 0.630137 0.369863
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##         0         1 
## 0.7123288 0.2876712
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##        0        1 
## 0.630137 0.369863
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##         0         1 
## 0.6575342 0.3424658 
## 
##         0         1 
## 0.6575342 0.3424658
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
##         0         1 
## 0.6986301 0.3013699
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Results Summary

Display Summary Table

print(sampling_results, row.names = FALSE)
##  Sampling Mean_Balanced_Accuracy SD_Balanced_Accuracy
##         1              0.5900000           0.13986601
##         2              0.7055556           0.16063147
##         3              0.5431818           0.11903104
##         4              0.6188889           0.13936585
##         5              0.6566667           0.15842590
##         6              0.6163636           0.09921437
##         7              0.6600000           0.22809598
##         8              0.6133333           0.14554071
##         9              0.6455556           0.07395361
##        10              0.6477273           0.07554896

Calculate Overall Averages

print(mean(sampling_results$Mean_Balanced_Accuracy))
## [1] 0.6297273
print(mean(sampling_results$SD_Balanced_Accuracy))
## [1] 0.1339674