set.seed(0326)
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Loading required package: lattice
n <- 1000 # observations
p <- 20 # predictors
Create an unbalanced outcome variable with approximately 66% class 0 and 33% class 1:
y <- rbinom(n, 1, prob = 1/3)
X <- matrix(nrow = n, ncol = p)
for (j in 1:p) {
if (j <= 8) {
# Moderate predictors (X1-X8)
X[, j] <- ifelse(y == 1,
rnorm(n, mean = 0.35, sd = 1),
rnorm(n, mean = -0.18, sd = 1))
} else if (j <= 12) {
# Weak predictors (X9-X12)
X[, j] <- ifelse(y == 1,
rnorm(n, mean = 0.20, sd = 1),
rnorm(n, mean = -0.10, sd = 1))
} else {
# Noise predictors (X13-X20)
X[, j] <- rnorm(n, mean = 0, sd = 1)
}
}
colnames(X) <- paste0("X", 1:p)
df <- data.frame(y = factor(y, levels = c(0, 1)), X)
cat("\nProportions:\n", prop.table(table(df$y)), "\n")
##
## Proportions:
## 0.683 0.317
model <- glm(y ~ ., data = df, family = "binomial")
pred_prob <- predict(model, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
cm <- confusionMatrix(factor(pred_class, levels = c(0, 1)), df$y)
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 614 104
## 1 69 213
##
## Accuracy : 0.827
## 95% CI : (0.8021, 0.85)
## No Information Rate : 0.683
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5883
##
## Mcnemar's Test P-Value : 0.009739
##
## Sensitivity : 0.8990
## Specificity : 0.6719
## Pos Pred Value : 0.8552
## Neg Pred Value : 0.7553
## Prevalence : 0.6830
## Detection Rate : 0.6140
## Detection Prevalence : 0.7180
## Balanced Accuracy : 0.7854
##
## 'Positive' Class : 0
##
sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
balanced_accuracy <- (sensitivity + specificity) / 2
cat("\nBalanced Accuracy:", round(balanced_accuracy, 3), "\n")
##
## Balanced Accuracy: 0.785
sampling_results <- data.frame()
for (sampling in 1:10) {
# Randomly sample 73 individuals
sample_indices <- sample(1:nrow(df), size = 73, replace = FALSE)
df_sample <- df[sample_indices, ]
# Check class distribution in sample
print(prop.table(table(df_sample$y)))
# Define training control for 5-fold CV
train_control <- trainControl(
method = "cv",
number = 5,
savePredictions = "final",
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Rename factor levels for caret (needs non-numeric levels)
df_sample$y <- factor(df_sample$y, levels = c(0, 1), labels = c("Class0", "Class1"))
# Train logistic regression model with 5-fold CV
cv_model <- train(
y ~ .,
data = df_sample,
method = "glm",
family = "binomial",
trControl = train_control,
metric = "ROC"
)
# Extract predictions from each fold
cv_predictions <- cv_model$pred
# Calculate accuracy for each fold
fold_results <- data.frame()
for (fold in 1:5) {
fold_data <- cv_predictions[cv_predictions$Resample == paste0("Fold", fold), ]
# Calculate metrics
accuracy <- mean(fold_data$pred == fold_data$obs)
error_rate <- 1 - accuracy
cm_fold <- confusionMatrix(fold_data$pred, fold_data$obs)
sensitivity <- cm_fold$byClass["Sensitivity"]
specificity <- cm_fold$byClass["Specificity"]
balanced_acc <- (sensitivity + specificity) / 2
fold_results <- rbind(fold_results, data.frame(
Fold = fold,
Accuracy = accuracy,
Error_Rate = error_rate,
Sensitivity = sensitivity,
Specificity = specificity,
Balanced_Accuracy = balanced_acc
))
}
# Calculate mean and SD for this sampling
mean_balanced_acc <- mean(fold_results$Balanced_Accuracy)
sd_balanced_acc <- sd(fold_results$Balanced_Accuracy)
# Store results
sampling_results <- rbind(sampling_results, data.frame(
Sampling = sampling,
Mean_Balanced_Accuracy = mean_balanced_acc,
SD_Balanced_Accuracy = sd_balanced_acc
))
}
##
## 0 1
## 0.6849315 0.3150685
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.6575342 0.3424658
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.7260274 0.2739726
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.630137 0.369863
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.630137 0.369863
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.7123288 0.2876712
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.630137 0.369863
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.6575342 0.3424658
##
## 0 1
## 0.6575342 0.3424658
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## 0 1
## 0.6986301 0.3013699
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(sampling_results, row.names = FALSE)
## Sampling Mean_Balanced_Accuracy SD_Balanced_Accuracy
## 1 0.5900000 0.13986601
## 2 0.7055556 0.16063147
## 3 0.5431818 0.11903104
## 4 0.6188889 0.13936585
## 5 0.6566667 0.15842590
## 6 0.6163636 0.09921437
## 7 0.6600000 0.22809598
## 8 0.6133333 0.14554071
## 9 0.6455556 0.07395361
## 10 0.6477273 0.07554896
print(mean(sampling_results$Mean_Balanced_Accuracy))
## [1] 0.6297273
print(mean(sampling_results$SD_Balanced_Accuracy))
## [1] 0.1339674