library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
data(Sonar, package = "mlbench")
set.seed(530306627)
fold <- createFolds(Sonar$Class, k = 10)
test.fold.sizes <- vapply(fold, length, numeric(1))
# For convenience, put the 10 fold CV sets into a list
split.data.into.folds <- function(x, dat, n = nrow(dat)) {
which.group = factor(ifelse(seq(n) %in% x, "Test", "Training"))
split(dat, which.group)
}
train_test <- lapply(fold, split.data.into.folds, dat = Sonar, n = nrow(Sonar))
predict.knn <- function(x, k = 5, label.name = "Class") {
predicted <- class::knn(train = x$Training %>% dplyr::select(-all_of(label.name)),
test = x$Test %>% dplyr::select(-all_of(label.name)),
cl = x$Training[[label.name]], k = k)
observed <- x$Test[[label.name]]
list(predicted = predicted, observed = observed)
}
knn.output <- lapply(train_test, predict.knn)
confuse_mat <- lapply(knn.output, function(x) confusionMatrix(x$predicted, x$observed))
acc_estimate <- vapply(confuse_mat, function(c) c[["overall"]][["Accuracy"]], numeric(1))
class_metric <- vapply(confuse_mat, function(c) c[["byClass"]], numeric(11))
all_metric <- rbind(Accuracy = acc_estimate, class_metric)
rel_metric <- all_metric[row.names(all_metric) %in% c("Accuracy", "Sensitivity", "Specificity", "F1"), ]
avg_metric <- apply(rel_metric, 1, mean)
avg_metric
## Accuracy Sensitivity Specificity F1
## 0.7788095 0.8636364 0.6833333 0.8059570
Question 2
m <- 100
multi_folds <- lapply(seq(m), function(i) createFolds(Sonar[["Class"]], k = 10))
cross_val_compute <- function(fold, k = 5, dat = Sonar)
{
train_test <- lapply(fold, split.data.into.folds, dat = dat, n = nrow(dat))
knn.output <- lapply(train_test, predict.knn)
confuse_mat <- lapply(knn.output, function(x) confusionMatrix(x$predicted,
x$observed))
acc_estimate <- vapply(confuse_mat, function(c) c[["overall"]][["Accuracy"]], numeric(1))
class_metric <- vapply(confuse_mat, function(c) c[["byClass"]], numeric(11))
all_metric <- rbind(Accuracy = acc_estimate, class_metric)
rel_metric <- all_metric[row.names(all_metric) %in% c("Accuracy", "Sensitivity", "Specificity", "F1"), ]
avg_metric <- apply(rel_metric, 1, mean)
avg_metric
}
repeat_cross_val <- vapply(multi_folds, cross_val_compute, numeric(4))
dat <- as.data.frame(t(repeat_cross_val))
head(dat)
## Accuracy Sensitivity Specificity F1
## 1 0.7877922 0.8636364 0.6988889 0.8132861
## 2 0.8027922 0.8727273 0.7244444 0.8261837
## 3 0.7864069 0.8818182 0.6755556 0.8165426
## 4 0.8180952 0.8825758 0.7444444 0.8373401
## 5 0.8030952 0.8734848 0.7233333 0.8257659
## 6 0.8170996 0.8818182 0.7433333 0.8345907
plot.dat <- dat %>% pivot_longer(everything(), names_to = "Metric", values_to = "Value")
ggplot(plot.dat) + geom_boxplot(aes(x = Metric, y = Value)) +
ggtitle("Box plots for Repeated Cross-Validation Estimates")
