Tutorial Week 6.1

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(mlbench)

data(Sonar, package = "mlbench")

set.seed(530306627)
fold <- createFolds(Sonar$Class, k = 10)
test.fold.sizes <- vapply(fold, length, numeric(1))
# For convenience, put the 10 fold CV sets into a list
split.data.into.folds <- function(x, dat, n = nrow(dat)) {
    which.group = factor(ifelse(seq(n) %in% x, "Test", "Training"))
    split(dat, which.group)
}
train_test <- lapply(fold, split.data.into.folds, dat = Sonar, n = nrow(Sonar))

predict.knn <- function(x, k = 5, label.name = "Class") {
    predicted <- class::knn(train = x$Training %>% dplyr::select(-all_of(label.name)),
                     test = x$Test %>% dplyr::select(-all_of(label.name)),
                     cl = x$Training[[label.name]], k = k)
    observed <- x$Test[[label.name]]
    list(predicted = predicted, observed = observed)
}
knn.output <- lapply(train_test, predict.knn)

confuse_mat <- lapply(knn.output, function(x) confusionMatrix(x$predicted, x$observed))
acc_estimate <- vapply(confuse_mat, function(c) c[["overall"]][["Accuracy"]], numeric(1))
class_metric <- vapply(confuse_mat, function(c) c[["byClass"]], numeric(11))
all_metric <- rbind(Accuracy = acc_estimate, class_metric)
rel_metric <- all_metric[row.names(all_metric) %in% c("Accuracy", "Sensitivity", "Specificity", "F1"), ]
avg_metric <- apply(rel_metric, 1, mean)
avg_metric

##    Accuracy Sensitivity Specificity          F1 
##   0.7788095   0.8636364   0.6833333   0.8059570

Question 2

m <- 100
multi_folds <- lapply(seq(m), function(i) createFolds(Sonar[["Class"]], k = 10))
cross_val_compute <- function(fold, k = 5, dat = Sonar)
{
    train_test <- lapply(fold, split.data.into.folds, dat = dat, n = nrow(dat))
    knn.output <- lapply(train_test, predict.knn)
    confuse_mat <- lapply(knn.output, function(x) confusionMatrix(x$predicted,
                                                                       x$observed))
    acc_estimate <- vapply(confuse_mat, function(c) c[["overall"]][["Accuracy"]], numeric(1))
    class_metric <- vapply(confuse_mat, function(c) c[["byClass"]], numeric(11))
    all_metric <- rbind(Accuracy = acc_estimate, class_metric)
    rel_metric <- all_metric[row.names(all_metric) %in% c("Accuracy", "Sensitivity", "Specificity", "F1"), ]
    avg_metric <- apply(rel_metric, 1, mean)
    avg_metric
}
repeat_cross_val <- vapply(multi_folds, cross_val_compute, numeric(4))
dat <- as.data.frame(t(repeat_cross_val))
head(dat)

##    Accuracy Sensitivity Specificity        F1
## 1 0.7877922   0.8636364   0.6988889 0.8132861
## 2 0.8027922   0.8727273   0.7244444 0.8261837
## 3 0.7864069   0.8818182   0.6755556 0.8165426
## 4 0.8180952   0.8825758   0.7444444 0.8373401
## 5 0.8030952   0.8734848   0.7233333 0.8257659
## 6 0.8170996   0.8818182   0.7433333 0.8345907

plot.dat <- dat %>% pivot_longer(everything(), names_to = "Metric", values_to = "Value")
ggplot(plot.dat) + geom_boxplot(aes(x = Metric, y = Value)) +
  ggtitle("Box plots for Repeated Cross-Validation Estimates")

Tutorial Week 6.1

2023-09-10

Question 2