Exercise5 4.5

make_blobs <- function(
  n_samples = 40, n_features = 2,
  # By default, class 1 is centered at (0, 0) and class 2 at (1, 1)
  cluster_centers = matrix(c(0, 0, 1, 1), nrow = 2, byrow = TRUE),
  # By default, the two features are uncorrelated with variance = 1
  cluster_covar = matrix(c(1, 0, 0, 1), nrow = 2),
  dist = c("norm", "t"), t_df = 5
) {
  if (ncol(cluster_centers) != n_features) {
    stop("Dimensionality of centers must equal number of features")
  }
  if ((nrow(cluster_covar) != n_features) |
      (ncol(cluster_covar) != n_features)) {
    stop("Dimensionality of covariance matrix must match number of features")
  }
  dist <- match.arg(dist)
  
  # Equally divides each of `n_samples` into the different categories according
  #  to the number of provided classes
  categories <- rep(1:nrow(cluster_centers), length.out = n_samples)
  
  if (dist == "norm") {
    points <- MASS::mvrnorm(n = n_samples, mu = c(0, 0), Sigma = cluster_covar)
  } else if (dist == "t") {
    points <- mvtnorm::rmvt(n = n_samples, delta = c(0, 0), df = t_df,
                            sigma = cluster_covar)
  }
  points <- points + cluster_centers[categories, ]
  
  colnames(points) <- c("x", "y")
  as_tibble(points) %>%
    bind_cols(category = factor(categories))
}

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr) # For unnest()
library(tibble) # For tribble()
library(mlbench) # For make_blobs()

set.seed(22)

tribble(
  ~scenario, ~data,
  "Scenario 1", make_blobs(n_samples = 40),
  "Scenario 2", make_blobs(n_samples = 40,
                           cluster_covar = matrix(c(1, -0.5, -0.5, 1),
                                                  nrow = 2)),
  "Scenario 3", make_blobs(n_samples = 100,
                           cluster_covar = matrix(c(1, -0.5, -0.5, 1),
                                                  nrow = 2),
                           dist = "t")
) %>%
  unnest(data) %>%
  ggplot(aes(x, y, color = category, shape = category)) +
  geom_point(size = 3) +
  facet_wrap(~scenario)

sim_linear_train <- tribble(
  ~ scenario, ~ n_samples, ~ corr, ~ dist,
  "Scenario 1", 40, 0.0, "norm",
  "Scenario 2", 40, -0.5, "norm",
  "Scenario 3", 40, -0.5, "t"
) %>%
  crossing(sim = 1:100) %>%
  rowwise() %>%
  mutate(
    train_data = list(make_blobs(
      n_samples = n_samples,
      cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
      dist = dist
    ))
  ) %>%
  ungroup()

sim_linear_test <- sim_linear_train %>%
  distinct(scenario, corr, dist) %>%
  rowwise() %>%
  mutate(
    test_data = list(make_blobs(
      n_samples = 1000,
      cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
      dist = dist
    ))
  ) %>%
  ungroup()

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──

## ✔ broom        1.0.8     ✔ recipes      1.2.1
## ✔ dials        1.4.0     ✔ rsample      1.2.1
## ✔ infer        1.0.7     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()

library(discrim) # this needs to be loaded separately for `discrim_*()`

## 
## Attaching package: 'discrim'

## The following object is masked from 'package:dials':
## 
##     smoothness

models <- tribble(
  ~ model_label, ~ model,
  "KNN-1", nearest_neighbor(mode = "classification", neighbors = 1),
  "KNN-CV", nearest_neighbor(mode = "classification", neighbors = tune()),
  "LDA", discrim_linear(),
  "Logistic", logistic_reg(),
  "NBayes", naive_Bayes(engine = "klaR") %>%
    # The klaR engine has an argument usekernel that is always TRUE
    # We have to set it to FALSE to not use KDE, and instead use Gaussian
    #  distributions, as in the text
    set_args(usekernel = FALSE),
  "QDA", discrim_quad()
)

# A helper function for fitting on a training set and getting accuracy from
#  a testing set
calc_test_accuracy <- function(model_label, train_data, test_data, model) {
  wf <- workflow() %>%
    add_recipe(recipe(category ~ x + y, data = train_data)) %>%
    add_model(model)
    
  if (model_label == "KNN-CV") {
    # 5 fold cross-validation
    train_data_folds <- vfold_cv(train_data, v = 5)
    tune_res <- wf %>%
      tune_grid(
        resamples = train_data_folds,
        # Try 1 to 10 neighbors
        grid = tibble(neighbors = 1:10)
      )
    # Overwrite the workflow with the best `neighbors` value by CV accuracy
    wf <- finalize_workflow(wf, select_best(tune_res, "accuracy")) 
  }
  
  wf %>%
    fit(data = train_data) %>%
    augment(test_data) %>%
    accuracy(truth = category, estimate = .pred_class) %>%
    pull(.estimate)
}

calc_test_accuracy_fixed <- function(model_label, train_data, test_data, model) {
  wf <- workflow() %>%
    add_recipe(recipe(category ~ x + y, data = train_data)) %>%
    add_model(model)
    
  if (model_label == "KNN-CV") {
    # 5 fold cross-validation
    train_data_folds <- vfold_cv(train_data, v = 5)
    tune_res <- wf %>%
      tune_grid(
        resamples = train_data_folds,
        # Try 1 to 10 neighbors
        grid = tibble(neighbors = 1:10)
      )
    # Fixed line - added "metric ="
    wf <- finalize_workflow(wf, select_best(tune_res, metric = "accuracy")) 
  }
  
  wf %>%
    fit(data = train_data) %>%
    augment(test_data) %>%
    accuracy(truth = category, estimate = .pred_class) %>%
    pull(.estimate)
}

library(furrr)

## Loading required package: future

library(dplyr)
library(tidyr)
plan(multisession, workers = 1)  # Temporarily use a single worker
library(tictoc)

tic()
sim_linear_res <- sim_linear_train %>%
  left_join(sim_linear_test %>% dplyr::select(scenario, test_data), by = "scenario") %>%
  crossing(models) %>%
  mutate(
    test_accuracy = future_pmap_dbl(
      list(model_label, train_data, test_data, model),
      calc_test_accuracy_fixed  # Use your fixed function instead
    ),
    test_error = 1 - test_accuracy
  )

## → A | warning: No event observations were detected in `truth` with event level '1'.

## There were issues with some computations   A: x1There were issues with some computations   A: x1
## → A | warning: No event observations were detected in `truth` with event level '1'.
## There were issues with some computations   A: x1There were issues with some computations   A: x1
## → A | warning: No control observations were detected in `truth` with control level '2'.
## There were issues with some computations   A: x1There were issues with some computations   A: x1
## → A | warning: No event observations were detected in `truth` with event level '1'.
## There were issues with some computations   A: x1There were issues with some computations   A: x1

## Warning: There were 564 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `test_accuracy = future_pmap_dbl(...)`.
## Caused by warning in `FUN()`:
## ! Numerical 0 probability for all classes with observation 235
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 563 remaining warnings.

library(forcats)
# Modified code without dunnr
sim_linear_res %>%
  mutate(model_label = fct_inorder(model_label),
         model_fill = fct_collapse(model_label,
                                  KNN = c("KNN-1", "KNN-CV"))) %>%
  ggplot(aes(x = model_label, y = test_error)) +
  geom_boxplot(aes(fill = model_fill), show.legend = FALSE) +
  facet_wrap(~ scenario, nrow = 1, scales = "free_y") +
  # Removed the dunnr::add_facet_borders() line
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = NULL, y = "test error")

Exercise5 4.5

Yeruult Geser 113035132

2025-04-03