make_blobs <- function(
n_samples = 40, n_features = 2,
# By default, class 1 is centered at (0, 0) and class 2 at (1, 1)
cluster_centers = matrix(c(0, 0, 1, 1), nrow = 2, byrow = TRUE),
# By default, the two features are uncorrelated with variance = 1
cluster_covar = matrix(c(1, 0, 0, 1), nrow = 2),
dist = c("norm", "t"), t_df = 5
) {
if (ncol(cluster_centers) != n_features) {
stop("Dimensionality of centers must equal number of features")
}
if ((nrow(cluster_covar) != n_features) |
(ncol(cluster_covar) != n_features)) {
stop("Dimensionality of covariance matrix must match number of features")
}
dist <- match.arg(dist)
# Equally divides each of `n_samples` into the different categories according
# to the number of provided classes
categories <- rep(1:nrow(cluster_centers), length.out = n_samples)
if (dist == "norm") {
points <- MASS::mvrnorm(n = n_samples, mu = c(0, 0), Sigma = cluster_covar)
} else if (dist == "t") {
points <- mvtnorm::rmvt(n = n_samples, delta = c(0, 0), df = t_df,
sigma = cluster_covar)
}
points <- points + cluster_centers[categories, ]
colnames(points) <- c("x", "y")
as_tibble(points) %>%
bind_cols(category = factor(categories))
}
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # For unnest()
library(tibble) # For tribble()
library(mlbench) # For make_blobs()
set.seed(22)
tribble(
~scenario, ~data,
"Scenario 1", make_blobs(n_samples = 40),
"Scenario 2", make_blobs(n_samples = 40,
cluster_covar = matrix(c(1, -0.5, -0.5, 1),
nrow = 2)),
"Scenario 3", make_blobs(n_samples = 100,
cluster_covar = matrix(c(1, -0.5, -0.5, 1),
nrow = 2),
dist = "t")
) %>%
unnest(data) %>%
ggplot(aes(x, y, color = category, shape = category)) +
geom_point(size = 3) +
facet_wrap(~scenario)

sim_linear_train <- tribble(
~ scenario, ~ n_samples, ~ corr, ~ dist,
"Scenario 1", 40, 0.0, "norm",
"Scenario 2", 40, -0.5, "norm",
"Scenario 3", 40, -0.5, "t"
) %>%
crossing(sim = 1:100) %>%
rowwise() %>%
mutate(
train_data = list(make_blobs(
n_samples = n_samples,
cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
dist = dist
))
) %>%
ungroup()
sim_linear_test <- sim_linear_train %>%
distinct(scenario, corr, dist) %>%
rowwise() %>%
mutate(
test_data = list(make_blobs(
n_samples = 1000,
cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
dist = dist
))
) %>%
ungroup()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ recipes 1.2.1
## ✔ dials 1.4.0 ✔ rsample 1.2.1
## ✔ infer 1.0.7 ✔ tune 1.3.0
## ✔ modeldata 1.4.0 ✔ workflows 1.2.0
## ✔ parsnip 1.3.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.4 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
library(discrim) # this needs to be loaded separately for `discrim_*()`
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
models <- tribble(
~ model_label, ~ model,
"KNN-1", nearest_neighbor(mode = "classification", neighbors = 1),
"KNN-CV", nearest_neighbor(mode = "classification", neighbors = tune()),
"LDA", discrim_linear(),
"Logistic", logistic_reg(),
"NBayes", naive_Bayes(engine = "klaR") %>%
# The klaR engine has an argument usekernel that is always TRUE
# We have to set it to FALSE to not use KDE, and instead use Gaussian
# distributions, as in the text
set_args(usekernel = FALSE),
"QDA", discrim_quad()
)
# A helper function for fitting on a training set and getting accuracy from
# a testing set
calc_test_accuracy <- function(model_label, train_data, test_data, model) {
wf <- workflow() %>%
add_recipe(recipe(category ~ x + y, data = train_data)) %>%
add_model(model)
if (model_label == "KNN-CV") {
# 5 fold cross-validation
train_data_folds <- vfold_cv(train_data, v = 5)
tune_res <- wf %>%
tune_grid(
resamples = train_data_folds,
# Try 1 to 10 neighbors
grid = tibble(neighbors = 1:10)
)
# Overwrite the workflow with the best `neighbors` value by CV accuracy
wf <- finalize_workflow(wf, select_best(tune_res, "accuracy"))
}
wf %>%
fit(data = train_data) %>%
augment(test_data) %>%
accuracy(truth = category, estimate = .pred_class) %>%
pull(.estimate)
}
calc_test_accuracy_fixed <- function(model_label, train_data, test_data, model) {
wf <- workflow() %>%
add_recipe(recipe(category ~ x + y, data = train_data)) %>%
add_model(model)
if (model_label == "KNN-CV") {
# 5 fold cross-validation
train_data_folds <- vfold_cv(train_data, v = 5)
tune_res <- wf %>%
tune_grid(
resamples = train_data_folds,
# Try 1 to 10 neighbors
grid = tibble(neighbors = 1:10)
)
# Fixed line - added "metric ="
wf <- finalize_workflow(wf, select_best(tune_res, metric = "accuracy"))
}
wf %>%
fit(data = train_data) %>%
augment(test_data) %>%
accuracy(truth = category, estimate = .pred_class) %>%
pull(.estimate)
}
library(furrr)
## Loading required package: future
library(dplyr)
library(tidyr)
plan(multisession, workers = 1) # Temporarily use a single worker
library(tictoc)
tic()
sim_linear_res <- sim_linear_train %>%
left_join(sim_linear_test %>% dplyr::select(scenario, test_data), by = "scenario") %>%
crossing(models) %>%
mutate(
test_accuracy = future_pmap_dbl(
list(model_label, train_data, test_data, model),
calc_test_accuracy_fixed # Use your fixed function instead
),
test_error = 1 - test_accuracy
)
## → A | warning: No event observations were detected in `truth` with event level '1'.
## There were issues with some computations A: x1There were issues with some computations A: x1
## → A | warning: No event observations were detected in `truth` with event level '1'.
## There were issues with some computations A: x1There were issues with some computations A: x1
## → A | warning: No control observations were detected in `truth` with control level '2'.
## There were issues with some computations A: x1There were issues with some computations A: x1
## → A | warning: No event observations were detected in `truth` with event level '1'.
## There were issues with some computations A: x1There were issues with some computations A: x1
## Warning: There were 564 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `test_accuracy = future_pmap_dbl(...)`.
## Caused by warning in `FUN()`:
## ! Numerical 0 probability for all classes with observation 235
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 563 remaining warnings.
library(forcats)
# Modified code without dunnr
sim_linear_res %>%
mutate(model_label = fct_inorder(model_label),
model_fill = fct_collapse(model_label,
KNN = c("KNN-1", "KNN-CV"))) %>%
ggplot(aes(x = model_label, y = test_error)) +
geom_boxplot(aes(fill = model_fill), show.legend = FALSE) +
facet_wrap(~ scenario, nrow = 1, scales = "free_y") +
# Removed the dunnr::add_facet_borders() line
theme(axis.text.x = element_text(angle = 90)) +
labs(x = NULL, y = "test error")
