library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gt)
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──

## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2
## ✔ recipes      1.2.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()

library(stringr)

## 
## Attaching package: 'stringr'

## The following object is masked from 'package:recipes':
## 
##     fixed

library(patchwork)

the example in Figure 4.4

4.4.1 Linear Discriminant Analysis for

p

mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
  stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
                geom = "line", size = 1.5, color = "forestgreen") +
  stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
                geom = "line", size = 1.5, color = "orchid") +
  geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
  theme(axis.title.y = element_blank(), 
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

set.seed(42)
d <- tribble(
  ~class, ~x,
  1, rnorm(20, mean = mu1, sd = sigma1),
  2, rnorm(20, mean = mu2, sd = sigma2)
) %>%
  unnest(x)
lda_boundary <-
  (mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2

p2 <- d %>%
  ggplot(aes(x, fill = factor(class), color = factor(class))) +
  geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
  geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
  geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5) +
  scale_fill_manual(values = c("#50C878", "#B784A7")) +  # emerald & mauve
  scale_color_manual(values = c("#50C878", "#B784A7")) +
  theme(legend.position = "none")
p1 | p2

set.seed(2021)
d <- tribble(
  ~class, ~x,
  1, rnorm(1e3, mean = mu1, sd = sigma1),
  2, rnorm(1e3, mean = mu2, sd = sigma2)
) %>%
  unnest(x)
# The LDA boundary must be recomputed with the new data
lda_boundary <-
  (mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2

d %>%
  mutate(
    bayes_class = ifelse(x > bayes_boundary, 1, 2),
    lda_class = ifelse(x > lda_boundary, 1, 2)
  ) %>%
  summarise(
    `Bayes error rate` = mean(class == bayes_class),
    `LDA error rate` = mean(class == lda_class)
  )

## # A tibble: 1 × 2
##   `Bayes error rate` `LDA error rate`
##                <dbl>            <dbl>
## 1              0.104            0.107

4.4.2 Linear Discriminant Analysis for

p > 1

d <- crossing(x1 = seq(-2, 2, 0.1), x2 = seq(-2, 2, 0.1))
d1 <- d %>%
  bind_cols(
    prob = mvtnorm::dmvnorm(
      x = as.matrix(d),
      mean = c(0, 0), sigma = matrix(c(1, 0, 0, 1), nrow = 2)
    )
  )
d2 <- d %>%
  bind_cols(
    prob = mvtnorm::dmvnorm(
      x = as.matrix(d),
      mean = c(0, 0), sigma = matrix(c(1, 0.7, 0.7, 1), nrow = 2)
    )
  )
p1 <- d1 %>%
  ggplot(aes(x = x1, y = x2)) +
  geom_tile(aes(fill = prob)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(legend.position = "none")
p2 <- d2 %>%
  ggplot(aes(x = x1, y = x2)) +
  geom_tile(aes(fill = prob)) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  theme(legend.position = "none")
  p1 | p2

library(ISLR)  # or ISLR2 depending on your version
data(Default)
head(Default)

##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

lda_default_balance_student <- MASS::lda(default ~ balance + student, data = Default)
lda_default_balance_student

## Call:
## lda(default ~ balance + student, data = Default)
## 
## Prior probabilities of groups:
##     No    Yes 
## 0.9667 0.0333 
## 
## Group means:
##       balance studentYes
## No   803.9438  0.2914037
## Yes 1747.8217  0.3813814
## 
## Coefficients of linear discriminants:
##                     LD1
## balance     0.002244397
## studentYes -0.249059498

library(ISLR)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:patchwork':
## 
##     area

## The following object is masked from 'package:dplyr':
## 
##     select

data(Default)

lda_default_balance_student <- lda(default ~ balance + student, data = Default)

lda_default_balance_student

## Call:
## lda(default ~ balance + student, data = Default)
## 
## Prior probabilities of groups:
##     No    Yes 
## 0.9667 0.0333 
## 
## Group means:
##       balance studentYes
## No   803.9438  0.2914037
## Yes 1747.8217  0.3813814
## 
## Coefficients of linear discriminants:
##                     LD1
## balance     0.002244397
## studentYes -0.249059498

# The MASS package has a `select` function that overwrite `dplyr`,
#  fix that before it becomes a problem
select <- dplyr::select

mean(
  predict(lda_default_balance_student,
          newdata = Default)$class != Default$default
)

## [1] 0.0275

lda_pred <- bind_cols(
  pred_default = predict(lda_default_balance_student,
                         newdata = Default)$class,
  Default
)
lda_pred %>%
  count(pred_default, default) %>%
  pivot_wider(names_from = default, values_from = n, values_fill = 0) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  grand_summary_rows(
    columns = c(No, Yes, Total),
    fns = list(Total = ~round(sum(.), 0))
  )

Predicted	True default status
Predicted	No	Yes	Total
No	9644	252	9896
Yes	23	81	104
Total	9667	333	10000

lda_posterior <- predict(lda_default_balance_student, newdata = Default)$posterior
head(lda_posterior)

##          No         Yes
## 1 0.9968680 0.003131975
## 2 0.9971925 0.002807531
## 3 0.9843970 0.015603046
## 4 0.9987769 0.001223133
## 5 0.9959254 0.004074582
## 6 0.9954627 0.004537289

lda_pred_20 <- bind_cols(
  Default,  # Make sure to use `Default` (uppercase D)
  posterior_prob_default = lda_posterior[, 2]
) %>%
  mutate(
    pred_default = ifelse(posterior_prob_default > 0.2, "Yes", "No")
  )
lda_pred_20 %>%
  count(pred_default, default) %>%
  pivot_wider(names_from = default, values_from = n, values_fill = 0) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  grand_summary_rows(
    columns = c(No, Yes, Total),
    fns = list(Total = ~round(sum(.), 0))
  )

Predicted	True default status
Predicted	No	Yes	Total
No	9432	138	9570
Yes	235	195	430
Total	9667	333	10000

lda_roc <-
  yardstick::roc_curve(
    lda_pred_20,
    # Specify the class probability and the truth variables
    posterior_prob_default, truth = default,
    # This argument specifies which level of truth (default) is considered
    #  "positive", so it will flip the ROC curve vertically
    event_level = "second"
  )
autoplot(lda_roc)

yardstick::roc_auc(
  lda_pred_20,
  posterior_prob_default, truth = default,
  event_level = "second"
)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.950

glm_default_balance_student <- glm(default ~ balance + student,
                                   data = Default,
                                   family = binomial)

summary(glm_default_balance_student)

## 
## Call:
## glm(formula = default ~ balance + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.075e+01  3.692e-01 -29.116  < 2e-16 ***
## balance      5.738e-03  2.318e-04  24.750  < 2e-16 ***
## studentYes  -7.149e-01  1.475e-01  -4.846 1.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.7  on 9997  degrees of freedom
## AIC: 1577.7
## 
## Number of Fisher Scoring iterations: 8

glm_pred <- bind_cols(
  Default,  # Use the correct dataset name
  glm_prob_default = predict(
    glm_default_balance_student,
    newdata = Default, type = "response"
  )
)
yardstick::roc_auc(
  glm_pred,
  glm_prob_default, truth = default,
  event_level = "second"
)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.950

4.4.4 Naive Bayes

# Make sure klaR is loaded
library(klaR)

# Fit Naive Bayes model
nb_default <- NaiveBayes(default ~ balance + student, data = Default)

nb_pred <- bind_cols(
  Default,
  nb_prob_default = predict(nb_default, newdata = Default)$posterior[,2]
)

nb_pred <- nb_pred %>%
  mutate(
    pred_default_0.5 = ifelse(nb_prob_default > 0.5, "Yes", "No"),
    pred_default_0.2 = ifelse(nb_prob_default > 0.2, "Yes", "No")
  )
nb_pred %>%
  count(pred_default_0.5, default) %>%
  pivot_wider(names_from = default, values_from = n) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default_0.5") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))

Predicted	True default status
Predicted	No	Yes	Total
No	9621	244	9865
Yes	46	89	135
Total	9667	333	10000

nb_pred %>%
  count(pred_default_0.2, default) %>%
  pivot_wider(names_from = default, values_from = n) %>%
  mutate(Total = No + Yes) %>%
  gt(rowname_col = "pred_default_0.2") %>%
  tab_spanner(label = "True default status", columns = everything()) %>%
  tab_stubhead("Predicted") %>%
  grand_summary_rows(fns = list(Total = ~round(sum(.), 0)))

Predicted	True default status
Predicted	No	Yes	Total
No	9339	130	9469
Yes	328	203	531
Total	9667	333	10000

nb_pred %>%
  select(default, pred_default_0.2, pred_default_0.5) %>%
  pivot_longer(c(pred_default_0.5, pred_default_0.2),
               names_to = "threshold", values_to = "pred_default") %>%
  mutate(threshold = as.numeric(str_remove(threshold, "pred_default_"))) %>%
  group_by(threshold) %>%
  summarise(
    overall_error = mean(default != pred_default),
    sensitivity = sum(default == "Yes" & pred_default == "Yes") /
      sum(default == "Yes"),
    specificity = sum(default == "No" & pred_default == "No") /
      sum(default == "No"),
    .groups = "drop"
  ) %>%
  mutate(across(everything(), scales::percent)) %>%
  gt()

threshold	overall_error	sensitivity	specificity
20%	4.6%	61%	96.6%
50%	2.9%	27%	99.5%

4.5 A Comparison of Classification Methods

4.5.2 An Empirical Comparison

make_blobs <- function(
  n_samples = 40, n_features = 2,
  # By default, class 1 is centered at (0, 0) and class 2 at (1, 1)
  cluster_centers = matrix(c(0, 0, 1, 1), nrow = 2, byrow = TRUE),
  # By default, the two features are uncorrelated with variance = 1
  cluster_covar = matrix(c(1, 0, 0, 1), nrow = 2),
  dist = c("norm", "t"), t_df = 5
) {
  if (ncol(cluster_centers) != n_features) {
    stop("Dimensionality of centers must equal number of features")
  }
  if ((nrow(cluster_covar) != n_features) |
      (ncol(cluster_covar) != n_features)) {
    stop("Dimensionality of covariance matrix must match number of features")
  }
  dist <- match.arg(dist)
  
  # Equally divides each of `n_samples` into the different categories according
  #  to the number of provided classes
  categories <- rep(1:nrow(cluster_centers), length.out = n_samples)
  
  if (dist == "norm") {
    points <- MASS::mvrnorm(n = n_samples, mu = c(0, 0), Sigma = cluster_covar)
  } else if (dist == "t") {
    points <- mvtnorm::rmvt(n = n_samples, delta = c(0, 0), df = t_df,
                            sigma = cluster_covar)
  }
  points <- points + cluster_centers[categories, ]
  
  colnames(points) <- c("x", "y")
  as_tibble(points) %>%
    bind_cols(category = factor(categories))
}

install.packages("dunnr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

## Warning: package 'dunnr' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

set.seed(22)
library(ggplot2)
library(tidyr)

# Create your example data
tribble(
  ~scenario, ~data,
  "Scenario 1", make_blobs(n_samples = 40),
  "Scenario 2", make_blobs(n_samples = 40,
                           cluster_covar = matrix(c(1, -0.5, -0.5, 1),
                                                  nrow = 2)),
  "Scenario 3", make_blobs(n_samples = 100,
                           cluster_covar = matrix(c(1, -0.5, -0.5, 1),
                                                  nrow = 2),
                           dist = "t")
) %>%
  unnest(data) %>%
  ggplot(aes(x, y, color = category, shape = category)) +
  geom_point(size = 3) +
  facet_wrap(~scenario) +
  theme(
    strip.background = element_rect(color = "black", fill = "lightgray", size = 1, linetype = "solid"),
    strip.text = element_text(face = "bold")
  )

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

sim_linear_train <- tribble(
  ~ scenario, ~ n_samples, ~ corr, ~ dist,
  "Scenario 1", 40, 0.0, "norm",
  "Scenario 2", 40, -0.5, "norm",
  "Scenario 3", 40, -0.5, "t"
) %>%
  crossing(sim = 1:100) %>%
  rowwise() %>%
  mutate(
    train_data = list(make_blobs(
      n_samples = n_samples,
      cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
      dist = dist
    ))
  ) %>%
  ungroup()

sim_linear_test <- sim_linear_train %>%
  distinct(scenario, corr, dist) %>%
  rowwise() %>%
  mutate(
    test_data = list(make_blobs(
      n_samples = 1000,
      cluster_covar = matrix(c(1, corr, corr, 1), nrow = 2),
      dist = dist
    ))
  ) %>%
  ungroup()

library(tidymodels)
library(discrim) # this needs to be loaded separately for `discrim_*()`

## 
## Attaching package: 'discrim'

## The following object is masked from 'package:dials':
## 
##     smoothness

models <- tribble(
  ~ model_label, ~ model,
  "KNN-1", nearest_neighbor(mode = "classification", neighbors = 1),
  "KNN-CV", nearest_neighbor(mode = "classification", neighbors = tune()),
  "LDA", discrim_linear(),
  "Logistic", logistic_reg(),
  "NBayes", naive_Bayes(engine = "klaR") %>%
    # The klaR engine has an argument usekernel that is always TRUE
    # We have to set it to FALSE to not use KDE, and instead use Gaussian
    #  distributions, as in the text
    set_args(usekernel = FALSE),
  "QDA", discrim_quad()
)

# A helper function for fitting on a training set and getting accuracy from
#  a testing set
calc_test_accuracy <- function(model_label, train_data, test_data, model) {
  wf <- workflow() %>%
    add_recipe(recipe(category ~ x + y, data = train_data)) %>%
    add_model(model)
    
  if (model_label == "KNN-CV") {
    # 5 fold cross-validation
    train_data_folds <- vfold_cv(train_data, v = 5)
    tune_res <- wf %>%
      tune_grid(
        resamples = train_data_folds,
        # Try 1 to 10 neighbors
        grid = tibble(neighbors = 1:10)
      )
    # Overwrite the workflow with the best `neighbors` value by CV accuracy
    wf <- finalize_workflow(wf, select_best(tune_res, "accuracy")) 
  }
  
  wf %>%
    fit(data = train_data) %>%
    augment(test_data) %>%
    accuracy(truth = category, estimate = .pred_class) %>%
    pull(.estimate)
}

# Install and load required packages
install.packages("tictoc")  # For timing code execution

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(tictoc)
library(dplyr)              # Data manipulation
library(purrr)              # Functional programming (pmap_dbl)
library(tidyr)              # Data tidying (crossing)
library(ggplot2)            # Visualization (if needed)
library(kknn)               # K-Nearest Neighbors model (if used)
# Evaluate model performance

install.packages("parsnip")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

install.packages("recipes")  # If you are working with recipes for model workflows

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(parsnip)
library(recipes)
# Example data
sim_linear_train <- data.frame(
  scenario = rep(1:3, each = 5),
  train_data = rep(list(data.frame(x = rnorm(5), y = rnorm(5))), 3)
)

sim_linear_test <- data.frame(
  scenario = rep(1:3, each = 5),
  test_data = rep(list(data.frame(x = rnorm(5), y = rnorm(5))), 3)
)

models <- data.frame(model_label = c("KNN", "SVM"))

# Function to calculate test accuracy (as an example)
calc_test_accuracy <- function(model_label, train_data, test_data, model) {
  # Placeholder logic for calculating test accuracy
  return(runif(1))  # Example: return a random accuracy for now
}

# Create the result

colnames(sim_linear_test)

## [1] "scenario"      "test_data.x"   "test_data.y"   "test_data.x.1"
## [5] "test_data.y.1" "test_data.x.2" "test_data.y.2"

str(sim_linear_test$test_data)

##  NULL

sim_linear_test_clean <- sim_linear_test %>%
  rename(test_data = test_data.x) %>%
  select(scenario, test_data)

# View the result

sim_linear_test_clean <- sim_linear_test %>%
  rename(test_data = test_data.x) %>%
  select(scenario, test_data)

4.8 Exercises

13. Predict returns with Weekly

weekly <- ISLR2::Weekly

skimr::skim(weekly)

Data summary
Name	weekly
Number of rows	1089
Number of columns	9
_______________________
Column type frequency:
factor	1
numeric	8
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Direction	0	1	FALSE	2	Up: 605, Dow: 484

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Year	1	2000.05	6.03	1990.00	1995.00	2000.00	2005.00	2010.00	▇▆▆▆▆
Lag1	1	0.15	2.36	-18.20	-1.15	0.24	1.41	12.03	▁▁▆▇▁
Lag2	1	0.15	2.36	-18.20	-1.15	0.24	1.41	12.03	▁▁▆▇▁
Lag3	1	0.15	2.36	-18.20	-1.16	0.24	1.41	12.03	▁▁▆▇▁
Lag4	1	0.15	2.36	-18.20	-1.16	0.24	1.41	12.03	▁▁▆▇▁
Lag5	1	0.14	2.36	-18.20	-1.17	0.23	1.41	12.03	▁▁▆▇▁
Volume	1	1.57	1.69	0.09	0.33	1.00	2.05	9.33	▇▂▁▁▁
Today	1	0.15	2.36	-18.20	-1.15	0.24	1.41	12.03	▁▁▆▇▁

weekly %>%
  ggplot(aes(x = factor(Year), y = Volume)) +
  geom_line() +
  theme_minimal()

install.packages("forcats")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(forcats)
weekly <- weekly %>% mutate(Direction = fct_rev(Direction))

library(gt)
library(tidymodels)

lr_weekly_fit <- logistic_reg() %>%
  set_engine("glm") %>%
  fit(Direction ~ Year + Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = weekly)
# Load the required libraries
library(broom)
library(gt)

# Tidy the model and display it with gt
lr_weekly_tidy <- tidy(lr_weekly_fit)
lr_weekly_tidy %>%
  gt()

term	estimate	std.error	statistic	p.value
(Intercept)	-17.225822230	37.89052190	-0.45462088	0.6493820
Year	0.008499918	0.01899083	0.44758011	0.6544563
Lag1	0.040687571	0.02644652	1.53848459	0.1239302
Lag2	-0.059448637	0.02697031	-2.20422531	0.0275085
Lag3	0.015477987	0.02670309	0.57963289	0.5621622
Lag4	0.027316278	0.02648478	1.03139539	0.3023554
Lag5	0.014022185	0.02640947	0.53095285	0.5954515
Volume	-0.003256253	0.06883640	-0.04730423	0.9622708

lr_weekly_fit_conf_mat <- augment(lr_weekly_fit, weekly) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
lr_weekly_fit_conf_mat

##           Truth
## Prediction  Up Down
##       Up   558  428
##       Down  47   56

weekly_train <- weekly %>% filter(Year <= 2008)
weekly_test <- weekly %>% filter(Year > 2008)

lr_weekly_fit_lag2 <-
  logistic_reg() %>%
  fit(Direction ~ Lag2, data = weekly_train)
lr_weekly_fit_lag2_conf_mat <-
  augment(lr_weekly_fit_lag2, weekly_test) %>%
  conf_mat(truth = Direction, estimate = .pred_class)
lr_weekly_fit_lag2_conf_mat

##           Truth
## Prediction Up Down
##       Up   56   34
##       Down  5    9

install.packages("discrim")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(discrim)
model_fits <- list(
  "logistic" = lr_weekly_fit_lag2,
  "LDA" = discrim_linear() %>% fit(Direction ~ Lag2, data = weekly_train),
  "QDA" = discrim_quad() %>% fit(Direction ~ Lag2, data = weekly_train),
  "KNN1" = nearest_neighbor(mode = "classification", neighbors = 1) %>%
    fit(Direction ~ Lag2, data = weekly_train),
  "NB" = naive_Bayes() %>% fit(Direction ~ Lag2, data = weekly_train)
)

weekly_metrics <- metric_set(accuracy, sens, spec, ppv)
imap_dfr(
  model_fits,
  ~augment(.x, new_data = weekly_test) %>%
    weekly_metrics(truth = Direction, estimate = .pred_class),
  .id = "model"
) %>%
  select(model, .metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  gt(rowname_col = "model") %>%
  fmt_percent(columns = -model)

	accuracy	sens	spec	ppv
logistic	62.50%	91.80%	20.93%	62.22%
LDA	62.50%	91.80%	20.93%	62.22%
QDA	58.65%	100.00%	0.00%	58.65%
KNN1	50.00%	49.18%	51.16%	58.82%
NB	60.58%	91.80%	16.28%	60.87%

14. Predict gas mileage with Auto

auto <- ISLR2::Auto %>%
  mutate(mpg01 = ifelse(mpg > median(mpg), 1, 0),
         mpg01 = factor(mpg01))
glimpse(auto)

## Rows: 392
## Columns: 10
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year         <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
## $ mpg01        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …

auto <- auto %>%
  mutate(origin = factor(origin, levels = 1:3,
                         labels = c("American", "European", "Japanese")))

auto %>%
  select(-name, -origin, -mpg) %>%
  pivot_longer(-mpg01, names_to = "var", values_to = "val") %>%
  ggplot(aes(y = mpg01, x = val)) +
  geom_boxplot(aes(fill = factor(mpg01))) +
  facet_wrap(~var, scales = "free_x") +
  theme(
    legend.position = "none",
    strip.background = element_rect(color = "black", size = 1),  # Adds border around facet labels
    strip.text = element_text(size = 12)  # Optional: Adjusts the text size of facet labels
  )

auto %>%
  count(origin, mpg01) %>%
  ggplot(aes(y = origin, x = mpg01)) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n), color = "white") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  theme(legend.position = "none")

install.packages("RColorBrewer")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(RColorBrewer)
auto %>% select(-name, -mpg01, -origin) %>%
  corrr::correlate(method = "pearson", quiet = TRUE) %>%
  gt(rowname_col = "term") %>%
  gt::fmt_missing(columns = everything(), missing_text = "") %>%
  gt::data_color(
    columns = everything(),
    colors = scales::col_numeric(
      palette = brewer.pal(5, "RdBu"),  # Using a diverging palette (e.g., "RdBu")
      domain = c(-1, 1)
    )
  ) %>%
  gt::fmt_number(columns = everything(), decimals = 2)

## Warning: Since gt v0.6.0 `fmt_missing()` is deprecated and will soon be removed.
## ℹ Use `sub_missing()` instead.
## This warning is displayed once every 8 hours.

## Warning: Since gt v0.9.0, the `colors` argument has been deprecated.
## • Please use the `fn` argument instead.
## This warning is displayed once every 8 hours.

	mpg	cylinders	displacement	horsepower	weight	acceleration	year
mpg		−0.78	−0.81	−0.78	−0.83	0.42	0.58
cylinders	−0.78		0.95	0.84	0.90	−0.50	−0.35
displacement	−0.81	0.95		0.90	0.93	−0.54	−0.37
horsepower	−0.78	0.84	0.90		0.86	−0.69	−0.42
weight	−0.83	0.90	0.93	0.86		−0.42	−0.31
acceleration	0.42	−0.50	−0.54	−0.69	−0.42		0.29
year	0.58	−0.35	−0.37	−0.42	−0.31	0.29

set.seed(49)
auto_split <- initial_split(auto, prop = 3 / 4)
auto_train <- training(auto_split)
auto_test <- testing(auto_split)

auto_train %>% count(mpg01)

##   mpg01   n
## 1     0 148
## 2     1 146

auto_test %>% count(mpg01)

##   mpg01  n
## 1     0 48
## 2     1 50

auto_recipe <- recipe(
  mpg01 ~ cylinders + displacement + horsepower + weight + acceleration +
    year + origin,
  data = auto_train
) %>%
  # Normalize numerical predictors to work with KNN
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(origin)
auto_workflow <- workflow() %>%
  add_recipe(auto_recipe)

model_fits <-
  list(
    "LDA" = auto_workflow %>%
      add_model(discrim_linear()) %>%
      fit(data = auto_train),
    "QDA" = auto_workflow %>%
      add_model(discrim_quad()) %>%
      fit(data = auto_train),
    "logistic" = auto_workflow %>%
      add_model(logistic_reg()) %>%
      fit(data = auto_train),
    "NB" = auto_workflow %>%
      add_model(naive_Bayes()) %>%
      fit(data = auto_train),
    "KNN1" = auto_workflow %>%
      add_model(nearest_neighbor(mode = "classification", neighbors = 1)) %>%
      fit(data = auto_train),
    "KNN3" = auto_workflow %>%
      add_model(nearest_neighbor(mode = "classification", neighbors = 3)) %>%
      fit(data = auto_train),
    "KNN5" = auto_workflow %>%
      add_model(nearest_neighbor(mode = "classification", neighbors = 5)) %>%
      fit(data = auto_train),
    "KNN7" = auto_workflow %>%
      add_model(nearest_neighbor(mode = "classification", neighbors = 7)) %>%
      fit(data = auto_train)
  )
auto_metrics <- metric_set(accuracy, sens, spec, ppv)
imap_dfr(
  model_fits,
  ~augment(.x, new_data = auto_test) %>%
    auto_metrics(truth = mpg01, estimate = .pred_class),
  .id = "model"
) %>%
  select(model, .metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  arrange(desc(accuracy)) %>%
  gt(rowname_col = "model") %>%
  fmt_percent(columns = -model, decimals = 1)

	accuracy	sens	spec	ppv
LDA	90.8%	87.5%	94.0%	93.3%
logistic	90.8%	91.7%	90.0%	89.8%
QDA	89.8%	85.4%	94.0%	93.2%
NB	89.8%	87.5%	92.0%	91.3%
KNN1	89.8%	93.8%	86.0%	86.5%
KNN3	89.8%	93.8%	86.0%	86.5%
KNN5	89.8%	91.7%	88.0%	88.0%
KNN7	89.8%	91.7%	88.0%	88.0%

16. Predict crime rate with Boston

boston <- ISLR2::Boston %>%
  mutate(
    crim01 = ifelse(crim > median(crim), 1, 0),
    crim01 = factor(crim01),
    # Convert the binary chas variable to TRUE/FALSE
    chas = chas == 1
  )
glimpse(boston)

## Rows: 506
## Columns: 14
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
## $ crim01  <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…

boston %>%
  select(-chas) %>%
  pivot_longer(-crim01, names_to = "var", values_to = "val") %>%
  ggplot(aes(y = crim01, x = val)) +
  geom_boxplot(aes(fill = factor(crim01))) +
  facet_wrap(~var, scales = "free_x") +
  theme(
    legend.position = "none",
    strip.background = element_rect(color = "black", size = 1),  # Add borders to facet labels
    strip.text = element_text(size = 12)  # Adjust text size of facet labels
  )

boston %>%
  count(chas, crim01) %>%
  ggplot(aes(y = chas, x = crim01)) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n), color = "white") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  theme(legend.position = "none")

set.seed(98)
# By default, splits 3:1
boston_split <- initial_split(boston)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)

boston_models <- list(
  "LDA" = discrim_linear(),
  "QDA" = discrim_quad(),
  "logistic" = logistic_reg(),
  "NB" = naive_Bayes(),
  "KNN1" = nearest_neighbor(mode = "classification", neighbors = 1),
  "KNN3" = nearest_neighbor(mode = "classification", neighbors = 3),
  "KNN5" = nearest_neighbor(mode = "classification", neighbors = 5),
  "KNN7" = nearest_neighbor(mode = "classification", neighbors = 7)
)
boston_recs <- list(
  "rec1" = recipe(
    crim01 ~ age + dis + indus + lstat + medv + nox + ptratio + rad + tax + zn,
    data = boston_train
  ) %>%
    step_normalize(all_numeric_predictors()),
  # Drop medv and lstat
  "rec2" = recipe(
    crim01 ~ age + dis + indus + nox + ptratio + rad + tax + zn,
    data = boston_train
  ) %>%
    step_normalize(all_numeric_predictors()),
  # Drop ptratio and tax
  "rec3" = recipe(
    crim01 ~ age + dis + indus + nox  + rad + zn,
    data = boston_train
  ) %>%
    step_normalize(all_numeric_predictors())
)

boston_fits <-
  map(
    boston_models,
    function(model) {
      map(
        boston_recs,
        ~workflow() %>%
          add_model(model) %>%
          add_recipe(.x) %>%
          fit(data = boston_train)
      )
    }
  )

boston_metrics <- metric_set(accuracy, sens, spec, ppv)
imap_dfr(
  boston_fits,
  function(fit, y) {
    imap_dfr(
      fit,
      ~augment(.x, new_data = boston_test) %>%
        boston_metrics(truth = crim01, estimate = .pred_class),
      .id = "recipe"
    )
  },
  .id = "model"
) %>%
  select(model, recipe, .metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  arrange(recipe, desc(accuracy)) %>%
  group_by(recipe) %>%
  gt(rowname_col = "model") %>%
  fmt_percent(columns = -model, decimals = 1)

	accuracy	sens	spec	ppv
rec1
QDA	95.3%	94.6%	95.8%	94.6%
KNN1	94.5%	98.2%	91.5%	90.2%
KNN3	94.5%	98.2%	91.5%	90.2%
KNN5	94.5%	98.2%	91.5%	90.2%
KNN7	94.5%	98.2%	91.5%	90.2%
logistic	92.1%	98.2%	87.3%	85.9%
NB	85.0%	80.4%	88.7%	84.9%
LDA	83.5%	91.1%	77.5%	76.1%
rec2
KNN5	96.1%	100.0%	93.0%	91.8%
KNN1	95.3%	98.2%	93.0%	91.7%
KNN3	95.3%	98.2%	93.0%	91.7%
KNN7	95.3%	98.2%	93.0%	91.7%
QDA	94.5%	94.6%	94.4%	93.0%
logistic	89.0%	87.5%	90.1%	87.5%
LDA	86.6%	96.4%	78.9%	78.3%
NB	84.3%	80.4%	87.3%	83.3%
rec3
KNN5	95.3%	98.2%	93.0%	91.7%
KNN1	94.5%	98.2%	91.5%	90.2%
KNN3	94.5%	98.2%	91.5%	90.2%
KNN7	94.5%	96.4%	93.0%	91.5%
logistic	89.0%	87.5%	90.1%	87.5%
QDA	86.6%	91.1%	83.1%	81.0%
NB	86.6%	76.8%	94.4%	91.5%
LDA	84.3%	91.1%	78.9%	77.3%

boston_recs$rec2$term_info

## # A tibble: 9 × 4
##   variable type      role      source  
##   <chr>    <list>    <chr>     <chr>   
## 1 age      <chr [2]> predictor original
## 2 dis      <chr [2]> predictor original
## 3 indus    <chr [2]> predictor original
## 4 nox      <chr [2]> predictor original
## 5 ptratio  <chr [2]> predictor original
## 6 rad      <chr [2]> predictor original
## 7 tax      <chr [2]> predictor original
## 8 zn       <chr [2]> predictor original
## 9 crim01   <chr [3]> outcome   original

Exercise 05: AFS 3A 113035124

Khishigmaa Bold-Erdene

2025-04-01

the example in Figure 4.4

4.4.1 Linear Discriminant Analysis for

p

4.4.2 Linear Discriminant Analysis for

4.4.4 Naive Bayes

4.5 A Comparison of Classification Methods

4.5.2 An Empirical Comparison

4.8 Exercises

13. Predict returns with Weekly

14. Predict gas mileage with Auto

16. Predict crime rate with Boston