#4.4.1 Linear Discriminant Analysis for
mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse) # Loads ggplot2, dplyr, and other useful packages
mu1 <- -1.25
mu2 <- 1.25
sigma1 <- 1
sigma2 <- 1
bayes_boundary <- (mu1 + mu2) / 2
p1 <- ggplot(data = tibble(x = seq(-4, 4, 0.1)), aes(x)) +
stat_function(fun = dnorm, args = list(mean = mu1, sd = sigma1),
geom = "line", size = 1.5, color = "green") + # Replace with actual color
stat_function(fun = dnorm, args = list(mean = mu2, sd = sigma2),
geom = "line", size = 1.5, color = "purple") + # Replace with actual color
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank()) # Manually removing y-axis
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(42)
d <- tribble(
~class, ~x,
1, rnorm(20, mean = mu1, sd = sigma1),
2, rnorm(20, mean = mu2, sd = sigma2)
) %>% unnest(x)
lda_boundary <- (mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2
p2 <- d %>%
ggplot(aes(x, fill = factor(class), color = factor(class))) +
geom_histogram(bins = 13, alpha = 0.5, position = "identity") +
geom_vline(xintercept = bayes_boundary, lty = 2, size = 1.5) +
geom_vline(xintercept = lda_boundary, lty = 1, size = 1.5) +
scale_fill_manual(values = c("green", "purple")) +
scale_color_manual(values = c("green", "purple")) +
theme(legend.position = "none")
install.packages("patchwork")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(patchwork)
p1 | p2 # Combining the plots side by side (patchwork)

set.seed(2021)
d <- tribble(
~class, ~x,
1, rnorm(1e3, mean = mu1, sd = sigma1),
2, rnorm(1e3, mean = mu2, sd = sigma2)
) %>%
unnest(x)
# The LDA boundary must be recomputed with the new data
lda_boundary <-
(mean(filter(d, class == 1)$x) + mean(filter(d, class == 2)$x)) / 2
d %>%
mutate(
bayes_class = ifelse(x > bayes_boundary, 1, 2),
lda_class = ifelse(x > lda_boundary, 1, 2)
) %>%
summarise(
`Bayes error rate` = mean(class == bayes_class),
`LDA error rate` = mean(class == lda_class)
)
## # A tibble: 1 × 2
## `Bayes error rate` `LDA error rate`
## <dbl> <dbl>
## 1 0.104 0.107
#4.4.2 Linear Discriminant Analysis for
d <- crossing(x1 = seq(-2, 2, 0.1), x2 = seq(-2, 2, 0.1))
d1 <- d %>%
bind_cols(
prob = mvtnorm::dmvnorm(
x = as.matrix(d),
mean = c(0, 0), sigma = matrix(c(1, 0, 0, 1), nrow = 2)
)
)
d2 <- d %>%
bind_cols(
prob = mvtnorm::dmvnorm(
x = as.matrix(d),
mean = c(0, 0), sigma = matrix(c(1, 0.7, 0.7, 1), nrow = 2)
)
)
p1 <- d1 %>%
ggplot(aes(x = x1, y = x2)) +
geom_tile(aes(fill = prob)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.position = "none")
p2 <- d2 %>%
ggplot(aes(x = x1, y = x2)) +
geom_tile(aes(fill = prob)) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme(legend.position = "none")
p1 | p2

#4.8 Exercises
weekly <- ISLR2::Weekly
skimr::skim(weekly)
Data summary
Name |
weekly |
Number of rows |
1089 |
Number of columns |
9 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
8 |
________________________ |
|
Group variables |
None |
Variable type: factor
Direction |
0 |
1 |
FALSE |
2 |
Up: 605, Dow: 484 |
Variable type: numeric
Year |
0 |
1 |
2000.05 |
6.03 |
1990.00 |
1995.00 |
2000.00 |
2005.00 |
2010.00 |
▇▆▆▆▆ |
Lag1 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.15 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag2 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.15 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag3 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.16 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag4 |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.16 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
Lag5 |
0 |
1 |
0.14 |
2.36 |
-18.20 |
-1.17 |
0.23 |
1.41 |
12.03 |
▁▁▆▇▁ |
Volume |
0 |
1 |
1.57 |
1.69 |
0.09 |
0.33 |
1.00 |
2.05 |
9.33 |
▇▂▁▁▁ |
Today |
0 |
1 |
0.15 |
2.36 |
-18.20 |
-1.15 |
0.24 |
1.41 |
12.03 |
▁▁▆▇▁ |
#14. Predict gas mileage with
#a.Create a binary outcome mpg01
auto <- ISLR2::Auto %>%
mutate(mpg01 = ifelse(mpg > median(mpg), 1, 0),
mpg01 = factor(mpg01))
glimpse(auto)
## Rows: 392
## Columns: 10
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
## $ mpg01 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, …
auto <- auto %>%
mutate(origin = factor(origin, levels = 1:3,
labels = c("American", "European", "Japanese")))
#16. Predict crime rate with Boston
boston <- ISLR2::Boston %>%
mutate(
crim01 = ifelse(crim > median(crim), 1, 0),
crim01 = factor(crim01),
# Convert the binary chas variable to TRUE/FALSE
chas = chas == 1
)
glimpse(boston)
## Rows: 506
## Columns: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
## $ crim01 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,…