library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ISLR)
library(moderndive)
library(skimr)
library(ggthemes)
library(class)
library(modelr)
set.seed(123)

Exercises

11.

summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365

a.

Auto <- Auto |> mutate(mpg01 = as.integer(mpg>mean(mpg)))
Auto

b.

ggplot(Auto, aes(year, fill = as.factor(mpg01))) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Auto, aes(cylinders, fill = as.factor(mpg01))) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Auto, aes(weight, fill = as.factor(mpg01))) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Auto, aes(horsepower, fill = as.factor(mpg01))) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Auto, aes(origin, fill = as.factor(mpg01))) +
  geom_bar()

Looks like older, heavy, american cars with lots of horsepower/cylinders are above the mean miles per gallon.

c.

Auto <- Auto |> mutate(id = row_number())
train_1 <- Auto |> sample_frac(0.7)
test_1 <- Auto |> anti_join(train_1, by="id")
train_1
test_1

f.

glm_model = glm(mpg01 ~ weight+horsepower+cylinders, data = train_1, family = binomial)
rows <- nrow(test_1)
test_1 |>
  add_predictions(glm_model, var="pred_prob", type="response") |>
  mutate(prediction = ifelse(pred_prob > 0.5, 1, 0)) |>
  mutate(right = ifelse(prediction == mpg01, 1, 0)) |>
  summarise(error = 1-sum(right)/rows)

g.

Auto <- Auto |> mutate(id = row_number())
train_2 <- Auto |> sample_frac(0.7)
test_2 <- Auto |> anti_join(train_2, by="id")
train_2_true <- train_2 |> select(mpg01) %>% .$mpg01
test_2_true <- test_2 |> select(mpg01) %>% .$mpg01
train_2 <- train_2 |> select(weight, horsepower, cylinders)
test_2 <- test_2 |> select(weight, horsepower, cylinders)

my_knn <- function(x) {
  knn_pred = knn(train_2, test_2, train_2_true, k = x)
  error = mean(knn_pred == test_2_true)
  return(1-error)
}
tibble(k = 1:50) |> 
  mutate(error = map_dbl(k, my_knn)) |>
ggplot(aes(k, error)) +
  geom_line()