library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ISLR)
library(moderndive)
library(skimr)
library(ggthemes)
library(class)
library(modelr)
set.seed(123)
dfC <- Carseats
glimpse(dfC)
## Rows: 400
## Columns: 11
## $ Sales       <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, …
## $ CompPrice   <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117…
## $ Income      <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2…
## $ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, …
## $ Population  <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,…
## $ Price       <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136…
## $ ShelveLoc   <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,…
## $ Age         <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52…
## $ Education   <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18…
## $ Urban       <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye…
## $ US          <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N…

Quiz

Visually Exploring Data

ggplot(dfC, aes(Sales, fill=ShelveLoc)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dfC, aes(CompPrice, fill=ShelveLoc)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dfC, aes(Income, fill=ShelveLoc)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dfC, aes(Age, fill=ShelveLoc)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Programming Quiz

set.seed(1)
dfC <- dfC |> mutate(id = row_number(), ShelveLoc01 = ifelse(ShelveLoc == "Good", 1, 0))
train <- dfC |> sample_frac(0.8)
test <- dfC |> anti_join(train, by = "id")
glm_model = glm(ShelveLoc01 ~ Sales+Income+Age, data = train, family = binomial)
rows <- nrow(test)
test |>
  add_predictions(glm_model, var="pred_prob", type="response") |>
  mutate(prediction = ifelse(pred_prob > 0.8, 1, 0)) |>
  mutate(right = ifelse(prediction == ShelveLoc01, 1, 0)) |>
  summarise(error = 1-sum(right)/rows)

Programming Bonus

Selected variables from visual exploration above. Sales has a clear trend where “good”s are on one side of the graph. Age seems to have a concentration of “good” in the middle of the graph. Income seems to be a bit more spread but with more on the lower end.

set.seed(1)
dfC <- dfC |> mutate(id = row_number(), ShelveLoc01 = ifelse(ShelveLoc == "Good", 1, 0))
train <- dfC |> sample_frac(0.8)
test <- dfC |> anti_join(train, by = "id")
train_true <- train$ShelveLoc01
test_true <- test$ShelveLoc01
train <- train |> select(Sales, Income, Age)
test <- test |> select(Sales, Income, Age)
knn_pred <- knn(train, test, train_true, k = 5)
error <- 1-mean(knn_pred == test_true)
error
## [1] 0.125