library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ISLR)
library(moderndive)
library(skimr)
library(ggthemes)
library(class)
library(modelr)
set.seed(123)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
Auto <- Auto |> mutate(mpg01 = as.integer(mpg>mean(mpg)))
Auto
ggplot(Auto, aes(year, fill = as.factor(mpg01))) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Auto, aes(cylinders, fill = as.factor(mpg01))) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Auto, aes(weight, fill = as.factor(mpg01))) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Auto, aes(horsepower, fill = as.factor(mpg01))) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Auto, aes(origin, fill = as.factor(mpg01))) +
geom_bar()
Looks like older, heavy, american cars with lots of horsepower/cylinders are above the mean miles per gallon.
Auto <- Auto |> mutate(id = row_number())
train_1 <- Auto |> sample_frac(0.7)
test_1 <- Auto |> anti_join(train_1, by="id")
train_1
test_1
glm_model = glm(mpg01 ~ weight+horsepower+cylinders, data = train_1, family = binomial)
rows <- nrow(test_1)
test_1 |>
add_predictions(glm_model, var="pred_prob", type="response") |>
mutate(prediction = ifelse(pred_prob > 0.5, 1, 0)) |>
mutate(right = ifelse(prediction == mpg01, 1, 0)) |>
summarise(error = 1-sum(right)/rows)
Auto <- Auto |> mutate(id = row_number())
train_2 <- Auto |> sample_frac(0.7)
test_2 <- Auto |> anti_join(train_2, by="id")
train_2_true <- train_2 |> select(mpg01) %>% .$mpg01
test_2_true <- test_2 |> select(mpg01) %>% .$mpg01
train_2 <- train_2 |> select(weight, horsepower, cylinders)
test_2 <- test_2 |> select(weight, horsepower, cylinders)
my_knn <- function(x) {
knn_pred = knn(train_2, test_2, train_2_true, k = x)
error = mean(knn_pred == test_2_true)
return(1-error)
}
tibble(k = 1:50) |>
mutate(error = map_dbl(k, my_knn)) |>
ggplot(aes(k, error)) +
geom_line()