library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.0 
## ✔ dials        1.2.1      ✔ tune         1.1.2 
## ✔ infer        1.0.6      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.0.1 
## ✔ parsnip      1.2.0      ✔ yardstick    1.3.0 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(kknn)

boston <- readr::read_csv(file = 'boston.csv')
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# #3 ----------------------------------------------------------------------
view(boston)

# Regression
# #5,6, 7 ----------------------------------------------------------------------

set.seed(123)
boston_split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test  <- testing(boston_split)

boston_split
## <Training/Testing/Total>
## <352/154/506>
boston_test
## # A tibble: 154 × 16
##      lon   lat cmedv   crim    zn indus  chas   nox    rm   age   dis   rad
##    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 -70.9  42.3  34.7 0.0273   0    7.07     0 0.469  7.18  61.1  4.97     2
##  2 -70.9  42.3  36.2 0.0690   0    2.18     0 0.458  7.15  54.2  6.06     3
##  3 -70.9  42.3  28.7 0.0298   0    2.18     0 0.458  6.43  58.7  6.06     3
##  4 -70.9  42.3  22.9 0.0883  12.5  7.87     0 0.524  6.01  66.6  5.56     5
##  5 -70.9  42.3  16.5 0.211   12.5  7.87     0 0.524  5.63 100    6.08     5
##  6 -70.9  42.3  15   0.225   12.5  7.87     0 0.524  6.38  94.3  6.35     5
##  7 -71.0  42.3  18.2 0.638    0    8.14     0 0.538  6.10  84.5  4.46     4
##  8 -71.0  42.3  13.6 1.25     0    8.14     0 0.538  5.57  98.1  3.80     4
##  9 -71.0  42.3  16.6 0.672    0    8.14     0 0.538  5.81  90.3  4.68     4
## 10 -71.0  42.3  18.4 0.773    0    8.14     0 0.538  6.50  94.4  4.45     4
## # ℹ 144 more rows
## # ℹ 4 more variables: tax <dbl>, ptratio <dbl>, b <dbl>, lstat <dbl>
boston_train
## # A tibble: 352 × 16
##      lon   lat cmedv  crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 -71.0  42.3  15.2 1.23      0  8.14     0 0.538  6.14  91.7  3.98     4   307
##  2 -71.0  42.3  14.5 0.988     0  8.14     0 0.538  5.81 100    4.10     4   307
##  3 -71.0  42.3  15.6 0.750     0  8.14     0 0.538  5.92  94.1  4.40     4   307
##  4 -71.0  42.3  13.9 0.841     0  8.14     0 0.538  5.60  85.7  4.45     4   307
##  5 -71.0  42.3  14.8 0.956     0  8.14     0 0.538  6.05  88.8  4.45     4   307
##  6 -71.0  42.3  13.2 1.39      0  8.14     0 0.538  5.95  82    3.99     4   307
##  7 -71.0  42.3  13.1 1.15      0  8.14     0 0.538  5.70  95    3.79     4   307
##  8 -71.0  42.3  13.5 1.61      0  8.14     0 0.538  6.10  96.9  3.76     4   307
##  9 -71.0  42.3  16.6 0.229     0  6.91     0 0.448  6.03  85.5  5.69     3   233
## 10 -71.0  42.3  14.4 0.254     0  6.91     0 0.448  5.40  95.3  5.87     3   233
## # ℹ 342 more rows
## # ℹ 3 more variables: ptratio <dbl>, b <dbl>, lstat <dbl>
view(boston_train)
view(boston_test)

# Do the distributions line up? 
ggplot(boston_train, aes(x = cmedv)) + 
  geom_line(stat = "density", trim = TRUE) + 
  geom_line(data = boston_test, stat = "density", trim = TRUE, col = "red")

# #8 ----------------------------------------------------------------------

boston_lm1 <- linear_reg() %>%
  set_engine('lm') %>%
  fit(cmedv ~ rm + crim, data = boston_train)

boston_lm1 %>% predict(boston_test)
## # A tibble: 154 × 1
##    .pred
##    <dbl>
##  1  31.2
##  2  30.8
##  3  24.7
##  4  21.1
##  5  17.8
##  6  24.2
##  7  21.7
##  8  17.0
##  9  19.3
## 10  25.1
## # ℹ 144 more rows
# compute the RMSE
boston_lm1 %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.48
# #9 ----------------------------------------------------------------------

boston_lm2 <- linear_reg() %>%
  set_engine('lm') %>%
  fit(cmedv ~ ., data = boston_train)

boston_lm2
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = cmedv ~ ., data = data)
## 
## Coefficients:
## (Intercept)          lon          lat         crim           zn        indus  
##  -6.080e+02   -5.648e+00    5.537e+00   -8.304e-02    3.318e-02   -1.426e-02  
##        chas          nox           rm          age          dis          rad  
##   2.282e+00   -1.165e+01    4.369e+00   -9.215e-03   -1.259e+00    2.721e-01  
##         tax      ptratio            b        lstat  
##  -1.212e-02   -8.739e-01    1.231e-02   -4.786e-01
boston_lm2 %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.83
# #10 ---------------------------------------------------------------------

knn <- nearest_neighbor() %>%
  set_engine('kknn') %>%
  set_mode("regression") %>%
  fit( cmedv ~ . , data = boston_train)
# compute the RMSE on the test data
knn %>%
  predict(boston_test) %>%
  bind_cols(boston_test %>% select(cmedv)) %>%
  rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        3.37