library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.1 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.2.0 ✔ yardstick 1.3.0
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(kknn)
boston <- readr::read_csv(file = 'boston.csv')
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# #3 ----------------------------------------------------------------------
view(boston)
# Regression
# #5,6, 7 ----------------------------------------------------------------------
set.seed(123)
boston_split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(boston_split)
boston_test <- testing(boston_split)
boston_split
## <Training/Testing/Total>
## <352/154/506>
boston_test
## # A tibble: 154 × 16
## lon lat cmedv crim zn indus chas nox rm age dis rad
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -70.9 42.3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2
## 2 -70.9 42.3 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3
## 3 -70.9 42.3 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3
## 4 -70.9 42.3 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5
## 5 -70.9 42.3 16.5 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5
## 6 -70.9 42.3 15 0.225 12.5 7.87 0 0.524 6.38 94.3 6.35 5
## 7 -71.0 42.3 18.2 0.638 0 8.14 0 0.538 6.10 84.5 4.46 4
## 8 -71.0 42.3 13.6 1.25 0 8.14 0 0.538 5.57 98.1 3.80 4
## 9 -71.0 42.3 16.6 0.672 0 8.14 0 0.538 5.81 90.3 4.68 4
## 10 -71.0 42.3 18.4 0.773 0 8.14 0 0.538 6.50 94.4 4.45 4
## # ℹ 144 more rows
## # ℹ 4 more variables: tax <dbl>, ptratio <dbl>, b <dbl>, lstat <dbl>
boston_train
## # A tibble: 352 × 16
## lon lat cmedv crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -71.0 42.3 15.2 1.23 0 8.14 0 0.538 6.14 91.7 3.98 4 307
## 2 -71.0 42.3 14.5 0.988 0 8.14 0 0.538 5.81 100 4.10 4 307
## 3 -71.0 42.3 15.6 0.750 0 8.14 0 0.538 5.92 94.1 4.40 4 307
## 4 -71.0 42.3 13.9 0.841 0 8.14 0 0.538 5.60 85.7 4.45 4 307
## 5 -71.0 42.3 14.8 0.956 0 8.14 0 0.538 6.05 88.8 4.45 4 307
## 6 -71.0 42.3 13.2 1.39 0 8.14 0 0.538 5.95 82 3.99 4 307
## 7 -71.0 42.3 13.1 1.15 0 8.14 0 0.538 5.70 95 3.79 4 307
## 8 -71.0 42.3 13.5 1.61 0 8.14 0 0.538 6.10 96.9 3.76 4 307
## 9 -71.0 42.3 16.6 0.229 0 6.91 0 0.448 6.03 85.5 5.69 3 233
## 10 -71.0 42.3 14.4 0.254 0 6.91 0 0.448 5.40 95.3 5.87 3 233
## # ℹ 342 more rows
## # ℹ 3 more variables: ptratio <dbl>, b <dbl>, lstat <dbl>
view(boston_train)
view(boston_test)
# Do the distributions line up?
ggplot(boston_train, aes(x = cmedv)) +
geom_line(stat = "density", trim = TRUE) +
geom_line(data = boston_test, stat = "density", trim = TRUE, col = "red")

# #8 ----------------------------------------------------------------------
boston_lm1 <- linear_reg() %>%
set_engine('lm') %>%
fit(cmedv ~ rm + crim, data = boston_train)
boston_lm1 %>% predict(boston_test)
## # A tibble: 154 × 1
## .pred
## <dbl>
## 1 31.2
## 2 30.8
## 3 24.7
## 4 21.1
## 5 17.8
## 6 24.2
## 7 21.7
## 8 17.0
## 9 19.3
## 10 25.1
## # ℹ 144 more rows
# compute the RMSE
boston_lm1 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 6.48
# #9 ----------------------------------------------------------------------
boston_lm2 <- linear_reg() %>%
set_engine('lm') %>%
fit(cmedv ~ ., data = boston_train)
boston_lm2
## parsnip model object
##
##
## Call:
## stats::lm(formula = cmedv ~ ., data = data)
##
## Coefficients:
## (Intercept) lon lat crim zn indus
## -6.080e+02 -5.648e+00 5.537e+00 -8.304e-02 3.318e-02 -1.426e-02
## chas nox rm age dis rad
## 2.282e+00 -1.165e+01 4.369e+00 -9.215e-03 -1.259e+00 2.721e-01
## tax ptratio b lstat
## -1.212e-02 -8.739e-01 1.231e-02 -4.786e-01
boston_lm2 %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.83
# #10 ---------------------------------------------------------------------
knn <- nearest_neighbor() %>%
set_engine('kknn') %>%
set_mode("regression") %>%
fit( cmedv ~ . , data = boston_train)
# compute the RMSE on the test data
knn %>%
predict(boston_test) %>%
bind_cols(boston_test %>% select(cmedv)) %>%
rmse(truth = cmedv, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 3.37