library(readr)
boston <- read_csv("~/Documents/BANA 4080/data/boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
boston
## # A tibble: 506 × 16
##      lon   lat cmedv    crim    zn indus  chas   nox    rm   age   dis   rad
##    <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 -71.0  42.3  24   0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1
##  2 -71.0  42.3  21.6 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2
##  3 -70.9  42.3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2
##  4 -70.9  42.3  33.4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3
##  5 -70.9  42.3  36.2 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3
##  6 -70.9  42.3  28.7 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3
##  7 -70.9  42.3  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5
##  8 -70.9  42.3  22.1 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5
##  9 -70.9  42.3  16.5 0.211    12.5  7.87     0 0.524  5.63 100    6.08     5
## 10 -70.9  42.3  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5
## # ℹ 496 more rows
## # ℹ 4 more variables: tax <dbl>, ptratio <dbl>, b <dbl>, lstat <dbl>
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()  masks scales::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ purrr::discard()     masks scales::discard()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ stringr::fixed()     masks recipes::fixed()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ yardstick::spec()    masks readr::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
# Part 1 

set.seed(123)
split <- initial_split(boston, prop = 0.7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
# Step 2. create our feature engineering recipe
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_YeoJohnson(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())
# Step 3. create resampling object
set.seed(123)
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
# Step 4. create our model object
reg_mod <- linear_reg(mixture = tune(), penalty = tune()) %>%
set_engine('glmnet')
# Step 5. create our hyperparameter search grid
reg_grid <- grid_regular(mixture(), penalty(c(-10,5)), levels = 10)
# Step 6. create our workflow object
boston_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod)
# Step 7. perform hyperparamter search
tuning_results <- boston_wf %>%
tune_grid(resamples = kfolds, grid = reg_grid)
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x3There were issues with some computations   A: x4There were issues with some computations   A: x5There were issues with some computations   A: x5
# Step 8. assess results
tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean)
## # A tibble: 100 × 8
##          penalty mixture .metric .estimator  mean     n std_err .config         
##            <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
##  1 0.0215          1     rmse    standard    4.93     5   0.172 Preprocessor1_M…
##  2 0.0215          0.889 rmse    standard    4.93     5   0.172 Preprocessor1_M…
##  3 0.0215          0.778 rmse    standard    4.93     5   0.172 Preprocessor1_M…
##  4 0.0215          0.667 rmse    standard    4.93     5   0.173 Preprocessor1_M…
##  5 0.0215          0.556 rmse    standard    4.93     5   0.173 Preprocessor1_M…
##  6 0.0215          0.444 rmse    standard    4.94     5   0.173 Preprocessor1_M…
##  7 0.0215          0.333 rmse    standard    4.94     5   0.173 Preprocessor1_M…
##  8 0.0000000001    0.778 rmse    standard    4.94     5   0.173 Preprocessor1_M…
##  9 0.00000000464   0.778 rmse    standard    4.94     5   0.173 Preprocessor1_M…
## 10 0.000000215     0.778 rmse    standard    4.94     5   0.173 Preprocessor1_M…
## # ℹ 90 more rows
autoplot(tuning_results)

# Step 1. finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- select_best(tuning_results, metric = "rmse")
final_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(reg_mod) %>%
  finalize_workflow(best_hyperparameters)
# Step 2. fit our final workflow object across the full training set data
final_fit <- final_wf %>%
fit(data = boston_train)
# Step 3. plot the top 10 most influential features
final_fit %>%
extract_fit_parsnip() %>%
vip()