library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ recipes      1.1.0
## ✔ dials        1.3.0     ✔ rsample      1.2.1
## ✔ dplyr        1.1.4     ✔ tibble       3.2.1
## ✔ ggplot2      3.5.1     ✔ tidyr        1.3.1
## ✔ infer        1.0.7     ✔ tune         1.2.1
## ✔ modeldata    1.4.0     ✔ workflows    1.1.4
## ✔ parsnip      1.2.1     ✔ workflowsets 1.1.0
## ✔ purrr        1.0.2     ✔ yardstick    1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vip)
## 
## Attaching package: 'vip'
## 
## The following object is masked from 'package:utils':
## 
##     vi
library(readr)
library(Matrix)
## Warning: package 'Matrix' was built under R version 4.4.2
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  1. Split the data into a training set and test set using a 70-30% split
set.seed(123)
split <- initial_split(boston, 0.70, strata = cmedv)
boston_train <- training(split)
boston <- testing(split)
  1. Create a recipe that will model cmedv as a function of all predictor variables.
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())
  1. Create a 5-fold cross validation resampling object
set.seed(123)
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
  1. Create a regularized regression model object that:
  1. Contains tuning placeholders for the mixture and penalty arguments

  2. Sets the engine to use the glmnet package.

reg_mod <- linear_reg(mixture = tune(), penalty = tune()) %>%
  set_engine("glmnet")
  1. Create our hyperparameter search grid that:
  1. Searches across default values for mixture

  2. Searches across values ranging from -10 to 5 for penalty

  3. Will search across 10 values for each of these hyperparameters (levels)

reg_grid <- grid_regular(penalty(range = c(-10, 5)), mixture(), levels = 10)
reg_grid
## # A tibble: 100 × 2
##     penalty mixture
##       <dbl>   <dbl>
##  1 1   e-10       0
##  2 4.64e- 9       0
##  3 2.15e- 7       0
##  4 1   e- 5       0
##  5 4.64e- 4       0
##  6 2.15e- 2       0
##  7 1   e+ 0       0
##  8 4.64e+ 1       0
##  9 2.15e+ 3       0
## 10 1   e+ 5       0
## # ℹ 90 more rows
  1. Creates a workflow object that combines our recipe object with our model object.
boston_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(reg_mod)
  1. Performs a hyperparameter search.
tuning_results <- boston_wf %>%
  tune_grid(resamples = kfolds, grid = reg_grid)
## Warning: package 'glmnet' was built under R version 4.4.2
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
##                standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x3There were issues with some computations   A: x4There were issues with some computations   A: x5There were issues with some computations   A: x5
  1. Assess the results
tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  arrange(mean)
## # A tibble: 100 × 8
##          penalty mixture .metric .estimator  mean     n std_err .config         
##            <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
##  1 0.0215          1     rmse    standard    4.46     5   0.235 Preprocessor1_M…
##  2 0.0215          0.889 rmse    standard    4.46     5   0.235 Preprocessor1_M…
##  3 0.0215          0.778 rmse    standard    4.46     5   0.235 Preprocessor1_M…
##  4 0.0215          0.667 rmse    standard    4.47     5   0.234 Preprocessor1_M…
##  5 0.0215          0.556 rmse    standard    4.47     5   0.234 Preprocessor1_M…
##  6 0.0215          0.444 rmse    standard    4.47     5   0.234 Preprocessor1_M…
##  7 0.0000000001    0.667 rmse    standard    4.47     5   0.233 Preprocessor1_M…
##  8 0.00000000464   0.667 rmse    standard    4.47     5   0.233 Preprocessor1_M…
##  9 0.000000215     0.667 rmse    standard    4.47     5   0.233 Preprocessor1_M…
## 10 0.00001         0.667 rmse    standard    4.47     5   0.233 Preprocessor1_M…
## # ℹ 90 more rows

Assess this plot regarding our hyperparameter search results. What do the results tell you? Does the amount of regularization (size of the penalty) have a larger influence on the RMSE or does the type of penalty applied (mixture) have a larger influence?

autoplot(tuning_results)

  1. finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- select_best(tuning_results, metric = "rmse")

final_wf <- workflow() %>%
  add_recipe(boston_recipe) %>%
  add_model(reg_mod) %>%
  finalize_workflow(best_hyperparameters)
  1. Fit our final workflow object across the full training set data
final_fit <- final_wf %>%
  fit(data = boston_train)
  1. plot the top 10 most influential features
final_fit %>%
  extract_fit_parsnip() %>%
  vip()

Part 2: Tuning a Regularized Classification Model

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:scales':
## 
##     alpha
data(spam)
  1. Split the data into a training set and test set using a 70-30% split.
set.seed(123)
split <- initial_split(spam, prop = 0.70, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
  1. Create a recipe that will model type as a function of all predictor variables. Apply the following feature engineering steps in this order:
  1. Normalize all numeric predictor variables using a Yeo-Johnson transformation

  2. Standardize all numeric predictor variables

spam_recipe <- recipe(type ~ ., data = spam_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())
  1. Create a 5-fold cross validation resampling object
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
  1. Create a regularized logistic regression model object that:
  1. Contains tuning placeholders for the mixture and penalty arguments

  2. Sets the engine to use the glmnet package.

  3. Sets the mode to be a classification model

logit_mod <- logistic_reg(mixture = tune(), penalty = tune()) %>%
  set_engine("glmnet") %>%
  set_mode("classification")
  1. Create our hyperparameter search grid that:
  1. Searches across default values for mixture

  2. Searches across values ranging from -10 to 5 for penalty

  3. Will search across 10 values for each of these hyperparameters (levels)

logit_grid <- grid_regular(mixture (), penalty(range = c(-10, 5)), levels = 10)
  1. Creates a workflow object that combines our recipe object with our model object.
spam_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(logit_mod)
  1. Performs a hyperparameter search.
tuning_results <- spam_wf %>%
  tune_grid(resamples = kfolds, grid = logit_grid)
  1. Assess results
tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  arrange(desc(mean))
## # A tibble: 100 × 8
##         penalty mixture .metric .estimator  mean     n std_err .config          
##           <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            
##  1 0.000464       1     roc_auc binary     0.979     5 0.00352 Preprocessor1_Mo…
##  2 0.000464       0.889 roc_auc binary     0.979     5 0.00349 Preprocessor1_Mo…
##  3 0.000464       0.778 roc_auc binary     0.979     5 0.00347 Preprocessor1_Mo…
##  4 0.000464       0.667 roc_auc binary     0.979     5 0.00344 Preprocessor1_Mo…
##  5 0.000464       0.222 roc_auc binary     0.979     5 0.00329 Preprocessor1_Mo…
##  6 0.000464       0.333 roc_auc binary     0.979     5 0.00333 Preprocessor1_Mo…
##  7 0.000464       0.556 roc_auc binary     0.979     5 0.00342 Preprocessor1_Mo…
##  8 0.000464       0.111 roc_auc binary     0.979     5 0.00328 Preprocessor1_Mo…
##  9 0.000464       0.444 roc_auc binary     0.979     5 0.00339 Preprocessor1_Mo…
## 10 0.0000000001   0.111 roc_auc binary     0.979     5 0.00335 Preprocessor1_Mo…
## # ℹ 90 more rows
autoplot(tuning_results)

1. finalize our workflow object with the optimal hyperparameter values

best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(logit_mod) %>%
  finalize_workflow(best_hyperparameters)
  1. fit our final workflow object across the full training set data
final_fit <- final_wf %>%
  fit(data = spam_train)
  1. plot the top 10 most influential features.
final_fit %>%
  extract_fit_parsnip() %>%
  vip()

Part 3: Tuning a MARS Classification Model

  1. Split the data into a training set and test set using a 70-30% split.
set.seed(123)
split <- initial_split(spam, prop = 0.70, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
  1. Create a recipe that will model type as a function of all predictor variables. Apply the following feature engineering steps in this order:
  1. Normalize all numeric predictor variables using a Yeo-Johnson transformation

  2. Standardize all numeric predictor variables

spam_recipe <- recipe(type ~ ., data = spam_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())
  1. Create a 5-fold cross validation resampling object.
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
  1. Create a MARS model object that:
  1. Contains tuning placeholders for the num_terms and prod_degree arguments

  2. Sets the mode to be a classification model.

mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
  set_mode("classification")
  1. Create our hyperparameter search grid that:
  1. Searches across values ranging from 1 to 30 for num_terms

  2. Searches across default values for prod_degree

  3. Will search across 25 values for each of these hyperparameters (levels)

mars_grid <- grid_regular(num_terms(range = c(1,30)), prod_degree(), levels = 25)
  1. Creates a workflow object that combines our recipe object with our model object.
spam_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(mars_mod)
  1. Performs a hyperparameter search.
tuning_results <- spam_wf %>%
  tune_grid(resamples = kfolds, grid = mars_grid)
## Warning: package 'earth' was built under R version 4.4.2
## Warning: package 'plotmo' was built under R version 4.4.2
## → A | warning: glm.fit: algorithm did not converge, glm.fit: fitted probabilities numerically 0 or 1 occurred, the glm algorithm did not converge for response "spam"
## There were issues with some computations   A: x1There were issues with some computations   A: x2There were issues with some computations   A: x2
  1. Assess results
tuning_results %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  arrange(desc(mean))
## # A tibble: 50 × 8
##    num_terms prod_degree .metric .estimator  mean     n std_err .config         
##        <int>       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
##  1        25           2 roc_auc binary     0.975     5 0.00339 Preprocessor1_M…
##  2        26           2 roc_auc binary     0.975     5 0.00339 Preprocessor1_M…
##  3        27           2 roc_auc binary     0.975     5 0.00339 Preprocessor1_M…
##  4        28           2 roc_auc binary     0.975     5 0.00339 Preprocessor1_M…
##  5        30           2 roc_auc binary     0.975     5 0.00339 Preprocessor1_M…
##  6        22           2 roc_auc binary     0.975     5 0.00338 Preprocessor1_M…
##  7        20           2 roc_auc binary     0.975     5 0.00333 Preprocessor1_M…
##  8        21           2 roc_auc binary     0.975     5 0.00327 Preprocessor1_M…
##  9        23           2 roc_auc binary     0.975     5 0.00335 Preprocessor1_M…
## 10        16           2 roc_auc binary     0.975     5 0.00335 Preprocessor1_M…
## # ℹ 40 more rows
autoplot(tuning_results)

  1. finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")

final_wf <- workflow() %>%
  add_recipe(spam_recipe) %>%
  add_model(mars_mod) %>%
  finalize_workflow(best_hyperparameters)
  1. fit our final workflow object across the full training set data
final_fit <- final_wf %>%
  fit(data = spam_train)
  1. plot the top 10 most influential features.
final_fit %>%
  extract_fit_parsnip() %>%
  vip()