library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ recipes 1.1.0
## ✔ dials 1.3.0 ✔ rsample 1.2.1
## ✔ dplyr 1.1.4 ✔ tibble 3.2.1
## ✔ ggplot2 3.5.1 ✔ tidyr 1.3.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vip)
##
## Attaching package: 'vip'
##
## The following object is masked from 'package:utils':
##
## vi
library(readr)
library(Matrix)
## Warning: package 'Matrix' was built under R version 4.4.2
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
boston <- read_csv("boston.csv")
## Rows: 506 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (16): lon, lat, cmedv, crim, zn, indus, chas, nox, rm, age, dis, rad, ta...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
set.seed(123)
split <- initial_split(boston, 0.70, strata = cmedv)
boston_train <- training(split)
boston <- testing(split)
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
set.seed(123)
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
Contains tuning placeholders for the mixture and penalty arguments
Sets the engine to use the glmnet package.
reg_mod <- linear_reg(mixture = tune(), penalty = tune()) %>%
set_engine("glmnet")
Searches across default values for mixture
Searches across values ranging from -10 to 5 for penalty
Will search across 10 values for each of these hyperparameters (levels)
reg_grid <- grid_regular(penalty(range = c(-10, 5)), mixture(), levels = 10)
reg_grid
## # A tibble: 100 × 2
## penalty mixture
## <dbl> <dbl>
## 1 1 e-10 0
## 2 4.64e- 9 0
## 3 2.15e- 7 0
## 4 1 e- 5 0
## 5 4.64e- 4 0
## 6 2.15e- 2 0
## 7 1 e+ 0 0
## 8 4.64e+ 1 0
## 9 2.15e+ 3 0
## 10 1 e+ 5 0
## # ℹ 90 more rows
boston_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod)
tuning_results <- boston_wf %>%
tune_grid(resamples = kfolds, grid = reg_grid)
## Warning: package 'glmnet' was built under R version 4.4.2
## → A | warning: A correlation computation is required, but `estimate` is constant and has 0
## standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## There were issues with some computations A: x1There were issues with some computations A: x2There were issues with some computations A: x3There were issues with some computations A: x4There were issues with some computations A: x5There were issues with some computations A: x5
tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean)
## # A tibble: 100 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0215 1 rmse standard 4.46 5 0.235 Preprocessor1_M…
## 2 0.0215 0.889 rmse standard 4.46 5 0.235 Preprocessor1_M…
## 3 0.0215 0.778 rmse standard 4.46 5 0.235 Preprocessor1_M…
## 4 0.0215 0.667 rmse standard 4.47 5 0.234 Preprocessor1_M…
## 5 0.0215 0.556 rmse standard 4.47 5 0.234 Preprocessor1_M…
## 6 0.0215 0.444 rmse standard 4.47 5 0.234 Preprocessor1_M…
## 7 0.0000000001 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## 8 0.00000000464 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## 9 0.000000215 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## 10 0.00001 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## # ℹ 90 more rows
Assess this plot regarding our hyperparameter search results. What do the results tell you? Does the amount of regularization (size of the penalty) have a larger influence on the RMSE or does the type of penalty applied (mixture) have a larger influence?
autoplot(tuning_results)
best_hyperparameters <- select_best(tuning_results, metric = "rmse")
final_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = boston_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
## The following object is masked from 'package:scales':
##
## alpha
data(spam)
set.seed(123)
split <- initial_split(spam, prop = 0.70, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
Normalize all numeric predictor variables using a Yeo-Johnson transformation
Standardize all numeric predictor variables
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
Contains tuning placeholders for the mixture and penalty arguments
Sets the engine to use the glmnet package.
Sets the mode to be a classification model
logit_mod <- logistic_reg(mixture = tune(), penalty = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
Searches across default values for mixture
Searches across values ranging from -10 to 5 for penalty
Will search across 10 values for each of these hyperparameters (levels)
logit_grid <- grid_regular(mixture (), penalty(range = c(-10, 5)), levels = 10)
spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod)
tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = logit_grid)
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 100 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.000464 1 roc_auc binary 0.979 5 0.00352 Preprocessor1_Mo…
## 2 0.000464 0.889 roc_auc binary 0.979 5 0.00349 Preprocessor1_Mo…
## 3 0.000464 0.778 roc_auc binary 0.979 5 0.00347 Preprocessor1_Mo…
## 4 0.000464 0.667 roc_auc binary 0.979 5 0.00344 Preprocessor1_Mo…
## 5 0.000464 0.222 roc_auc binary 0.979 5 0.00329 Preprocessor1_Mo…
## 6 0.000464 0.333 roc_auc binary 0.979 5 0.00333 Preprocessor1_Mo…
## 7 0.000464 0.556 roc_auc binary 0.979 5 0.00342 Preprocessor1_Mo…
## 8 0.000464 0.111 roc_auc binary 0.979 5 0.00328 Preprocessor1_Mo…
## 9 0.000464 0.444 roc_auc binary 0.979 5 0.00339 Preprocessor1_Mo…
## 10 0.0000000001 0.111 roc_auc binary 0.979 5 0.00335 Preprocessor1_Mo…
## # ℹ 90 more rows
autoplot(tuning_results)
1. finalize our workflow object with the optimal hyperparameter
values
best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = spam_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()
set.seed(123)
split <- initial_split(spam, prop = 0.70, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
Normalize all numeric predictor variables using a Yeo-Johnson transformation
Standardize all numeric predictor variables
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
Contains tuning placeholders for the num_terms and prod_degree arguments
Sets the mode to be a classification model.
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
set_mode("classification")
Searches across values ranging from 1 to 30 for num_terms
Searches across default values for prod_degree
Will search across 25 values for each of these hyperparameters (levels)
mars_grid <- grid_regular(num_terms(range = c(1,30)), prod_degree(), levels = 25)
spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod)
tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = mars_grid)
## Warning: package 'earth' was built under R version 4.4.2
## Warning: package 'plotmo' was built under R version 4.4.2
## → A | warning: glm.fit: algorithm did not converge, glm.fit: fitted probabilities numerically 0 or 1 occurred, the glm algorithm did not converge for response "spam"
## There were issues with some computations A: x1There were issues with some computations A: x2There were issues with some computations A: x2
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 50 × 8
## num_terms prod_degree .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 25 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 2 26 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 3 27 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 4 28 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 5 30 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 6 22 2 roc_auc binary 0.975 5 0.00338 Preprocessor1_M…
## 7 20 2 roc_auc binary 0.975 5 0.00333 Preprocessor1_M…
## 8 21 2 roc_auc binary 0.975 5 0.00327 Preprocessor1_M…
## 9 23 2 roc_auc binary 0.975 5 0.00335 Preprocessor1_M…
## 10 16 2 roc_auc binary 0.975 5 0.00335 Preprocessor1_M…
## # ℹ 40 more rows
autoplot(tuning_results)
best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = spam_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()