# Helper packages
library(tidyverse) # for data wrangling & plotting
# Modeling packages
library(tidymodels) # for fitting MARS models
# Model interpretability packages
library(vip) # for variable importance
library(pdp) # for variable relationships
library(rsample) # data splitting
library(ggplot2) # plotting
library(earth) # fit MARS models
library(caret) # automating the tuning process
library(vip) # variable importance
library(pdp) # variable relationships
boston <- read_csv("data/boston.csv")
set.seed(123)
split <- initial_split(boston, prop = .7, strata = cmedv)
boston_train <- training(split)
boston_test <- testing(split)
boston_recipe <- recipe(cmedv ~ ., data = boston_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
set.seed(123)
kfolds <- vfold_cv(boston_train, v = 5, strata = cmedv)
reg_mod <- linear_reg(mixture = tune(), penalty = tune()) %>%
set_engine("glmnet")
reg_grid <- grid_regular(mixture(), penalty(range = c(-10,5)), levels = 10)
boston_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod)
tuning_results <- boston_wf %>%
tune_grid(resamples = kfolds, grid = reg_grid)
tuning_results %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
arrange(mean)
## # A tibble: 100 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0215 1 rmse standard 4.46 5 0.235 Preprocessor1_M…
## 2 0.0215 0.889 rmse standard 4.46 5 0.235 Preprocessor1_M…
## 3 0.0215 0.778 rmse standard 4.46 5 0.235 Preprocessor1_M…
## 4 0.0215 0.667 rmse standard 4.47 5 0.234 Preprocessor1_M…
## 5 0.0215 0.556 rmse standard 4.47 5 0.234 Preprocessor1_M…
## 6 0.0215 0.444 rmse standard 4.47 5 0.234 Preprocessor1_M…
## 7 0.0000000001 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## 8 0.00000000464 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## 9 0.000000215 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## 10 0.00001 0.667 rmse standard 4.47 5 0.233 Preprocessor1_M…
## # … with 90 more rows
autoplot(tuning_results)
# Step 1. finalize our workflow object with the optimal hyperparameter values
best_hyperparameters <- select_best(tuning_results, metric = "rmse")
final_wf <- workflow() %>%
add_recipe(boston_recipe) %>%
add_model(reg_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = boston_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()
#Part 2: Tuning a Regularized Classification Model
library(kernlab)
data(spam)
set.seed(123) # for reproducibility
split <- initial_split(spam, prop = .7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
logit_mod <- logistic_reg(mixture = tune(), penalty = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
logit_grid <- grid_regular(mixture(), penalty(range = c(-10,5)), levels = 10)
spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod)
tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = logit_grid)
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 100 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.000464 1 roc_auc binary 0.979 5 0.00352 Preprocessor1_Mo…
## 2 0.000464 0.889 roc_auc binary 0.979 5 0.00349 Preprocessor1_Mo…
## 3 0.000464 0.778 roc_auc binary 0.979 5 0.00347 Preprocessor1_Mo…
## 4 0.000464 0.667 roc_auc binary 0.979 5 0.00344 Preprocessor1_Mo…
## 5 0.000464 0.222 roc_auc binary 0.979 5 0.00329 Preprocessor1_Mo…
## 6 0.000464 0.333 roc_auc binary 0.979 5 0.00333 Preprocessor1_Mo…
## 7 0.000464 0.556 roc_auc binary 0.979 5 0.00342 Preprocessor1_Mo…
## 8 0.000464 0.111 roc_auc binary 0.979 5 0.00328 Preprocessor1_Mo…
## 9 0.000464 0.444 roc_auc binary 0.979 5 0.00339 Preprocessor1_Mo…
## 10 0.0000000001 0.111 roc_auc binary 0.979 5 0.00335 Preprocessor1_Mo…
## # … with 90 more rows
autoplot(tuning_results)
best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(logit_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = spam_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()
#Part 3: Tuning a MARS Classification Model
library(kernlab)
data(spam)
set.seed(123) # for reproducibility
split <- initial_split(spam, prop = .7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
spam_recipe <- recipe(type ~ ., data = spam_train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_nominal_predictors())
set.seed(123)
kfolds <- vfold_cv(spam_train, v = 5, strata = type)
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
set_mode("classification")
mars_grid <- grid_regular(num_terms(range = c(1,30)), prod_degree(), levels = 25)
spam_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod)
tuning_results <- spam_wf %>%
tune_grid(resamples = kfolds, grid = mars_grid)
tuning_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean))
## # A tibble: 50 × 8
## num_terms prod_degree .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 25 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 2 26 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 3 27 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 4 28 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 5 30 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 6 22 2 roc_auc binary 0.975 5 0.00339 Preprocessor1_M…
## 7 20 2 roc_auc binary 0.975 5 0.00334 Preprocessor1_M…
## 8 21 2 roc_auc binary 0.975 5 0.00328 Preprocessor1_M…
## 9 23 2 roc_auc binary 0.975 5 0.00335 Preprocessor1_M…
## 10 19 2 roc_auc binary 0.975 5 0.00330 Preprocessor1_M…
## # … with 40 more rows
Plot
autoplot(tuning_results)
best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- workflow() %>%
add_recipe(spam_recipe) %>%
add_model(mars_mod) %>%
finalize_workflow(best_hyperparameters)
final_fit <- final_wf %>%
fit(data = spam_train)
final_fit %>%
extract_fit_parsnip() %>%
vip()