library(tidymodels)
library(tidyverse)
library(baguette)

## Warning: package 'baguette' was built under R version 4.2.2

library(vip)
library(pdp)

## Warning: package 'pdp' was built under R version 4.2.2

library(kernlab)
data(spam)
# split data into train / test sets
set.seed(123)
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)

Part 1: Decision trees

#1

# Step 1: create decision tree model object
dt_mod <- decision_tree(mode = 'classification') %>%
set_engine("rpart")
# Step 2: create model recipe
model_recipe <- recipe(type ~ ., data = spam_train)
# Step 3: fit model workflow
dt_fit <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(dt_mod) %>%
fit(data = spam_train)

#2

rpart.plot::rpart.plot(dt_fit$fit$fit$fit)

## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

#3

# create resampling procedure
set.seed(123)
kfold <- vfold_cv(spam_train, v = 5)
# train model
dt_results <- fit_resamples(dt_mod, model_recipe, kfold)
# model results
collect_metrics(dt_results)

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.890     5 0.00510 Preprocessor1_Model1
## 2 roc_auc  binary     0.903     5 0.00808 Preprocessor1_Model1

#4

dt_mod <- decision_tree(
mode = "classification",
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("rpart")
# create the hyperparameter grid
dt_hyper_grid <- grid_regular(
cost_complexity(),
tree_depth(),
min_n(),
levels = 5
)
# train our model across the hyper parameter grid
set.seed(123)
dt_results <- tune_grid(dt_mod, model_recipe, resamples = kfold, grid = dt_hyper_grid)
# get best results
show_best(dt_results, metric = "roc_auc", n = 5)

## # A tibble: 5 × 9
##   cost_complexity tree_depth min_n .metric .estima…¹  mean     n std_err .config
##             <dbl>      <int> <int> <chr>   <chr>     <dbl> <int>   <dbl> <chr>  
## 1    0.0000000001         15    21 roc_auc binary    0.943     5 0.00417 Prepro…
## 2    0.0000000178         15    21 roc_auc binary    0.943     5 0.00417 Prepro…
## 3    0.00000316           15    21 roc_auc binary    0.943     5 0.00417 Prepro…
## 4    0.0000000001         11    21 roc_auc binary    0.937     5 0.00539 Prepro…
## 5    0.0000000178         11    21 roc_auc binary    0.937     5 0.00539 Prepro…
## # … with abbreviated variable name ¹.estimator

#5

# get best hyperparameter values
dt_best_model <- select_best(dt_results, metric = 'roc_auc')
# put together final workflow
dt_final_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(dt_mod) %>%
finalize_workflow(dt_best_model)
# fit final workflow across entire training data
dt_final_fit <- dt_final_wf %>%
fit(data = spam_train)
# plot feature importance
dt_final_fit %>%
extract_fit_parsnip() %>%
vip(20)

Part 2: Bagging

#1

# create bagged CART model object and
# start with 5 bagged trees
bag_mod <- bag_tree() %>%
set_engine("rpart", times = 5) %>%
set_mode("classification")
# train model
bag_results <- fit_resamples(bag_mod, model_recipe, kfold)
# model results
collect_metrics(bag_results)

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.927     5 0.00436 Preprocessor1_Model1
## 2 roc_auc  binary     0.961     5 0.00356 Preprocessor1_Model1

#2

# create bagged CART model object with
# tuning option set for number of bagged trees
bag_mod <- bag_tree() %>%
set_engine("rpart", times = tune()) %>%
set_mode("classification")
# create the hyperparameter grid
bag_hyper_grid <- expand.grid(times = c(5, 25, 50, 100, 200, 300))
# train our model across the hyper parameter grid
set.seed(123)
bag_results <- tune_grid(bag_mod, model_recipe, resamples = kfold, grid = bag_hyper_grid)

# model results
show_best(bag_results, metric = "roc_auc")

## # A tibble: 5 × 7
##   times .metric .estimator  mean     n std_err .config             
##   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1   200 roc_auc binary     0.975     5 0.00305 Preprocessor1_Model5
## 2   300 roc_auc binary     0.975     5 0.00296 Preprocessor1_Model6
## 3   100 roc_auc binary     0.975     5 0.00316 Preprocessor1_Model4
## 4    50 roc_auc binary     0.973     5 0.00261 Preprocessor1_Model3
## 5    25 roc_auc binary     0.973     5 0.00298 Preprocessor1_Model2

Part 3: Random forests

#1

# create random forest model object and
# use the ranger package for the engine
rf_mod <- rand_forest(mode = "classification") %>%
set_engine("ranger")
# train model
rf_results <- fit_resamples(rf_mod, model_recipe, kfold)

## Warning: package 'ranger' was built under R version 4.2.2

# model results
collect_metrics(rf_results)

## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.946     5 0.00205 Preprocessor1_Model1
## 2 roc_auc  binary     0.983     5 0.00262 Preprocessor1_Model1

#2

# create random forest model object with tuning option
rf_mod <- rand_forest(
mode = "classification",
trees = tune(),
mtry = tune(),
min_n = tune()
) %>%
set_engine("ranger", importance = "impurity")
# create the hyperparameter grid
rf_hyper_grid <- grid_regular(
trees(range = c(50, 800)),
mtry(range = c(2, 50)),
min_n(range = c(1, 20)),
levels = 5
)

# train our model across the hyper parameter grid
set.seed(123)
rf_results <- tune_grid(rf_mod, model_recipe, resamples = kfold, grid = rf_hyper_grid)
# model results
show_best(rf_results, metric = "roc_auc")

## # A tibble: 5 × 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2   237     1 roc_auc binary     0.984     5 0.00230 Preprocessor1_Model0…
## 2     2   800     1 roc_auc binary     0.984     5 0.00214 Preprocessor1_Model0…
## 3     2   800     5 roc_auc binary     0.984     5 0.00226 Preprocessor1_Model0…
## 4     2   612    10 roc_auc binary     0.984     5 0.00215 Preprocessor1_Model0…
## 5     2   425     5 roc_auc binary     0.984     5 0.00220 Preprocessor1_Model0…

#3

# get optimal hyperparameters
rf_best_hyperparameters <- select_best(rf_results, metric = "roc_auc")
# create final workflow object
final_rf_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(rf_mod) %>%
finalize_workflow(rf_best_hyperparameters)
# fit final workflow object
rf_final_fit <- final_rf_wf %>%
fit(data = spam_train)
# plot feature importance
rf_final_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 20)

Module 13 Lab

Daniel Plotkin

2022-11-21

Part 1: Decision trees

#1

#2

#3

#4

#5

Part 2: Bagging

#1

#2

Part 3: Random forests

#1

#2

#3