library(tidymodels)
library(tidyverse)
library(baguette)
## Warning: package 'baguette' was built under R version 4.2.2
library(vip)
library(pdp)
## Warning: package 'pdp' was built under R version 4.2.2
library(kernlab)
data(spam)
# split data into train / test sets
set.seed(123)
split <- initial_split(spam, prop = 0.7, strata = type)
spam_train <- training(split)
spam_test <- testing(split)
Part 1: Decision trees
#1
# Step 1: create decision tree model object
dt_mod <- decision_tree(mode = 'classification') %>%
set_engine("rpart")
# Step 2: create model recipe
model_recipe <- recipe(type ~ ., data = spam_train)
# Step 3: fit model workflow
dt_fit <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(dt_mod) %>%
fit(data = spam_train)
#2
rpart.plot::rpart.plot(dt_fit$fit$fit$fit)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

#3
# create resampling procedure
set.seed(123)
kfold <- vfold_cv(spam_train, v = 5)
# train model
dt_results <- fit_resamples(dt_mod, model_recipe, kfold)
# model results
collect_metrics(dt_results)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.890 5 0.00510 Preprocessor1_Model1
## 2 roc_auc binary 0.903 5 0.00808 Preprocessor1_Model1
#4
dt_mod <- decision_tree(
mode = "classification",
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("rpart")
# create the hyperparameter grid
dt_hyper_grid <- grid_regular(
cost_complexity(),
tree_depth(),
min_n(),
levels = 5
)
# train our model across the hyper parameter grid
set.seed(123)
dt_results <- tune_grid(dt_mod, model_recipe, resamples = kfold, grid = dt_hyper_grid)
# get best results
show_best(dt_results, metric = "roc_auc", n = 5)
## # A tibble: 5 × 9
## cost_complexity tree_depth min_n .metric .estima…¹ mean n std_err .config
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 15 21 roc_auc binary 0.943 5 0.00417 Prepro…
## 2 0.0000000178 15 21 roc_auc binary 0.943 5 0.00417 Prepro…
## 3 0.00000316 15 21 roc_auc binary 0.943 5 0.00417 Prepro…
## 4 0.0000000001 11 21 roc_auc binary 0.937 5 0.00539 Prepro…
## 5 0.0000000178 11 21 roc_auc binary 0.937 5 0.00539 Prepro…
## # … with abbreviated variable name ¹.estimator
#5
# get best hyperparameter values
dt_best_model <- select_best(dt_results, metric = 'roc_auc')
# put together final workflow
dt_final_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(dt_mod) %>%
finalize_workflow(dt_best_model)
# fit final workflow across entire training data
dt_final_fit <- dt_final_wf %>%
fit(data = spam_train)
# plot feature importance
dt_final_fit %>%
extract_fit_parsnip() %>%
vip(20)

Part 2: Bagging
#1
# create bagged CART model object and
# start with 5 bagged trees
bag_mod <- bag_tree() %>%
set_engine("rpart", times = 5) %>%
set_mode("classification")
# train model
bag_results <- fit_resamples(bag_mod, model_recipe, kfold)
# model results
collect_metrics(bag_results)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.927 5 0.00436 Preprocessor1_Model1
## 2 roc_auc binary 0.961 5 0.00356 Preprocessor1_Model1
#2
# create bagged CART model object with
# tuning option set for number of bagged trees
bag_mod <- bag_tree() %>%
set_engine("rpart", times = tune()) %>%
set_mode("classification")
# create the hyperparameter grid
bag_hyper_grid <- expand.grid(times = c(5, 25, 50, 100, 200, 300))
# train our model across the hyper parameter grid
set.seed(123)
bag_results <- tune_grid(bag_mod, model_recipe, resamples = kfold, grid = bag_hyper_grid)
# model results
show_best(bag_results, metric = "roc_auc")
## # A tibble: 5 × 7
## times .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 200 roc_auc binary 0.975 5 0.00305 Preprocessor1_Model5
## 2 300 roc_auc binary 0.975 5 0.00296 Preprocessor1_Model6
## 3 100 roc_auc binary 0.975 5 0.00316 Preprocessor1_Model4
## 4 50 roc_auc binary 0.973 5 0.00261 Preprocessor1_Model3
## 5 25 roc_auc binary 0.973 5 0.00298 Preprocessor1_Model2
Part 3: Random forests
#1
# create random forest model object and
# use the ranger package for the engine
rf_mod <- rand_forest(mode = "classification") %>%
set_engine("ranger")
# train model
rf_results <- fit_resamples(rf_mod, model_recipe, kfold)
## Warning: package 'ranger' was built under R version 4.2.2
# model results
collect_metrics(rf_results)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.946 5 0.00205 Preprocessor1_Model1
## 2 roc_auc binary 0.983 5 0.00262 Preprocessor1_Model1
#2
# create random forest model object with tuning option
rf_mod <- rand_forest(
mode = "classification",
trees = tune(),
mtry = tune(),
min_n = tune()
) %>%
set_engine("ranger", importance = "impurity")
# create the hyperparameter grid
rf_hyper_grid <- grid_regular(
trees(range = c(50, 800)),
mtry(range = c(2, 50)),
min_n(range = c(1, 20)),
levels = 5
)
# train our model across the hyper parameter grid
set.seed(123)
rf_results <- tune_grid(rf_mod, model_recipe, resamples = kfold, grid = rf_hyper_grid)
# model results
show_best(rf_results, metric = "roc_auc")
## # A tibble: 5 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 237 1 roc_auc binary 0.984 5 0.00230 Preprocessor1_Model0…
## 2 2 800 1 roc_auc binary 0.984 5 0.00214 Preprocessor1_Model0…
## 3 2 800 5 roc_auc binary 0.984 5 0.00226 Preprocessor1_Model0…
## 4 2 612 10 roc_auc binary 0.984 5 0.00215 Preprocessor1_Model0…
## 5 2 425 5 roc_auc binary 0.984 5 0.00220 Preprocessor1_Model0…
#3
# get optimal hyperparameters
rf_best_hyperparameters <- select_best(rf_results, metric = "roc_auc")
# create final workflow object
final_rf_wf <- workflow() %>%
add_recipe(model_recipe) %>%
add_model(rf_mod) %>%
finalize_workflow(rf_best_hyperparameters)
# fit final workflow object
rf_final_fit <- final_rf_wf %>%
fit(data = spam_train)
# plot feature importance
rf_final_fit %>%
extract_fit_parsnip() %>%
vip(num_features = 20)
