An Introduction to Statistical Learning (2nd ed.)

Chapter 08

Tree-based Models

library(tidymodels)
library(ISLR)
library(rpart.plot)
library(vip)

theme_set(theme_bw())
carseat <- tibble(Carseats)

glimpse(carseat)
Rows: 400
Columns: 11
$ Sales       <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, ~
$ CompPrice   <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117~
$ Income      <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2~
$ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, ~
$ Population  <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,~
$ Price       <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136~
$ ShelveLoc   <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,~
$ Age         <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52~
$ Education   <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18~
$ Urban       <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye~
$ US          <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N~
knitr::kable(head(carseat))
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
9.50 138 73 11 276 120 Bad 42 17 Yes Yes
11.22 111 48 16 260 83 Good 65 10 Yes Yes
10.06 113 35 10 269 80 Medium 59 12 Yes Yes
7.40 117 100 4 466 97 Medium 55 14 Yes Yes
4.15 141 64 3 340 128 Bad 38 13 Yes No
10.81 124 113 13 501 72 Bad 78 16 No Yes
carseat <- carseat %>% 
  mutate(High = factor(if_else(Sales <=8, "No", "Yes"))) %>% 
  select(-Sales)

set.seed(2021)
car_split <- initial_split(carseat, prop = .75, strata = High)
car_train <- training(car_split)
car_test <- testing(car_split)
car_folds <- vfold_cv(car_train, v = 10)

Classification Tree

tree_clas_model <-
  decision_tree(cost_complexity = tune()) %>%
  set_engine('rpart') %>%
  set_mode('classification')

class_tree_wf <- 
  workflow() %>% 
  add_model(tree_clas_model) %>% 
  add_formula(High ~ .)
param_grid <- grid_regular(cost_complexity(range = c(-3, -1)), levels = 10)

tree_class_tune <- 
  tune_grid(
    class_tree_wf,
    resamples = car_folds,
    grid = param_grid,
    metrics = metric_set(accuracy, roc_auc, mn_log_loss),
    control = control_grid(save_pred = T)
  )
autoplot(tree_class_tune)

tree_class_tune %>% collect_metrics()
# A tibble: 30 x 7
   cost_complexity .metric     .estimator  mean     n std_err .config           
             <dbl> <chr>       <chr>      <dbl> <int>   <dbl> <chr>             
 1         0.001   accuracy    binary     0.73     10  0.0246 Preprocessor1_Mod~
 2         0.001   mn_log_loss binary     1.16     10  0.312  Preprocessor1_Mod~
 3         0.001   roc_auc     binary     0.790    10  0.0372 Preprocessor1_Mod~
 4         0.00167 accuracy    binary     0.73     10  0.0246 Preprocessor1_Mod~
 5         0.00167 mn_log_loss binary     1.16     10  0.312  Preprocessor1_Mod~
 6         0.00167 roc_auc     binary     0.790    10  0.0372 Preprocessor1_Mod~
 7         0.00278 accuracy    binary     0.73     10  0.0246 Preprocessor1_Mod~
 8         0.00278 mn_log_loss binary     1.16     10  0.312  Preprocessor1_Mod~
 9         0.00278 roc_auc     binary     0.790    10  0.0372 Preprocessor1_Mod~
10         0.00464 accuracy    binary     0.727    10  0.0237 Preprocessor1_Mod~
# ... with 20 more rows
tree_class_tune %>% collect_predictions()
# A tibble: 3,000 x 8
   id     .pred_class  .row cost_complexity .pred_No .pred_Yes High  .config    
   <chr>  <fct>       <int>           <dbl>    <dbl>     <dbl> <fct> <chr>      
 1 Fold01 No             18           0.001    0.814    0.186  No    Preprocess~
 2 Fold01 No             26           0.001    0.814    0.186  No    Preprocess~
 3 Fold01 No             30           0.001    0.814    0.186  No    Preprocess~
 4 Fold01 No             31           0.001    0.814    0.186  No    Preprocess~
 5 Fold01 Yes            63           0.001    0.3      0.7    No    Preprocess~
 6 Fold01 No             78           0.001    0.905    0.0952 No    Preprocess~
 7 Fold01 No             83           0.001    0.905    0.0952 No    Preprocess~
 8 Fold01 No            107           0.001    0.814    0.186  No    Preprocess~
 9 Fold01 No            109           0.001    0.905    0.0952 No    Preprocess~
10 Fold01 Yes           111           0.001    0.176    0.824  No    Preprocess~
# ... with 2,990 more rows
tree_class_tune %>% show_best(metric = "accuracy")
# A tibble: 5 x 7
  cost_complexity .metric  .estimator  mean     n std_err .config              
            <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
1         0.0359  accuracy binary     0.743    10  0.0277 Preprocessor1_Model08
2         0.0129  accuracy binary     0.737    10  0.0213 Preprocessor1_Model06
3         0.0215  accuracy binary     0.733    10  0.0228 Preprocessor1_Model07
4         0.001   accuracy binary     0.73     10  0.0246 Preprocessor1_Model01
5         0.00167 accuracy binary     0.73     10  0.0246 Preprocessor1_Model02
best_class_tree <- tree_class_tune %>% select_best(metric = "accuracy")
class_tree_final_wf <-
  class_tree_wf %>% 
  finalize_workflow(best_class_tree)
class_tree_fit <- 
  fit(class_tree_final_wf, car_train)

class_tree_pred <- augment(class_tree_fit, car_test)
class_tree_pred
# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
class_tree_pred %>% conf_mat(truth = High, estimate = .pred_class)
          Truth
Prediction No Yes
       No  42   8
       Yes 17  33
class_tree_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.75
class_tree_pred %>% roc_auc(truth = High, estimate = .pred_No)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.733
class_tree_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

class_tree_fit %>% 
  extract_fit_engine() %>% 
  rpart.plot()

Bagging and Random Forest

Here we apply bagging and random forests to the Boston data set. We will be using the randomForest package as the engine. A bagging model is the same as a random forest where mtry is equal to the number of predictors. We can specify the mtry to be .cols() which means that the number of columns in the predictor matrix is used. This is useful if you want to make the specification more general and useable to many different data sets. .cols() is one of many descriptors in the parsnip package. We also set importance = TRUE in set_engine() to tell the engine to save the information regarding variable importance. This is needed for this engine if we want to use the vip package later.

bagg_model <- 
  rand_forest(mtry = .cols()) %>% 
  set_engine("randomForest", importance = T) %>% 
  set_mode("classification")

bagg_fit <- fit(bagg_model, High ~ ., data = car_train)
bagg_fit
parsnip model object

Fit time:  140ms 

Call:
 randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~.cols(),      x), importance = ~T) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 10

        OOB estimate of  error rate: 20.33%
Confusion matrix:
     No Yes class.error
No  150  27   0.1525424
Yes  34  89   0.2764228
bagg_pred <- augment(bagg_fit, car_test) %>% print()
# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
bagg_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.84
bagg_pred %>% roc_auc(truth = High, estimate = .pred_No) 
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.931
bagg_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

vip(bagg_fit)

rf_model <- 
  rand_forest(mtry = ceiling(sqrt(ncol(carseat)))) %>% 
  set_engine("randomForest", importance = T) %>% 
  set_mode("classification")

rf_fit <- fit(rf_model, High ~ ., data = car_train)
rf_fit
parsnip model object

Fit time:  130ms 

Call:
 randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~ceiling(sqrt(ncol(carseat))),      x), importance = ~T) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 4

        OOB estimate of  error rate: 22%
Confusion matrix:
     No Yes class.error
No  152  25   0.1412429
Yes  41  82   0.3333333
rf_pred <- augment(rf_fit, car_test) %>% print()
# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
rf_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.86
rf_pred %>% roc_auc(truth = High, estimate = .pred_No) 
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.939
rf_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

vip(rf_fit)

Boosting

boost_model <- 
  boost_tree(trees = 5000, tree_depth = 4) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_fit <- fit(boost_model, High ~ ., data = car_train)
[19:58:01] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgb_fit
parsnip model object

Fit time:  1.6s 
##### xgb.Booster
raw: 2.8 Mb 
call:
  xgboost::xgb.train(params = list(eta = 0.3, max_depth = 4, gamma = 0, 
    colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, 
    subsample = 1, objective = "binary:logistic"), data = x$data, 
    nrounds = 5000, watchlist = x$watchlist, verbose = 0, nthread = 1)
params (as set within xgb.train):
  eta = "0.3", max_depth = "4", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
xgb.attributes:
  niter
callbacks:
  cb.evaluation.log()
# of features: 14 
niter: 5000
nfeatures : 14 
evaluation_log:
    iter training_logloss
       1         0.573747
       2         0.486670
---                      
    4999         0.008865
    5000         0.008865
xgb_pred <- augment(xgb_fit, car_test) %>% print()
# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
xgb_pred %>% conf_mat(truth = High, estimate = .pred_class)
          Truth
Prediction No Yes
       No  54   9
       Yes  5  32
xgb_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.86
xgb_pred %>% roc_auc(truth = High, estimate = .pred_No) 
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.927
xgb_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

vip(xgb_fit)

tree_acc <- class_tree_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "Clasif. Tree")
tree_roc <- class_tree_pred %>% roc_auc(truth = High, estimate = .pred_No) %>% 
  mutate(model = "Clasif. Tree")

bagg_acc <- bagg_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "Bagging")
bagg_roc <- bagg_pred %>% roc_auc(truth = High, estimate = .pred_No) %>% 
  mutate(model = "Bagging")

rf_acc <- rf_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "RandomForest")
rf_roc <- rf_pred %>% roc_auc(truth = High, estimate = .pred_No) %>% 
  mutate(model = "RandomForest")

xgb_acc <- xgb_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "xgboost")
xgb_roc <- xgb_pred %>% roc_auc(truth = High, estimate = .pred_No)  %>% 
  mutate(model = "xgboost")

summary_models <- bind_rows(tree_acc, tree_roc,
          bagg_acc, bagg_roc,
          rf_acc, rf_roc,
          xgb_acc, xgb_roc) %>% 
  relocate(model) %>% 
  print()
# A tibble: 8 x 4
  model        .metric  .estimator .estimate
  <chr>        <chr>    <chr>          <dbl>
1 Clasif. Tree accuracy binary         0.75 
2 Clasif. Tree roc_auc  binary         0.733
3 Bagging      accuracy binary         0.84 
4 Bagging      roc_auc  binary         0.931
5 RandomForest accuracy binary         0.86 
6 RandomForest roc_auc  binary         0.939
7 xgboost      accuracy binary         0.86 
8 xgboost      roc_auc  binary         0.927
summary_models %>% 
  select(model, .metric, .estimate) %>% 
  filter(.metric == "accuracy") %>% 
  arrange(desc(.estimate))
# A tibble: 4 x 3
  model        .metric  .estimate
  <chr>        <chr>        <dbl>
1 RandomForest accuracy      0.86
2 xgboost      accuracy      0.86
3 Bagging      accuracy      0.84
4 Clasif. Tree accuracy      0.75
summary_models %>% 
  select(model, .metric, .estimate) %>% 
  filter(.metric == "roc_auc") %>% 
  arrange(desc(.estimate))
# A tibble: 4 x 3
  model        .metric .estimate
  <chr>        <chr>       <dbl>
1 RandomForest roc_auc     0.939
2 Bagging      roc_auc     0.931
3 xgboost      roc_auc     0.927
4 Clasif. Tree roc_auc     0.733

An Introduction to Statistcial Learning

ISLR tidymodels Labs

– END

sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19043)

Matrix products: default

locale:
[1] LC_COLLATE=Spanish_Mexico.1252  LC_CTYPE=Spanish_Mexico.1252   
[3] LC_MONETARY=Spanish_Mexico.1252 LC_NUMERIC=C                   
[5] LC_TIME=Spanish_Mexico.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] vip_0.3.2          rpart.plot_3.1.0   rpart_4.1-15       ISLR_1.2          
 [5] yardstick_0.0.8    workflowsets_0.0.2 workflows_0.2.3    tune_0.1.6        
 [9] tidyr_1.1.3        tibble_3.1.3       rsample_0.1.0      recipes_0.1.16    
[13] purrr_0.3.4        parsnip_0.1.7      modeldata_0.1.1    infer_0.5.4       
[17] ggplot2_3.3.5      dplyr_1.0.7        dials_0.0.9        scales_1.1.1      
[21] broom_0.7.8        tidymodels_0.1.3  

loaded via a namespace (and not attached):
 [1] lubridate_1.7.10    DiceDesign_1.9      tools_4.1.0        
 [4] backports_1.2.1     bslib_0.2.5.1       utf8_1.2.2         
 [7] R6_2.5.0            DBI_1.1.1           colorspace_2.0-2   
[10] nnet_7.3-16         withr_2.4.2         tidyselect_1.1.1   
[13] gridExtra_2.3       prettyunits_1.1.1   compiler_4.1.0     
[16] cli_3.0.0           labeling_0.4.2      sass_0.4.0         
[19] randomForest_4.6-14 stringr_1.4.0       digest_0.6.27      
[22] rmarkdown_2.9       pkgconfig_2.0.3     htmltools_0.5.1.1  
[25] parallelly_1.26.1   lhs_1.1.1           highr_0.9          
[28] rlang_0.4.11        rstudioapi_0.13     jquerylib_0.1.4    
[31] generics_0.1.0      farver_2.1.0        jsonlite_1.7.2     
[34] magrittr_2.0.1      Matrix_1.3-3        Rcpp_1.0.7         
[37] munsell_0.5.0       fansi_0.5.0         GPfit_1.0-8        
[40] lifecycle_1.0.0     furrr_0.2.3         stringi_1.6.2      
[43] pROC_1.17.0.1       yaml_2.2.1          MASS_7.3-54        
[46] plyr_1.8.6          grid_4.1.0          parallel_4.1.0     
[49] listenv_0.8.0       crayon_1.4.1        lattice_0.20-44    
[52] splines_4.1.0       knitr_1.33          pillar_1.6.2       
[55] xgboost_1.4.1.1     codetools_0.2-18    glue_1.4.2         
[58] evaluate_0.14       data.table_1.14.0   vctrs_0.3.8        
[61] foreach_1.5.1       gtable_0.3.0        future_1.21.0      
[64] assertthat_0.2.1    xfun_0.24           gower_0.2.2        
[67] prodlim_2019.11.13  class_7.3-19        survival_3.2-11    
[70] timeDate_3043.102   iterators_1.0.13    hardhat_0.1.6      
[73] lava_1.6.9          globals_0.14.0      ellipsis_0.3.2     
[76] ipred_0.9-11