An Introduction to Statistical Learning (2nd ed.)

Labs

Chapter 08 Tree-based Models

library(tidymodels)
library(ISLR)
library(rpart.plot)
library(vip)

theme_set(theme_bw())

carseat <- tibble(Carseats)

glimpse(carseat)

Rows: 400
Columns: 11
$ Sales       <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, ~
$ CompPrice   <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117~
$ Income      <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2~
$ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, ~
$ Population  <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,~
$ Price       <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136~
$ ShelveLoc   <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,~
$ Age         <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52~
$ Education   <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18~
$ Urban       <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye~
$ US          <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N~

knitr::kable(head(carseat))

Sales	CompPrice	Income	Advertising	Population	Price	ShelveLoc	Age	Education	Urban	US
9.50	138	73	11	276	120	Bad	42	17	Yes	Yes
11.22	111	48	16	260	83	Good	65	10	Yes	Yes
10.06	113	35	10	269	80	Medium	59	12	Yes	Yes
7.40	117	100	4	466	97	Medium	55	14	Yes	Yes
4.15	141	64	3	340	128	Bad	38	13	Yes	No
10.81	124	113	13	501	72	Bad	78	16	No	Yes

carseat <- carseat %>% 
  mutate(High = factor(if_else(Sales <=8, "No", "Yes"))) %>% 
  select(-Sales)

set.seed(2021)
car_split <- initial_split(carseat, prop = .75, strata = High)
car_train <- training(car_split)
car_test <- testing(car_split)

car_folds <- vfold_cv(car_train, v = 10)

Classification Tree

tree_clas_model <-
  decision_tree(cost_complexity = tune()) %>%
  set_engine('rpart') %>%
  set_mode('classification')

class_tree_wf <- 
  workflow() %>% 
  add_model(tree_clas_model) %>% 
  add_formula(High ~ .)

param_grid <- grid_regular(cost_complexity(range = c(-3, -1)), levels = 10)

tree_class_tune <- 
  tune_grid(
    class_tree_wf,
    resamples = car_folds,
    grid = param_grid,
    metrics = metric_set(accuracy, roc_auc, mn_log_loss),
    control = control_grid(save_pred = T)
  )

autoplot(tree_class_tune)

tree_class_tune %>% collect_metrics()

# A tibble: 30 x 7
   cost_complexity .metric     .estimator  mean     n std_err .config           
             <dbl> <chr>       <chr>      <dbl> <int>   <dbl> <chr>             
 1         0.001   accuracy    binary     0.73     10  0.0246 Preprocessor1_Mod~
 2         0.001   mn_log_loss binary     1.16     10  0.312  Preprocessor1_Mod~
 3         0.001   roc_auc     binary     0.790    10  0.0372 Preprocessor1_Mod~
 4         0.00167 accuracy    binary     0.73     10  0.0246 Preprocessor1_Mod~
 5         0.00167 mn_log_loss binary     1.16     10  0.312  Preprocessor1_Mod~
 6         0.00167 roc_auc     binary     0.790    10  0.0372 Preprocessor1_Mod~
 7         0.00278 accuracy    binary     0.73     10  0.0246 Preprocessor1_Mod~
 8         0.00278 mn_log_loss binary     1.16     10  0.312  Preprocessor1_Mod~
 9         0.00278 roc_auc     binary     0.790    10  0.0372 Preprocessor1_Mod~
10         0.00464 accuracy    binary     0.727    10  0.0237 Preprocessor1_Mod~
# ... with 20 more rows

tree_class_tune %>% collect_predictions()

# A tibble: 3,000 x 8
   id     .pred_class  .row cost_complexity .pred_No .pred_Yes High  .config    
   <chr>  <fct>       <int>           <dbl>    <dbl>     <dbl> <fct> <chr>      
 1 Fold01 No             18           0.001    0.814    0.186  No    Preprocess~
 2 Fold01 No             26           0.001    0.814    0.186  No    Preprocess~
 3 Fold01 No             30           0.001    0.814    0.186  No    Preprocess~
 4 Fold01 No             31           0.001    0.814    0.186  No    Preprocess~
 5 Fold01 Yes            63           0.001    0.3      0.7    No    Preprocess~
 6 Fold01 No             78           0.001    0.905    0.0952 No    Preprocess~
 7 Fold01 No             83           0.001    0.905    0.0952 No    Preprocess~
 8 Fold01 No            107           0.001    0.814    0.186  No    Preprocess~
 9 Fold01 No            109           0.001    0.905    0.0952 No    Preprocess~
10 Fold01 Yes           111           0.001    0.176    0.824  No    Preprocess~
# ... with 2,990 more rows

tree_class_tune %>% show_best(metric = "accuracy")

# A tibble: 5 x 7
  cost_complexity .metric  .estimator  mean     n std_err .config              
            <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
1         0.0359  accuracy binary     0.743    10  0.0277 Preprocessor1_Model08
2         0.0129  accuracy binary     0.737    10  0.0213 Preprocessor1_Model06
3         0.0215  accuracy binary     0.733    10  0.0228 Preprocessor1_Model07
4         0.001   accuracy binary     0.73     10  0.0246 Preprocessor1_Model01
5         0.00167 accuracy binary     0.73     10  0.0246 Preprocessor1_Model02

best_class_tree <- tree_class_tune %>% select_best(metric = "accuracy")

class_tree_final_wf <-
  class_tree_wf %>% 
  finalize_workflow(best_class_tree)

class_tree_fit <- 
  fit(class_tree_final_wf, car_train)

class_tree_pred <- augment(class_tree_fit, car_test)
class_tree_pred

# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>

class_tree_pred %>% conf_mat(truth = High, estimate = .pred_class)

          Truth
Prediction No Yes
       No  42   8
       Yes 17  33

class_tree_pred %>% accuracy(truth = High, estimate = .pred_class)

# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.75

class_tree_pred %>% roc_auc(truth = High, estimate = .pred_No)

# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.733

class_tree_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

class_tree_fit %>% 
  extract_fit_engine() %>% 
  rpart.plot()

Bagging and Random Forest

Here we apply bagging and random forests to the Boston data set. We will be using the randomForest package as the engine. A bagging model is the same as a random forest where mtry is equal to the number of predictors. We can specify the mtry to be .cols() which means that the number of columns in the predictor matrix is used. This is useful if you want to make the specification more general and useable to many different data sets. .cols() is one of many descriptors in the parsnip package. We also set importance = TRUE in set_engine() to tell the engine to save the information regarding variable importance. This is needed for this engine if we want to use the vip package later.

bagg_model <- 
  rand_forest(mtry = .cols()) %>% 
  set_engine("randomForest", importance = T) %>% 
  set_mode("classification")

bagg_fit <- fit(bagg_model, High ~ ., data = car_train)
bagg_fit

parsnip model object

Fit time:  140ms 

Call:
 randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~.cols(),      x), importance = ~T) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 10

        OOB estimate of  error rate: 20.33%
Confusion matrix:
     No Yes class.error
No  150  27   0.1525424
Yes  34  89   0.2764228

bagg_pred <- augment(bagg_fit, car_test) %>% print()

# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>

bagg_pred %>% accuracy(truth = High, estimate = .pred_class)

# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.84

bagg_pred %>% roc_auc(truth = High, estimate = .pred_No)

# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.931

bagg_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

vip(bagg_fit)

rf_model <- 
  rand_forest(mtry = ceiling(sqrt(ncol(carseat)))) %>% 
  set_engine("randomForest", importance = T) %>% 
  set_mode("classification")

rf_fit <- fit(rf_model, High ~ ., data = car_train)
rf_fit

parsnip model object

Fit time:  130ms 

Call:
 randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~ceiling(sqrt(ncol(carseat))),      x), importance = ~T) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 4

        OOB estimate of  error rate: 22%
Confusion matrix:
     No Yes class.error
No  152  25   0.1412429
Yes  41  82   0.3333333

rf_pred <- augment(rf_fit, car_test) %>% print()

# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>

rf_pred %>% accuracy(truth = High, estimate = .pred_class)

# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.86

rf_pred %>% roc_auc(truth = High, estimate = .pred_No)

# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.939

rf_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

vip(rf_fit)

Boosting

boost_model <- 
  boost_tree(trees = 5000, tree_depth = 4) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_fit <- fit(boost_model, High ~ ., data = car_train)

[19:58:01] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

xgb_fit

parsnip model object

Fit time:  1.6s 
##### xgb.Booster
raw: 2.8 Mb 
call:
  xgboost::xgb.train(params = list(eta = 0.3, max_depth = 4, gamma = 0, 
    colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, 
    subsample = 1, objective = "binary:logistic"), data = x$data, 
    nrounds = 5000, watchlist = x$watchlist, verbose = 0, nthread = 1)
params (as set within xgb.train):
  eta = "0.3", max_depth = "4", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
xgb.attributes:
  niter
callbacks:
  cb.evaluation.log()
# of features: 14 
niter: 5000
nfeatures : 14 
evaluation_log:
    iter training_logloss
       1         0.573747
       2         0.486670
---                      
    4999         0.008865
    5000         0.008865

xgb_pred <- augment(xgb_fit, car_test) %>% print()

# A tibble: 100 x 14
   CompPrice Income Advertising Population Price ShelveLoc   Age Education Urban
       <dbl>  <dbl>       <dbl>      <dbl> <dbl> <fct>     <dbl>     <dbl> <fct>
 1       111     48          16        260    83 Good         65        10 Yes  
 2       113     35          10        269    80 Medium       59        12 Yes  
 3       115    105           0         45   108 Medium       71        15 Yes  
 4       132    113           0        131   124 Medium       76        17 No   
 5       122     35           2        393   136 Medium       62        18 Yes  
 6       149     95           5        400   144 Medium       76        18 No   
 7       128     46           6        497   138 Medium       42        13 Yes  
 8       121     31           0        292   109 Medium       79        10 Yes  
 9       107    115          11        496   131 Good         50        11 No   
10       103     74           0        359    97 Bad          55        11 Yes  
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
#   .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>

xgb_pred %>% conf_mat(truth = High, estimate = .pred_class)

          Truth
Prediction No Yes
       No  54   9
       Yes  5  32

xgb_pred %>% accuracy(truth = High, estimate = .pred_class)

# A tibble: 1 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.86

xgb_pred %>% roc_auc(truth = High, estimate = .pred_No)

# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.927

xgb_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()

vip(xgb_fit)

tree_acc <- class_tree_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "Clasif. Tree")
tree_roc <- class_tree_pred %>% roc_auc(truth = High, estimate = .pred_No) %>% 
  mutate(model = "Clasif. Tree")

bagg_acc <- bagg_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "Bagging")
bagg_roc <- bagg_pred %>% roc_auc(truth = High, estimate = .pred_No) %>% 
  mutate(model = "Bagging")

rf_acc <- rf_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "RandomForest")
rf_roc <- rf_pred %>% roc_auc(truth = High, estimate = .pred_No) %>% 
  mutate(model = "RandomForest")

xgb_acc <- xgb_pred %>% accuracy(truth = High, estimate = .pred_class) %>% 
  mutate(model = "xgboost")
xgb_roc <- xgb_pred %>% roc_auc(truth = High, estimate = .pred_No)  %>% 
  mutate(model = "xgboost")

summary_models <- bind_rows(tree_acc, tree_roc,
          bagg_acc, bagg_roc,
          rf_acc, rf_roc,
          xgb_acc, xgb_roc) %>% 
  relocate(model) %>% 
  print()

# A tibble: 8 x 4
  model        .metric  .estimator .estimate
  <chr>        <chr>    <chr>          <dbl>
1 Clasif. Tree accuracy binary         0.75 
2 Clasif. Tree roc_auc  binary         0.733
3 Bagging      accuracy binary         0.84 
4 Bagging      roc_auc  binary         0.931
5 RandomForest accuracy binary         0.86 
6 RandomForest roc_auc  binary         0.939
7 xgboost      accuracy binary         0.86 
8 xgboost      roc_auc  binary         0.927

summary_models %>% 
  select(model, .metric, .estimate) %>% 
  filter(.metric == "accuracy") %>% 
  arrange(desc(.estimate))

# A tibble: 4 x 3
  model        .metric  .estimate
  <chr>        <chr>        <dbl>
1 RandomForest accuracy      0.86
2 xgboost      accuracy      0.86
3 Bagging      accuracy      0.84
4 Clasif. Tree accuracy      0.75

summary_models %>% 
  select(model, .metric, .estimate) %>% 
  filter(.metric == "roc_auc") %>% 
  arrange(desc(.estimate))

# A tibble: 4 x 3
  model        .metric .estimate
  <chr>        <chr>       <dbl>
1 RandomForest roc_auc     0.939
2 Bagging      roc_auc     0.931
3 xgboost      roc_auc     0.927
4 Clasif. Tree roc_auc     0.733

An Introduction to Statistcial Learning

ISLR tidymodels Labs

– END

sessionInfo()

R version 4.1.0 (2021-05-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19043)

Matrix products: default

locale:
[1] LC_COLLATE=Spanish_Mexico.1252  LC_CTYPE=Spanish_Mexico.1252   
[3] LC_MONETARY=Spanish_Mexico.1252 LC_NUMERIC=C                   
[5] LC_TIME=Spanish_Mexico.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] vip_0.3.2          rpart.plot_3.1.0   rpart_4.1-15       ISLR_1.2          
 [5] yardstick_0.0.8    workflowsets_0.0.2 workflows_0.2.3    tune_0.1.6        
 [9] tidyr_1.1.3        tibble_3.1.3       rsample_0.1.0      recipes_0.1.16    
[13] purrr_0.3.4        parsnip_0.1.7      modeldata_0.1.1    infer_0.5.4       
[17] ggplot2_3.3.5      dplyr_1.0.7        dials_0.0.9        scales_1.1.1      
[21] broom_0.7.8        tidymodels_0.1.3  

loaded via a namespace (and not attached):
 [1] lubridate_1.7.10    DiceDesign_1.9      tools_4.1.0        
 [4] backports_1.2.1     bslib_0.2.5.1       utf8_1.2.2         
 [7] R6_2.5.0            DBI_1.1.1           colorspace_2.0-2   
[10] nnet_7.3-16         withr_2.4.2         tidyselect_1.1.1   
[13] gridExtra_2.3       prettyunits_1.1.1   compiler_4.1.0     
[16] cli_3.0.0           labeling_0.4.2      sass_0.4.0         
[19] randomForest_4.6-14 stringr_1.4.0       digest_0.6.27      
[22] rmarkdown_2.9       pkgconfig_2.0.3     htmltools_0.5.1.1  
[25] parallelly_1.26.1   lhs_1.1.1           highr_0.9          
[28] rlang_0.4.11        rstudioapi_0.13     jquerylib_0.1.4    
[31] generics_0.1.0      farver_2.1.0        jsonlite_1.7.2     
[34] magrittr_2.0.1      Matrix_1.3-3        Rcpp_1.0.7         
[37] munsell_0.5.0       fansi_0.5.0         GPfit_1.0-8        
[40] lifecycle_1.0.0     furrr_0.2.3         stringi_1.6.2      
[43] pROC_1.17.0.1       yaml_2.2.1          MASS_7.3-54        
[46] plyr_1.8.6          grid_4.1.0          parallel_4.1.0     
[49] listenv_0.8.0       crayon_1.4.1        lattice_0.20-44    
[52] splines_4.1.0       knitr_1.33          pillar_1.6.2       
[55] xgboost_1.4.1.1     codetools_0.2-18    glue_1.4.2         
[58] evaluate_0.14       data.table_1.14.0   vctrs_0.3.8        
[61] foreach_1.5.1       gtable_0.3.0        future_1.21.0      
[64] assertthat_0.2.1    xfun_0.24           gower_0.2.2        
[67] prodlim_2019.11.13  class_7.3-19        survival_3.2-11    
[70] timeDate_3043.102   iterators_1.0.13    hardhat_0.1.6      
[73] lava_1.6.9          globals_0.14.0      ellipsis_0.3.2     
[76] ipred_0.9-11

08. Tree-Based Models

GS

23/8/2021

An Introduction to Statistical Learning (2nd ed.)

Labs

Chapter 08

Tree-based Models

Classification Tree

Bagging and Random Forest

Boosting