library(tidymodels)
library(ISLR)
library(rpart.plot)
library(vip)
theme_set(theme_bw())
carseat <- tibble(Carseats)
glimpse(carseat)
Rows: 400
Columns: 11
$ Sales <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54, ~
$ CompPrice <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 117~
$ Income <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35, 2~
$ Advertising <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0, ~
$ Population <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 503,~
$ Price <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 136~
$ ShelveLoc <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Medium,~
$ Age <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, 52~
$ Education <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, 18~
$ Urban <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, Ye~
$ US <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes, N~
knitr::kable(head(carseat))
| Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US |
|---|---|---|---|---|---|---|---|---|---|---|
| 9.50 | 138 | 73 | 11 | 276 | 120 | Bad | 42 | 17 | Yes | Yes |
| 11.22 | 111 | 48 | 16 | 260 | 83 | Good | 65 | 10 | Yes | Yes |
| 10.06 | 113 | 35 | 10 | 269 | 80 | Medium | 59 | 12 | Yes | Yes |
| 7.40 | 117 | 100 | 4 | 466 | 97 | Medium | 55 | 14 | Yes | Yes |
| 4.15 | 141 | 64 | 3 | 340 | 128 | Bad | 38 | 13 | Yes | No |
| 10.81 | 124 | 113 | 13 | 501 | 72 | Bad | 78 | 16 | No | Yes |
carseat <- carseat %>%
mutate(High = factor(if_else(Sales <=8, "No", "Yes"))) %>%
select(-Sales)
set.seed(2021)
car_split <- initial_split(carseat, prop = .75, strata = High)
car_train <- training(car_split)
car_test <- testing(car_split)
car_folds <- vfold_cv(car_train, v = 10)
tree_clas_model <-
decision_tree(cost_complexity = tune()) %>%
set_engine('rpart') %>%
set_mode('classification')
class_tree_wf <-
workflow() %>%
add_model(tree_clas_model) %>%
add_formula(High ~ .)
param_grid <- grid_regular(cost_complexity(range = c(-3, -1)), levels = 10)
tree_class_tune <-
tune_grid(
class_tree_wf,
resamples = car_folds,
grid = param_grid,
metrics = metric_set(accuracy, roc_auc, mn_log_loss),
control = control_grid(save_pred = T)
)
autoplot(tree_class_tune)
tree_class_tune %>% collect_metrics()
# A tibble: 30 x 7
cost_complexity .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.001 accuracy binary 0.73 10 0.0246 Preprocessor1_Mod~
2 0.001 mn_log_loss binary 1.16 10 0.312 Preprocessor1_Mod~
3 0.001 roc_auc binary 0.790 10 0.0372 Preprocessor1_Mod~
4 0.00167 accuracy binary 0.73 10 0.0246 Preprocessor1_Mod~
5 0.00167 mn_log_loss binary 1.16 10 0.312 Preprocessor1_Mod~
6 0.00167 roc_auc binary 0.790 10 0.0372 Preprocessor1_Mod~
7 0.00278 accuracy binary 0.73 10 0.0246 Preprocessor1_Mod~
8 0.00278 mn_log_loss binary 1.16 10 0.312 Preprocessor1_Mod~
9 0.00278 roc_auc binary 0.790 10 0.0372 Preprocessor1_Mod~
10 0.00464 accuracy binary 0.727 10 0.0237 Preprocessor1_Mod~
# ... with 20 more rows
tree_class_tune %>% collect_predictions()
# A tibble: 3,000 x 8
id .pred_class .row cost_complexity .pred_No .pred_Yes High .config
<chr> <fct> <int> <dbl> <dbl> <dbl> <fct> <chr>
1 Fold01 No 18 0.001 0.814 0.186 No Preprocess~
2 Fold01 No 26 0.001 0.814 0.186 No Preprocess~
3 Fold01 No 30 0.001 0.814 0.186 No Preprocess~
4 Fold01 No 31 0.001 0.814 0.186 No Preprocess~
5 Fold01 Yes 63 0.001 0.3 0.7 No Preprocess~
6 Fold01 No 78 0.001 0.905 0.0952 No Preprocess~
7 Fold01 No 83 0.001 0.905 0.0952 No Preprocess~
8 Fold01 No 107 0.001 0.814 0.186 No Preprocess~
9 Fold01 No 109 0.001 0.905 0.0952 No Preprocess~
10 Fold01 Yes 111 0.001 0.176 0.824 No Preprocess~
# ... with 2,990 more rows
tree_class_tune %>% show_best(metric = "accuracy")
# A tibble: 5 x 7
cost_complexity .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0359 accuracy binary 0.743 10 0.0277 Preprocessor1_Model08
2 0.0129 accuracy binary 0.737 10 0.0213 Preprocessor1_Model06
3 0.0215 accuracy binary 0.733 10 0.0228 Preprocessor1_Model07
4 0.001 accuracy binary 0.73 10 0.0246 Preprocessor1_Model01
5 0.00167 accuracy binary 0.73 10 0.0246 Preprocessor1_Model02
best_class_tree <- tree_class_tune %>% select_best(metric = "accuracy")
class_tree_final_wf <-
class_tree_wf %>%
finalize_workflow(best_class_tree)
class_tree_fit <-
fit(class_tree_final_wf, car_train)
class_tree_pred <- augment(class_tree_fit, car_test)
class_tree_pred
# A tibble: 100 x 14
CompPrice Income Advertising Population Price ShelveLoc Age Education Urban
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <fct>
1 111 48 16 260 83 Good 65 10 Yes
2 113 35 10 269 80 Medium 59 12 Yes
3 115 105 0 45 108 Medium 71 15 Yes
4 132 113 0 131 124 Medium 76 17 No
5 122 35 2 393 136 Medium 62 18 Yes
6 149 95 5 400 144 Medium 76 18 No
7 128 46 6 497 138 Medium 42 13 Yes
8 121 31 0 292 109 Medium 79 10 Yes
9 107 115 11 496 131 Good 50 11 No
10 103 74 0 359 97 Bad 55 11 Yes
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
# .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
class_tree_pred %>% conf_mat(truth = High, estimate = .pred_class)
Truth
Prediction No Yes
No 42 8
Yes 17 33
class_tree_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.75
class_tree_pred %>% roc_auc(truth = High, estimate = .pred_No)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.733
class_tree_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()
class_tree_fit %>%
extract_fit_engine() %>%
rpart.plot()
Here we apply bagging and random forests to the Boston data set. We will be using the randomForest package as the engine. A bagging model is the same as a random forest where mtry is equal to the number of predictors. We can specify the mtry to be .cols() which means that the number of columns in the predictor matrix is used. This is useful if you want to make the specification more general and useable to many different data sets. .cols() is one of many descriptors in the parsnip package. We also set importance = TRUE in set_engine() to tell the engine to save the information regarding variable importance. This is needed for this engine if we want to use the vip package later.
bagg_model <-
rand_forest(mtry = .cols()) %>%
set_engine("randomForest", importance = T) %>%
set_mode("classification")
bagg_fit <- fit(bagg_model, High ~ ., data = car_train)
bagg_fit
parsnip model object
Fit time: 140ms
Call:
randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~.cols(), x), importance = ~T)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 10
OOB estimate of error rate: 20.33%
Confusion matrix:
No Yes class.error
No 150 27 0.1525424
Yes 34 89 0.2764228
bagg_pred <- augment(bagg_fit, car_test) %>% print()
# A tibble: 100 x 14
CompPrice Income Advertising Population Price ShelveLoc Age Education Urban
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <fct>
1 111 48 16 260 83 Good 65 10 Yes
2 113 35 10 269 80 Medium 59 12 Yes
3 115 105 0 45 108 Medium 71 15 Yes
4 132 113 0 131 124 Medium 76 17 No
5 122 35 2 393 136 Medium 62 18 Yes
6 149 95 5 400 144 Medium 76 18 No
7 128 46 6 497 138 Medium 42 13 Yes
8 121 31 0 292 109 Medium 79 10 Yes
9 107 115 11 496 131 Good 50 11 No
10 103 74 0 359 97 Bad 55 11 Yes
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
# .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
bagg_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.84
bagg_pred %>% roc_auc(truth = High, estimate = .pred_No)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.931
bagg_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()
vip(bagg_fit)
rf_model <-
rand_forest(mtry = ceiling(sqrt(ncol(carseat)))) %>%
set_engine("randomForest", importance = T) %>%
set_mode("classification")
rf_fit <- fit(rf_model, High ~ ., data = car_train)
rf_fit
parsnip model object
Fit time: 130ms
Call:
randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~ceiling(sqrt(ncol(carseat))), x), importance = ~T)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 4
OOB estimate of error rate: 22%
Confusion matrix:
No Yes class.error
No 152 25 0.1412429
Yes 41 82 0.3333333
rf_pred <- augment(rf_fit, car_test) %>% print()
# A tibble: 100 x 14
CompPrice Income Advertising Population Price ShelveLoc Age Education Urban
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <fct>
1 111 48 16 260 83 Good 65 10 Yes
2 113 35 10 269 80 Medium 59 12 Yes
3 115 105 0 45 108 Medium 71 15 Yes
4 132 113 0 131 124 Medium 76 17 No
5 122 35 2 393 136 Medium 62 18 Yes
6 149 95 5 400 144 Medium 76 18 No
7 128 46 6 497 138 Medium 42 13 Yes
8 121 31 0 292 109 Medium 79 10 Yes
9 107 115 11 496 131 Good 50 11 No
10 103 74 0 359 97 Bad 55 11 Yes
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
# .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
rf_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.86
rf_pred %>% roc_auc(truth = High, estimate = .pred_No)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.939
rf_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()
vip(rf_fit)
boost_model <-
boost_tree(trees = 5000, tree_depth = 4) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_fit <- fit(boost_model, High ~ ., data = car_train)
[19:58:01] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgb_fit
parsnip model object
Fit time: 1.6s
##### xgb.Booster
raw: 2.8 Mb
call:
xgboost::xgb.train(params = list(eta = 0.3, max_depth = 4, gamma = 0,
colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1,
subsample = 1, objective = "binary:logistic"), data = x$data,
nrounds = 5000, watchlist = x$watchlist, verbose = 0, nthread = 1)
params (as set within xgb.train):
eta = "0.3", max_depth = "4", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 14
niter: 5000
nfeatures : 14
evaluation_log:
iter training_logloss
1 0.573747
2 0.486670
---
4999 0.008865
5000 0.008865
xgb_pred <- augment(xgb_fit, car_test) %>% print()
# A tibble: 100 x 14
CompPrice Income Advertising Population Price ShelveLoc Age Education Urban
<dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <fct>
1 111 48 16 260 83 Good 65 10 Yes
2 113 35 10 269 80 Medium 59 12 Yes
3 115 105 0 45 108 Medium 71 15 Yes
4 132 113 0 131 124 Medium 76 17 No
5 122 35 2 393 136 Medium 62 18 Yes
6 149 95 5 400 144 Medium 76 18 No
7 128 46 6 497 138 Medium 42 13 Yes
8 121 31 0 292 109 Medium 79 10 Yes
9 107 115 11 496 131 Good 50 11 No
10 103 74 0 359 97 Bad 55 11 Yes
# ... with 90 more rows, and 5 more variables: US <fct>, High <fct>,
# .pred_class <fct>, .pred_No <dbl>, .pred_Yes <dbl>
xgb_pred %>% conf_mat(truth = High, estimate = .pred_class)
Truth
Prediction No Yes
No 54 9
Yes 5 32
xgb_pred %>% accuracy(truth = High, estimate = .pred_class)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.86
xgb_pred %>% roc_auc(truth = High, estimate = .pred_No)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.927
xgb_pred %>% roc_curve(truth = High, estimate = .pred_No) %>% autoplot()
vip(xgb_fit)
tree_acc <- class_tree_pred %>% accuracy(truth = High, estimate = .pred_class) %>%
mutate(model = "Clasif. Tree")
tree_roc <- class_tree_pred %>% roc_auc(truth = High, estimate = .pred_No) %>%
mutate(model = "Clasif. Tree")
bagg_acc <- bagg_pred %>% accuracy(truth = High, estimate = .pred_class) %>%
mutate(model = "Bagging")
bagg_roc <- bagg_pred %>% roc_auc(truth = High, estimate = .pred_No) %>%
mutate(model = "Bagging")
rf_acc <- rf_pred %>% accuracy(truth = High, estimate = .pred_class) %>%
mutate(model = "RandomForest")
rf_roc <- rf_pred %>% roc_auc(truth = High, estimate = .pred_No) %>%
mutate(model = "RandomForest")
xgb_acc <- xgb_pred %>% accuracy(truth = High, estimate = .pred_class) %>%
mutate(model = "xgboost")
xgb_roc <- xgb_pred %>% roc_auc(truth = High, estimate = .pred_No) %>%
mutate(model = "xgboost")
summary_models <- bind_rows(tree_acc, tree_roc,
bagg_acc, bagg_roc,
rf_acc, rf_roc,
xgb_acc, xgb_roc) %>%
relocate(model) %>%
print()
# A tibble: 8 x 4
model .metric .estimator .estimate
<chr> <chr> <chr> <dbl>
1 Clasif. Tree accuracy binary 0.75
2 Clasif. Tree roc_auc binary 0.733
3 Bagging accuracy binary 0.84
4 Bagging roc_auc binary 0.931
5 RandomForest accuracy binary 0.86
6 RandomForest roc_auc binary 0.939
7 xgboost accuracy binary 0.86
8 xgboost roc_auc binary 0.927
summary_models %>%
select(model, .metric, .estimate) %>%
filter(.metric == "accuracy") %>%
arrange(desc(.estimate))
# A tibble: 4 x 3
model .metric .estimate
<chr> <chr> <dbl>
1 RandomForest accuracy 0.86
2 xgboost accuracy 0.86
3 Bagging accuracy 0.84
4 Clasif. Tree accuracy 0.75
summary_models %>%
select(model, .metric, .estimate) %>%
filter(.metric == "roc_auc") %>%
arrange(desc(.estimate))
# A tibble: 4 x 3
model .metric .estimate
<chr> <chr> <dbl>
1 RandomForest roc_auc 0.939
2 Bagging roc_auc 0.931
3 xgboost roc_auc 0.927
4 Clasif. Tree roc_auc 0.733
An Introduction to Statistcial Learning
– END
sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19043)
Matrix products: default
locale:
[1] LC_COLLATE=Spanish_Mexico.1252 LC_CTYPE=Spanish_Mexico.1252
[3] LC_MONETARY=Spanish_Mexico.1252 LC_NUMERIC=C
[5] LC_TIME=Spanish_Mexico.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] vip_0.3.2 rpart.plot_3.1.0 rpart_4.1-15 ISLR_1.2
[5] yardstick_0.0.8 workflowsets_0.0.2 workflows_0.2.3 tune_0.1.6
[9] tidyr_1.1.3 tibble_3.1.3 rsample_0.1.0 recipes_0.1.16
[13] purrr_0.3.4 parsnip_0.1.7 modeldata_0.1.1 infer_0.5.4
[17] ggplot2_3.3.5 dplyr_1.0.7 dials_0.0.9 scales_1.1.1
[21] broom_0.7.8 tidymodels_0.1.3
loaded via a namespace (and not attached):
[1] lubridate_1.7.10 DiceDesign_1.9 tools_4.1.0
[4] backports_1.2.1 bslib_0.2.5.1 utf8_1.2.2
[7] R6_2.5.0 DBI_1.1.1 colorspace_2.0-2
[10] nnet_7.3-16 withr_2.4.2 tidyselect_1.1.1
[13] gridExtra_2.3 prettyunits_1.1.1 compiler_4.1.0
[16] cli_3.0.0 labeling_0.4.2 sass_0.4.0
[19] randomForest_4.6-14 stringr_1.4.0 digest_0.6.27
[22] rmarkdown_2.9 pkgconfig_2.0.3 htmltools_0.5.1.1
[25] parallelly_1.26.1 lhs_1.1.1 highr_0.9
[28] rlang_0.4.11 rstudioapi_0.13 jquerylib_0.1.4
[31] generics_0.1.0 farver_2.1.0 jsonlite_1.7.2
[34] magrittr_2.0.1 Matrix_1.3-3 Rcpp_1.0.7
[37] munsell_0.5.0 fansi_0.5.0 GPfit_1.0-8
[40] lifecycle_1.0.0 furrr_0.2.3 stringi_1.6.2
[43] pROC_1.17.0.1 yaml_2.2.1 MASS_7.3-54
[46] plyr_1.8.6 grid_4.1.0 parallel_4.1.0
[49] listenv_0.8.0 crayon_1.4.1 lattice_0.20-44
[52] splines_4.1.0 knitr_1.33 pillar_1.6.2
[55] xgboost_1.4.1.1 codetools_0.2-18 glue_1.4.2
[58] evaluate_0.14 data.table_1.14.0 vctrs_0.3.8
[61] foreach_1.5.1 gtable_0.3.0 future_1.21.0
[64] assertthat_0.2.1 xfun_0.24 gower_0.2.2
[67] prodlim_2019.11.13 class_7.3-19 survival_3.2-11
[70] timeDate_3043.102 iterators_1.0.13 hardhat_0.1.6
[73] lava_1.6.9 globals_0.14.0 ellipsis_0.3.2
[76] ipred_0.9-11