if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}

Ensembles

Bagging and Boosting in R

Combining Propensities

library(tidyverse)
library(adabag)
library(rpart)
library(caret)

set.seed(1)
# load and preprocess the data
bank.df <- mlba::UniversalBank %>%
    select(-c(ID, ZIP.Code)) %>%
    mutate(
      Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes"))
    )

# partition the data
train.index <- sample(c(1:dim(bank.df)[1]), dim(bank.df)[1]*0.6)
train.df <- bank.df[train.index, ]
holdout.df <- bank.df[-train.index, ]

# single tree (rpart)
tr <- rpart(Personal.Loan ~ ., data=train.df)

# bagging and boosting using adabag
bag <- bagging(Personal.Loan ~ ., data=train.df)
boost <- boosting(Personal.Loan ~ ., data=train.df)

# bagging and boosting using randomForest and xgboost with parameter tuning
bag.rf <- train(Personal.Loan ~ ., data=train.df, method="rf")
boost.xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0)
library(ROCR)
rocCurveData <- function(prob, data) {
  predob <- prediction(prob, data$Personal.Loan)
  perf <- performance(predob, "tpr", "fpr")
  return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]]))
}
performance.df <- rbind(
  cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), model="Single tree"),
  cbind(rocCurveData(predict(bag, holdout.df)$prob[, 2], holdout.df), model="Bagging"),
  cbind(rocCurveData(predict(boost, holdout.df)$prob[, 2], holdout.df), model="Boosting")
)
colors <- c("Single tree"="grey", "Bagging"="blue", "Boosting"="tomato")
g1 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
  geom_line() +
  scale_color_manual(values=colors) +
  geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
  labs(x="1 - Specificity", y="Sensitivity", color="Model")
g1

performance.df <- rbind(
  cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df),
        model="Single tree"),
  cbind(rocCurveData(predict(bag.rf, holdout.df, type="prob")[,"Yes"], holdout.df),
        model="Random forest"),
  cbind(rocCurveData(predict(boost.xgb, holdout.df, type="prob")[,"Yes"], holdout.df),
        model="xgboost")
)
colors <- c("Single tree"="grey", "Random forest"="blue", "xgboost"="tomato")
g2 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
  geom_line() +
  scale_color_manual(values=colors) +
  geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
  labs(x="1 - Specificity", y="Sensitivity", color="Model")
g2

library(gridExtra)
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.49, 0.51))
ggsave(file=file.path(getwd(), "figures", "chapter_13", "bagging-boosting.pdf"),
       g, width=8, height=3, units="in")

Automated Machine Learning (AutoML)

AutoML: Explore and Clean Data

library(tidyverse)

# load and preprocess the data
bank.df <- mlba::UniversalBank %>%
  # Drop ID and zip code columns.
  select(-c(ID, ZIP.Code)) %>%
  # convert Personal.Loan to a factor with labels Yes and No
  mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")))

# partition the data
set.seed(1)
idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
train.df <- bank.df[idx, ]
holdout.df <- bank.df[-idx, ]
library(h2o)

# Start the H2O cluster (locally)
h2o.init()
#>  Connection successful!
#> 
#> R is connected to the H2O cluster: 
#>     H2O cluster uptime:         6 minutes 15 seconds 
#>     H2O cluster timezone:       America/Toronto 
#>     H2O data parsing timezone:  UTC 
#>     H2O cluster version:        3.40.0.1 
#>     H2O cluster version age:    1 month and 25 days 
#>     H2O cluster name:           H2O_started_from_R_gokul_vrh496 
#>     H2O cluster total nodes:    1 
#>     H2O cluster total memory:   7.91 GB 
#>     H2O cluster total cores:    16 
#>     H2O cluster allowed cores:  16 
#>     H2O cluster healthy:        TRUE 
#>     H2O Connection ip:          localhost 
#>     H2O Connection port:        54321 
#>     H2O Connection proxy:       NA 
#>     H2O Internal Security:      FALSE 
#>     R Version:                  R version 4.2.3 (2023-03-15 ucrt)
train.h2o <- as.h2o(train.df)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
holdout.h2o <- as.h2o(holdout.df)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

AutoML: Choose Features and Machine Learning Methods

# identify outcome and predictors
y <- "Personal.Loan"
x <- setdiff(names(train.df), y)

# run AutoML for 20 base models
aml <- h2o.automl(x=x, y=y, training_frame=train.h2o,
                  max_models=20, exclude_algos=c("DeepLearning"),
                  seed=1)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
#> 11:00:01.550: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |============================================                          |  62%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |======================================================================| 100%
aml.balanced <- h2o.automl(x=x, y=y, training_frame=train.h2o,
                  max_models=20, exclude_algos=c("DeepLearning"),
                  balance_classes=TRUE,
                  seed=1)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
#> 11:00:46.709: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
aml
#> AutoML Details
#> ==============
#> Project Name: AutoML_1_20230403_110001 
#> Leader Model ID: StackedEnsemble_AllModels_1_AutoML_1_20230403_110001 
#> Algorithm: stackedensemble 
#> 
#> Total Number of Models Trained: 22 
#> Start Time: 2023-04-03 11:00:02 UTC 
#> End Time: 2023-04-03 11:00:36 UTC 
#> Duration: 35 s
#> 
#> Leaderboard
#> ===========
#>                                                   model_id       auc    logloss
#> 1     StackedEnsemble_AllModels_1_AutoML_1_20230403_110001 0.9979163 0.03245920
#> 2             GBM_grid_1_AutoML_1_20230403_110001_model_12 0.9974253 0.04187360
#> 3  StackedEnsemble_BestOfFamily_1_AutoML_1_20230403_110001 0.9973856 0.03620766
#> 4              GBM_grid_1_AutoML_1_20230403_110001_model_5 0.9972678 0.03951980
#> 5                           GBM_4_AutoML_1_20230403_110001 0.9970207 0.04387762
#> 6              GBM_grid_1_AutoML_1_20230403_110001_model_2 0.9969048 0.04140080
#> 7                           GBM_3_AutoML_1_20230403_110001 0.9967493 0.04059709
#> 8                           GBM_5_AutoML_1_20230403_110001 0.9965687 0.04425404
#> 9                           GBM_2_AutoML_1_20230403_110001 0.9964798 0.04167867
#> 10                          DRF_1_AutoML_1_20230403_110001 0.9964209 0.05035308
#>        aucpr mean_per_class_error       rmse        mse
#> 1  0.9843821           0.03903945 0.09671344 0.00935349
#> 2  0.9805166           0.03430740 0.10647014 0.01133589
#> 3  0.9805034           0.04169739 0.10278329 0.01056440
#> 4  0.9788479           0.06707842 0.10808600 0.01168258
#> 5  0.9784953           0.05174533 0.10987843 0.01207327
#> 6  0.9767822           0.07018191 0.10917959 0.01192018
#> 7  0.9797131           0.05821350 0.10470703 0.01096356
#> 8  0.9778616           0.05200651 0.10778991 0.01161867
#> 9  0.9776190           0.05255961 0.10581536 0.01119689
#> 10 0.9778407           0.03264811 0.11239728 0.01263315
#> 
#> [22 rows x 7 columns]
aml.balanced
#> AutoML Details
#> ==============
#> Project Name: AutoML_2_20230403_110046 
#> Leader Model ID: StackedEnsemble_AllModels_1_AutoML_2_20230403_110046 
#> Algorithm: stackedensemble 
#> 
#> Total Number of Models Trained: 22 
#> Start Time: 2023-04-03 11:00:47 UTC 
#> End Time: 2023-04-03 11:01:25 UTC 
#> Duration: 38 s
#> 
#> Leaderboard
#> ===========
#>                                                   model_id       auc    logloss
#> 1     StackedEnsemble_AllModels_1_AutoML_2_20230403_110046 0.9976077 0.03544409
#> 2                           GBM_1_AutoML_2_20230403_110046 0.9975898 0.04557709
#> 3  StackedEnsemble_BestOfFamily_1_AutoML_2_20230403_110046 0.9974656 0.03729750
#> 4                           GBM_2_AutoML_2_20230403_110046 0.9969714 0.05812751
#> 5              GBM_grid_1_AutoML_2_20230403_110046_model_2 0.9967077 0.04897035
#> 6              GBM_grid_1_AutoML_2_20230403_110046_model_5 0.9965086 0.05892540
#> 7                           GBM_3_AutoML_2_20230403_110046 0.9964433 0.06149567
#> 8                           GBM_4_AutoML_2_20230403_110046 0.9963537 0.06534853
#> 9                           DRF_1_AutoML_2_20230403_110046 0.9962967 0.09839131
#> 10             GBM_grid_1_AutoML_2_20230403_110046_model_9 0.9960835 0.05500576
#>        aucpr mean_per_class_error      rmse        mse
#> 1  0.9817983           0.05411136 0.1023414 0.01047375
#> 2  0.9808815           0.03914700 0.1147291 0.01316277
#> 3  0.9800965           0.03641224 0.1058710 0.01120868
#> 4  0.9785022           0.06571104 0.1187951 0.01411229
#> 5  0.9757267           0.06955199 0.1147946 0.01317779
#> 6  0.9744478           0.06260755 0.1161645 0.01349420
#> 7  0.9791149           0.05374263 0.1203961 0.01449522
#> 8  0.9780848           0.05529437 0.1236499 0.01528930
#> 9  0.9734061           0.03925455 0.1786314 0.03190919
#> 10 0.9781456           0.04735128 0.1200517 0.01441241
#> 
#> [22 rows x 7 columns]

AutoML: Evaluate Model Performance

h2o.confusionMatrix(aml@leader, holdout.h2o)
#> Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.612097120745797:
#>          No Yes    Error      Rate
#> No     1805   3 0.001659   =3/1808
#> Yes      16 176 0.083333   =16/192
#> Totals 1821 179 0.009500  =19/2000
h2o.confusionMatrix(aml.balanced@leader, holdout.h2o)
#> Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.598595135802623:
#>          No Yes    Error      Rate
#> No     1803   5 0.002765   =5/1808
#> Yes      14 178 0.072917   =14/192
#> Totals 1817 183 0.009500  =19/2000

Explaining Model Predictions

Explaining Model Predictions: LIME

cases <- c('3055', '3358', # predicted Yes
           '2', '1178')    # predicted No
explainer <- lime::lime(train.df, aml@leader, bin_continuous=TRUE, quantile_bins=FALSE)
explanations <- lime::explain(holdout.df[cases,], explainer, n_labels=1, n_features=8)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
lime::plot_features(explanations, ncol=2)

pdf(file=file.path(getwd(), "figures", "chapter_13", "lime-analysis.pdf"),
    width=7, height=6)
    lime::plot_features(explanations, ncol=2)
dev.off()
#> png 
#>   2