if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
library(tidyverse)
library(adabag)
library(rpart)
library(caret)
set.seed(1)
# load and preprocess the data
bank.df <- mlba::UniversalBank %>%
select(-c(ID, ZIP.Code)) %>%
mutate(
Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes"))
)
# partition the data
train.index <- sample(c(1:dim(bank.df)[1]), dim(bank.df)[1]*0.6)
train.df <- bank.df[train.index, ]
holdout.df <- bank.df[-train.index, ]
# single tree (rpart)
tr <- rpart(Personal.Loan ~ ., data=train.df)
# bagging and boosting using adabag
bag <- bagging(Personal.Loan ~ ., data=train.df)
boost <- boosting(Personal.Loan ~ ., data=train.df)
# bagging and boosting using randomForest and xgboost with parameter tuning
bag.rf <- train(Personal.Loan ~ ., data=train.df, method="rf")
boost.xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0)
library(ROCR)
rocCurveData <- function(prob, data) {
predob <- prediction(prob, data$Personal.Loan)
perf <- performance(predob, "tpr", "fpr")
return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]]))
}
performance.df <- rbind(
cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df), model="Single tree"),
cbind(rocCurveData(predict(bag, holdout.df)$prob[, 2], holdout.df), model="Bagging"),
cbind(rocCurveData(predict(boost, holdout.df)$prob[, 2], holdout.df), model="Boosting")
)
colors <- c("Single tree"="grey", "Bagging"="blue", "Boosting"="tomato")
g1 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
geom_line() +
scale_color_manual(values=colors) +
geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
labs(x="1 - Specificity", y="Sensitivity", color="Model")
g1
performance.df <- rbind(
cbind(rocCurveData(predict(tr, holdout.df, type="prob")[,"Yes"], holdout.df),
model="Single tree"),
cbind(rocCurveData(predict(bag.rf, holdout.df, type="prob")[,"Yes"], holdout.df),
model="Random forest"),
cbind(rocCurveData(predict(boost.xgb, holdout.df, type="prob")[,"Yes"], holdout.df),
model="xgboost")
)
colors <- c("Single tree"="grey", "Random forest"="blue", "xgboost"="tomato")
g2 <- ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
geom_line() +
scale_color_manual(values=colors) +
geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
labs(x="1 - Specificity", y="Sensitivity", color="Model")
g2
library(gridExtra)
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2, widths=c(0.49, 0.51))
ggsave(file=file.path(getwd(), "figures", "chapter_13", "bagging-boosting.pdf"),
g, width=8, height=3, units="in")
library(tidyverse)
# load and preprocess the data
bank.df <- mlba::UniversalBank %>%
# Drop ID and zip code columns.
select(-c(ID, ZIP.Code)) %>%
# convert Personal.Loan to a factor with labels Yes and No
mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")))
# partition the data
set.seed(1)
idx <- caret::createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
train.df <- bank.df[idx, ]
holdout.df <- bank.df[-idx, ]
library(h2o)
# Start the H2O cluster (locally)
h2o.init()
#> Connection successful!
#>
#> R is connected to the H2O cluster:
#> H2O cluster uptime: 6 minutes 15 seconds
#> H2O cluster timezone: America/Toronto
#> H2O data parsing timezone: UTC
#> H2O cluster version: 3.40.0.1
#> H2O cluster version age: 1 month and 25 days
#> H2O cluster name: H2O_started_from_R_gokul_vrh496
#> H2O cluster total nodes: 1
#> H2O cluster total memory: 7.91 GB
#> H2O cluster total cores: 16
#> H2O cluster allowed cores: 16
#> H2O cluster healthy: TRUE
#> H2O Connection ip: localhost
#> H2O Connection port: 54321
#> H2O Connection proxy: NA
#> H2O Internal Security: FALSE
#> R Version: R version 4.2.3 (2023-03-15 ucrt)
train.h2o <- as.h2o(train.df)
#>
|
| | 0%
|
|======================================================================| 100%
holdout.h2o <- as.h2o(holdout.df)
#>
|
| | 0%
|
|======================================================================| 100%
# identify outcome and predictors
y <- "Personal.Loan"
x <- setdiff(names(train.df), y)
# run AutoML for 20 base models
aml <- h2o.automl(x=x, y=y, training_frame=train.h2o,
max_models=20, exclude_algos=c("DeepLearning"),
seed=1)
#>
|
| | 0%
|
|== | 3%
#> 11:00:01.550: AutoML: XGBoost is not available; skipping it.
|
|=== | 4%
|
|===== | 7%
|
|====== | 8%
|
|============ | 17%
|
|=============== | 21%
|
|==================== | 29%
|
|======================= | 33%
|
|============================================ | 62%
|
|================================================== | 71%
|
|======================================================================| 100%
aml.balanced <- h2o.automl(x=x, y=y, training_frame=train.h2o,
max_models=20, exclude_algos=c("DeepLearning"),
balance_classes=TRUE,
seed=1)
#>
|
| | 0%
|
|=== | 4%
#> 11:00:46.709: AutoML: XGBoost is not available; skipping it.
|
|===== | 8%
|
|========= | 12%
|
|============ | 17%
|
|=============== | 21%
|
|================== | 25%
|
|====================== | 32%
|
|======================= | 33%
|
|=============================================== | 67%
|
|======================================================================| 100%
aml
#> AutoML Details
#> ==============
#> Project Name: AutoML_1_20230403_110001
#> Leader Model ID: StackedEnsemble_AllModels_1_AutoML_1_20230403_110001
#> Algorithm: stackedensemble
#>
#> Total Number of Models Trained: 22
#> Start Time: 2023-04-03 11:00:02 UTC
#> End Time: 2023-04-03 11:00:36 UTC
#> Duration: 35 s
#>
#> Leaderboard
#> ===========
#> model_id auc logloss
#> 1 StackedEnsemble_AllModels_1_AutoML_1_20230403_110001 0.9979163 0.03245920
#> 2 GBM_grid_1_AutoML_1_20230403_110001_model_12 0.9974253 0.04187360
#> 3 StackedEnsemble_BestOfFamily_1_AutoML_1_20230403_110001 0.9973856 0.03620766
#> 4 GBM_grid_1_AutoML_1_20230403_110001_model_5 0.9972678 0.03951980
#> 5 GBM_4_AutoML_1_20230403_110001 0.9970207 0.04387762
#> 6 GBM_grid_1_AutoML_1_20230403_110001_model_2 0.9969048 0.04140080
#> 7 GBM_3_AutoML_1_20230403_110001 0.9967493 0.04059709
#> 8 GBM_5_AutoML_1_20230403_110001 0.9965687 0.04425404
#> 9 GBM_2_AutoML_1_20230403_110001 0.9964798 0.04167867
#> 10 DRF_1_AutoML_1_20230403_110001 0.9964209 0.05035308
#> aucpr mean_per_class_error rmse mse
#> 1 0.9843821 0.03903945 0.09671344 0.00935349
#> 2 0.9805166 0.03430740 0.10647014 0.01133589
#> 3 0.9805034 0.04169739 0.10278329 0.01056440
#> 4 0.9788479 0.06707842 0.10808600 0.01168258
#> 5 0.9784953 0.05174533 0.10987843 0.01207327
#> 6 0.9767822 0.07018191 0.10917959 0.01192018
#> 7 0.9797131 0.05821350 0.10470703 0.01096356
#> 8 0.9778616 0.05200651 0.10778991 0.01161867
#> 9 0.9776190 0.05255961 0.10581536 0.01119689
#> 10 0.9778407 0.03264811 0.11239728 0.01263315
#>
#> [22 rows x 7 columns]
aml.balanced
#> AutoML Details
#> ==============
#> Project Name: AutoML_2_20230403_110046
#> Leader Model ID: StackedEnsemble_AllModels_1_AutoML_2_20230403_110046
#> Algorithm: stackedensemble
#>
#> Total Number of Models Trained: 22
#> Start Time: 2023-04-03 11:00:47 UTC
#> End Time: 2023-04-03 11:01:25 UTC
#> Duration: 38 s
#>
#> Leaderboard
#> ===========
#> model_id auc logloss
#> 1 StackedEnsemble_AllModels_1_AutoML_2_20230403_110046 0.9976077 0.03544409
#> 2 GBM_1_AutoML_2_20230403_110046 0.9975898 0.04557709
#> 3 StackedEnsemble_BestOfFamily_1_AutoML_2_20230403_110046 0.9974656 0.03729750
#> 4 GBM_2_AutoML_2_20230403_110046 0.9969714 0.05812751
#> 5 GBM_grid_1_AutoML_2_20230403_110046_model_2 0.9967077 0.04897035
#> 6 GBM_grid_1_AutoML_2_20230403_110046_model_5 0.9965086 0.05892540
#> 7 GBM_3_AutoML_2_20230403_110046 0.9964433 0.06149567
#> 8 GBM_4_AutoML_2_20230403_110046 0.9963537 0.06534853
#> 9 DRF_1_AutoML_2_20230403_110046 0.9962967 0.09839131
#> 10 GBM_grid_1_AutoML_2_20230403_110046_model_9 0.9960835 0.05500576
#> aucpr mean_per_class_error rmse mse
#> 1 0.9817983 0.05411136 0.1023414 0.01047375
#> 2 0.9808815 0.03914700 0.1147291 0.01316277
#> 3 0.9800965 0.03641224 0.1058710 0.01120868
#> 4 0.9785022 0.06571104 0.1187951 0.01411229
#> 5 0.9757267 0.06955199 0.1147946 0.01317779
#> 6 0.9744478 0.06260755 0.1161645 0.01349420
#> 7 0.9791149 0.05374263 0.1203961 0.01449522
#> 8 0.9780848 0.05529437 0.1236499 0.01528930
#> 9 0.9734061 0.03925455 0.1786314 0.03190919
#> 10 0.9781456 0.04735128 0.1200517 0.01441241
#>
#> [22 rows x 7 columns]
h2o.confusionMatrix(aml@leader, holdout.h2o)
#> Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.612097120745797:
#> No Yes Error Rate
#> No 1805 3 0.001659 =3/1808
#> Yes 16 176 0.083333 =16/192
#> Totals 1821 179 0.009500 =19/2000
h2o.confusionMatrix(aml.balanced@leader, holdout.h2o)
#> Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.598595135802623:
#> No Yes Error Rate
#> No 1803 5 0.002765 =5/1808
#> Yes 14 178 0.072917 =14/192
#> Totals 1817 183 0.009500 =19/2000
cases <- c('3055', '3358', # predicted Yes
'2', '1178') # predicted No
explainer <- lime::lime(train.df, aml@leader, bin_continuous=TRUE, quantile_bins=FALSE)
explanations <- lime::explain(holdout.df[cases,], explainer, n_labels=1, n_features=8)
#>
|
| | 0%
|
|======================================================================| 100%
#>
|
| | 0%
|
|======================================================================| 100%
lime::plot_features(explanations, ncol=2)
pdf(file=file.path(getwd(), "figures", "chapter_13", "lime-analysis.pdf"),
width=7, height=6)
lime::plot_features(explanations, ncol=2)
dev.off()
#> png
#> 2