Deep Learning using H2o package
Mai Thi Nguyen
19 October 2018
# Load data
rm(list = ls())
library(tidyverse)
library(magrittr)
library(caret)
library(tufte)
data("GermanCredit")
sub_data <- GermanCredit
# Loading package h2o
library(h2o)
h2o.init(nthreads = 6, max_mem_size = "8g" )
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\MLCL~1.LOC\AppData\Local\Temp\Rtmpc1vWlA/h2o_mlcl_local_started_from_r.out
## C:\Users\MLCL~1.LOC\AppData\Local\Temp\Rtmpc1vWlA/h2o_mlcl_local_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 94 milliseconds
## H2O cluster timezone: Australia/Sydney
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.8
## H2O cluster version age: 28 days, 2 hours and 38 minutes
## H2O cluster name: H2O_started_from_R_mlcl.local_faf147
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.11 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.0 (2018-04-23)
# Converting data into h2o Object
sub_data %<>% as.h2o()
##
|
| | 0%
|
|=================================================================| 100%
# Data preparation
y <- "Class"
x <- setdiff(colnames(sub_data), y)
id <- h2o.splitFrame(sub_data, ratios = 0.7, seed = 29)
train <- id[[1]]
test <- id[[2]]
# Model
deep_ln <- h2o.deeplearning(x,
y,
model_id = "Deep_learning",
training_frame = train,
nfolds = 10,
hidden = c(400, 400, 400, 400),
# Declare imbalanced data
balance_classes = TRUE,
# Declare evaluation & condition
stopping_metric = "AUC",
replicate_training_data = TRUE,
stopping_tolerance = 0.001,
stopping_rounds = 5,
overwrite_with_best_model = TRUE,
# Choosing samples:
fold_assignment = "Stratified",
epochs = 1000,
# Functions:
activation = "TanhWithDropout",
keep_cross_validation_predictions = FALSE,
keep_cross_validation_fold_assignment = TRUE,
score_each_iteration = TRUE,
variable_importances = TRUE,
reproducible = TRUE,
seed = 123)
##
|
| | 0%
|
| | 1%
|
|======== | 13%
|
|====================================== | 59%
|
|=================================================================| 100%
# Writing a function to see the results
results_df <- function(h2o_model) {
h2o_model@model$cross_validation_metrics_summary %>%
as.data.frame() %>%
select(-mean, -sd) %>%
t() %>%
as.data.frame() %>%
mutate_all(as.character) %>%
mutate_all(as.numeric) -> k
k %>%
select(Accuracy = accuracy,
AUC = auc,
Precision = precision,
Specificity = specificity,
Recall = recall,
Logloss = logloss) %>%
return()
}
# Using function
results_df(deep_ln) -> outcome
# Outcome
outcome %>%
gather(Metrics, Values) %>%
ggplot(aes(Metrics, Values, fill = Metrics, color = Metrics)) +
geom_boxplot(alpha = 0.3, show.legend = FALSE) +
facet_wrap(~ Metrics, scales = "free") +
labs(title = "Performance of Deep Learning model using H2o package ",
caption = "Data Source: GermanCredit\nCreated by Jenny") +
theme_minimal()

# Statistics summary
outcome %>%
gather(Metrics, Values) %>%
group_by(Metrics) %>%
summarise_each(funs(mean, median, min, max, sd, n())) %>%
mutate_if(is.numeric, function(x) {round(100*x, 2)}) %>%
knitr::kable(col.names = c("Criterion", "Mean", "Median", "Min", "Max", "SD", "N"))
| Accuracy |
80.61 |
81.99 |
72.97 |
85.45 |
4.35 |
1000 |
| AUC |
82.37 |
82.26 |
76.09 |
87.67 |
3.49 |
1000 |
| Logloss |
54.83 |
54.32 |
42.64 |
74.04 |
10.57 |
1000 |
| Precision |
82.01 |
84.04 |
69.49 |
88.89 |
6.89 |
1000 |
| Recall |
93.18 |
94.01 |
84.48 |
100.00 |
4.55 |
1000 |
| Specificity |
52.09 |
51.08 |
35.71 |
73.08 |
11.23 |
1000 |
# Prediction function
predict <- function(h2o_model, data) {
h2o.predict(h2o_model, data) %>%
as.data.frame() %>%
pull(predict) %>%
return()
}
# Test function
predict(deep_ln, test) %>% head()
##
|
| | 0%
|
|=================================================================| 100%
## [1] Bad Bad Good Good Good Good
## Levels: Bad Good
# Accuracy when predict on test data
my_confusion_matrix <- function(h2o_model, data){
confusionMatrix(predict(h2o_model, data), data %>%
as.data.frame() %>%
pull(Class),
positive = "Bad")
}
my_confusion_matrix(deep_ln, test)
##
|
| | 0%
|
|=================================================================| 100%
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 59 58
## Good 32 156
##
## Accuracy : 0.7049
## 95% CI : (0.6503, 0.7555)
## No Information Rate : 0.7016
## P-Value [Acc > NIR] : 0.478404
##
## Kappa : 0.3487
## Mcnemar's Test P-Value : 0.008408
##
## Sensitivity : 0.6484
## Specificity : 0.7290
## Pos Pred Value : 0.5043
## Neg Pred Value : 0.8298
## Prevalence : 0.2984
## Detection Rate : 0.1934
## Detection Prevalence : 0.3836
## Balanced Accuracy : 0.6887
##
## 'Positive' Class : Bad
##