Deep Learning using H2o package

Mai Thi Nguyen

19 October 2018

# Load data
rm(list = ls())
library(tidyverse)
library(magrittr)
library(caret)
library(tufte)
data("GermanCredit")
sub_data <- GermanCredit


# Loading package h2o
library(h2o)
h2o.init(nthreads = 6, max_mem_size = "8g" )

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\MLCL~1.LOC\AppData\Local\Temp\Rtmpc1vWlA/h2o_mlcl_local_started_from_r.out
##     C:\Users\MLCL~1.LOC\AppData\Local\Temp\Rtmpc1vWlA/h2o_mlcl_local_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 94 milliseconds 
##     H2O cluster timezone:       Australia/Sydney 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.8 
##     H2O cluster version age:    28 days, 2 hours and 38 minutes  
##     H2O cluster name:           H2O_started_from_R_mlcl.local_faf147 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.11 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.0 (2018-04-23)

# Converting data into h2o Object
sub_data  %<>% as.h2o()

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

# Data preparation
y <- "Class"
x <- setdiff(colnames(sub_data), y)
id <- h2o.splitFrame(sub_data, ratios = 0.7, seed = 29)
train <- id[[1]]
test <- id[[2]]


# Model
deep_ln <- h2o.deeplearning(x,
                            y,
                            model_id = "Deep_learning",
                            training_frame = train,
                            nfolds = 10,
                            hidden = c(400, 400, 400, 400),
                            # Declare imbalanced data
                            balance_classes = TRUE,
                            # Declare evaluation & condition
                            stopping_metric = "AUC",
                            replicate_training_data = TRUE,
                            stopping_tolerance = 0.001,
                            stopping_rounds = 5,
                            overwrite_with_best_model = TRUE,
                            # Choosing samples:
                            fold_assignment = "Stratified",
                            epochs = 1000,
                            # Functions:
                            activation = "TanhWithDropout",
                            keep_cross_validation_predictions = FALSE,
                            keep_cross_validation_fold_assignment = TRUE,
                            score_each_iteration = TRUE,
                            variable_importances = TRUE,
                            reproducible = TRUE,
                            seed = 123)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=================================================================| 100%

# Writing a function to see the results
results_df <- function(h2o_model) {
  h2o_model@model$cross_validation_metrics_summary %>% 
    as.data.frame() %>% 
    select(-mean, -sd) %>% 
    t() %>% 
    as.data.frame() %>% 
    mutate_all(as.character) %>% 
    mutate_all(as.numeric) -> k
  
  k %>% 
    select(Accuracy = accuracy,
           AUC = auc,
           Precision = precision,
           Specificity = specificity,
           Recall = recall,
           Logloss = logloss) %>% 
  return()
}



# Using function
results_df(deep_ln) -> outcome

# Outcome 
outcome %>% 
  gather(Metrics, Values) %>% 
  ggplot(aes(Metrics, Values, fill = Metrics, color = Metrics)) +
  geom_boxplot(alpha = 0.3, show.legend = FALSE) + 
  facet_wrap(~ Metrics, scales = "free") + 
  labs(title = "Performance of Deep Learning model using H2o package ",
       caption = "Data Source: GermanCredit\nCreated by Jenny") +
  theme_minimal()

# Statistics summary
outcome %>% 
  gather(Metrics, Values) %>% 
  group_by(Metrics) %>% 
  summarise_each(funs(mean, median, min, max, sd, n())) %>% 
  mutate_if(is.numeric, function(x) {round(100*x, 2)}) %>%
  knitr::kable(col.names = c("Criterion", "Mean", "Median", "Min", "Max", "SD", "N"))

Criterion	Mean	Median	Min	Max	SD	N
Accuracy	80.61	81.99	72.97	85.45	4.35	1000
AUC	82.37	82.26	76.09	87.67	3.49	1000
Logloss	54.83	54.32	42.64	74.04	10.57	1000
Precision	82.01	84.04	69.49	88.89	6.89	1000
Recall	93.18	94.01	84.48	100.00	4.55	1000
Specificity	52.09	51.08	35.71	73.08	11.23	1000

# Prediction function

predict <- function(h2o_model, data) {
  h2o.predict(h2o_model, data) %>% 
    as.data.frame() %>% 
    pull(predict) %>% 
    return()

}

# Test  function
predict(deep_ln, test) %>% head()

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

## [1] Bad  Bad  Good Good Good Good
## Levels: Bad Good

# Accuracy when predict on test data

my_confusion_matrix <- function(h2o_model, data){
  confusionMatrix(predict(h2o_model, data), data %>% 
                    as.data.frame() %>% 
                    pull(Class),
                  positive = "Bad")
}

my_confusion_matrix(deep_ln, test)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Bad Good
##       Bad   59   58
##       Good  32  156
##                                           
##                Accuracy : 0.7049          
##                  95% CI : (0.6503, 0.7555)
##     No Information Rate : 0.7016          
##     P-Value [Acc > NIR] : 0.478404        
##                                           
##                   Kappa : 0.3487          
##  Mcnemar's Test P-Value : 0.008408        
##                                           
##             Sensitivity : 0.6484          
##             Specificity : 0.7290          
##          Pos Pred Value : 0.5043          
##          Neg Pred Value : 0.8298          
##              Prevalence : 0.2984          
##          Detection Rate : 0.1934          
##    Detection Prevalence : 0.3836          
##       Balanced Accuracy : 0.6887          
##                                           
##        'Positive' Class : Bad             
##