Deep learning with H20

# Pima Indians Diabetes Database
library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)

## [1] 768   9

levels(PimaIndiansDiabetes$diabetes)

## [1] "neg" "pos"

head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

pima.scale = as.data.frame(scale(PimaIndiansDiabetes[,-9]))
pima.scale$type = PimaIndiansDiabetes$diabetes
str(pima.scale)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  0.64 -0.844 1.233 -0.844 -1.141 ...
##  $ glucose : num  0.848 -1.123 1.942 -0.998 0.504 ...
##  $ pressure: num  0.15 -0.16 -0.264 -0.16 -1.504 ...
##  $ triceps : num  0.907 0.531 -1.287 0.154 0.907 ...
##  $ insulin : num  -0.692 -0.692 -0.692 0.123 0.765 ...
##  $ mass    : num  0.204 -0.684 -1.103 -0.494 1.409 ...
##  $ pedigree: num  0.468 -0.365 0.604 -0.92 5.481 ...
##  $ age     : num  1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...
##  $ type    : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

getwd()

## [1] "C:/Users/HP/Documents"

write.csv(pima.scale, file="pimaScale.csv", row.names=FALSE)
library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

localH2O = h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         11 minutes 27 seconds 
##     H2O cluster version:        3.10.5.3 
##     H2O cluster version age:    1 month and 6 days  
##     H2O cluster name:           H2O_started_from_R_HP_ywg784 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.86 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 3.4.0 (2017-04-21)

h2o.getConnection()

## IP Address: localhost 
## Port      : 54321 
## Session ID: _sid_ac3b 
## Key Count : 0

path = "pimaScale.csv"
pima.hex = h2o.uploadFile(path=path, destination_frame="pima.hex")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

class(pima.hex)

## [1] "H2OFrame"

str(pima.hex)

## Class 'H2OFrame' <environment: 0x0000000007fd5680> 
##  - attr(*, "op")= chr "Parse"
##  - attr(*, "id")= chr "pima.hex"
##  - attr(*, "eval")= logi FALSE
##  - attr(*, "nrow")= int 768
##  - attr(*, "ncol")= int 9
##  - attr(*, "types")=List of 9
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "enum"
##  - attr(*, "data")='data.frame': 10 obs. of  9 variables:
##   ..$ pregnant: num  0.64 -0.844 1.233 -0.844 -1.141 ...
##   ..$ glucose : num  0.848 -1.123 1.942 -0.998 0.504 ...
##   ..$ pressure: num  0.15 -0.16 -0.264 -0.16 -1.504 ...
##   ..$ triceps : num  0.907 0.531 -1.287 0.154 0.907 ...
##   ..$ insulin : num  -0.692 -0.692 -0.692 0.123 0.765 ...
##   ..$ mass    : num  0.204 -0.684 -1.103 -0.494 1.409 ...
##   ..$ pedigree: num  0.468 -0.365 0.604 -0.92 5.481 ...
##   ..$ age     : num  1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...
##   ..$ type    : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2

head(pima.hex)

##     pregnant    glucose   pressure    triceps    insulin       mass
## 1  0.6395305  0.8477713  0.1495433  0.9066791 -0.6924393  0.2038799
## 2 -0.8443348 -1.1226647 -0.1604412  0.5305558 -0.6924393 -0.6839762
## 3  1.2330766  1.9424580 -0.2637694 -1.2873733 -0.6924393 -1.1025370
## 4 -0.8443348 -0.9975577 -0.1604412  0.1544326  0.1232213 -0.4937213
## 5 -1.1411079  0.5037269 -1.5037073  0.9066791  0.7653372  1.4088275
## 6  0.3427574 -0.1530851  0.2528715 -1.2873733 -0.6924393 -0.8108128
##     pedigree         age type
## 1  0.4681869  1.42506672  pos
## 2 -0.3648230 -0.19054773  neg
## 3  0.6040037 -0.10551539  pos
## 4 -0.9201630 -1.04087112  neg
## 5  5.4813370 -0.02048305  pos
## 6 -0.8175458 -0.27558007  neg

summary(pima.hex)

## Warning in summary.H2OFrame(pima.hex): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.

##  pregnant             glucose              pressure            
##  Min.   :-1.141e+00   Min.   :-3.781e+00   Min.   :-3.570e+00  
##  1st Qu.:-8.485e-01   1st Qu.:-6.878e-01   1st Qu.:-3.683e-01  
##  Median :-2.532e-01   Median :-1.277e-01   Median : 1.485e-01  
##  Mean   :-8.559e-16   Mean   :-1.331e-16   Mean   : 1.115e-16  
##  3rd Qu.: 6.348e-01   3rd Qu.: 6.021e-01   3rd Qu.: 5.582e-01  
##  Max.   : 3.904e+00   Max.   : 2.443e+00   Max.   : 2.733e+00  
##  triceps              insulin              mass                
##  Min.   :-1.287e+00   Min.   :-6.924e-01   Min.   :-4.058e+00  
##  1st Qu.:-1.287e+00   1st Qu.:-6.924e-01   1st Qu.:-6.025e-01  
##  Median : 1.524e-01   Median :-4.318e-01   Median :-6.720e-03  
##  Mean   :-1.567e-16   Mean   : 5.584e-17   Mean   :-4.787e-17  
##  3rd Qu.: 7.172e-01   3rd Qu.: 4.105e-01   3rd Qu.: 5.805e-01  
##  Max.   : 4.919e+00   Max.   : 6.649e+00   Max.   : 4.453e+00  
##  pedigree             age                  type    
##  Min.   :-1.189e+00   Min.   :-1.041e+00   neg:500 
##  1st Qu.:-6.940e-01   1st Qu.:-7.858e-01   pos:268 
##  Median :-3.017e-01   Median :-3.623e-01           
##  Mean   : 3.524e-17   Mean   :-2.259e-16           
##  3rd Qu.: 4.600e-01   3rd Qu.: 6.581e-01           
##  Max.   : 5.880e+00   Max.   : 4.061e+00

rand = h2o.runif(pima.hex, seed = 123)
train = pima.hex[rand <= 0.7, ]
train = h2o.assign(train, key = "train")
test = pima.hex[rand > 0.7, ]
test <- h2o.assign(test, key = "test")

h2o.table(train[,9])

##   type Count
## 1  neg   341
## 2  pos   185
## 
## [2 rows x 2 columns]

h2o.table(test[,9])

##   type Count
## 1  neg   159
## 2  pos    83
## 
## [2 rows x 2 columns]

args(h2o.deeplearning)

## function (x, y, training_frame, model_id = NULL, validation_frame = NULL, 
##     nfolds = 0, keep_cross_validation_predictions = FALSE, keep_cross_validation_fold_assignment = FALSE, 
##     fold_assignment = c("AUTO", "Random", "Modulo", "Stratified"), 
##     fold_column = NULL, ignore_const_cols = TRUE, score_each_iteration = FALSE, 
##     weights_column = NULL, offset_column = NULL, balance_classes = FALSE, 
##     class_sampling_factors = NULL, max_after_balance_size = 5, 
##     max_hit_ratio_k = 0, checkpoint = NULL, pretrained_autoencoder = NULL, 
##     overwrite_with_best_model = TRUE, use_all_factor_levels = TRUE, 
##     standardize = TRUE, activation = c("Tanh", "TanhWithDropout", 
##         "Rectifier", "RectifierWithDropout", "Maxout", "MaxoutWithDropout"), 
##     hidden = c(200, 200), epochs = 10, train_samples_per_iteration = -2, 
##     target_ratio_comm_to_comp = 0.05, seed = -1, adaptive_rate = TRUE, 
##     rho = 0.99, epsilon = 1e-08, rate = 0.005, rate_annealing = 1e-06, 
##     rate_decay = 1, momentum_start = 0, momentum_ramp = 1e+06, 
##     momentum_stable = 0, nesterov_accelerated_gradient = TRUE, 
##     input_dropout_ratio = 0, hidden_dropout_ratios = NULL, l1 = 0, 
##     l2 = 0, max_w2 = 3.4028235e+38, initial_weight_distribution = c("UniformAdaptive", 
##         "Uniform", "Normal"), initial_weight_scale = 1, initial_weights = NULL, 
##     initial_biases = NULL, loss = c("Automatic", "CrossEntropy", 
##         "Quadratic", "Huber", "Absolute", "Quantile"), distribution = c("AUTO", 
##         "bernoulli", "multinomial", "gaussian", "poisson", "gamma", 
##         "tweedie", "laplace", "quantile", "huber"), quantile_alpha = 0.5, 
##     tweedie_power = 1.5, huber_alpha = 0.9, score_interval = 5, 
##     score_training_samples = 10000, score_validation_samples = 0, 
##     score_duty_cycle = 0.1, classification_stop = 0, regression_stop = 1e-06, 
##     stopping_rounds = 5, stopping_metric = c("AUTO", "deviance", 
##         "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "lift_top_group", 
##         "misclassification", "mean_per_class_error"), stopping_tolerance = 0, 
##     max_runtime_secs = 0, score_validation_sampling = c("Uniform", 
##         "Stratified"), diagnostics = TRUE, fast_mode = TRUE, 
##     force_load_balance = TRUE, variable_importances = TRUE, replicate_training_data = TRUE, 
##     single_node_mode = FALSE, shuffle_training_data = FALSE, 
##     missing_values_handling = c("MeanImputation", "Skip"), quiet_mode = FALSE, 
##     autoencoder = FALSE, sparse = FALSE, col_major = FALSE, average_activation = 0, 
##     sparsity_beta = 0, max_categorical_features = 2147483647, 
##     reproducible = FALSE, export_weights_and_biases = FALSE, 
##     mini_batch_size = 1, categorical_encoding = c("AUTO", "Enum", 
##         "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", 
##         "LabelEncoder", "SortByResponse", "EnumLimited"), elastic_averaging = FALSE, 
##     elastic_averaging_moving_rate = 0.9, elastic_averaging_regularization = 0.001) 
## NULL

dlmodel <- h2o.deeplearning(x=1:8, y=9, training_frame = train,
validation_frame = test, seed = 123, variable_importances = TRUE)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |=================================================================| 100%

dlmodel

## Model Details:
## ==============
## 
## H2OBinomialModel: deeplearning
## Model ID:  DeepLearning_model_R_1501960966284_7 
## Status of Neuron Layers: predicting type, 2-class classification, bernoulli distribution, CrossEntropy loss, 42,402 weights/biases, 506.0 KB, 5,260 training samples, mini-batch size 1
##   layer units      type dropout       l1       l2 mean_rate rate_rms
## 1     1     8     Input  0.00 %                                     
## 2     2   200 Rectifier  0.00 % 0.000000 0.000000  0.003426 0.001900
## 3     3   200 Rectifier  0.00 % 0.000000 0.000000  0.059771 0.143572
## 4     4     2   Softmax         0.000000 0.000000  0.001888 0.001131
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1                                                   
## 2 0.000000   -0.004742   0.092210  0.391177 0.052695
## 3 0.000000   -0.006082   0.070035  0.984823 0.012891
## 4 0.000000   -0.029659   0.378094 -0.000008 0.000365
## 
## 
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## MSE:  0.1774354
## RMSE:  0.4212309
## LogLoss:  0.6171341
## Mean Per-Class Error:  0.1956804
## AUC:  0.8825632
## Gini:  0.7651264
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        neg pos    Error      Rate
## neg    261  80 0.234604   =80/341
## pos     29 156 0.156757   =29/185
## Totals 290 236 0.207224  =109/526
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.041928 0.741093 227
## 2                       max f2  0.014685 0.829361 290
## 3                 max f0point5  0.146641 0.763975 154
## 4                 max accuracy  0.115444 0.823194 169
## 5                max precision  0.986608 1.000000   0
## 6                   max recall  0.000438 1.000000 390
## 7              max specificity  0.986608 1.000000   0
## 8             max absolute_mcc  0.113692 0.606873 171
## 9   max min_per_class_accuracy  0.054234 0.797654 213
## 10 max mean_per_class_accuracy  0.041928 0.804320 227
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
## 
## MSE:  0.220593
## RMSE:  0.4696733
## LogLoss:  0.8821718
## Mean Per-Class Error:  0.2892703
## AUC:  0.7694173
## Gini:  0.5388346
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        neg pos    Error     Rate
## neg     90  69 0.433962  =69/159
## pos     12  71 0.144578   =12/83
## Totals 102 140 0.334711  =81/242
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.014637 0.636771 139
## 2                       max f2  0.004220 0.772994 178
## 3                 max f0point5  0.501319 0.646388  44
## 4                 max accuracy  0.501319 0.752066  44
## 5                max precision  0.994303 1.000000   0
## 6                   max recall  0.000051 1.000000 235
## 7              max specificity  0.994303 1.000000   0
## 8             max absolute_mcc  0.474066 0.415449  46
## 9   max min_per_class_accuracy  0.049525 0.698113 105
## 10 max mean_per_class_accuracy  0.021900 0.711525 130
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

dlmodel@allparameters

## $model_id
## [1] "DeepLearning_model_R_1501960966284_7"
## 
## $training_frame
## [1] "train"
## 
## $validation_frame
## [1] "test"
## 
## $nfolds
## [1] 0
## 
## $keep_cross_validation_predictions
## [1] FALSE
## 
## $keep_cross_validation_fold_assignment
## [1] FALSE
## 
## $fold_assignment
## [1] "AUTO"
## 
## $ignore_const_cols
## [1] TRUE
## 
## $score_each_iteration
## [1] FALSE
## 
## $balance_classes
## [1] FALSE
## 
## $max_after_balance_size
## [1] 5
## 
## $max_confusion_matrix_size
## [1] 20
## 
## $max_hit_ratio_k
## [1] 0
## 
## $overwrite_with_best_model
## [1] TRUE
## 
## $use_all_factor_levels
## [1] TRUE
## 
## $standardize
## [1] TRUE
## 
## $activation
## [1] "Rectifier"
## 
## $hidden
## [1] 200 200
## 
## $epochs
## [1] 10
## 
## $train_samples_per_iteration
## [1] -2
## 
## $target_ratio_comm_to_comp
## [1] 0.05
## 
## $seed
## [1] 123
## 
## $adaptive_rate
## [1] TRUE
## 
## $rho
## [1] 0.99
## 
## $epsilon
## [1] 1e-08
## 
## $rate
## [1] 0.005
## 
## $rate_annealing
## [1] 1e-06
## 
## $rate_decay
## [1] 1
## 
## $momentum_start
## [1] 0
## 
## $momentum_ramp
## [1] 1e+06
## 
## $momentum_stable
## [1] 0
## 
## $nesterov_accelerated_gradient
## [1] TRUE
## 
## $input_dropout_ratio
## [1] 0
## 
## $l1
## [1] 0
## 
## $l2
## [1] 0
## 
## $max_w2
## [1] 3.402823e+38
## 
## $initial_weight_distribution
## [1] "UniformAdaptive"
## 
## $initial_weight_scale
## [1] 1
## 
## $loss
## [1] "Automatic"
## 
## $distribution
## [1] "AUTO"
## 
## $quantile_alpha
## [1] 0.5
## 
## $tweedie_power
## [1] 1.5
## 
## $huber_alpha
## [1] 0.9
## 
## $score_interval
## [1] 5
## 
## $score_training_samples
## [1] 10000
## 
## $score_validation_samples
## [1] 0
## 
## $score_duty_cycle
## [1] 0.1
## 
## $classification_stop
## [1] 0
## 
## $regression_stop
## [1] 1e-06
## 
## $stopping_rounds
## [1] 5
## 
## $stopping_metric
## [1] "AUTO"
## 
## $stopping_tolerance
## [1] 0
## 
## $max_runtime_secs
## [1] 0
## 
## $score_validation_sampling
## [1] "Uniform"
## 
## $diagnostics
## [1] TRUE
## 
## $fast_mode
## [1] TRUE
## 
## $force_load_balance
## [1] TRUE
## 
## $variable_importances
## [1] TRUE
## 
## $replicate_training_data
## [1] TRUE
## 
## $single_node_mode
## [1] FALSE
## 
## $shuffle_training_data
## [1] FALSE
## 
## $missing_values_handling
## [1] "MeanImputation"
## 
## $quiet_mode
## [1] FALSE
## 
## $autoencoder
## [1] FALSE
## 
## $sparse
## [1] FALSE
## 
## $col_major
## [1] FALSE
## 
## $average_activation
## [1] 0
## 
## $sparsity_beta
## [1] 0
## 
## $max_categorical_features
## [1] 2147483647
## 
## $reproducible
## [1] FALSE
## 
## $export_weights_and_biases
## [1] FALSE
## 
## $mini_batch_size
## [1] 1
## 
## $categorical_encoding
## [1] "AUTO"
## 
## $elastic_averaging
## [1] FALSE
## 
## $elastic_averaging_moving_rate
## [1] 0.9
## 
## $elastic_averaging_regularization
## [1] 0.001
## 
## $x
## [1] "pregnant" "glucose"  "pressure" "triceps"  "insulin"  "mass"    
## [7] "pedigree" "age"     
## 
## $y
## [1] "type"

dlmodel@model$variable_importances

## Variable Importances: 
##   variable relative_importance scaled_importance percentage
## 1  glucose            1.000000          1.000000   0.131849
## 2      age            0.998628          0.998628   0.131669
## 3 pressure            0.970514          0.970514   0.127962
## 4  insulin            0.970048          0.970048   0.127900
## 5     mass            0.956604          0.956604   0.126128
## 6 pregnant            0.948564          0.948564   0.125068
## 7 pedigree            0.895442          0.895442   0.118064
## 8  triceps            0.844606          0.844606   0.111361

dlPredict = h2o.predict(dlmodel,newdata=test)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

dlPredict

##   predict       neg          pos
## 1     neg 0.9998340 0.0001660238
## 2     pos 0.0626549 0.9373450986
## 3     neg 0.9998435 0.0001565213
## 4     pos 0.9659392 0.0340607757
## 5     pos 0.6810771 0.3189228946
## 6     neg 0.9941267 0.0058733392
## 
## [242 rows x 3 columns]

dlPred = as.data.frame(dlPredict)
head(dlPred)

##   predict       neg          pos
## 1     neg 0.9998340 0.0001660238
## 2     pos 0.0626549 0.9373450986
## 3     neg 0.9998435 0.0001565213
## 4     pos 0.9659392 0.0340607757
## 5     pos 0.6810771 0.3189228946
## 6     neg 0.9941267 0.0058733392

Deep learning with H20

Kushan De Silva

August 5, 2017