The data consist if surveys administered to people who live near a community health facilty. The goal here is to predict the rate of hypertension among the residents given predictors such as age,race,gender,education and employment status.

rm(list =c())
library(h2o)
library(h2oEnsemble)
library(tidyverse)
h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
         max_mem_size = "8G")  #max mem size is the maximum memory to allocate to H2O
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         15 minutes 23 seconds 
    H2O cluster version:        3.14.0.3 
    H2O cluster version age:    15 days  
    H2O cluster name:           H2O_started_from_R_nanaakwasiabayieboateng_qlb223 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   6.70 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.4.1 (2017-06-30) 
localH2O = h2o.init(ip = 'localhost', port = 54321, nthreads = -1,max_mem_size = "8G")
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         15 minutes 23 seconds 
    H2O cluster version:        3.14.0.3 
    H2O cluster version age:    15 days  
    H2O cluster name:           H2O_started_from_R_nanaakwasiabayieboateng_qlb223 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   6.70 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.4.1 (2017-06-30) 
loan_csv <- "/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/DataMiningscience/UICProject/Workbook3.csv"
datachicago <- h2o.importFile(loan_csv)  

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=========================================================================================| 100%
dim(datachicago)
[1] 454  17
head(datachicago)
h2o.names((datachicago))
 [1] "ID"                "Gender"            "Age"               "Hypertension"     
 [5] "Employment"        "Hispanic"          "Education"         "Hispanic_ancestry"
 [9] "race_white"        "race_black"        "race_Asian"        "race_pacific"     
[13] "race_Amer_Indian"  "race_other"        "race_dontknow"     "race_refused"     
[17] "Asian_ancestry"   
str(datachicago)
Class 'H2OFrame' <environment: 0x11413cdb8> 
 - attr(*, "op")= chr "Parse"
 - attr(*, "id")= chr "Workbook3.hex_sid_9881_8"
 - attr(*, "eval")= logi FALSE
 - attr(*, "nrow")= int 454
 - attr(*, "ncol")= int 17
 - attr(*, "types")=List of 17
  ..$ : chr "int"
  ..$ : chr "enum"
  ..$ : chr "int"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
 - attr(*, "data")='data.frame':    10 obs. of  17 variables:
  ..$ ID               : num  1 2 3 4 5 6 7 8 9 10
  ..$ Gender           : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 1 2 1 1 2
  ..$ Age              : num  24 76 47 42 24 39 31 25 33 32
  ..$ Hypertension     : Factor w/ 4 levels "NA","NO CODED RESPONSE APPLICABLE (SPECIFY)",..: 4 4 3 3 3 3 4 3 3 3
  ..$ Employment       : Factor w/ 9 levels "Employed for wages,",..: 8 7 1 1 1 1 1 6 1 6
  ..$ Hispanic         : Factor w/ 3 levels "NO CODED RESPONSE APPLICABLE (SPECIFY)",..: 2 2 2 2 2 2 2 2 2 2
  ..$ Education        : Factor w/ 24 levels "10th Grade","11th Grade",..: 15 19 15 15 15 20 15 15 20 20
  ..$ Hispanic_ancestry: Factor w/ 7 levels ".","Ecuadorian, or",..: 1 1 1 1 1 1 1 1 1 1
  ..$ race_white       : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2
  ..$ race_black       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_Asian       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_pacific     : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1
  ..$ race_Amer_Indian : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_other       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_dontknow    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_refused     : Factor w/ 4 levels ".","Chinese,",..: 3 3 3 3 3 3 3 3 3 3
  ..$ Asian_ancestry   : Factor w/ 5 levels ".","Asian Indian,",..: 1 1 1 1 1 1 1 1 1 1
#==========================================================================================================
# Look at  the  structure of the data with the glimpse function in 
#  dplyr  package
#==========================================================================================================
str(data)
Classes ‘data.table’ and 'data.frame':  434 obs. of  17 variables:
 $ ID               : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Gender           : chr  "Male" "Female" "Male" "Female" ...
 $ Age              : num  24 76 47 42 24 39 31 25 33 32 ...
 $ Hypertension     : chr  "Yes" "Yes" "No" "No" ...
 $ Employment       : chr  "Self-employed," "Primarily retired, or" "Employed for wages," "Employed for wages," ...
 $ Hispanic         : chr  "No" "No" "No" "No" ...
 $ Education        : chr  "Bachelor's Degree (Example: BA, AB, BS, BBA)" "High School Graduate" "Bachelor's Degree (Example: BA, AB, BS, BBA)" "Bachelor's Degree (Example: BA, AB, BS, BBA)" ...
 $ Hispanic_ancestry: chr  "." "." "." "." ...
 $ race_white       : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ race_black       : chr  "No" "No" "No" "No" ...
 $ race_Asian       : chr  "No" "No" "No" "No" ...
 $ race_pacific     : chr  "No" "No" "No" "No" ...
 $ race_Amer_Indian : chr  "No" "No" "No" "No" ...
 $ race_other       : chr  "No" "No" "No" "No" ...
 $ race_dontknow    : chr  "No" "No" "No" "No" ...
 $ race_refused     : chr  "No" "No" "No" "No" ...
 $ Asian_ancestry   : chr  "<NA>" "<NA>" "<NA>" "<NA>" ...
 - attr(*, ".internal.selfref")=<externalptr> 
dplyr::glimpse(data)
Observations: 434
Variables: 17
$ ID                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2...
$ Gender            <chr> "Male", "Female", "Male", "Female", "Female", "Female", "Male", "Fem...
$ Age               <dbl> 24, 76, 47, 42, 24, 39, 31, 25, 33, 32, 24, 34, 26, 31, 24, 26, 34, ...
$ Hypertension      <chr> "Yes", "Yes", "No", "No", "No", "No", "Yes", "No", "No", "No", "No",...
$ Employment        <chr> "Self-employed,", "Primarily retired, or", "Employed for wages,", "E...
$ Hispanic          <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ Education         <chr> "Bachelor's Degree (Example: BA, AB, BS, BBA)", "High School Graduat...
$ Hispanic_ancestry <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "."...
$ race_white        <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"...
$ race_black        <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ race_Asian        <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ race_pacific      <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ race_Amer_Indian  <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ race_other        <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ race_dontknow     <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ race_refused      <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "N...
$ Asian_ancestry    <chr> "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA...
summary(data)
       ID           Gender               Age        Hypertension        Employment       
 Min.   :  1.0   Length:434         Min.   :19.00   Length:434         Length:434        
 1st Qu.:110.2   Class :character   1st Qu.:29.00   Class :character   Class :character  
 Median :221.5   Mode  :character   Median :39.00   Mode  :character   Mode  :character  
 Mean   :221.6                      Mean   :42.49                                        
 3rd Qu.:331.8                      3rd Qu.:54.75                                        
 Max.   :444.0                      Max.   :91.00                                        
   Hispanic          Education         Hispanic_ancestry   race_white         race_black       
 Length:434         Length:434         Length:434         Length:434         Length:434        
 Class :character   Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                                               
                                                                                               
                                                                                               
  race_Asian        race_pacific       race_Amer_Indian    race_other        race_dontknow     
 Length:434         Length:434         Length:434         Length:434         Length:434        
 Class :character   Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                                               
                                                                                               
                                                                                               
 race_refused       Asian_ancestry    
 Length:434         Length:434        
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      
#==========================================================================================================
#check the number of missing rows
#==========================================================================================================
colSums(is.na.data.frame(data))
               ID            Gender               Age      Hypertension        Employment 
                0                 0                 0                 0                 0 
         Hispanic         Education Hispanic_ancestry        race_white        race_black 
                0                 0                 0                 0                 0 
       race_Asian      race_pacific  race_Amer_Indian        race_other     race_dontknow 
                0                 0                 0                 0                 0 
     race_refused    Asian_ancestry 
                0                 0 
data[!complete.cases(data),]%>%head()
data[which(data$Asian_ancestry!="."),]
data$Asian_ancestry=ifelse(data$Asian_ancestry==".","<NA>",data$Asian_ancestry)
data=data[complete.cases(data),]
#==========================================================================================================
#NO CODED RESPONSE APPLICABLE (SPECIFY)
#==========================================================================================================
#data%>%dplyr::filter(str_detect(Hypertension, "NO CODED RESPONSE APPLICABLE (SPECIFY)"))
ndata=data%>%dplyr::select(-ID)%>%dplyr::filter(Hypertension!="NO CODED RESPONSE APPLICABLE (SPECIFY)",Employment
                                                !="NO CODED RESPONSE APPLICABLE (LEAVE NOTE FIRST)"
                                                ,Hispanic!="NO CODED RESPONSE APPLICABLE (SPECIFY)",
                                                Education!="NO CODED RESPONSE APPLICABLE (SPECIFY)")
ndata=mutate_if(ndata,is.character,as.factor)
str(ndata)
'data.frame':   424 obs. of  16 variables:
 $ Gender           : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 1 2 1 1 2 ...
 $ Age              : num  24 76 47 42 24 39 31 25 33 32 ...
 $ Hypertension     : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 2 1 1 1 ...
 $ Employment       : Factor w/ 8 levels "Employed for wages,",..: 7 6 1 1 1 1 1 5 1 5 ...
 $ Hispanic         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ Education        : Factor w/ 23 levels "10th Grade","11th Grade",..: 15 19 15 15 15 20 15 15 20 20 ...
 $ Hispanic_ancestry: Factor w/ 5 levels ".","Ecuadorian, or",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ race_white       : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ race_black       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ race_Asian       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ race_pacific     : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
 $ race_Amer_Indian : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ race_other       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ race_dontknow    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ race_refused     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ Asian_ancestry   : Factor w/ 4 levels "<NA>","Asian Indian,",..: 1 1 1 1 1 1 1 1 1 1 ...
#==========================================================================================================
# import R object to the H2O cloud.
#convert r data to h2o object
#==========================================================================================================
datah20=as.h2o(ndata)

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=========================================================================================| 100%
str(datah20)
Class 'H2OFrame' <environment: 0x10ce545c8> 
 - attr(*, "op")= chr "Parse"
 - attr(*, "id")= chr "ndata"
 - attr(*, "eval")= logi FALSE
 - attr(*, "nrow")= int 424
 - attr(*, "ncol")= int 16
 - attr(*, "types")=List of 16
  ..$ : chr "enum"
  ..$ : chr "int"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "enum"
 - attr(*, "data")='data.frame':    10 obs. of  16 variables:
  ..$ Gender           : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 1 2 1 1 2
  ..$ Age              : num  24 76 47 42 24 39 31 25 33 32
  ..$ Hypertension     : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 2 1 1 1
  ..$ Employment       : Factor w/ 8 levels "Employed for wages,",..: 7 6 1 1 1 1 1 5 1 5
  ..$ Hispanic         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ Education        : Factor w/ 23 levels "10th Grade","11th Grade",..: 15 19 15 15 15 20 15 15 20 20
  ..$ Hispanic_ancestry: Factor w/ 5 levels ".","Ecuadorian, or",..: 1 1 1 1 1 1 1 1 1 1
  ..$ race_white       : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2
  ..$ race_black       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_Asian       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_pacific     : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1
  ..$ race_Amer_Indian : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_other       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_dontknow    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ race_refused     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1
  ..$ Asian_ancestry   : Factor w/ 4 levels "<NA>","Asian Indian,",..: 1 1 1 1 1 1 1 1 1 1
#==========================================================================================================
# Partition the data into training, validation and test sets
#==========================================================================================================
splits <- h2o.splitFrame(data = datah20, 
                         ratios = c(0.7, 0.15),  #partition data into 70%, 15%, 15% chunks
                         seed = 1)  #setting a seed will guarantee reproducibility
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]
# Identify response and predictor variables
y <- "Hypertension"
x <- setdiff(names(datah20), y)  
#==========================================================================================================
#glm/logistic
#similar to R's glm, h2o.glm has the family argument
# 1. Let's start with a basic binomial Generalized Linear Model
# By default, h2o.glm uses a regularized, elastic net model
#==========================================================================================================
glm_fit1 <- h2o.glm(x = x, 
                    y = y, 
                    training_frame = train,
                    model_id = "glm_fit1",
                    family = "binomial") 
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=========================================================================================| 100%
#====================================================================================================================================
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
#=====================================================================================================================================
glm_fit2 <- h2o.glm(x = x, 
                    y = y, 
                    training_frame = train,
                    model_id = "glm_fit2",
                    validation_frame = valid,
                    family = "binomial",
                    lambda_search = TRUE)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=================                                                                        |  19%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the two GLMs
#==========================================================================================================
glm_perf1 <- h2o.performance(model = glm_fit1,
                             newdata = test)
glm_perf2 <- h2o.performance(model = glm_fit2,
                             newdata = test)
(glm_perf1)  
H2OBinomialMetrics: glm

MSE:  0.2021788
RMSE:  0.449643
LogLoss:  0.5969429
Mean Per-Class Error:  0.3113984
AUC:  0.7079906
Gini:  0.4159812
R^2:  0.1447194
Residual Deviance:  71.63315
AIC:  79.63315

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     22  15 0.405405  =15/37
Yes     5  18 0.217391   =5/23
Totals 27  33 0.333333  =20/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.239391 0.642857  27
2                       max f2  0.131403 0.782313  40
3                 max f0point5  0.550961 0.714286   9
4                 max accuracy  0.550961 0.750000   9
5                max precision  0.728177 1.000000   0
6                   max recall  0.131403 1.000000  40
7              max specificity  0.728177 1.000000   0
8             max absolute_mcc  0.550961 0.475239   9
9   max min_per_class_accuracy  0.320071 0.648649  22
10 max mean_per_class_accuracy  0.273119 0.693890  24

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
(glm_perf2) 
H2OBinomialMetrics: glm

MSE:  0.1965816
RMSE:  0.4433753
LogLoss:  0.5866738
Mean Per-Class Error:  0.3143361
AUC:  0.7285546
Gini:  0.4571093
R^2:  0.1683973
Residual Deviance:  70.40085
AIC:  98.40085

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     25  12 0.324324  =12/37
Yes     7  16 0.304348   =7/23
Totals 32  28 0.316667  =19/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.282829 0.627451  27
2                       max f2  0.088373 0.782313  53
3                 max f0point5  0.397360 0.689655  15
4                 max accuracy  0.397360 0.750000  15
5                max precision  0.814130 1.000000   0
6                   max recall  0.088373 1.000000  53
7              max specificity  0.814130 1.000000   0
8             max absolute_mcc  0.397360 0.454770  15
9   max min_per_class_accuracy  0.282829 0.675676  27
10 max mean_per_class_accuracy  0.397360 0.706816  15

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
#==========================================================================================================
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
#==========================================================================================================
h2o.auc(glm_perf1)  
[1] 0.7079906
h2o.auc(glm_perf2)  
[1] 0.7285546
#==========================================================================================================
# Compare test AUC to the training AUC and validation AUC
#==========================================================================================================
h2o.auc(glm_fit2, train = TRUE)  
[1] 0.8268307
h2o.auc(glm_fit2, valid = TRUE) 
[1] 0.8009756
glm_fit2@model$validation_metrics  
H2OBinomialMetrics: glm
** Reported on validation data. **

MSE:  0.175042
RMSE:  0.4183802
LogLoss:  0.522122
Mean Per-Class Error:  0.2443902
AUC:  0.8009756
Gini:  0.6019512
R^2:  0.2561143
Residual Deviance:  68.9201
AIC:  96.9201

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     39   2 0.048780   =2/41
Yes    11  14 0.440000  =11/25
Totals 50  16 0.196970  =13/66

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.656748 0.682927  15
2                       max f2  0.176934 0.789474  51
3                 max f0point5  0.706704 0.821918  11
4                 max accuracy  0.706704 0.803030  11
5                max precision  0.917858 1.000000   0
6                   max recall  0.124401 1.000000  58
7              max specificity  0.917858 1.000000   0
8             max absolute_mcc  0.706704 0.603692  11
9   max min_per_class_accuracy  0.467950 0.720000  28
10 max mean_per_class_accuracy  0.656748 0.755610  15

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
#==========================================================================================================
# 2. Random Forest
# H2O's Random Forest (RF) implements a distributed version of the standard 
# Random Forest algorithm and variable importance measures.
# First we will train a basic Random Forest model with default parameters. 
# The Random Forest model will infer the response distribution from the response encoding. 
# A seed is required for reproducibility.
#==========================================================================================================
rf_fit1 <- h2o.randomForest(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "rf_fit1",
                            seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |==================                                                                       |  20%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Next we will increase the number of trees used in the forest by setting `ntrees = 100`.  
# The default number of trees in an H2O Random Forest is 50, so this RF will be twice as 
# big as the default.  Usually increasing the number of trees in a RF will increase 
# performance as well.  Unlike Gradient Boosting Machines (GBMs), Random Forests are fairly 
# resistant (although not free from) overfitting.
# See the GBM example below for additional guidance on preventing overfitting using H2O's 
# early stopping functionality.
#==========================================================================================================
rf_fit2 <- h2o.randomForest(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "rf_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100,
                            seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=====                                                                                    |   6%
  |                                                                                               
  |================================================================                         |  72%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the two RFs
#==========================================================================================================
rf_perf1 <- h2o.performance(model = rf_fit1,
                            newdata = test)
rf_perf2 <- h2o.performance(model = rf_fit2,
                            newdata = test)
#==========================================================================================================
# Print model performance
#==========================================================================================================
rf_perf1
H2OBinomialMetrics: drf

MSE:  0.2063654
RMSE:  0.4542746
LogLoss:  0.6252983
Mean Per-Class Error:  0.2984724
AUC:  0.6921269
Gini:  0.3842538

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     31   6 0.162162   =6/37
Yes    10  13 0.434783  =10/23
Totals 41  19 0.266667  =16/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.365918 0.619048  18
2                       max f2  0.049115 0.761589  58
3                 max f0point5  0.365918 0.656566  18
4                 max accuracy  0.365918 0.733333  18
5                max precision  0.852486 1.000000   0
6                   max recall  0.049115 1.000000  58
7              max specificity  0.852486 1.000000   0
8             max absolute_mcc  0.615149 0.422781   5
9   max min_per_class_accuracy  0.300039 0.652174  25
10 max mean_per_class_accuracy  0.365918 0.701528  18

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
rf_perf2
H2OBinomialMetrics: drf

MSE:  0.2098127
RMSE:  0.4580531
LogLoss:  0.6416177
Mean Per-Class Error:  0.3225617
AUC:  0.6780259
Gini:  0.3560517

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     26  11 0.297297  =11/37
Yes     8  15 0.347826   =8/23
Totals 34  26 0.316667  =19/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.294974 0.612245  25
2                       max f2  0.033693 0.756579  59
3                 max f0point5  0.626739 0.638298   5
4                 max accuracy  0.626739 0.716667   5
5                max precision  0.810297 1.000000   0
6                   max recall  0.033693 1.000000  59
7              max specificity  0.810297 1.000000   0
8             max absolute_mcc  0.626739 0.422781   5
9   max min_per_class_accuracy  0.294974 0.652174  25
10 max mean_per_class_accuracy  0.420257 0.679788  17

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
#==========================================================================================================
# Retreive test set AUC
#==========================================================================================================
h2o.auc(rf_perf1)  
[1] 0.6921269
h2o.auc(rf_perf2)  
[1] 0.6780259
#==========================================================================================================
# Cross-validate performance
# Rather than using held-out test set to evaluate model performance, a user may wish 
# to estimate model performance using cross-validation. Using the RF algorithm 
# (with default model parameters) as an example, we demonstrate how to perform k-fold 
# cross-validation using H2O. No custom code or loops are required, you simply specify 
# the number of desired folds in the nfolds argument.
# Since we are not going to use a test set here, we can use the original (full) dataset, 
# which we called data rather than the subsampled `train` dataset. Note that this will 
# take approximately k (nfolds) times longer than training a single RF model, since it 
# will train k models in the cross-validation process (trained on n(k-1)/k rows), in 
# addition to the final model trained on the full training_frame dataset with n rows.
#==========================================================================================================
rf_fit3 <- h2o.randomForest(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "rf_fit3",
                            seed = 1,
                            nfolds = 5)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |======                                                                                   |   7%
  |                                                                                               
  |==========================================                                               |  47%
  |                                                                                               
  |=====================================================================                    |  78%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# To evaluate the cross-validated AUC, do the following:
#==========================================================================================================
h2o.auc(rf_fit3, xval = TRUE)  
[1] 0.789991
#==========================================================================================================
# 3. Gradient Boosting Machine
# H2O's Gradient Boosting Machine (GBM) offers a Stochastic GBM, which can 
# increase performance quite a bit compared to the original GBM implementation.
# Now we will train a basic GBM model
# The GBM model will infer the response distribution from the response encoding if not specified 
# explicitly through the `distribution` argument. A seed is required for reproducibility.
#==========================================================================================================
gbm_fit1 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit1",
                    seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |============                                                                             |  14%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Next we will increase the number of trees used in the GBM by setting `ntrees=500`.  
# The default number of trees in an H2O GBM is 50, so this GBM will trained using ten times 
# the default.  Increasing the number of trees in a GBM is one way to increase performance 
# of the model, however, you have to be careful not to overfit your model to the training data 
# by using too many trees.  To automatically find the optimal number of trees, you must use 
# H2O's early stopping functionality.  This example will not do that, however, the following 
# example will.
#==========================================================================================================
gbm_fit2 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit2",
                    #validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=                                                                                        |   1%
  |                                                                                               
  |=============                                                                            |  15%
  |                                                                                               
  |====================                                                                     |  23%
  |                                                                                               
  |==========================                                                               |  29%
  |                                                                                               
  |=====================================                                                    |  41%
  |                                                                                               
  |=========================================================================================| 100%
#============================================================================================
# We will again set `ntrees = 500`, however, this time we will use early stopping in order to 
# prevent overfitting (from too many trees).  All of H2O's algorithms have early stopping available, 
# however early stopping is not enabled by default (with the exception of Deep Learning).  
# There are several parameters that should be used to control early stopping.  The three that are 
# common to all the algorithms are: `stopping_rounds`, `stopping_metric` and `stopping_tolerance`.  
# The stopping metric is the metric by which you'd like to measure performance, and so we will choose 
# AUC here.  The `score_tree_interval` is a parameter specific to the Random Forest model and the GBM.  
# Setting `score_tree_interval = 5` will score the model after every five trees.  The parameters we 
# have set below specify that the model will stop training after there have been three scoring intervals 
# where the AUC has not increased more than 0.0005.  Since we have specified a validation frame, 
# the stopping tolerance will be computed on validation AUC rather than training AUC. 
#===============================================================================================
gbm_fit3 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit3",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
                    seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |==                                                                                       |   2%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the two GBMs
#==========================================================================================================
gbm_perf1 <- h2o.performance(model = gbm_fit1,
                             newdata = test)
gbm_perf2 <- h2o.performance(model = gbm_fit2,
                             newdata = test)
gbm_perf3 <- h2o.performance(model = gbm_fit3,
                             newdata = test)
#==========================================================================================================
# Print model performance
#==========================================================================================================
gbm_perf1
H2OBinomialMetrics: gbm

MSE:  0.2398655
RMSE:  0.4897607
LogLoss:  0.8364659
Mean Per-Class Error:  0.333725
AUC:  0.6169213
Gini:  0.2338425

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     30   7 0.189189   =7/37
Yes    11  12 0.478261  =11/23
Totals 41  19 0.300000  =18/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.314972 0.571429  18
2                       max f2  0.013415 0.761589  58
3                 max f0point5  0.738993 0.636364   7
4                 max accuracy  0.738993 0.716667   7
5                max precision  0.938118 1.000000   0
6                   max recall  0.013415 1.000000  58
7              max specificity  0.938118 1.000000   0
8             max absolute_mcc  0.738993 0.396644   7
9   max min_per_class_accuracy  0.152995 0.608696  27
10 max mean_per_class_accuracy  0.462635 0.671563  15

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
gbm_perf2
H2OBinomialMetrics: gbm

MSE:  0.3094912
RMSE:  0.5563193
LogLoss:  1.784196
Mean Per-Class Error:  0.4324324
AUC:  0.626322
Gini:  0.2526439

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No      5  32 0.864865  =32/37
Yes     0  23 0.000000   =0/23
Totals  5  55 0.533333  =32/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.000046 0.589744  54
2                       max f2  0.000046 0.782313  54
3                 max f0point5  0.872488 0.636364   7
4                 max accuracy  0.872488 0.716667   7
5                max precision  0.999979 1.000000   0
6                   max recall  0.000046 1.000000  54
7              max specificity  0.999979 1.000000   0
8             max absolute_mcc  0.872488 0.396644   7
9   max min_per_class_accuracy  0.008380 0.565217  28
10 max mean_per_class_accuracy  0.872488 0.638660   7

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
gbm_perf3
H2OBinomialMetrics: gbm

MSE:  0.2238611
RMSE:  0.4731396
LogLoss:  0.7295884
Mean Per-Class Error:  0.306698
AUC:  0.6333725
Gini:  0.266745

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     32   5 0.135135   =5/37
Yes    11  12 0.478261  =11/23
Totals 43  17 0.266667  =16/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.437536 0.600000  16
2                       max f2  0.026584 0.756579  59
3                 max f0point5  0.489286 0.666667  12
4                 max accuracy  0.489286 0.733333  12
5                max precision  0.917153 1.000000   0
6                   max recall  0.026584 1.000000  59
7              max specificity  0.917153 1.000000   0
8             max absolute_mcc  0.489286 0.417428  12
9   max min_per_class_accuracy  0.211870 0.608696  24
10 max mean_per_class_accuracy  0.437536 0.693302  16

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
#==========================================================================================================
# Retreive test set AUC
#==========================================================================================================
h2o.auc(gbm_perf1)  
[1] 0.6169213
h2o.auc(gbm_perf2)  
[1] 0.626322
h2o.auc(gbm_perf3)  
[1] 0.6333725

To examine the scoring history, use the scoring_history method on a trained model.
If score_tree_interval is not specified, it will score at various intervals, as we can see for h2o.scoreHistory() below. However, regular 5-tree intervals are used for h2o.scoreHistory().
The gbm_fit2 was trained only using a training set (no validation set), so the scoring history is calculated for training set performance metrics only.

#==========================================================================================================
# To examine the scoring history, use the `scoring_history` method on a trained model.  
# If `score_tree_interval` is not specified, it will score at various intervals, as we can 
# see for `h2o.scoreHistory()` below.  However, regular 5-tree intervals are used 
# for `h2o.scoreHistory()`.  
# The `gbm_fit2` was trained only using a training set (no validation set), so the scoring 
# history is calculated for training set performance metrics only.
#==========================================================================================================
h2o.scoreHistory(gbm_fit2)
Scoring History: 
            timestamp   duration number_of_trees training_rmse training_logloss training_auc
1 2017-10-08 01:27:12  0.001 sec               0       0.47447          0.64254      0.50000
2 2017-10-08 01:27:12  0.009 sec               1       0.45247          0.59776      0.90156
3 2017-10-08 01:27:12  0.015 sec               2       0.43398          0.56188      0.90831
4 2017-10-08 01:27:12  0.021 sec               3       0.41807          0.53192      0.91432
5 2017-10-08 01:27:12  0.026 sec               4       0.40453          0.50680      0.91737
  training_lift training_classification_error
1       1.00000                       0.65772
2       2.92157                       0.17785
3       2.92157                       0.17114
4       2.92157                       0.16443
5       2.92157                       0.15436

---
              timestamp   duration number_of_trees training_rmse training_logloss training_auc
167 2017-10-08 01:27:16  3.849 sec             166       0.15266          0.09869      0.99797
168 2017-10-08 01:27:16  3.882 sec             167       0.15216          0.09817      0.99792
169 2017-10-08 01:27:16  3.916 sec             168       0.15174          0.09770      0.99792
170 2017-10-08 01:27:16  3.951 sec             169       0.15121          0.09716      0.99797
171 2017-10-08 01:27:16  3.991 sec             170       0.15070          0.09654      0.99802
172 2017-10-08 01:27:17  4.863 sec             500       0.10981          0.04117      0.99877
    training_lift training_classification_error
167       2.92157                       0.02685
168       2.92157                       0.02685
169       2.92157                       0.02685
170       2.92157                       0.02685
171       2.92157                       0.02685
172       2.92157                       0.02349
#==========================================================================================================
# When early stopping is used, we see that training stopped at 105 trees instead of the full 500.  
# Since we used a validation set in `gbm_fit3`, both training and validation performance metrics 
# are stored in the scoring history object.  Take a look at the validation AUC to observe that the 
# correct stopping tolerance was enforced.
#==========================================================================================================
h2o.scoreHistory(gbm_fit3)
Scoring History: 
            timestamp   duration number_of_trees training_rmse training_logloss training_auc
1 2017-10-08 01:27:18  0.001 sec               0       0.47447          0.64254      0.50000
2 2017-10-08 01:27:18  0.021 sec               5       0.39295          0.48547      0.92099
3 2017-10-08 01:27:18  0.045 sec              10       0.35296          0.41031      0.93035
4 2017-10-08 01:27:18  0.069 sec              15       0.33131          0.36613      0.93705
5 2017-10-08 01:27:18  0.090 sec              20       0.31451          0.33344      0.94768
6 2017-10-08 01:27:18  0.115 sec              25       0.29868          0.30512      0.95521
7 2017-10-08 01:27:18  0.137 sec              30       0.28750          0.28517      0.96141
  training_lift training_classification_error validation_rmse validation_logloss validation_auc
1       1.00000                       0.65772         0.48646            0.66638        0.50000
2       2.92157                       0.15436         0.45331            0.59915        0.72439
3       2.92157                       0.13758         0.46625            0.62275        0.70537
4       2.92157                       0.12416         0.48076            0.65590        0.67707
5       2.92157                       0.11074         0.48574            0.67175        0.67122
6       2.92157                       0.09732         0.49773            0.70904        0.66829
7       2.92157                       0.09060         0.50043            0.72274        0.68098
  validation_lift validation_classification_error
1         1.00000                         0.62121
2         1.32000                         0.42424
3         1.32000                         0.42424
4         0.00000                         0.39394
5         2.64000                         0.39394
6         2.64000                         0.36364
7         2.64000                         0.40909
#==========================================================================================================
# Look at scoring history for third GBM model
#==========================================================================================================
plot(gbm_fit3, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit3, 
     timestep = "number_of_trees", 
     metric = "logloss")

There is overfitting for number of trees greater than 5.This can be observed from the graph above which shows the training error continues to decrease but the validation error starts to increase after 5 trees.We will arrrive at a better model by choosing number of trees to be less than 5.

#==========================================================================================================
# 4. Deep Learning
# H2O's Deep Learning algorithm is a multilayer feed-forward artificial neural network.  
# It can also be used to train an autoencoder. In this example we will train 
# a standard supervised prediction model.
# Train a default DL
# First we will train a basic DL model with default parameters. The DL model will infer the response 
# distribution from the response encoding if it is not specified explicitly through the `distribution` 
# argument.  H2O's DL will not be reproducible if it is run on more than a single core, so in this example, 
# the performance metrics below may vary slightly from what you see on your machine.
# In H2O's DL, early stopping is enabled by default, so below, it will use the training set and 
# default stopping parameters to perform early stopping.
#==========================================================================================================
dl_fit1 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit1",
                            seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=======================================================================                  |  80%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Train a DL with new architecture and more epochs.
# Next we will increase the number of epochs used in the GBM by setting `epochs=20` (the default is 10).  
# Increasing the number of epochs in a deep neural net may increase performance of the model, however, 
# you have to be careful not to overfit your model to your training data.  To automatically find the optimal number of epochs, 
# you must use H2O's early stopping functionality.  Unlike the rest of the H2O algorithms, H2O's DL will 
# use early stopping by default, so for comparison we will first turn off early stopping.  We do this in the next example 
# by setting `stopping_rounds=0`.
#==========================================================================================================
dl_fit2 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
                            seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |==================                                                                       |  20%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Train a DL with early stopping
# This example will use the same model parameters as `dl_fit2`. This time, we will turn on 
# early stopping and specify the stopping criterion.  We will also pass a validation set, as is
# recommended for early stopping.
#==========================================================================================================
dl_fit3 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit3",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 2,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
                            seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the three DL models
#==========================================================================================================
dl_perf1 <- h2o.performance(model = dl_fit1,
                            newdata = test)
dl_perf2 <- h2o.performance(model = dl_fit2,
                            newdata = test)
dl_perf3 <- h2o.performance(model = dl_fit3,
                            newdata = test)
#==========================================================================================================
# Print model performance
#==========================================================================================================
dl_perf1
H2OBinomialMetrics: deeplearning

MSE:  0.2413914
RMSE:  0.491316
LogLoss:  0.9163704
Mean Per-Class Error:  0.2849589
AUC:  0.6768508
Gini:  0.3537015

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     32   5 0.135135   =5/37
Yes    10  13 0.434783  =10/23
Totals 42  18 0.250000  =15/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.151863 0.634146  17
2                       max f2  0.007122 0.777027  55
3                 max f0point5  0.288452 0.746269  10
4                 max accuracy  0.288452 0.766667  10
5                max precision  0.783162 1.000000   0
6                   max recall  0.007122 1.000000  55
7              max specificity  0.783162 1.000000   0
8             max absolute_mcc  0.288452 0.512354  10
9   max min_per_class_accuracy  0.070772 0.594595  28
10 max mean_per_class_accuracy  0.174827 0.720329  14

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
dl_perf2
H2OBinomialMetrics: deeplearning

MSE:  0.2209325
RMSE:  0.4700346
LogLoss:  0.7261213
Mean Per-Class Error:  0.2925969
AUC:  0.7003525
Gini:  0.4007051

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     25  12 0.324324  =12/37
Yes     6  17 0.260870   =6/23
Totals 31  29 0.300000  =18/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.136734 0.653846  28
2                       max f2  0.016174 0.761589  58
3                 max f0point5  0.364614 0.704225  11
4                 max accuracy  0.364614 0.750000  11
5                max precision  0.863757 1.000000   0
6                   max recall  0.016174 1.000000  58
7              max specificity  0.863757 1.000000   0
8             max absolute_mcc  0.364614 0.462774  11
9   max min_per_class_accuracy  0.153054 0.675676  27
10 max mean_per_class_accuracy  0.136734 0.707403  28

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
dl_perf3
H2OBinomialMetrics: deeplearning

MSE:  0.2079788
RMSE:  0.4560469
LogLoss:  0.6326183
Mean Per-Class Error:  0.2820212
AUC:  0.7226792
Gini:  0.4453584

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     29   8 0.216216   =8/37
Yes     8  15 0.347826   =8/23
Totals 37  23 0.266667  =16/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.280369 0.652174  22
2                       max f2  0.045496 0.761589  58
3                 max f0point5  0.447421 0.666667  12
4                 max accuracy  0.447421 0.733333  12
5                max precision  0.868179 1.000000   0
6                   max recall  0.045496 1.000000  58
7              max specificity  0.868179 1.000000   0
8             max absolute_mcc  0.280369 0.435958  22
9   max min_per_class_accuracy  0.280369 0.652174  22
10 max mean_per_class_accuracy  0.280369 0.717979  22

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
#==========================================================================================================
# Retreive test set AUC
#==========================================================================================================
h2o.auc(dl_perf1)  
[1] 0.6768508
h2o.auc(dl_perf2)  
[1] 0.7003525
h2o.auc(dl_perf3)  
[1] 0.7226792
#==========================================================================================================
# Scoring history
#==========================================================================================================
h2o.scoreHistory(dl_fit3)
Scoring History: 
            timestamp   duration training_speed  epochs iterations    samples training_rmse
1 2017-10-08 01:27:25  0.000 sec                0.00000          0   0.000000              
2 2017-10-08 01:27:25  0.018 sec  21333 obs/sec 0.21477          1  64.000000       0.51330
3 2017-10-08 01:27:25  0.039 sec  32736 obs/sec 2.08725         10 622.000000       0.40466
  training_logloss training_auc training_lift training_classification_error validation_rmse
1                                                                                          
2          0.87313      0.56403       1.94771                       0.65772         0.55709
3          0.49752      0.82768       2.92157                       0.26174         0.43714
  validation_logloss validation_auc validation_lift validation_classification_error
1                                                                                  
2            1.00843        0.52488         0.00000                         0.60606
3            0.59696        0.72195         2.64000                         0.21212
#==========================================================================================================
# confusion matrix
#==========================================================================================================
h2o.confusionMatrix(dl_fit3)
Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.155333974647658:
        No Yes    Error     Rate
No     133  63 0.321429  =63/196
Yes     15  87 0.147059  =15/102
Totals 148 150 0.261745  =78/298
#==========================================================================================================
# model diagnostics
#==========================================================================================================
plot(dl_fit3,
     timestep = "epochs",
     metric = "classification_error")

h2o.scoreHistory(dl_fit3)$epochs
[1] 0.0000000 0.2147651 2.0872483
h2o.scoreHistory(dl_fit3)$validation_classification_error
[1]       NaN 0.6060606 0.2121212
#==========================================================================================================
# Look at scoring history for third DL model
#==========================================================================================================
# The model starts to overfitt as epoch goes beyond 2. The training error continues to decrease whereas the 
# test error begins to increase.
plot(dl_fit3, 
     timestep = "epochs", 
     metric = "AUC")

#==========================================================================================================
# # Get the CV models from the `dl_fit3` object for third DL model
#==========================================================================================================
dl_fit3 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit3",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 2,
                            nfolds = 3,
                            stopping_metric = "misclassification", #used for early stopping
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 5,          #used for early stopping
                            #stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
                            seed = 1)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |======================================================================                   |  79%
  |                                                                                               
  |=========================================================================================| 100%
cv_models <- sapply(dl_fit3@model$cross_validation_models,
                    function(i) h2o.getModel(i$name))
# Plot the scoring history over time
plot(cv_models[[2]],
     timestep = "epochs",
     metric = "classification_error")

plot(dl_fit3,
     timestep = "epochs",
     metric = "classification_error")

cv_models[[1]]
Model Details:
==============

H2OBinomialModel: deeplearning
Model ID:  dl_fit3_cv_1 
Status of Neuron Layers: predicting Hypertension, 2-class classification, bernoulli distribution, CrossEntropy loss, 862 weights/biases, 17.6 KB, 388 training samples, mini-batch size 1
  layer units      type dropout       l1       l2 mean_rate rate_rms momentum mean_weight
1     1    72     Input  0.00 %                                                          
2     2    10 Rectifier  0.00 % 0.000000 0.000000  0.312233 0.455086 0.000000    0.002696
3     3    10 Rectifier  0.00 % 0.000000 0.000000  0.003896 0.004367 0.000000   -0.010156
4     4     2   Softmax         0.000000 0.000000  0.002739 0.001932 0.000000    0.597045
  weight_rms mean_bias bias_rms
1                              
2   0.161645  0.504851 0.057905
3   0.299093  0.996581 0.031065
4   1.652194 -0.000000 0.018216


H2OBinomialMetrics: deeplearning
** Reported on training data. **
** Metrics reported on temporary training frame with 298 samples **

MSE:  0.1505877
RMSE:  0.3880564
LogLoss:  0.4643866
Mean Per-Class Error:  0.220613
AUC:  0.8475962
Gini:  0.6951923

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
        No Yes    Error     Rate
No     105  23 0.179688  =23/128
Yes     17  48 0.261538   =17/65
Totals 122  71 0.207254  =40/193

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.293097 0.705882  69
2                       max f2  0.116315 0.811170 114
3                 max f0point5  0.563696 0.727700  36
4                 max accuracy  0.499190 0.797927  45
5                max precision  0.897045 1.000000   0
6                   max recall  0.014746 1.000000 178
7              max specificity  0.897045 1.000000   0
8             max absolute_mcc  0.335698 0.549399  64
9   max min_per_class_accuracy  0.261208 0.765625  78
10 max mean_per_class_accuracy  0.293097 0.779387  69

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
H2OBinomialMetrics: deeplearning
** Reported on validation data. **
** Metrics reported on full validation frame **

MSE:  0.2064037
RMSE:  0.4543167
LogLoss:  0.6735717
Mean Per-Class Error:  0.2724563
AUC:  0.7066773
Gini:  0.4133545

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error     Rate
No     53  15 0.220588   =15/68
Yes    12  25 0.324324   =12/37
Totals 65  40 0.257143  =27/105

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.276746 0.649351  38
2                       max f2  0.012913 0.755102  94
3                 max f0point5  0.276746 0.634518  38
4                 max accuracy  0.276746 0.742857  38
5                max precision  0.902949 1.000000   0
6                   max recall  0.012913 1.000000  94
7              max specificity  0.902949 1.000000   0
8             max absolute_mcc  0.276746 0.447676  38
9   max min_per_class_accuracy  0.211203 0.702703  44
10 max mean_per_class_accuracy  0.276746 0.727544  38

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

As an alternative to manual tuning, or “hand tuning”, we can use the h2o.grid() function to perform either a Cartesian or Randon Grid Search (RGS). Random Grid Search is usually a quicker way to find a good model, so we will provide a example of how to use H2O’s Random Grid Search on a DNN. One handy feature of RGS is that you can specify how long you would like to execute the grid for – this can be based on a time, number of models, or a performance-metric-based stopping criterion. In the example below,we will train the DNN grid for 600 seconds (10 minutes). First define a grid of Deep Learning hyperparamters and specify the search_criteria .

#==========================================================================================================
# Deep Learning Grid Search third DL model
#==========================================================================================================
# As an alternative to manual tuning, or “hand tuning”, we can use the h2o.grid() function to perform either a
# Cartesian or Randon Grid Search (RGS). Random Grid Search is usually a quicker way to find a good model,
# so we will provide a example of how to use H2O’s Random Grid Search on a DNN.
# One handy feature of RGS is that you can specify how long you would like to execute the grid for – this can be
# based on a time, number of models, or a performance-metric-based stopping criterion. In the example below,
# we will train the DNN grid for 600 seconds (10 minutes).
# First define a grid of Deep Learning hyperparamters and specify the search_criteria .
activation_opt <- c("Rectifier", "Maxout", "Tanh")
l1_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01)
l2_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01)
hyper_params <- list(activation = activation_opt, l1 = l1_opt, l2 = l2_opt)
search_criteria <- list(strategy = "RandomDiscrete", max_runtime_secs = 600)
# Rather than comparing models by using cross-validation (which is “better” but takes longer), we will simply
# partition our training set into two pieces – one for training and one for validiation.
# This will split the train frame into an 80% and 20% partition of the rows.
splits <- h2o.splitFrame(train, ratios = 0.8, seed = 1)
#Train the random grid. Fixed non-default parameters such as hidden=c(20,20) can be passed directly to
#the h2o.grid() function.
dl_grid <- h2o.grid("deeplearning", x = x, y = y,
                    grid_id = "dl_grid",
                    training_frame = splits[[1]],
                    validation_frame = splits[[2]],
                    seed = 1,
                    hidden = c(20,20),
                    hyper_params = hyper_params,
                    search_criteria = search_criteria)

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |                                                                                         |   1%
  |                                                                                               
  |=                                                                                        |   1%
#Once we have trained the grid, we can collect the results and sort by our model performance metric of choice.
dl_gridperf <- h2o.getGrid(grid_id = "dl_grid",
                           sort_by = "accuracy",
                           decreasing = TRUE)
print(dl_gridperf)
H2O Grid Details
================

Grid ID: dl_grid 
Used hyper parameters: 
  -  activation 
  -  l1 
  -  l2 
Number of models: 75 
Number of failed models: 0 

Hyper-Parameter Search Summary: ordered by decreasing accuracy
  activation     l1     l2        model_ids           accuracy
1     Maxout 1.0E-5  0.001 dl_grid_model_59 0.7818181818181819
2     Maxout 1.0E-5    0.0 dl_grid_model_44 0.7818181818181819
3     Maxout  0.001  0.001 dl_grid_model_19 0.7818181818181819
4     Maxout   0.01    0.0 dl_grid_model_43 0.7818181818181819
5  Rectifier   0.01 1.0E-4 dl_grid_model_64 0.7818181818181819

---
   activation     l1     l2        model_ids            accuracy
70       Tanh 1.0E-4 1.0E-5 dl_grid_model_62  0.5272727272727273
71       Tanh   0.01 1.0E-4 dl_grid_model_35  0.5272727272727273
72       Tanh    0.0 1.0E-4 dl_grid_model_68   0.509090909090909
73       Tanh 1.0E-5 1.0E-4 dl_grid_model_56   0.509090909090909
74       Tanh    0.0    0.0 dl_grid_model_63   0.509090909090909
75       Tanh  0.001    0.0 dl_grid_model_38 0.49090909090909096
#Grab the model_id for the top DL model, chosen by validation error.
best_dl_model_id <- dl_gridperf@model_ids[[1]]
best_dl <- h2o.getModel(best_dl_model_id)
#Now let’s evaluate the model performance on a test set so we get an honest estimate of top model
#performance.
best_dl_perf <- h2o.performance(model = best_dl, newdata = test)
h2o.mse(best_dl_perf)
[1] 0.2421704
#==========================================================================================================
# 5. Naive Bayes model
# The Naive Bayes (NB) algorithm does not usually beat an algorithm like a Random Forest 
# or GBM, however it is still a popular algorithm, especially in the text domain (when your 
# input is text encoded as "Bag of Words", for example).  The Naive Bayes algorithm is for 
# binary or multiclass classification problems only, not regression.  Therefore, your response 
# must be a factor instead of a numeric.
# First we will train a basic NB model with default parameters. 
#==========================================================================================================
nb_fit1 <- h2o.naiveBayes(x = x,
                          y = y,
                          training_frame = train,
                          model_id = "nb_fit1")
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Train a NB model with Laplace Smoothing
# One of the few tunable model parameters for the Naive Bayes algorithm is the amount of Laplace 
# smoothing. The H2O Naive Bayes model will not use any Laplace smoothing by default.
#==========================================================================================================
nb_fit2 <- h2o.naiveBayes(x = x,
                          y = y,
                          training_frame = train,
                          model_id = "nb_fit2",
                          laplace = 6)
Dropping bad and constant columns: [race_pacific].

  |                                                                                               
  |                                                                                         |   0%
  |                                                                                               
  |=========================================================================================| 100%
#==========================================================================================================
# Let's compare the performance of the two NB models
#==========================================================================================================
nb_perf1 <- h2o.performance(model = nb_fit1,
                            newdata = test)
nb_perf2 <- h2o.performance(model = nb_fit2,
                            newdata = test)
#==========================================================================================================
# Print model performance
#==========================================================================================================
nb_perf1
H2OBinomialMetrics: naivebayes

MSE:  0.1797905
RMSE:  0.4240172
LogLoss:  0.7532077
Mean Per-Class Error:  0.2226792
AUC:  0.7121034
Gini:  0.4242068

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     35   2 0.054054   =2/37
Yes     9  14 0.391304   =9/23
Totals 44  16 0.183333  =11/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.608819 0.717949  15
2                       max f2  0.003638 0.761589  58
3                 max f0point5  0.608819 0.804598  15
4                 max accuracy  0.608819 0.816667  15
5                max precision  0.999242 1.000000   0
6                   max recall  0.003638 1.000000  58
7              max specificity  0.999242 1.000000   0
8             max absolute_mcc  0.608819 0.609805  15
9   max min_per_class_accuracy  0.313477 0.695652  22
10 max mean_per_class_accuracy  0.608819 0.777321  15

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
nb_perf2
H2OBinomialMetrics: naivebayes

MSE:  0.1774692
RMSE:  0.4212709
LogLoss:  0.5977346
Mean Per-Class Error:  0.2361927
AUC:  0.7614571
Gini:  0.5229142

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
       No Yes    Error    Rate
No     34   3 0.081081   =3/37
Yes     9  14 0.391304   =9/23
Totals 43  17 0.200000  =12/60

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.437234 0.700000  16
2                       max f2  0.009638 0.756579  59
3                 max f0point5  0.437234 0.769231  16
4                 max accuracy  0.437234 0.800000  16
5                max precision  0.993291 1.000000   0
6                   max recall  0.009638 1.000000  59
7              max specificity  0.993291 1.000000   0
8             max absolute_mcc  0.437234 0.569276  16
9   max min_per_class_accuracy  0.211192 0.702703  27
10 max mean_per_class_accuracy  0.437234 0.763807  16

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
---
title: "Deep Learning with h20"
output: html_notebook
author: Nana Boateng
df_print: paged
Time: '`r Sys.time()`'
date: "`r format(Sys.time(), '%B %d, %Y')`"
---



```{r setup, include=FALSE}
knitr::opts_chunk$set(cache=TRUE)
```


The data consist if surveys administered to people who live near a community health facilty. The goal here is to predict the rate of hypertension among the residents given predictors such as age,race,gender,education and employment status.

```{r}
rm(list =c())


library(h2o)
library(h2oEnsemble)
library(tidyverse)

h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
         max_mem_size = "8G")  #max mem size is the maximum memory to allocate to H2O


localH2O = h2o.init(ip = 'localhost', port = 54321, nthreads = -1,max_mem_size = "8G")


loan_csv <- "/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/DataMiningscience/UICProject/Workbook3.csv"
datachicago <- h2o.importFile(loan_csv)  
dim(datachicago)

head(datachicago)

h2o.names((datachicago))

str(datachicago)


```



```{r}
#==========================================================================================================
# Look at  the  structure of the data with the glimpse function in 
#  dplyr  package
#==========================================================================================================

str(data)

dplyr::glimpse(data)

summary(data)


```




```{r}
#==========================================================================================================
#check the number of missing rows
#==========================================================================================================

colSums(is.na.data.frame(data))


data[!complete.cases(data),]%>%head()


data[which(data$Asian_ancestry!="."),]


data$Asian_ancestry=ifelse(data$Asian_ancestry==".","<NA>",data$Asian_ancestry)


data=data[complete.cases(data),]

#==========================================================================================================
#NO CODED RESPONSE APPLICABLE (SPECIFY)
#==========================================================================================================

#data%>%dplyr::filter(str_detect(Hypertension, "NO CODED RESPONSE APPLICABLE (SPECIFY)"))

ndata=data%>%dplyr::select(-ID)%>%dplyr::filter(Hypertension!="NO CODED RESPONSE APPLICABLE (SPECIFY)",Employment
                                                !="NO CODED RESPONSE APPLICABLE (LEAVE NOTE FIRST)"
                                                ,Hispanic!="NO CODED RESPONSE APPLICABLE (SPECIFY)",
                                                Education!="NO CODED RESPONSE APPLICABLE (SPECIFY)")





ndata=mutate_if(ndata,is.character,as.factor)

str(ndata)
```


```{r}
#==========================================================================================================
# import R object to the H2O cloud.
#convert r data to h2o object
#==========================================================================================================

datah20=as.h2o(ndata)

str(datah20)

```


```{r}
#==========================================================================================================
# Partition the data into training, validation and test sets
#==========================================================================================================

splits <- h2o.splitFrame(data = datah20, 
                         ratios = c(0.7, 0.15),  #partition data into 70%, 15%, 15% chunks
                         seed = 1)  #setting a seed will guarantee reproducibility
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]



# Identify response and predictor variables
y <- "Hypertension"
x <- setdiff(names(datah20), y)  
```



```{r}
#==========================================================================================================
#glm/logistic
#similar to R's glm, h2o.glm has the family argument
# 1. Let's start with a basic binomial Generalized Linear Model
# By default, h2o.glm uses a regularized, elastic net model
#==========================================================================================================


glm_fit1 <- h2o.glm(x = x, 
                    y = y, 
                    training_frame = train,
                    model_id = "glm_fit1",
                    family = "binomial") 

```






```{r}
#====================================================================================================================================
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
#=====================================================================================================================================


glm_fit2 <- h2o.glm(x = x, 
                    y = y, 
                    training_frame = train,
                    model_id = "glm_fit2",
                    validation_frame = valid,
                    family = "binomial",
                    lambda_search = TRUE)

```




```{r}
#==========================================================================================================
# Let's compare the performance of the two GLMs
#==========================================================================================================


glm_perf1 <- h2o.performance(model = glm_fit1,
                             newdata = test)
glm_perf2 <- h2o.performance(model = glm_fit2,
                             newdata = test)



(glm_perf1)  
(glm_perf2) 


```



```{r}
#==========================================================================================================
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
#==========================================================================================================


h2o.auc(glm_perf1)  
h2o.auc(glm_perf2)  

```



```{r}
#==========================================================================================================
# Compare test AUC to the training AUC and validation AUC
#==========================================================================================================


h2o.auc(glm_fit2, train = TRUE)  
h2o.auc(glm_fit2, valid = TRUE) 

glm_fit2@model$validation_metrics  



```



```{r}
#==========================================================================================================
# 2. Random Forest
# H2O's Random Forest (RF) implements a distributed version of the standard 
# Random Forest algorithm and variable importance measures.
# First we will train a basic Random Forest model with default parameters. 
# The Random Forest model will infer the response distribution from the response encoding. 
# A seed is required for reproducibility.
#==========================================================================================================


rf_fit1 <- h2o.randomForest(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "rf_fit1",
                            seed = 1)


```




```{r}
#==========================================================================================================
# Next we will increase the number of trees used in the forest by setting `ntrees = 100`.  
# The default number of trees in an H2O Random Forest is 50, so this RF will be twice as 
# big as the default.  Usually increasing the number of trees in a RF will increase 
# performance as well.  Unlike Gradient Boosting Machines (GBMs), Random Forests are fairly 
# resistant (although not free from) overfitting.
# See the GBM example below for additional guidance on preventing overfitting using H2O's 
# early stopping functionality.
#==========================================================================================================



rf_fit2 <- h2o.randomForest(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "rf_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100,
                            seed = 1)


#==========================================================================================================
# Let's compare the performance of the two RFs
#==========================================================================================================


rf_perf1 <- h2o.performance(model = rf_fit1,
                            newdata = test)
rf_perf2 <- h2o.performance(model = rf_fit2,
                            newdata = test)

```





```{r}
#==========================================================================================================
# Print model performance
#==========================================================================================================


rf_perf1
rf_perf2


#==========================================================================================================
# Retreive test set AUC
#==========================================================================================================


h2o.auc(rf_perf1)  
h2o.auc(rf_perf2)  


```




```{r}
#==========================================================================================================
# Cross-validate performance
# Rather than using held-out test set to evaluate model performance, a user may wish 
# to estimate model performance using cross-validation. Using the RF algorithm 
# (with default model parameters) as an example, we demonstrate how to perform k-fold 
# cross-validation using H2O. No custom code or loops are required, you simply specify 
# the number of desired folds in the nfolds argument.
# Since we are not going to use a test set here, we can use the original (full) dataset, 
# which we called data rather than the subsampled `train` dataset. Note that this will 
# take approximately k (nfolds) times longer than training a single RF model, since it 
# will train k models in the cross-validation process (trained on n(k-1)/k rows), in 
# addition to the final model trained on the full training_frame dataset with n rows.

#==========================================================================================================


rf_fit3 <- h2o.randomForest(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "rf_fit3",
                            seed = 1,
                            nfolds = 5)

#==========================================================================================================
# To evaluate the cross-validated AUC, do the following:
#==========================================================================================================


h2o.auc(rf_fit3, xval = TRUE)  


```





```{r}

#==========================================================================================================
# 3. Gradient Boosting Machine
# H2O's Gradient Boosting Machine (GBM) offers a Stochastic GBM, which can 
# increase performance quite a bit compared to the original GBM implementation.

# Now we will train a basic GBM model
# The GBM model will infer the response distribution from the response encoding if not specified 
# explicitly through the `distribution` argument. A seed is required for reproducibility.
#==========================================================================================================


gbm_fit1 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit1",
                    seed = 1)


#==========================================================================================================
# Next we will increase the number of trees used in the GBM by setting `ntrees=500`.  
# The default number of trees in an H2O GBM is 50, so this GBM will trained using ten times 
# the default.  Increasing the number of trees in a GBM is one way to increase performance 
# of the model, however, you have to be careful not to overfit your model to the training data 
# by using too many trees.  To automatically find the optimal number of trees, you must use 
# H2O's early stopping functionality.  This example will not do that, however, the following 
# example will.
#==========================================================================================================


gbm_fit2 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit2",
                    #validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    seed = 1)



#============================================================================================
# We will again set `ntrees = 500`, however, this time we will use early stopping in order to 
# prevent overfitting (from too many trees).  All of H2O's algorithms have early stopping available, 
# however early stopping is not enabled by default (with the exception of Deep Learning).  
# There are several parameters that should be used to control early stopping.  The three that are 
# common to all the algorithms are: `stopping_rounds`, `stopping_metric` and `stopping_tolerance`.  
# The stopping metric is the metric by which you'd like to measure performance, and so we will choose 
# AUC here.  The `score_tree_interval` is a parameter specific to the Random Forest model and the GBM.  
# Setting `score_tree_interval = 5` will score the model after every five trees.  The parameters we 
# have set below specify that the model will stop training after there have been three scoring intervals 
# where the AUC has not increased more than 0.0005.  Since we have specified a validation frame, 
# the stopping tolerance will be computed on validation AUC rather than training AUC. 
#===============================================================================================


gbm_fit3 <- h2o.gbm(x = x,
                    y = y,
                    training_frame = train,
                    model_id = "gbm_fit3",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
                    seed = 1)

#==========================================================================================================
# Let's compare the performance of the two GBMs
#==========================================================================================================


gbm_perf1 <- h2o.performance(model = gbm_fit1,
                             newdata = test)
gbm_perf2 <- h2o.performance(model = gbm_fit2,
                             newdata = test)
gbm_perf3 <- h2o.performance(model = gbm_fit3,
                             newdata = test)

```





```{r}
#==========================================================================================================
# Print model performance
#==========================================================================================================


gbm_perf1
gbm_perf2
gbm_perf3

#==========================================================================================================
# Retreive test set AUC
#==========================================================================================================


h2o.auc(gbm_perf1)  
h2o.auc(gbm_perf2)  
h2o.auc(gbm_perf3)  

```



To examine the scoring history, use the `scoring_history` method on a trained model.  
If `score_tree_interval` is not specified, it will score at various intervals, as we can 
see for `h2o.scoreHistory()` below.  However, regular 5-tree intervals are used 
for `h2o.scoreHistory()`.  
The `gbm_fit2` was trained only using a training set (no validation set), so the scoring 
history is calculated for training set performance metrics only.


```{r}
#==========================================================================================================

#==========================================================================================================


h2o.scoreHistory(gbm_fit2)


#==========================================================================================================
# When early stopping is used, we see that training stopped at 105 trees instead of the full 500.  
# Since we used a validation set in `gbm_fit3`, both training and validation performance metrics 
# are stored in the scoring history object.  Take a look at the validation AUC to observe that the 
# correct stopping tolerance was enforced.

#==========================================================================================================



h2o.scoreHistory(gbm_fit3)



#==========================================================================================================

# Look at scoring history for third GBM model
#==========================================================================================================

plot(gbm_fit3, 
     timestep = "number_of_trees", 
     metric = "AUC")
plot(gbm_fit3, 
     timestep = "number_of_trees", 
     metric = "logloss")



```

There is overfitting for number of trees greater than 5.This can be observed from the graph above which shows the training error continues to decrease but the validation error starts to increase after 5 trees.We will arrrive at a better model by choosing number of trees to be less than 5.




```{r}
#==========================================================================================================

# 4. Deep Learning
# H2O's Deep Learning algorithm is a multilayer feed-forward artificial neural network.  
# It can also be used to train an autoencoder. In this example we will train 
# a standard supervised prediction model.

# Train a default DL
# First we will train a basic DL model with default parameters. The DL model will infer the response 
# distribution from the response encoding if it is not specified explicitly through the `distribution` 
# argument.  H2O's DL will not be reproducible if it is run on more than a single core, so in this example, 
# the performance metrics below may vary slightly from what you see on your machine.
# In H2O's DL, early stopping is enabled by default, so below, it will use the training set and 
# default stopping parameters to perform early stopping.
#==========================================================================================================


dl_fit1 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit1",
                            seed = 1)

#==========================================================================================================

# Train a DL with new architecture and more epochs.
# Next we will increase the number of epochs used in the GBM by setting `epochs=20` (the default is 10).  
# Increasing the number of epochs in a deep neural net may increase performance of the model, however, 
# you have to be careful not to overfit your model to your training data.  To automatically find the optimal number of epochs, 
# you must use H2O's early stopping functionality.  Unlike the rest of the H2O algorithms, H2O's DL will 
# use early stopping by default, so for comparison we will first turn off early stopping.  We do this in the next example 
# by setting `stopping_rounds=0`.
#==========================================================================================================


dl_fit2 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
                            seed = 1)


#==========================================================================================================

# Train a DL with early stopping
# This example will use the same model parameters as `dl_fit2`. This time, we will turn on 
# early stopping and specify the stopping criterion.  We will also pass a validation set, as is
# recommended for early stopping.
#==========================================================================================================


dl_fit3 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit3",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 2,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
                            seed = 1)


```




```{r}
#==========================================================================================================

# Let's compare the performance of the three DL models
#==========================================================================================================


dl_perf1 <- h2o.performance(model = dl_fit1,
                            newdata = test)
dl_perf2 <- h2o.performance(model = dl_fit2,
                            newdata = test)
dl_perf3 <- h2o.performance(model = dl_fit3,
                            newdata = test)


#==========================================================================================================

# Print model performance
#==========================================================================================================


dl_perf1
dl_perf2
dl_perf3


#==========================================================================================================

# Retreive test set AUC
#==========================================================================================================


h2o.auc(dl_perf1)  
h2o.auc(dl_perf2)  
h2o.auc(dl_perf3)  


```




```{r}
#==========================================================================================================

# Scoring history
#==========================================================================================================



h2o.scoreHistory(dl_fit3)

#==========================================================================================================

# confusion matrix
#==========================================================================================================


h2o.confusionMatrix(dl_fit3)

```



```{r}
#==========================================================================================================

# model diagnostics
#==========================================================================================================

plot(dl_fit3,
     timestep = "epochs",
     metric = "classification_error")


h2o.scoreHistory(dl_fit3)$epochs
h2o.scoreHistory(dl_fit3)$validation_classification_error
```




```{r}
#==========================================================================================================

# Look at scoring history for third DL model
#==========================================================================================================
# The model starts to overfitt as epoch goes beyond 2. The training error continues to decrease whereas the 
# test error begins to increase.

plot(dl_fit3, 
     timestep = "epochs", 
     metric = "AUC")
```




```{r}
#==========================================================================================================

# # Get the CV models from the `dl_fit3` object for third DL model
#==========================================================================================================



dl_fit3 <- h2o.deeplearning(x = x,
                            y = y,
                            training_frame = train,
                            model_id = "dl_fit3",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 2,
                            nfolds = 3,
                            stopping_metric = "misclassification", #used for early stopping
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 5,          #used for early stopping
                            #stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
                            seed = 1)


cv_models <- sapply(dl_fit3@model$cross_validation_models,
                    function(i) h2o.getModel(i$name))

# Plot the scoring history over time
plot(cv_models[[2]],
     timestep = "epochs",
     metric = "classification_error")

plot(dl_fit3,
     timestep = "epochs",
     metric = "classification_error")


cv_models[[1]]
```

 As an alternative to manual tuning, or “hand tuning”, we can use the h2o.grid() function to perform either a Cartesian or Randon Grid Search (RGS). Random Grid Search is usually a quicker way to find a good model, so we will provide a example of how to use H2O’s Random Grid Search on a DNN.
 One handy feature of RGS is that you can specify how long you would like to execute the grid for – this can be based on a time, number of models, or a performance-metric-based stopping criterion. In the example below,we will train the DNN grid for 600 seconds (10 minutes).
First define a grid of Deep Learning hyperparamters and specify the search_criteria .



```{r}
#==========================================================================================================

# Deep Learning Grid Search third DL model
#==========================================================================================================




activation_opt <- c("Rectifier", "Maxout", "Tanh")
l1_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01)
l2_opt <- c(0, 0.00001, 0.0001, 0.001, 0.01)
hyper_params <- list(activation = activation_opt, l1 = l1_opt, l2 = l2_opt)
search_criteria <- list(strategy = "RandomDiscrete", max_runtime_secs = 600)

# Rather than comparing models by using cross-validation (which is “better” but takes longer), we will simply
# partition our training set into two pieces – one for training and one for validiation.
# This will split the train frame into an 80% and 20% partition of the rows.

splits <- h2o.splitFrame(train, ratios = 0.8, seed = 1)

#Train the random grid. Fixed non-default parameters such as hidden=c(20,20) can be passed directly to
#the h2o.grid() function.


dl_grid <- h2o.grid("deeplearning", x = x, y = y,
                    grid_id = "dl_grid",
                    training_frame = splits[[1]],
                    validation_frame = splits[[2]],
                    seed = 1,
                    hidden = c(20,20),
                    hyper_params = hyper_params,
                    search_criteria = search_criteria)

#Once we have trained the grid, we can collect the results and sort by our model performance metric of choice.

dl_gridperf <- h2o.getGrid(grid_id = "dl_grid",
                           sort_by = "accuracy",
                           decreasing = TRUE)
print(dl_gridperf)
```




```{r}
#Grab the model_id for the top DL model, chosen by validation error.

best_dl_model_id <- dl_gridperf@model_ids[[1]]
best_dl <- h2o.getModel(best_dl_model_id)

#Now let’s evaluate the model performance on a test set so we get an honest estimate of top model
#performance.

best_dl_perf <- h2o.performance(model = best_dl, newdata = test)
h2o.mse(best_dl_perf)

```




```{r}
#==========================================================================================================

# 5. Naive Bayes model
# The Naive Bayes (NB) algorithm does not usually beat an algorithm like a Random Forest 
# or GBM, however it is still a popular algorithm, especially in the text domain (when your 
# input is text encoded as "Bag of Words", for example).  The Naive Bayes algorithm is for 
# binary or multiclass classification problems only, not regression.  Therefore, your response 
# must be a factor instead of a numeric.

# First we will train a basic NB model with default parameters. 
#==========================================================================================================


nb_fit1 <- h2o.naiveBayes(x = x,
                          y = y,
                          training_frame = train,
                          model_id = "nb_fit1")


#==========================================================================================================

# Train a NB model with Laplace Smoothing
# One of the few tunable model parameters for the Naive Bayes algorithm is the amount of Laplace 
# smoothing. The H2O Naive Bayes model will not use any Laplace smoothing by default.
#==========================================================================================================


nb_fit2 <- h2o.naiveBayes(x = x,
                          y = y,
                          training_frame = train,
                          model_id = "nb_fit2",
                          laplace = 6)

#==========================================================================================================

# Let's compare the performance of the two NB models
#==========================================================================================================


nb_perf1 <- h2o.performance(model = nb_fit1,
                            newdata = test)
nb_perf2 <- h2o.performance(model = nb_fit2,
                            newdata = test)


#==========================================================================================================

# Print model performance
#==========================================================================================================


nb_perf1
nb_perf2


```

