Madrid Mean Temperature Forecast

Read in data, select mean temperature and divide into training and forecasting sets (75:25)

suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(gmodels))
suppressPackageStartupMessages(library(lattice))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(Kmisc))
suppressPackageStartupMessages(library(ROCR))
suppressPackageStartupMessages(library(corrplot))
madrid <- read.csv("madrid.csv")

temp <- madrid[,c(1,3)]

require(forecast)
require(tseries)

xtraintemp <- window(temp$Mean.TemperatureC,end=round(length(temp$CET)*.75))

xforetemp <- window(temp$Mean.TemperatureC,start=round(length(temp$CET)*.75))

Plot Training and Forecasting sets

plot(xtraintemp,type="l",main="Madrid mean daily temperature (training set)",ylab="Degrees Celsius",xlab="Day",col="blue")

plot(xforetemp,type="l",main="Madrid mean daily temperature (forecasting set)",ylab="Degrees Celsius",xlab="Day",col="red")

plot(temp$Mean.TemperatureC,type="p",main="Madrid mean daily temperature, all data",ylab="Degrees Celsius",xlab="Day")
lines(y=xtraintemp,x=1:5109,col="blue")
lines(y=xforetemp,x=5109:6812,col="red")

Simple forecasting methods:

Arithmetic Mean:

xt <- xtraintemp
xf <- xforetemp
x <- temp$Mean.TemperatureC
mean <- meanf(xt,h=length(xf))
plot(mean,main="Arithmetic Mean Method",ylab="Level",xlab="Day")
lines(x)

Naive/ Random Walk method (with 80% & 95% CI’s)

rw2 <- rwf(xt,h=length(xf))

plot(rw2,main="Naïve or Random Walk Method",ylab="Temperature",xlab="Day")
lines(x)

rwd <- rwf(xt,drift=T,h=length(xf))
plot(rwd,main="Random Walk with Drift Method",ylab="Temperature",xlab="Day")
lines(x)

ari <- Arima(xt,seasonal=c())

require(knitr)

Assessing Accuracy

Mean method

kable(accuracy(mean,xf))
ME RMSE MAE MPE MAPE MASE ACF1 Theil’s U
Training set 0.000000 7.503106 6.457223 -Inf Inf 4.222349 0.9625232 NA
Test set 1.258595 7.828909 6.799465 -Inf Inf 4.446139 0.9659514 NaN

Random Walk method

kable(accuracy(rw2,xf))
ME RMSE MAE MPE MAPE MASE ACF1 Theil’s U
Training set 0.0019596 2.048501 1.529297 NaN Inf 1.000000 -0.0982006 NA
Test set -1.3973005 7.852402 6.812793 -Inf Inf 4.454854 0.9659514 NaN

Random walk with drift method

kable(accuracy(rwd,xf))
ME RMSE MAE MPE MAPE MASE ACF1 Theil’s U
Training set 0.000000 2.048500 1.529737 NaN Inf 1.000288 -0.0982006 NA
Test set -3.067886 8.371454 7.161636 -Inf Inf 4.682961 0.9663510 NaN

Moving Average Methods

Simple Moving Average SMA

sma2 <- ma(xt,2)
sma5 <- ma(xt,5)
sma30 <- ma(xt,30)
sma120 <- ma(xt,120)

plot(x,main="Simple Moving Average SMA2",ylab="Level",xlab="Day",col="grey")
lines(sma2,col="blue")

plot(x,main="Simple Moving Average SMA5",ylab="Level",xlab="Day",col="grey")
lines(sma5,col="blue")

plot(x,main="Simple Moving Average SMA30",ylab="Level",xlab="Day",col="grey")
lines(sma30,col="blue")

plot(x,main="Simple Moving Average SMA120",ylab="Level",xlab="Day",col="grey")
lines(sma120,col="blue")

plot(forecast(sma2,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA2",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

plot(forecast(sma5,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA5",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

plot(forecast(sma30,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA30",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

plot(forecast(sma120,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA120",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

f <- forecast(sma2,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA2 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

f <- forecast(sma5,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA5 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

f <- forecast(sma30,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA30 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

f <- forecast(sma120,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA120 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

ETS forecast

fit.ets <- ets(xt)

fr.ets <- forecast(fit.ets,h=length(xf))

plot(forecast(fit.ets,h=length(xf)))
lines(x)

Seasonlaity in data, and variations around this

require(ggplot2)
temp_ma = ts(na.omit(temp$Mean.TemperatureC), frequency=30)
decomp = stl(temp_ma, s.window="periodic")

plot(decomp)

par(mfrow=c(2,1))
plot(decomp$time.series[,2])
plot(decomp$time.series[,3])

(cols_withNa <- apply(madrid, 2, function(x) sum(is.na(x))))
##                        CET           Max.TemperatureC 
##                          0                          2 
##          Mean.TemperatureC           Min.TemperatureC 
##                          3                          2 
##                 Dew.PointC             MeanDew.PointC 
##                          2                          2 
##              Min.DewpointC               Max.Humidity 
##                          2                          2 
##              Mean.Humidity               Min.Humidity 
##                          2                          2 
##  Max.Sea.Level.PressurehPa Mean.Sea.Level.PressurehPa 
##                          0                          0 
##  Min.Sea.Level.PressurehPa           Max.VisibilityKm 
##                          0                        940 
##          Mean.VisibilityKm           Min.VisibilitykM 
##                        940                        940 
##         Max.Wind.SpeedKm.h        Mean.Wind.SpeedKm.h 
##                          0                          0 
##         Max.Gust.SpeedKm.h            Precipitationmm 
##                       3306                          0 
##                 CloudCover                     Events 
##                       1372                          0 
##             WindDirDegrees 
##                          0
require(dplyr)
require(tidyr)

madrid <- madrid %>% select(-c(Max.VisibilityKm, Mean.VisibilityKm, Min.VisibilitykM,Max.Gust.SpeedKm.h,CloudCover))

madrid.f <- madrid[complete.cases(madrid),]

dat <- madrid.f
par(mfrow=c(1,1))
factor_vars <- names(which(sapply(dat, class) == "factor"))

dat$CET <- as.Date(dat$CET)
names(dat)[1] <- c("Date")
require(stringr)

Rain <- ifelse(str_detect(dat$Events,"Rain"),1,0)

RainTomorrow <- numeric(length(Rain))

for(i in 1:length(Rain)-1){
  if(Rain[i+1]==1){
    RainTomorrow[i] <- 1
  }
  else(RainTomorrow[i] <- 0)
  
}

dat$RainTomorrow <- RainTomorrow



factor_vars <- names(which(sapply(dat, class) == "factor"))

require(corrplot)
matrixdat <- as.matrix(dat[,-c(1,17)])
matrixdat[,-c(17)] <- scale(matrixdat[,-c(17)])
datcor <- cor(matrixdat)
corrplot(datcor)

md <- as.data.frame(matrixdat)
factor_vars <- names(which(sapply(dat, class) == "factor"))
numeric_vars <- setdiff(colnames(dat), factor_vars)
numeric_vars <- setdiff(numeric_vars, c("RainTomorrow","Date"))
numeric_vars
##  [1] "Max.TemperatureC"           "Mean.TemperatureC"         
##  [3] "Min.TemperatureC"           "Dew.PointC"                
##  [5] "MeanDew.PointC"             "Min.DewpointC"             
##  [7] "Max.Humidity"               "Mean.Humidity"             
##  [9] "Min.Humidity"               "Max.Sea.Level.PressurehPa" 
## [11] "Mean.Sea.Level.PressurehPa" "Min.Sea.Level.PressurehPa" 
## [13] "Max.Wind.SpeedKm.h"         "Mean.Wind.SpeedKm.h"       
## [15] "Precipitationmm"            "WindDirDegrees"
numeric_vars_mat <- as.matrix(dat[, numeric_vars, drop=FALSE])
numeric_vars_cor <- cor(numeric_vars_mat)
corrplot(numeric_vars_cor)

dat$RainTomorrow <- ifelse(dat$RainTomorrow==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x= RainTomorrow, y=eval(parse(text=x)), col = dat$RainTomorrow)) + geom_boxplot() + xlab("RainTomorrow") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x=eval(parse(text=x)), col = RainTomorrow)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

dat$RainToday <- Rain

dat$RainToday <- ifelse(dat$RainToday==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x= RainToday, y=eval(parse(text=x)), col = dat$RainToday)) + geom_boxplot() + xlab("RainToday") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x=eval(parse(text=x)), col = RainToday)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

Rain_in2days <- numeric(length(dat$RainToday))

for(i in 1:length(dat$RainToday)){
  if(i+2<=length(dat$RainToday)){
    
      Rain_in2days[i] <- ifelse(dat$RainToday[i+2]=="Yes",1,0)
    
  }
  
  else(Rain_in2days[i]<-0)

}

dat$Rain_in2days <- Rain_in2days
dat$Rain_in2days <- ifelse(dat$Rain_in2days==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x= Rain_in2days, y=eval(parse(text=x)), col = dat$Rain_in2days)) + geom_boxplot() + xlab("Rain in 2 days") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x=eval(parse(text=x)), col = Rain_in2days)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

require(kohonen)
somdat <- dat[numeric_vars]

somdat <- scale(as.matrix(somdat))

somdat <- somdat[,c(2,5,8,11,14,15)]

som_grid <- somgrid(xdim = 5, ydim=5, topo="hexagonal")

som_model <- som(somdat, 
        grid=som_grid, 
        rlen=100, 
        alpha=c(0.05,0.01), 
        keep.data = TRUE)

plot(som_model, type="changes")

plot(som_model, type="count")

plot(som_model, type="dist.neighbours")

plot(som_model, type="codes")

MACHINE LEARNING

dat$Month <- format(as.Date(dat$Date), "%m")
dat$Year <- format(as.Date(dat$Date), "%Y")

library(h2o)
h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
max_mem_size = "4G") #max mem size is the maximum memory to allocate to H2O
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\GERHAR~1\AppData\Local\Temp\RtmpqG2RrY/h2o_Gerhard_Viljoen_started_from_r.out
##     C:\Users\GERHAR~1\AppData\Local\Temp\RtmpqG2RrY/h2o_Gerhard_Viljoen_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 540 milliseconds 
##     H2O cluster version:        3.16.0.2 
##     H2O cluster version age:    1 month and 29 days  
##     H2O cluster name:           H2O_started_from_R_Gerhard_Viljoen_pbn723 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.56 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.3 (2017-11-30)
#h2o.no_progress()

dat$Events <- as.factor(dat$Events)
dat$RainToday <- as.factor(dat$RainToday)
dat$RainTomorrow <- as.factor(dat$RainTomorrow)
dat$Rain_in2days <- as.factor(dat$Rain_in2days)
dat$Year <- as.factor(dat$Year)
dat$Month <- as.factor(dat$Month)

dat <- dat %>% select(-c(Events,Precipitationmm,Date))

dat.hex <- as.h2o(dat)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
#dat.hex[,-c(18:20)] <- scale(dat.hex[,-c(18:20)])

# Partition the data into training, validation and test sets
splits <- h2o.splitFrame(data = dat.hex, 
                         ratios = c(0.7, 0.15),  #partition data into 70%, 15%, 15% chunks
seed = 1) #setting a seed will guarantee reproducibility

train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]

y1 <- "RainToday"
y2 <- "RainTomorrow"
y3 <- "Rain_in2days"
x <- setdiff(names(dat.hex), c(y1,y2,y3))

print(x)
##  [1] "Max.TemperatureC"           "Mean.TemperatureC"         
##  [3] "Min.TemperatureC"           "Dew.PointC"                
##  [5] "MeanDew.PointC"             "Min.DewpointC"             
##  [7] "Max.Humidity"               "Mean.Humidity"             
##  [9] "Min.Humidity"               "Max.Sea.Level.PressurehPa" 
## [11] "Mean.Sea.Level.PressurehPa" "Min.Sea.Level.PressurehPa" 
## [13] "Max.Wind.SpeedKm.h"         "Mean.Wind.SpeedKm.h"       
## [15] "WindDirDegrees"             "Month"                     
## [17] "Year"

RAIN TODAY

Basic Binomial Generalized Linear Model

glm_fit1 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit1",
family = "binomial") #similar to R's glm, h2o.glm has the family argument
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
glm_fit2 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit2",
                    validation_frame = valid,
                    family = "binomial",
lambda_search = TRUE)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=================================================================| 100%
# Let's compare the performance of the two GLMs
glm_perf1 <- h2o.performance(model = glm_fit1,
                             newdata = test)
glm_perf2 <- h2o.performance(model = glm_fit2,
                             newdata = test)

# Print model performance
glm_perf1
## H2OBinomialMetrics: glm
## 
## MSE:  0.06893514
## RMSE:  0.262555
## LogLoss:  0.2314325
## Mean Per-Class Error:  0.1571695
## AUC:  0.9437533
## Gini:  0.8875065
## R^2:  0.5792355
## Residual Deviance:  473.0479
## AIC:  561.0479
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error      Rate
## N0     779  32 0.039457   =32/811
## Yes     58 153 0.274882   =58/211
## Totals 837 185 0.088063  =90/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.508712 0.772727 144
## 2                       max f2  0.179051 0.824698 240
## 3                 max f0point5  0.617765 0.810489 118
## 4                 max accuracy  0.508712 0.911937 144
## 5                max precision  0.999272 1.000000   0
## 6                   max recall  0.001937 1.000000 392
## 7              max specificity  0.999272 1.000000   0
## 8             max absolute_mcc  0.508712 0.720796 144
## 9   max min_per_class_accuracy  0.211979 0.865598 226
## 10 max mean_per_class_accuracy  0.179051 0.876774 240
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
glm_perf2
## H2OBinomialMetrics: glm
## 
## MSE:  0.0690964
## RMSE:  0.2628619
## LogLoss:  0.2316171
## Mean Per-Class Error:  0.1548962
## AUC:  0.9438117
## Gini:  0.8876234
## R^2:  0.5782512
## Residual Deviance:  473.4253
## AIC:  563.4253
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error      Rate
## N0     775  36 0.044390   =36/811
## Yes     56 155 0.265403   =56/211
## Totals 831 191 0.090020  =92/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.477549 0.771144 152
## 2                       max f2  0.161318 0.826162 247
## 3                 max f0point5  0.656427 0.814095 119
## 4                 max accuracy  0.507839 0.910959 147
## 5                max precision  0.999323 1.000000   0
## 6                   max recall  0.001606 1.000000 392
## 7              max specificity  0.999323 1.000000   0
## 8             max absolute_mcc  0.507839 0.717992 147
## 9   max min_per_class_accuracy  0.218971 0.872038 221
## 10 max mean_per_class_accuracy  0.161318 0.877294 247
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf1)  
## [1] 0.9437533
h2o.auc(glm_perf2)  
## [1] 0.9438117
# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit2, train = TRUE)  
## [1] 0.9443575
h2o.auc(glm_fit2, valid = TRUE)  
## [1] 0.950664
#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf1)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.508711960103498:
##         N0 Yes    Error      Rate
## N0     779  32 0.039457   =32/811
## Yes     58 153 0.274882   =58/211
## Totals 837 185 0.088063  =90/1022
h2o.confusionMatrix(glm_perf2)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.477548815356572:
##         N0 Yes    Error      Rate
## N0     775  36 0.044390   =36/811
## Yes     56 155 0.265403   =56/211
## Totals 831 191 0.090020  =92/1022

RANDOM FOREST

rf_fit1 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit1",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |=================================================================| 100%
rf_fit2 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit2",
                            validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100000,
                            score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
#Let's compare the performance of the two RFs
rf_perf1 <- h2o.performance(model = rf_fit1,
                            newdata = test)
rf_perf2 <- h2o.performance(model = rf_fit2,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf1)
## [1] 0.9336464
h2o.auc(rf_perf2) 
## [1] 0.9369803
#Cross Validate

rf_fit3 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit3",
                            seed = 1,
                            nfolds = 5)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%
# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit3, xval = TRUE) 
## [1] 0.9245813
h2o.confusionMatrix(rf_perf1)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.42:
##         N0 Yes    Error       Rate
## N0     764  47 0.057953    =47/811
## Yes     53 158 0.251185    =53/211
## Totals 817 205 0.097847  =100/1022
h2o.confusionMatrix(rf_perf2)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.4125:
##         N0 Yes    Error       Rate
## N0     764  47 0.057953    =47/811
## Yes     55 156 0.260664    =55/211
## Totals 819 203 0.099804  =102/1022

Gradient Boosting Machine

gbm_fit1 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit1",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_fit2 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit2",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%
gbm_fit3 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit3",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 50000,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_perf1 <- h2o.performance(model = gbm_fit1,
                             newdata = test)
gbm_perf2 <- h2o.performance(model = gbm_fit2,
                             newdata = test)
gbm_perf3 <- h2o.performance(model = gbm_fit3,
newdata = test)

h2o.auc(gbm_perf1)  
## [1] 0.9456116
h2o.auc(gbm_perf2) 
## [1] 0.946944
h2o.auc(gbm_perf3) 
## [1] 0.946944
h2o.scoreHistory(gbm_fit2)
## Scoring History: 
##             timestamp   duration number_of_trees training_rmse
## 1 2018-01-28 21:01:19  0.000 sec               0       0.41424
## 2 2018-01-28 21:01:19  0.032 sec               5       0.33477
## 3 2018-01-28 21:01:19  0.063 sec              10       0.29373
## 4 2018-01-28 21:01:19  0.107 sec              15       0.26949
## 5 2018-01-28 21:01:19  0.153 sec              20       0.25432
##   training_logloss training_auc training_lift
## 1          0.52690      0.50000       1.00000
## 2          0.37441      0.94039       4.54554
## 3          0.30527      0.95214       4.54554
## 4          0.26394      0.95846       4.54554
## 5          0.23658      0.96453       4.54554
##   training_classification_error validation_rmse validation_logloss
## 1                       0.78000         0.42489            0.54709
## 2                       0.10081         0.35010            0.39967
## 3                       0.08913         0.31656            0.34011
## 4                       0.08328         0.30122            0.31031
## 5                       0.07765         0.29461            0.29490
##   validation_auc validation_lift validation_classification_error
## 1        0.50000         1.00000                         0.76406
## 2        0.91773         4.23830                         0.11345
## 3        0.92434         4.23830                         0.11546
## 4        0.92637         4.23830                         0.10542
## 5        0.92795         4.23830                         0.10643
## 
## ---
##              timestamp   duration number_of_trees training_rmse
## 23 2018-01-28 21:01:19  0.775 sec             110       0.15847
## 24 2018-01-28 21:01:19  0.822 sec             115       0.15490
## 25 2018-01-28 21:01:19  0.853 sec             120       0.15165
## 26 2018-01-28 21:01:19  0.885 sec             125       0.14784
## 27 2018-01-28 21:01:19  0.916 sec             130       0.14553
## 28 2018-01-28 21:01:20  0.947 sec             135       0.14212
##    training_logloss training_auc training_lift
## 23          0.10434      0.99553       4.54554
## 24          0.10088      0.99605       4.54554
## 25          0.09789      0.99654       4.54554
## 26          0.09458      0.99699       4.54554
## 27          0.09248      0.99721       4.54554
## 28          0.08946      0.99750       4.54554
##    training_classification_error validation_rmse validation_logloss
## 23                       0.02588         0.27654            0.25339
## 24                       0.02379         0.27586            0.25214
## 25                       0.02254         0.27565            0.25178
## 26                       0.02108         0.27558            0.25205
## 27                       0.01941         0.27584            0.25245
## 28                       0.01774         0.27580            0.25253
##    validation_auc validation_lift validation_classification_error
## 23        0.94168         4.23830                         0.10843
## 24        0.94262         4.23830                         0.10743
## 25        0.94275         4.23830                         0.10643
## 26        0.94252         4.23830                         0.10643
## 27        0.94240         4.23830                         0.10743
## 28        0.94250         4.23830                         0.10643
h2o.scoreHistory(gbm_fit3)
## Scoring History: 
##             timestamp   duration number_of_trees training_rmse
## 1 2018-01-28 21:01:20  0.000 sec               0       0.41424
## 2 2018-01-28 21:01:20  0.047 sec               5       0.33477
## 3 2018-01-28 21:01:20  0.078 sec              10       0.29373
## 4 2018-01-28 21:01:20  0.125 sec              15       0.26949
## 5 2018-01-28 21:01:20  0.156 sec              20       0.25432
##   training_logloss training_auc training_lift
## 1          0.52690      0.50000       1.00000
## 2          0.37441      0.94039       4.54554
## 3          0.30527      0.95214       4.54554
## 4          0.26394      0.95846       4.54554
## 5          0.23658      0.96453       4.54554
##   training_classification_error validation_rmse validation_logloss
## 1                       0.78000         0.42489            0.54709
## 2                       0.10081         0.35010            0.39967
## 3                       0.08913         0.31656            0.34011
## 4                       0.08328         0.30122            0.31031
## 5                       0.07765         0.29461            0.29490
##   validation_auc validation_lift validation_classification_error
## 1        0.50000         1.00000                         0.76406
## 2        0.91773         4.23830                         0.11345
## 3        0.92434         4.23830                         0.11546
## 4        0.92637         4.23830                         0.10542
## 5        0.92795         4.23830                         0.10643
## 
## ---
##              timestamp   duration number_of_trees training_rmse
## 23 2018-01-28 21:01:21  0.737 sec             110       0.15847
## 24 2018-01-28 21:01:21  0.762 sec             115       0.15490
## 25 2018-01-28 21:01:21  0.794 sec             120       0.15165
## 26 2018-01-28 21:01:21  0.825 sec             125       0.14784
## 27 2018-01-28 21:01:21  0.856 sec             130       0.14553
## 28 2018-01-28 21:01:21  0.888 sec             135       0.14212
##    training_logloss training_auc training_lift
## 23          0.10434      0.99553       4.54554
## 24          0.10088      0.99605       4.54554
## 25          0.09789      0.99654       4.54554
## 26          0.09458      0.99699       4.54554
## 27          0.09248      0.99721       4.54554
## 28          0.08946      0.99750       4.54554
##    training_classification_error validation_rmse validation_logloss
## 23                       0.02588         0.27654            0.25339
## 24                       0.02379         0.27586            0.25214
## 25                       0.02254         0.27565            0.25178
## 26                       0.02108         0.27558            0.25205
## 27                       0.01941         0.27584            0.25245
## 28                       0.01774         0.27580            0.25253
##    validation_auc validation_lift validation_classification_error
## 23        0.94168         4.23830                         0.10843
## 24        0.94262         4.23830                         0.10743
## 25        0.94275         4.23830                         0.10643
## 26        0.94252         4.23830                         0.10643
## 27        0.94240         4.23830                         0.10743
## 28        0.94250         4.23830                         0.10643
plot(gbm_fit3, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit3, 
     timestep = "number_of_trees", 
metric = "logloss")

gbm_fit4 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit1",
                    ntrees=5,
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_perf4 <- h2o.performance(model = gbm_fit4,
newdata = test)

h2o.auc(gbm_perf4) 
## [1] 0.922321
h2o.confusionMatrix(gbm_perf1)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.387829013222046:
##         N0 Yes    Error      Rate
## N0     766  45 0.055487   =45/811
## Yes     49 162 0.232227   =49/211
## Totals 815 207 0.091977  =94/1022
h2o.confusionMatrix(gbm_perf2)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.360968775857713:
##         N0 Yes    Error      Rate
## N0     762  49 0.060419   =49/811
## Yes     46 165 0.218009   =46/211
## Totals 808 214 0.092955  =95/1022
h2o.confusionMatrix(gbm_perf3)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.360968775857713:
##         N0 Yes    Error      Rate
## N0     762  49 0.060419   =49/811
## Yes     46 165 0.218009   =46/211
## Totals 808 214 0.092955  =95/1022
h2o.confusionMatrix(gbm_perf4)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.250636786717922:
##         N0 Yes    Error       Rate
## N0     741  70 0.086313    =70/811
## Yes     50 161 0.236967    =50/211
## Totals 791 231 0.117417  =120/1022

Deep Learning

dl_fit1 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit1",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%
dl_fit2 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |=================================================================| 100%
dl_fit3 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit3",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |=================================================================| 100%
dl_perf1 <- h2o.performance(model = dl_fit1,
                            newdata = test)
dl_perf2 <- h2o.performance(model = dl_fit2,
                            newdata = test)
dl_perf3 <- h2o.performance(model = dl_fit3,
newdata = test)

h2o.auc(dl_perf1)  # 0.6774335
## [1] 0.9449045
h2o.auc(dl_perf2)  # 0.678446
## [1] 0.9446298
h2o.auc(dl_perf3) # 0.6770498
## [1] 0.9404866
plot(dl_fit3, 
     timestep = "epochs", 
metric = "AUC")

dl_fit4 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit4",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(32,64,32,128,32,64,32),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================================================================| 100%
dl_perf4 <- h2o.performance(model = dl_fit4,
newdata = test)

h2o.auc(dl_perf4)
## [1] 0.9396889
h2o.confusionMatrix(dl_perf1)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.487012442242802:
##         N0 Yes    Error      Rate
## N0     766  45 0.055487   =45/811
## Yes     47 164 0.222749   =47/211
## Totals 813 209 0.090020  =92/1022
h2o.confusionMatrix(dl_perf2)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.44685584564985:
##         N0 Yes    Error      Rate
## N0     765  46 0.056720   =46/811
## Yes     48 163 0.227488   =48/211
## Totals 813 209 0.091977  =94/1022
h2o.confusionMatrix(dl_perf3)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.464838685057443:
##         N0 Yes    Error      Rate
## N0     774  37 0.045623   =37/811
## Yes     50 161 0.236967   =50/211
## Totals 824 198 0.085127  =87/1022
h2o.confusionMatrix(dl_perf4)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.338893274470836:
##         N0 Yes    Error       Rate
## N0     730  81 0.099877    =81/811
## Yes     31 180 0.146919    =31/211
## Totals 761 261 0.109589  =112/1022

CARTESIAN GRID SEARCHES

gbm_params11 <- list(learn_rate = c(0.01, 0.1),
                    max_depth = c(3, 5, 9),
                    sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))

gbm_grid11 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid11",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
hyper_params = gbm_params11)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |========================================================         |  85%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf11 <- h2o.getGrid(grid_id = "gbm_grid11", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf11)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid11 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1             0.5        0.1         5         1.0 gbm_grid11_model_28
## 2             1.0        0.1         5         0.8 gbm_grid11_model_11
## 3             0.2        0.1         5         1.0 gbm_grid11_model_27
## 4             1.0        0.1         3         1.0 gbm_grid11_model_23
## 5             1.0        0.1         9         0.8 gbm_grid11_model_17
##                  auc
## 1 0.9448122571085079
## 2 0.9439399446417088
## 3  0.943707887158554
## 4 0.9422736041602594
## 5 0.9420946682696341
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             1.0       0.01         3         0.8  gbm_grid11_model_2
## 32             1.0       0.01         3         1.0 gbm_grid11_model_20
## 33             0.5       0.01         3         0.8  gbm_grid11_model_1
## 34             0.5       0.01         3         1.0 gbm_grid11_model_19
## 35             0.2       0.01         3         0.8  gbm_grid11_model_0
## 36             0.2       0.01         3         1.0 gbm_grid11_model_18
##                   auc
## 31  0.916940196270305
## 32 0.9167696479995527
## 33 0.9148684541616574
## 34 0.9134425587832359
## 35 0.9127324069673163
## 36 0.9125422875835267
gbm_params21 <- list(learn_rate = seq(0.001, 0.1, 0.001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.3, 1.0, 0.05),
                    col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria21 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid21 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid21",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
                      hyper_params = gbm_params21,
                      search_criteria = search_criteria21)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |====                                                             |   5%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  75%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  92%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf21 <- h2o.getGrid(grid_id = "gbm_grid21", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf21)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid21 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1            0.65      0.059         9         0.5 gbm_grid21_model_22
## 2             0.4      0.067         7        0.75 gbm_grid21_model_34
## 3             0.5      0.052         8         0.6 gbm_grid21_model_19
## 4             0.4      0.062         6         0.9  gbm_grid21_model_2
## 5             0.7      0.052         4         0.8 gbm_grid21_model_21
##                  auc
## 1 0.9422344619341851
## 2 0.9416724914026896
## 3  0.941339782481058
## 4 0.9406212430452652
## 5 0.9401599239522465
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             0.4      0.079         2        0.45 gbm_grid21_model_18
## 32            0.85      0.012         6         0.3 gbm_grid21_model_32
## 33             0.7      0.011         4        0.95  gbm_grid21_model_8
## 34             0.9       0.01         4        0.55  gbm_grid21_model_6
## 35             0.4      0.042         2         0.5 gbm_grid21_model_16
## 36            0.15      0.008         2        0.65 gbm_grid21_model_11
##                   auc
## 31 0.9301031677244388
## 32 0.9257220342774066
## 33 0.9240165515698828
## 34 0.9237201890010345
## 35    0.9235440489837
## 36 0.8947465540861689
#UPDATE

gbm_params31 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.6, 0.9, 0.005),
                    col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria31 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid31<- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid31",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 1000,
                      seed = 1,
                      hyper_params = gbm_params31,
                      search_criteria = search_criteria31)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf31 <- h2o.getGrid(grid_id = "gbm_grid31", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf31)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid31 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1           0.585     0.0436         3       0.785  gbm_grid31_model_5
## 2            0.64     0.0404         3       0.775 gbm_grid31_model_20
## 3            0.53     0.0499         2        0.61 gbm_grid31_model_29
## 4            0.53     0.0345         3        0.69 gbm_grid31_model_32
## 5           0.625     0.0367         2        0.78  gbm_grid31_model_0
##                  auc
## 1 0.9524198283333799
## 2  0.950801017697878
## 3 0.9500852741353761
## 4 0.9498755836385494
## 5 0.9495037324908435
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31           0.625     0.0365        10       0.685 gbm_grid31_model_13
## 32           0.615      0.036         9       0.685 gbm_grid31_model_31
## 33           0.715     0.0352        10        0.89  gbm_grid31_model_6
## 34            0.69     0.0353         9        0.64 gbm_grid31_model_10
## 35             0.5     0.0285         8       0.785 gbm_grid31_model_21
## 36           0.645     0.0372         9        0.85 gbm_grid31_model_22
##                   auc
## 31  0.942597925462018
## 32 0.9424497441775939
## 33 0.9421953197081108
## 34 0.9421869320882377
## 35 0.9419408952386278
## 36 0.9415187183716834
best_gbm_model_id <- gbm_gridperf31@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm, 
                                 newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541
## [1] 0.9476657
h2o.confusionMatrix(best_gbm_perf)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.496636246222322:
##         N0 Yes    Error      Rate
## N0     779  32 0.039457   =32/811
## Yes     56 155 0.265403   =56/211
## Totals 835 187 0.086106  =88/1022

RAIN TOMORROW

Basic Binomial Generalized Linear Model

y1 <- y2

glm_fit10 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit10",
family = "binomial") #similar to R's glm, h2o.glm has the family argument
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
glm_fit20 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit20",
                    validation_frame = valid,
                    family = "binomial",
lambda_search = TRUE)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=================================================================| 100%
# Let's compare the performance of the two GLMs
glm_perf10 <- h2o.performance(model = glm_fit10,
                             newdata = test)
glm_perf20 <- h2o.performance(model = glm_fit20,
                             newdata = test)

# Print model performance
glm_perf10
## H2OBinomialMetrics: glm
## 
## MSE:  0.1359599
## RMSE:  0.3687274
## LogLoss:  0.4285203
## Mean Per-Class Error:  0.26428
## AUC:  0.7990735
## Gini:  0.5981469
## R^2:  0.2228153
## Residual Deviance:  875.8955
## AIC:  961.8955
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     640 151 0.190898   =151/791
## Yes     78 153 0.337662    =78/231
## Totals 718 304 0.224070  =229/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.246202 0.571963 196
## 2                       max f2  0.126803 0.675583 280
## 3                 max f0point5  0.410343 0.584795 115
## 4                 max accuracy  0.448265 0.817025 103
## 5                max precision  0.998373 1.000000   0
## 6                   max recall  0.014665 1.000000 392
## 7              max specificity  0.998373 1.000000   0
## 8             max absolute_mcc  0.309329 0.436817 161
## 9   max min_per_class_accuracy  0.197032 0.722944 230
## 10 max mean_per_class_accuracy  0.246202 0.735720 196
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
glm_perf20
## H2OBinomialMetrics: glm
## 
## MSE:  0.1349278
## RMSE:  0.3673252
## LogLoss:  0.4249275
## Mean Per-Class Error:  0.2618473
## AUC:  0.8038157
## Gini:  0.6076313
## R^2:  0.2287153
## Residual Deviance:  868.5518
## AIC:  918.5518
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     637 154 0.194690   =154/791
## Yes     76 155 0.329004    =76/231
## Totals 713 309 0.225049  =230/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.241526 0.574074 190
## 2                       max f2  0.118447 0.680159 283
## 3                 max f0point5  0.356828 0.601704 125
## 4                 max accuracy  0.358210 0.821918 124
## 5                max precision  0.992446 1.000000   0
## 6                   max recall  0.024746 1.000000 388
## 7              max specificity  0.992446 1.000000   0
## 8             max absolute_mcc  0.356828 0.451257 125
## 9   max min_per_class_accuracy  0.191526 0.724399 223
## 10 max mean_per_class_accuracy  0.223121 0.739130 201
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf10)  
## [1] 0.7990735
h2o.auc(glm_perf20)  
## [1] 0.8038157
# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit10, train = TRUE)  
## [1] 0.8101513
h2o.auc(glm_fit20, valid = TRUE)  
## [1] 0.7799993
#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf10)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.246201735705228:
##         N0 Yes    Error       Rate
## N0     640 151 0.190898   =151/791
## Yes     78 153 0.337662    =78/231
## Totals 718 304 0.224070  =229/1022
h2o.confusionMatrix(glm_perf20)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.241526093530235:
##         N0 Yes    Error       Rate
## N0     637 154 0.194690   =154/791
## Yes     76 155 0.329004    =76/231
## Totals 713 309 0.225049  =230/1022

RANDOM FOREST

rf_fit10 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit10",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=================================================================| 100%
rf_fit20 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit20",
                            validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100000,
                            score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
#Let's compare the performance of the two RFs
rf_perf10 <- h2o.performance(model = rf_fit10,
                            newdata = test)
rf_perf20 <- h2o.performance(model = rf_fit20,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf10)
## [1] 0.7925252
h2o.auc(rf_perf20) 
## [1] 0.7900871
#Cross Validate

rf_fit30 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit30",
                            seed = 1,
                            nfolds = 5)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |=================================================================| 100%
rf_perf30 <- h2o.performance(model = rf_fit30,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf30)
## [1] 0.7925252
# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit30, xval = TRUE) 
## [1] 0.7838314
h2o.confusionMatrix(rf_perf10)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.35:
##         N0 Yes    Error       Rate
## N0     701  90 0.113780    =90/791
## Yes    101 130 0.437229   =101/231
## Totals 802 220 0.186888  =191/1022
h2o.confusionMatrix(rf_perf20)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.328571428571429:
##         N0 Yes    Error       Rate
## N0     683 108 0.136536   =108/791
## Yes     94 137 0.406926    =94/231
## Totals 777 245 0.197652  =202/1022
h2o.confusionMatrix(rf_perf30)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.35:
##         N0 Yes    Error       Rate
## N0     701  90 0.113780    =90/791
## Yes    101 130 0.437229   =101/231
## Totals 802 220 0.186888  =191/1022

Gradient Boosting Machine

gbm_fit10 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit10",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |=================================================================| 100%
gbm_fit20 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit20",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%
gbm_fit30 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit30",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 50000,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_perf10 <- h2o.performance(model = gbm_fit10,
                             newdata = test)
gbm_perf20 <- h2o.performance(model = gbm_fit20,
                             newdata = test)
gbm_perf30 <- h2o.performance(model = gbm_fit30,
newdata = test)

h2o.auc(gbm_perf10)  
## [1] 0.7963589
h2o.auc(gbm_perf20) 
## [1] 0.7972182
h2o.auc(gbm_perf30) 
## [1] 0.7972182
h2o.scoreHistory(gbm_fit20)
## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:08:02  0.000 sec               0       0.41096
## 2  2018-01-28 21:08:02  0.015 sec               5       0.37158
## 3  2018-01-28 21:08:02  0.046 sec              10       0.35093
## 4  2018-01-28 21:08:02  0.078 sec              15       0.33775
## 5  2018-01-28 21:08:02  0.093 sec              20       0.32816
## 6  2018-01-28 21:08:02  0.124 sec              25       0.32067
## 7  2018-01-28 21:08:02  0.156 sec              30       0.31465
## 8  2018-01-28 21:08:02  0.171 sec              35       0.31121
## 9  2018-01-28 21:08:02  0.203 sec              40       0.30703
## 10 2018-01-28 21:08:02  0.234 sec              45       0.30324
## 11 2018-01-28 21:08:02  0.265 sec              50       0.29859
## 12 2018-01-28 21:08:02  0.296 sec              55       0.29613
## 13 2018-01-28 21:08:02  0.312 sec              60       0.29363
##    training_logloss training_auc training_lift
## 1           0.52076      0.50000       1.00000
## 2           0.43782      0.85058       4.36813
## 3           0.39786      0.86818       4.45332
## 4           0.37192      0.88191       4.64694
## 5           0.35322      0.89190       4.64694
## 6           0.33779      0.90331       4.64694
## 7           0.32618      0.91141       4.64694
## 8           0.31933      0.91535       4.64694
## 9           0.31161      0.92052       4.64694
## 10          0.30461      0.92575       4.64694
## 11          0.29689      0.93082       4.64694
## 12          0.29253      0.93309       4.64694
## 13          0.28826      0.93563       4.64694
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78480         0.42711            0.55150
## 2                        0.18284         0.39843            0.48816
## 3                        0.17178         0.38800            0.46590
## 4                        0.16427         0.38517            0.45884
## 5                        0.15508         0.38292            0.45369
## 6                        0.15362         0.38299            0.45288
## 7                        0.13525         0.38296            0.45279
## 8                        0.13066         0.38296            0.45271
## 9                        0.12252         0.38309            0.45297
## 10                       0.12106         0.38425            0.45511
## 11                       0.12711         0.38380            0.45496
## 12                       0.11396         0.38363            0.45491
## 13                       0.12336         0.38409            0.45626
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76104
## 2         0.78273         4.18487                         0.29819
## 3         0.78540         4.18487                         0.26606
## 4         0.78459         3.76639                         0.28213
## 5         0.78665         4.18487                         0.26205
## 6         0.78797         4.18487                         0.26908
## 7         0.78821         4.18487                         0.23996
## 8         0.78910         4.18487                         0.25803
## 9         0.79017         3.76639                         0.23795
## 10        0.78935         3.34790                         0.23594
## 11        0.78948         3.76639                         0.27610
## 12        0.79032         3.34790                         0.24096
## 13        0.78976         3.76639                         0.27108
h2o.scoreHistory(gbm_fit30)
## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:08:03  0.000 sec               0       0.41096
## 2  2018-01-28 21:08:03  0.015 sec               5       0.37158
## 3  2018-01-28 21:08:03  0.049 sec              10       0.35093
## 4  2018-01-28 21:08:03  0.075 sec              15       0.33775
## 5  2018-01-28 21:08:03  0.090 sec              20       0.32816
## 6  2018-01-28 21:08:03  0.122 sec              25       0.32067
## 7  2018-01-28 21:08:03  0.153 sec              30       0.31465
## 8  2018-01-28 21:08:03  0.168 sec              35       0.31121
## 9  2018-01-28 21:08:03  0.200 sec              40       0.30703
## 10 2018-01-28 21:08:03  0.231 sec              45       0.30324
## 11 2018-01-28 21:08:03  0.257 sec              50       0.29859
## 12 2018-01-28 21:08:03  0.274 sec              55       0.29613
## 13 2018-01-28 21:08:03  0.290 sec              60       0.29363
##    training_logloss training_auc training_lift
## 1           0.52076      0.50000       1.00000
## 2           0.43782      0.85058       4.36813
## 3           0.39786      0.86818       4.45332
## 4           0.37192      0.88191       4.64694
## 5           0.35322      0.89190       4.64694
## 6           0.33779      0.90331       4.64694
## 7           0.32618      0.91141       4.64694
## 8           0.31933      0.91535       4.64694
## 9           0.31161      0.92052       4.64694
## 10          0.30461      0.92575       4.64694
## 11          0.29689      0.93082       4.64694
## 12          0.29253      0.93309       4.64694
## 13          0.28826      0.93563       4.64694
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78480         0.42711            0.55150
## 2                        0.18284         0.39843            0.48816
## 3                        0.17178         0.38800            0.46590
## 4                        0.16427         0.38517            0.45884
## 5                        0.15508         0.38292            0.45369
## 6                        0.15362         0.38299            0.45288
## 7                        0.13525         0.38296            0.45279
## 8                        0.13066         0.38296            0.45271
## 9                        0.12252         0.38309            0.45297
## 10                       0.12106         0.38425            0.45511
## 11                       0.12711         0.38380            0.45496
## 12                       0.11396         0.38363            0.45491
## 13                       0.12336         0.38409            0.45626
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76104
## 2         0.78273         4.18487                         0.29819
## 3         0.78540         4.18487                         0.26606
## 4         0.78459         3.76639                         0.28213
## 5         0.78665         4.18487                         0.26205
## 6         0.78797         4.18487                         0.26908
## 7         0.78821         4.18487                         0.23996
## 8         0.78910         4.18487                         0.25803
## 9         0.79017         3.76639                         0.23795
## 10        0.78935         3.34790                         0.23594
## 11        0.78948         3.76639                         0.27610
## 12        0.79032         3.34790                         0.24096
## 13        0.78976         3.76639                         0.27108
plot(gbm_fit20, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit30, 
     timestep = "number_of_trees", 
metric = "logloss")

gbm_fit40 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit40",
                    ntrees=5,
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_perf40 <- h2o.performance(model = gbm_fit40,
newdata = test)

h2o.auc(gbm_perf40) 
## [1] 0.7773026
h2o.confusionMatrix(gbm_perf10)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.303536886664258:
##         N0 Yes    Error       Rate
## N0     688 103 0.130215   =103/791
## Yes     97 134 0.419913    =97/231
## Totals 785 237 0.195695  =200/1022
h2o.confusionMatrix(gbm_perf20)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.279072940794731:
##         N0 Yes    Error       Rate
## N0     671 120 0.151707   =120/791
## Yes     92 139 0.398268    =92/231
## Totals 763 259 0.207436  =212/1022
h2o.confusionMatrix(gbm_perf30)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.279072940794731:
##         N0 Yes    Error       Rate
## N0     671 120 0.151707   =120/791
## Yes     92 139 0.398268    =92/231
## Totals 763 259 0.207436  =212/1022
h2o.confusionMatrix(gbm_perf40)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.210171705078586:
##         N0 Yes    Error       Rate
## N0     600 191 0.241466   =191/791
## Yes     72 159 0.311688    =72/231
## Totals 672 350 0.257339  =263/1022

Deep Learning

dl_fit10 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit10",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
dl_fit20 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit20",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
dl_fit30 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit30",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================================| 100%
dl_perf10 <- h2o.performance(model = dl_fit10,
                            newdata = test)
dl_perf20 <- h2o.performance(model = dl_fit20,
                            newdata = test)
dl_perf30 <- h2o.performance(model = dl_fit30,
newdata = test)

h2o.auc(dl_perf10)  # 0.6774335
## [1] 0.7993088
h2o.auc(dl_perf20)  # 0.678446
## [1] 0.7977928
h2o.auc(dl_perf30) # 0.6770498
## [1] 0.7907575
plot(dl_fit30, 
     timestep = "epochs", 
metric = "AUC")

dl_fit40 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit40",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(32,64,32,128,32,64,32),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================================| 100%
dl_perf40 <- h2o.performance(model = dl_fit40,
newdata = test)

h2o.auc(dl_perf40)
## [1] 0.7963151
h2o.confusionMatrix(dl_perf10)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.270812603089717:
##         N0 Yes    Error       Rate
## N0     653 138 0.174463   =138/791
## Yes     85 146 0.367965    =85/231
## Totals 738 284 0.218200  =223/1022
h2o.confusionMatrix(dl_perf20)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.356675378728843:
##         N0 Yes    Error       Rate
## N0     702  89 0.112516    =89/791
## Yes    105 126 0.454545   =105/231
## Totals 807 215 0.189824  =194/1022
h2o.confusionMatrix(dl_perf30)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.403758161696188:
##         N0 Yes    Error       Rate
## N0     693  98 0.123894    =98/791
## Yes     98 133 0.424242    =98/231
## Totals 791 231 0.191781  =196/1022
h2o.confusionMatrix(dl_perf40)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.377268149879096:
##         N0 Yes    Error       Rate
## N0     649 142 0.179520   =142/791
## Yes     80 151 0.346320    =80/231
## Totals 729 293 0.217221  =222/1022

CARTESIAN GRID SEARCHES

gbm_params10 <- list(learn_rate = c(0.01, 0.1),
                    max_depth = c(3, 5, 9),
                    sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))

gbm_grid10 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid10",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
hyper_params = gbm_params10)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  58%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |===========================================                      |  65%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf10 <- h2o.getGrid(grid_id = "gbm_grid10", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf10)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid10 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1             0.2       0.01         9         0.8 gbm_grid10_model_12
## 2             0.2       0.01         9         1.0 gbm_grid10_model_30
## 3             0.2        0.1         5         1.0 gbm_grid10_model_27
## 4             0.5       0.01         9         1.0 gbm_grid10_model_31
## 5             1.0        0.1         9         0.8 gbm_grid10_model_17
##                  auc
## 1 0.7956198310458749
## 2  0.793009024190151
## 3 0.7909164985255316
## 4 0.7895445777255494
## 5 0.7890456974346466
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             0.5       0.01         3         1.0 gbm_grid10_model_19
## 32             1.0        0.1         9         1.0 gbm_grid10_model_35
## 33             1.0       0.01         3         1.0 gbm_grid10_model_20
## 34             0.5        0.1         9         0.8 gbm_grid10_model_16
## 35             0.2       0.01         3         1.0 gbm_grid10_model_18
## 36             1.0       0.01         9         1.0 gbm_grid10_model_32
##                   auc
## 31 0.7789295137580098
## 32 0.7788020221281124
## 33 0.7786052415689231
## 34 0.7776019378727745
## 35  0.776806500964502
## 36 0.7628655683909447
gbm_params20 <- list(learn_rate = seq(0.001, 0.1, 0.001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.3, 1.0, 0.05),
                    col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria20 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid20 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid20",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
                      hyper_params = gbm_params20,
                      search_criteria = search_criteria20)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |=======================================                          |  61%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf20 <- h2o.getGrid(grid_id = "gbm_grid20", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf20)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid20 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1             0.4      0.092         8        0.65 gbm_grid20_model_15
## 2            0.25      0.027         7        0.95  gbm_grid20_model_4
## 3             1.0      0.043         6        0.55 gbm_grid20_model_28
## 4            0.25      0.008         7         0.9  gbm_grid20_model_2
## 5            0.45       0.02        10         1.0 gbm_grid20_model_10
##                  auc
## 1 0.7939125518281192
## 2 0.7936658832398394
## 3 0.7928926187889405
## 4 0.7923909669408661
## 5 0.7922080441675351
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             0.1      0.026         2         0.8 gbm_grid20_model_26
## 32            0.75      0.064         7        0.45 gbm_grid20_model_34
## 33            0.95      0.019         8         1.0 gbm_grid20_model_12
## 34            0.35      0.005         3         1.0 gbm_grid20_model_23
## 35            0.65      0.088         9         0.5  gbm_grid20_model_6
## 36             0.8      0.098         6         0.4 gbm_grid20_model_32
##                   auc
## 31  0.777762688188732
## 32 0.7777322010598434
## 33  0.776640207534201
## 34 0.7765432030331921
## 35 0.7752405711625019
## 36 0.7729429502671782
#UPDATE

gbm_params30 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.6, 0.9, 0.005),
                    col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria30 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid30 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid30",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 1000,
                      seed = 1,
                      hyper_params = gbm_params30,
                      search_criteria = search_criteria30)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  95%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf30 <- h2o.getGrid(grid_id = "gbm_grid30", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf30)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid30 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1           0.785     0.0265         3        0.79 gbm_grid30_model_33
## 2           0.665     0.0331         8        0.85  gbm_grid30_model_9
## 3           0.655     0.0274         7        0.75 gbm_grid30_model_19
## 4            0.71     0.0399         9        0.79  gbm_grid30_model_6
## 5           0.555     0.0375         9        0.69 gbm_grid30_model_18
##                  auc
## 1 0.7850103101926786
## 2 0.7845751757167246
## 3 0.7840458082969336
## 4 0.7834471519478504
## 5 0.7832060264739141
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31            0.54      0.042         6        0.74 gbm_grid30_model_15
## 32           0.765     0.0493         8         0.6 gbm_grid30_model_11
## 33           0.525     0.0489         4       0.885 gbm_grid30_model_21
## 34            0.54     0.0401         5       0.705 gbm_grid30_model_27
## 35           0.785      0.042         5       0.755 gbm_grid30_model_34
## 36            0.77     0.0487         5       0.815  gbm_grid30_model_3
##                   auc
## 31 0.7754290370501762
## 32 0.7746613157136205
## 33 0.7731452739407108
## 34  0.771820469612647
## 35 0.7707229329726614
## 36 0.7675190128821977
best_gbm_model_id <- gbm_gridperf30@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm, 
                                 newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541
## [1] 0.7976642
h2o.confusionMatrix(best_gbm_perf)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.255673344156405:
##         N0 Yes    Error       Rate
## N0     666 125 0.158028   =125/791
## Yes     85 146 0.367965    =85/231
## Totals 751 271 0.205479  =210/1022

RAIN IN 2 DAYS

Basic Binomial Generalized Linear Model

y1 <- y3

glm_fit105 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit105",
family = "binomial") #similar to R's glm, h2o.glm has the family argument
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
glm_fit205 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit205",
                    validation_frame = valid,
                    family = "binomial",
lambda_search = TRUE)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================================================================| 100%
# Let's compare the performance of the two GLMs
glm_perf105 <- h2o.performance(model = glm_fit105,
                             newdata = test)
glm_perf205 <- h2o.performance(model = glm_fit205,
                             newdata = test)

# Print model performance
glm_perf105
## H2OBinomialMetrics: glm
## 
## MSE:  0.1554299
## RMSE:  0.394246
## LogLoss:  0.4807781
## Mean Per-Class Error:  0.335744
## AUC:  0.7075338
## Gini:  0.4150676
## R^2:  0.09756708
## Residual Deviance:  982.7104
## AIC:  1074.71
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     501 295 0.370603   =295/796
## Yes     68 158 0.300885    =68/226
## Totals 569 453 0.355186  =363/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.199328 0.465390 224
## 2                       max f2  0.093995 0.613884 335
## 3                 max f0point5  0.338541 0.430108 111
## 4                 max accuracy  0.644194 0.786693  11
## 5                max precision  0.906639 1.000000   0
## 6                   max recall  0.032023 1.000000 394
## 7              max specificity  0.906639 1.000000   0
## 8             max absolute_mcc  0.237015 0.277072 192
## 9   max min_per_class_accuracy  0.208790 0.646985 216
## 10 max mean_per_class_accuracy  0.199328 0.664256 224
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
glm_perf205
## H2OBinomialMetrics: glm
## 
## MSE:  0.1551997
## RMSE:  0.3939539
## LogLoss:  0.4795676
## Mean Per-Class Error:  0.3349824
## AUC:  0.7110914
## Gini:  0.4221828
## R^2:  0.09890381
## Residual Deviance:  980.2362
## AIC:  1042.236
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     548 248 0.311558   =248/796
## Yes     81 145 0.358407    =81/226
## Totals 629 393 0.321918  =329/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.214892 0.468498 206
## 2                       max f2  0.113418 0.623522 324
## 3                 max f0point5  0.335966 0.429062 111
## 4                 max accuracy  0.576065 0.786693  15
## 5                max precision  0.903696 1.000000   0
## 6                   max recall  0.035596 1.000000 397
## 7              max specificity  0.903696 1.000000   0
## 8             max absolute_mcc  0.254013 0.284842 172
## 9   max min_per_class_accuracy  0.204259 0.655779 217
## 10 max mean_per_class_accuracy  0.214892 0.665018 206
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf105)  
## [1] 0.7075338
h2o.auc(glm_perf205)  
## [1] 0.7110914
# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit105, train = TRUE)  
## [1] 0.7259714
h2o.auc(glm_fit205, valid = TRUE)  
## [1] 0.7131017
#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf105)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.19932800273972:
##         N0 Yes    Error       Rate
## N0     501 295 0.370603   =295/796
## Yes     68 158 0.300885    =68/226
## Totals 569 453 0.355186  =363/1022
h2o.confusionMatrix(glm_perf205)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.214891693551094:
##         N0 Yes    Error       Rate
## N0     548 248 0.311558   =248/796
## Yes     81 145 0.358407    =81/226
## Totals 629 393 0.321918  =329/1022

RANDOM FOREST

rf_fit105 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit105",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=================================================================| 100%
rf_fit205 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit205",
                            validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100000,
                            score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
#Let's compare the performance of the two RFs
rf_perf105 <- h2o.performance(model = rf_fit105,
                            newdata = test)
rf_perf205 <- h2o.performance(model = rf_fit205,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf105)
## [1] 0.7107746
h2o.auc(rf_perf205) 
## [1] 0.7180287
#Cross Validate

rf_fit305 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit305",
                            seed = 1,
                            nfolds = 5)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%
rf_perf305 <- h2o.performance(model = rf_fit305,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf305)
## [1] 0.7107746
# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit305, xval = TRUE) 
## [1] 0.6987228
h2o.confusionMatrix(rf_perf105)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.246666666269302:
##         N0 Yes    Error       Rate
## N0     551 245 0.307789   =245/796
## Yes     82 144 0.362832    =82/226
## Totals 633 389 0.319961  =327/1022
h2o.confusionMatrix(rf_perf205)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.216748768091202:
##         N0 Yes    Error       Rate
## N0     509 287 0.360553   =287/796
## Yes     67 159 0.296460    =67/226
## Totals 576 446 0.346380  =354/1022
h2o.confusionMatrix(rf_perf305)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.246666666269302:
##         N0 Yes    Error       Rate
## N0     551 245 0.307789   =245/796
## Yes     82 144 0.362832    =82/226
## Totals 633 389 0.319961  =327/1022

Gradient Boosting Machine

gbm_fit105 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit105",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=================================================================| 100%
gbm_fit205 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit205",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%
gbm_fit305 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit305",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 50000,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_perf105 <- h2o.performance(model = gbm_fit105,
                             newdata = test)
gbm_perf205 <- h2o.performance(model = gbm_fit205,
                             newdata = test)
gbm_perf305 <- h2o.performance(model = gbm_fit305,
newdata = test)

h2o.auc(gbm_perf105)  
## [1] 0.6992985
h2o.auc(gbm_perf205) 
## [1] 0.702278
h2o.auc(gbm_perf305) 
## [1] 0.702278
h2o.scoreHistory(gbm_fit205)
## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:14:30  0.000 sec               0       0.41225
## 2  2018-01-28 21:14:30  0.031 sec               5       0.38988
## 3  2018-01-28 21:14:30  0.062 sec              10       0.37549
## 4  2018-01-28 21:14:30  0.078 sec              15       0.36573
## 5  2018-01-28 21:14:30  0.109 sec              20       0.35862
## 6  2018-01-28 21:14:30  0.140 sec              25       0.35329
## 7  2018-01-28 21:14:30  0.168 sec              30       0.34800
## 8  2018-01-28 21:14:30  0.199 sec              35       0.34195
## 9  2018-01-28 21:14:30  0.215 sec              40       0.33859
## 10 2018-01-28 21:14:30  0.246 sec              45       0.33531
##    training_logloss training_auc training_lift
## 1           0.52318      0.50000       1.00000
## 2           0.47292      0.79247       3.62497
## 3           0.44268      0.81789       4.22284
## 4           0.42239      0.83553       4.12686
## 5           0.40750      0.84855       4.31881
## 6           0.39658      0.85866       4.31881
## 7           0.38639      0.86996       4.31881
## 8           0.37493      0.88326       4.60673
## 9           0.36888      0.88902       4.60673
## 10          0.36309      0.89384       4.60673
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78293         0.42367            0.54481
## 2                        0.25631         0.40934            0.51132
## 3                        0.26717         0.40521            0.50237
## 4                        0.22313         0.40353            0.49790
## 5                        0.20643         0.40284            0.49550
## 6                        0.18743         0.40306            0.49584
## 7                        0.18076         0.40283            0.49501
## 8                        0.17074         0.40350            0.49761
## 9                        0.16114         0.40454            0.50025
## 10                       0.15300         0.40466            0.50008
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76606
## 2         0.70837         2.95939                         0.33936
## 3         0.70214         3.41974                         0.34739
## 4         0.70733         3.41974                         0.36546
## 5         0.71179         3.41974                         0.35442
## 6         0.71274         2.13734                         0.36345
## 7         0.71541         2.56481                         0.34538
## 8         0.71037         2.56481                         0.32329
## 9         0.70734         2.99227                         0.32129
## 10        0.70937         2.13734                         0.32229
h2o.scoreHistory(gbm_fit305)
## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:14:32  0.000 sec               0       0.41225
## 2  2018-01-28 21:14:32  0.016 sec               5       0.38988
## 3  2018-01-28 21:14:32  0.060 sec              10       0.37549
## 4  2018-01-28 21:14:32  0.075 sec              15       0.36573
## 5  2018-01-28 21:14:32  0.107 sec              20       0.35862
## 6  2018-01-28 21:14:32  0.138 sec              25       0.35329
## 7  2018-01-28 21:14:32  0.169 sec              30       0.34800
## 8  2018-01-28 21:14:32  0.200 sec              35       0.34195
## 9  2018-01-28 21:14:32  0.216 sec              40       0.33859
## 10 2018-01-28 21:14:32  0.247 sec              45       0.33531
##    training_logloss training_auc training_lift
## 1           0.52318      0.50000       1.00000
## 2           0.47292      0.79247       3.62497
## 3           0.44268      0.81789       4.22284
## 4           0.42239      0.83553       4.12686
## 5           0.40750      0.84855       4.31881
## 6           0.39658      0.85866       4.31881
## 7           0.38639      0.86996       4.31881
## 8           0.37493      0.88326       4.60673
## 9           0.36888      0.88902       4.60673
## 10          0.36309      0.89384       4.60673
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78293         0.42367            0.54481
## 2                        0.25631         0.40934            0.51132
## 3                        0.26717         0.40521            0.50237
## 4                        0.22313         0.40353            0.49790
## 5                        0.20643         0.40284            0.49550
## 6                        0.18743         0.40306            0.49584
## 7                        0.18076         0.40283            0.49501
## 8                        0.17074         0.40350            0.49761
## 9                        0.16114         0.40454            0.50025
## 10                       0.15300         0.40466            0.50008
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76606
## 2         0.70837         2.95939                         0.33936
## 3         0.70214         3.41974                         0.34739
## 4         0.70733         3.41974                         0.36546
## 5         0.71179         3.41974                         0.35442
## 6         0.71274         2.13734                         0.36345
## 7         0.71541         2.56481                         0.34538
## 8         0.71037         2.56481                         0.32329
## 9         0.70734         2.99227                         0.32129
## 10        0.70937         2.13734                         0.32229
plot(gbm_fit205, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit305, 
     timestep = "number_of_trees", 
metric = "logloss")

gbm_fit405 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit405",
                    ntrees=5,
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
gbm_perf405 <- h2o.performance(model = gbm_fit405,
newdata = test)

h2o.auc(gbm_perf405) 
## [1] 0.6973529
h2o.confusionMatrix(gbm_perf105)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.237339910626288:
##         N0 Yes    Error       Rate
## N0     583 213 0.267588   =213/796
## Yes     93 133 0.411504    =93/226
## Totals 676 346 0.299413  =306/1022
h2o.confusionMatrix(gbm_perf205)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.183380894744191:
##         N0 Yes    Error       Rate
## N0     506 290 0.364322   =290/796
## Yes     71 155 0.314159    =71/226
## Totals 577 445 0.353229  =361/1022
h2o.confusionMatrix(gbm_perf305)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.183380894744191:
##         N0 Yes    Error       Rate
## N0     506 290 0.364322   =290/796
## Yes     71 155 0.314159    =71/226
## Totals 577 445 0.353229  =361/1022
h2o.confusionMatrix(gbm_perf405)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.233304352675186:
##         N0 Yes    Error       Rate
## N0     593 203 0.255025   =203/796
## Yes     98 128 0.433628    =98/226
## Totals 691 331 0.294521  =301/1022

Deep Learning

dl_fit105 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit105",
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%
dl_fit205 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit205",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
dl_fit305 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit305",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |=================================================================| 100%
dl_perf105 <- h2o.performance(model = dl_fit105,
                            newdata = test)
dl_perf205 <- h2o.performance(model = dl_fit205,
                            newdata = test)
dl_perf305 <- h2o.performance(model = dl_fit305,
newdata = test)

h2o.auc(dl_perf105)  # 0.6774335
## [1] 0.7017638
h2o.auc(dl_perf205)  # 0.678446
## [1] 0.702253
h2o.auc(dl_perf305) # 0.6770498
## [1] 0.705541
plot(dl_fit305, 
     timestep = "epochs", 
metric = "AUC")

dl_fit405 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit405",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(32,64,32,128,32,64,32),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================================| 100%
dl_perf405 <- h2o.performance(model = dl_fit405,
newdata = test)

h2o.auc(dl_perf40)
## [1] 0.7963151
h2o.confusionMatrix(dl_perf105)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.139217562640086:
##         N0 Yes    Error       Rate
## N0     377 419 0.526382   =419/796
## Yes     28 198 0.123894    =28/226
## Totals 405 617 0.437378  =447/1022
h2o.confusionMatrix(dl_perf205)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.24193354973481:
##         N0 Yes    Error       Rate
## N0     496 300 0.376884   =300/796
## Yes     68 158 0.300885    =68/226
## Totals 564 458 0.360078  =368/1022
h2o.confusionMatrix(dl_perf305)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.15720004889004:
##         N0 Yes    Error       Rate
## N0     509 287 0.360553   =287/796
## Yes     70 156 0.309735    =70/226
## Totals 579 443 0.349315  =357/1022
h2o.confusionMatrix(dl_perf405)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.212065363158288:
##         N0 Yes    Error       Rate
## N0     616 180 0.226131   =180/796
## Yes    106 120 0.469027   =106/226
## Totals 722 300 0.279843  =286/1022

CARTESIAN GRID SEARCHES

gbm_params105 <- list(learn_rate = c(0.01, 0.1),
                    max_depth = c(3, 5, 9),
                    sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))

gbm_grid105 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid105",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
hyper_params = gbm_params105)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |==============================                                   |  45%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf105 <- h2o.getGrid(grid_id = "gbm_grid105", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf105)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid105 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate            model_ids
## 1             0.2       0.01         5         1.0 gbm_grid105_model_24
## 2             1.0       0.01         5         0.8  gbm_grid105_model_8
## 3             1.0       0.01         3         0.8  gbm_grid105_model_2
## 4             0.5       0.01         3         0.8  gbm_grid105_model_1
## 5             0.5       0.01         3         1.0 gbm_grid105_model_19
##                  auc
## 1  0.716080076949471
## 2 0.7155457056232738
## 3 0.7145332125841635
## 4 0.7144572756062303
## 5 0.7143447763796623
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate            model_ids
## 31             0.5        0.1         5         0.8 gbm_grid105_model_10
## 32             0.2        0.1         9         1.0 gbm_grid105_model_33
## 33             0.2        0.1         9         0.8 gbm_grid105_model_15
## 34             0.5        0.1         9         1.0 gbm_grid105_model_34
## 35             0.5        0.1         9         0.8 gbm_grid105_model_16
## 36             1.0        0.1         9         1.0 gbm_grid105_model_35
##                   auc
## 31   0.68331186473093
## 32 0.6826059320842168
## 33 0.6742416145888998
## 34 0.6728888113894218
## 35 0.6702703918910558
## 36 0.6691172748187356
gbm_params205 <- list(learn_rate = seq(0.001, 0.1, 0.001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.3, 1.0, 0.05),
                    col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria205 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid205 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid205",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
                      hyper_params = gbm_params20,
                      search_criteria = search_criteria20)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf205 <- h2o.getGrid(grid_id = "gbm_grid205", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf205)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid205 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate            model_ids
## 1            0.65       0.01         6         0.5  gbm_grid205_model_7
## 2             0.4      0.052         3        0.65 gbm_grid205_model_13
## 3            0.35      0.069         9         0.3  gbm_grid205_model_3
## 4             0.3       0.05         3         0.8 gbm_grid205_model_35
## 5             0.7      0.036         5        0.45 gbm_grid205_model_22
##                  auc
## 1 0.7185184976853284
## 2 0.7173428807676948
## 3 0.7166116357950039
## 4 0.7165047615297644
## 5 0.7153713318220937
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate            model_ids
## 31             0.8      0.072        10        0.75  gbm_grid205_model_6
## 32            0.25       0.05         8        0.75 gbm_grid205_model_25
## 33             0.9      0.087         6         0.5  gbm_grid205_model_9
## 34            0.65      0.049         9         0.3 gbm_grid205_model_23
## 35             0.8      0.082         7         0.8 gbm_grid205_model_15
## 36             0.1      0.079        10        0.85  gbm_grid205_model_1
##                   auc
## 31 0.6932792962048386
## 32 0.6931864843429202
## 33 0.6887905770647826
## 34 0.6852468514278963
## 35 0.6816496886583904
## 36 0.6714066340793907
#UPDATE

gbm_params305 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.6, 0.9, 0.005),
                    col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria305 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid305 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid305",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 1000,
                      seed = 1,
                      hyper_params = gbm_params30,
                      search_criteria = search_criteria30)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%
gbm_gridperf305 <- h2o.getGrid(grid_id = "gbm_grid305", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf305)
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid305 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate            model_ids
## 1            0.63     0.0255         2       0.815  gbm_grid305_model_3
## 2           0.705     0.0356         2        0.68 gbm_grid305_model_19
## 3           0.705      0.039         2        0.65 gbm_grid305_model_22
## 4            0.63     0.0488         2       0.605 gbm_grid305_model_23
## 5            0.77     0.0356         2         0.6 gbm_grid305_model_20
##                  auc
## 1 0.7092879361454391
## 2 0.7022736093689356
## 3 0.7019136118439185
## 4 0.6977877027095438
## 5 0.6957514667086665
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate            model_ids
## 31           0.715     0.0264         7       0.685 gbm_grid305_model_34
## 32           0.535      0.032         5       0.745 gbm_grid305_model_29
## 33            0.52     0.0443         5         0.8 gbm_grid305_model_35
## 34           0.515     0.0327         7       0.715 gbm_grid305_model_18
## 35           0.725     0.0456         6         0.9 gbm_grid305_model_26
## 36             0.7     0.0444         4       0.625 gbm_grid305_model_30
##                   auc
## 31 0.6719128805989459
## 32 0.6704925778635272
## 33 0.6701213304158534
## 34 0.6698035201007994
## 35 0.6688135269070025
## 36 0.6649604283970548
best_gbm_model_id <- gbm_gridperf30@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm, 
                                 newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541
## [1] 0.7976642
h2o.confusionMatrix(best_gbm_perf)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.255673344156405:
##         N0 Yes    Error       Rate
## N0     666 125 0.158028   =125/791
## Yes     85 146 0.367965    =85/231
## Totals 751 271 0.205479  =210/1022