Madrid Mean Temperature Forecast

Read in data, select mean temperature and divide into training and forecasting sets (75:25)

suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(gmodels))
suppressPackageStartupMessages(library(lattice))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(Kmisc))
suppressPackageStartupMessages(library(ROCR))
suppressPackageStartupMessages(library(corrplot))
madrid <- read.csv("madrid.csv")

temp <- madrid[,c(1,3)]

require(forecast)
require(tseries)

xtraintemp <- window(temp$Mean.TemperatureC,end=round(length(temp$CET)*.75))

xforetemp <- window(temp$Mean.TemperatureC,start=round(length(temp$CET)*.75))

Plot Training and Forecasting sets

plot(xtraintemp,type="l",main="Madrid mean daily temperature (training set)",ylab="Degrees Celsius",xlab="Day",col="blue")

plot(xforetemp,type="l",main="Madrid mean daily temperature (forecasting set)",ylab="Degrees Celsius",xlab="Day",col="red")

plot(temp$Mean.TemperatureC,type="p",main="Madrid mean daily temperature, all data",ylab="Degrees Celsius",xlab="Day")
lines(y=xtraintemp,x=1:5109,col="blue")
lines(y=xforetemp,x=5109:6812,col="red")

Simple forecasting methods:

Arithmetic Mean:

xt <- xtraintemp
xf <- xforetemp
x <- temp$Mean.TemperatureC
mean <- meanf(xt,h=length(xf))
plot(mean,main="Arithmetic Mean Method",ylab="Level",xlab="Day")
lines(x)

Naive/ Random Walk method (with 80% & 95% CI’s)

rw2 <- rwf(xt,h=length(xf))

plot(rw2,main="Naïve or Random Walk Method",ylab="Temperature",xlab="Day")
lines(x)

rwd <- rwf(xt,drift=T,h=length(xf))
plot(rwd,main="Random Walk with Drift Method",ylab="Temperature",xlab="Day")
lines(x)

ari <- Arima(xt,seasonal=c())

require(knitr)

Assessing Accuracy

Mean method

kable(accuracy(mean,xf))

	ME	RMSE	MAE	MPE	MAPE	MASE	ACF1	Theil’s U
Training set	0.000000	7.503106	6.457223	-Inf	Inf	4.222349	0.9625232	NA
Test set	1.258595	7.828909	6.799465	-Inf	Inf	4.446139	0.9659514	NaN

Random Walk method

kable(accuracy(rw2,xf))

	ME	RMSE	MAE	MPE	MAPE	MASE	ACF1	Theil’s U
Training set	0.0019596	2.048501	1.529297	NaN	Inf	1.000000	-0.0982006	NA
Test set	-1.3973005	7.852402	6.812793	-Inf	Inf	4.454854	0.9659514	NaN

Random walk with drift method

kable(accuracy(rwd,xf))

	ME	RMSE	MAE	MPE	MAPE	MASE	ACF1	Theil’s U
Training set	0.000000	2.048500	1.529737	NaN	Inf	1.000288	-0.0982006	NA
Test set	-3.067886	8.371454	7.161636	-Inf	Inf	4.682961	0.9663510	NaN

Moving Average Methods

Simple Moving Average SMA

sma2 <- ma(xt,2)
sma5 <- ma(xt,5)
sma30 <- ma(xt,30)
sma120 <- ma(xt,120)

plot(x,main="Simple Moving Average SMA2",ylab="Level",xlab="Day",col="grey")
lines(sma2,col="blue")

plot(x,main="Simple Moving Average SMA5",ylab="Level",xlab="Day",col="grey")
lines(sma5,col="blue")

plot(x,main="Simple Moving Average SMA30",ylab="Level",xlab="Day",col="grey")
lines(sma30,col="blue")

plot(x,main="Simple Moving Average SMA120",ylab="Level",xlab="Day",col="grey")
lines(sma120,col="blue")

plot(forecast(sma2,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA2",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

plot(forecast(sma5,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA5",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

plot(forecast(sma30,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA30",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

plot(forecast(sma120,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA120",ylab="Level",xlab="Day",col=4)
lines(x,col=3)

f <- forecast(sma2,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA2 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

f <- forecast(sma5,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA5 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

f <- forecast(sma30,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA30 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

f <- forecast(sma120,h=length(xf),robust = T)

f.up <- f$upper
f.low <- f$lower

f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]

plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA120 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")

ETS forecast

fit.ets <- ets(xt)

fr.ets <- forecast(fit.ets,h=length(xf))

plot(forecast(fit.ets,h=length(xf)))
lines(x)

Seasonlaity in data, and variations around this

require(ggplot2)
temp_ma = ts(na.omit(temp$Mean.TemperatureC), frequency=30)
decomp = stl(temp_ma, s.window="periodic")

plot(decomp)

par(mfrow=c(2,1))
plot(decomp$time.series[,2])
plot(decomp$time.series[,3])

(cols_withNa <- apply(madrid, 2, function(x) sum(is.na(x))))

##                        CET           Max.TemperatureC 
##                          0                          2 
##          Mean.TemperatureC           Min.TemperatureC 
##                          3                          2 
##                 Dew.PointC             MeanDew.PointC 
##                          2                          2 
##              Min.DewpointC               Max.Humidity 
##                          2                          2 
##              Mean.Humidity               Min.Humidity 
##                          2                          2 
##  Max.Sea.Level.PressurehPa Mean.Sea.Level.PressurehPa 
##                          0                          0 
##  Min.Sea.Level.PressurehPa           Max.VisibilityKm 
##                          0                        940 
##          Mean.VisibilityKm           Min.VisibilitykM 
##                        940                        940 
##         Max.Wind.SpeedKm.h        Mean.Wind.SpeedKm.h 
##                          0                          0 
##         Max.Gust.SpeedKm.h            Precipitationmm 
##                       3306                          0 
##                 CloudCover                     Events 
##                       1372                          0 
##             WindDirDegrees 
##                          0

require(dplyr)
require(tidyr)

madrid <- madrid %>% select(-c(Max.VisibilityKm, Mean.VisibilityKm, Min.VisibilitykM,Max.Gust.SpeedKm.h,CloudCover))

madrid.f <- madrid[complete.cases(madrid),]

dat <- madrid.f

par(mfrow=c(1,1))
factor_vars <- names(which(sapply(dat, class) == "factor"))

dat$CET <- as.Date(dat$CET)
names(dat)[1] <- c("Date")
require(stringr)

Rain <- ifelse(str_detect(dat$Events,"Rain"),1,0)

RainTomorrow <- numeric(length(Rain))

for(i in 1:length(Rain)-1){
  if(Rain[i+1]==1){
    RainTomorrow[i] <- 1
  }
  else(RainTomorrow[i] <- 0)
  
}

dat$RainTomorrow <- RainTomorrow



factor_vars <- names(which(sapply(dat, class) == "factor"))

require(corrplot)
matrixdat <- as.matrix(dat[,-c(1,17)])
matrixdat[,-c(17)] <- scale(matrixdat[,-c(17)])
datcor <- cor(matrixdat)
corrplot(datcor)

md <- as.data.frame(matrixdat)

factor_vars <- names(which(sapply(dat, class) == "factor"))
numeric_vars <- setdiff(colnames(dat), factor_vars)
numeric_vars <- setdiff(numeric_vars, c("RainTomorrow","Date"))
numeric_vars

##  [1] "Max.TemperatureC"           "Mean.TemperatureC"         
##  [3] "Min.TemperatureC"           "Dew.PointC"                
##  [5] "MeanDew.PointC"             "Min.DewpointC"             
##  [7] "Max.Humidity"               "Mean.Humidity"             
##  [9] "Min.Humidity"               "Max.Sea.Level.PressurehPa" 
## [11] "Mean.Sea.Level.PressurehPa" "Min.Sea.Level.PressurehPa" 
## [13] "Max.Wind.SpeedKm.h"         "Mean.Wind.SpeedKm.h"       
## [15] "Precipitationmm"            "WindDirDegrees"

numeric_vars_mat <- as.matrix(dat[, numeric_vars, drop=FALSE])
numeric_vars_cor <- cor(numeric_vars_mat)
corrplot(numeric_vars_cor)

dat$RainTomorrow <- ifelse(dat$RainTomorrow==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x= RainTomorrow, y=eval(parse(text=x)), col = dat$RainTomorrow)) + geom_boxplot() + xlab("RainTomorrow") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots

## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x=eval(parse(text=x)), col = RainTomorrow)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots

## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

dat$RainToday <- Rain

dat$RainToday <- ifelse(dat$RainToday==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x= RainToday, y=eval(parse(text=x)), col = dat$RainToday)) + geom_boxplot() + xlab("RainToday") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots

## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x=eval(parse(text=x)), col = RainToday)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots

## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

Rain_in2days <- numeric(length(dat$RainToday))

for(i in 1:length(dat$RainToday)){
  if(i+2<=length(dat$RainToday)){
    
      Rain_in2days[i] <- ifelse(dat$RainToday[i+2]=="Yes",1,0)
    
  }
  
  else(Rain_in2days[i]<-0)

}

dat$Rain_in2days <- Rain_in2days

dat$Rain_in2days <- ifelse(dat$Rain_in2days==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x= Rain_in2days, y=eval(parse(text=x)), col = dat$Rain_in2days)) + geom_boxplot() + xlab("Rain in 2 days") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots

## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

gp <- invisible(lapply(numeric_vars, function(x) { 
  ggplot(data=dat, aes(x=eval(parse(text=x)), col = Rain_in2days)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
  marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots

## $chunk1

## 
## $chunk2

## 
## $chunk3

## 
## $chunk4

require(kohonen)
somdat <- dat[numeric_vars]

somdat <- scale(as.matrix(somdat))

somdat <- somdat[,c(2,5,8,11,14,15)]

som_grid <- somgrid(xdim = 5, ydim=5, topo="hexagonal")

som_model <- som(somdat, 
        grid=som_grid, 
        rlen=100, 
        alpha=c(0.05,0.01), 
        keep.data = TRUE)

plot(som_model, type="changes")

plot(som_model, type="count")

plot(som_model, type="dist.neighbours")

plot(som_model, type="codes")

MACHINE LEARNING

dat$Month <- format(as.Date(dat$Date), "%m")
dat$Year <- format(as.Date(dat$Date), "%Y")

library(h2o)
h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
max_mem_size = "4G") #max mem size is the maximum memory to allocate to H2O

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\GERHAR~1\AppData\Local\Temp\RtmpqG2RrY/h2o_Gerhard_Viljoen_started_from_r.out
##     C:\Users\GERHAR~1\AppData\Local\Temp\RtmpqG2RrY/h2o_Gerhard_Viljoen_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 540 milliseconds 
##     H2O cluster version:        3.16.0.2 
##     H2O cluster version age:    1 month and 29 days  
##     H2O cluster name:           H2O_started_from_R_Gerhard_Viljoen_pbn723 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.56 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.3 (2017-11-30)

#h2o.no_progress()

dat$Events <- as.factor(dat$Events)
dat$RainToday <- as.factor(dat$RainToday)
dat$RainTomorrow <- as.factor(dat$RainTomorrow)
dat$Rain_in2days <- as.factor(dat$Rain_in2days)
dat$Year <- as.factor(dat$Year)
dat$Month <- as.factor(dat$Month)

dat <- dat %>% select(-c(Events,Precipitationmm,Date))

dat.hex <- as.h2o(dat)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

#dat.hex[,-c(18:20)] <- scale(dat.hex[,-c(18:20)])

# Partition the data into training, validation and test sets
splits <- h2o.splitFrame(data = dat.hex, 
                         ratios = c(0.7, 0.15),  #partition data into 70%, 15%, 15% chunks
seed = 1) #setting a seed will guarantee reproducibility

train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]

y1 <- "RainToday"
y2 <- "RainTomorrow"
y3 <- "Rain_in2days"
x <- setdiff(names(dat.hex), c(y1,y2,y3))

print(x)

##  [1] "Max.TemperatureC"           "Mean.TemperatureC"         
##  [3] "Min.TemperatureC"           "Dew.PointC"                
##  [5] "MeanDew.PointC"             "Min.DewpointC"             
##  [7] "Max.Humidity"               "Mean.Humidity"             
##  [9] "Min.Humidity"               "Max.Sea.Level.PressurehPa" 
## [11] "Mean.Sea.Level.PressurehPa" "Min.Sea.Level.PressurehPa" 
## [13] "Max.Wind.SpeedKm.h"         "Mean.Wind.SpeedKm.h"       
## [15] "WindDirDegrees"             "Month"                     
## [17] "Year"

RAIN TODAY

Basic Binomial Generalized Linear Model

glm_fit1 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit1",
family = "binomial") #similar to R's glm, h2o.glm has the family argument

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
glm_fit2 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit2",
                    validation_frame = valid,
                    family = "binomial",
lambda_search = TRUE)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=================================================================| 100%

# Let's compare the performance of the two GLMs
glm_perf1 <- h2o.performance(model = glm_fit1,
                             newdata = test)
glm_perf2 <- h2o.performance(model = glm_fit2,
                             newdata = test)

# Print model performance
glm_perf1

## H2OBinomialMetrics: glm
## 
## MSE:  0.06893514
## RMSE:  0.262555
## LogLoss:  0.2314325
## Mean Per-Class Error:  0.1571695
## AUC:  0.9437533
## Gini:  0.8875065
## R^2:  0.5792355
## Residual Deviance:  473.0479
## AIC:  561.0479
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error      Rate
## N0     779  32 0.039457   =32/811
## Yes     58 153 0.274882   =58/211
## Totals 837 185 0.088063  =90/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.508712 0.772727 144
## 2                       max f2  0.179051 0.824698 240
## 3                 max f0point5  0.617765 0.810489 118
## 4                 max accuracy  0.508712 0.911937 144
## 5                max precision  0.999272 1.000000   0
## 6                   max recall  0.001937 1.000000 392
## 7              max specificity  0.999272 1.000000   0
## 8             max absolute_mcc  0.508712 0.720796 144
## 9   max min_per_class_accuracy  0.211979 0.865598 226
## 10 max mean_per_class_accuracy  0.179051 0.876774 240
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

glm_perf2

## H2OBinomialMetrics: glm
## 
## MSE:  0.0690964
## RMSE:  0.2628619
## LogLoss:  0.2316171
## Mean Per-Class Error:  0.1548962
## AUC:  0.9438117
## Gini:  0.8876234
## R^2:  0.5782512
## Residual Deviance:  473.4253
## AIC:  563.4253
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error      Rate
## N0     775  36 0.044390   =36/811
## Yes     56 155 0.265403   =56/211
## Totals 831 191 0.090020  =92/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.477549 0.771144 152
## 2                       max f2  0.161318 0.826162 247
## 3                 max f0point5  0.656427 0.814095 119
## 4                 max accuracy  0.507839 0.910959 147
## 5                max precision  0.999323 1.000000   0
## 6                   max recall  0.001606 1.000000 392
## 7              max specificity  0.999323 1.000000   0
## 8             max absolute_mcc  0.507839 0.717992 147
## 9   max min_per_class_accuracy  0.218971 0.872038 221
## 10 max mean_per_class_accuracy  0.161318 0.877294 247
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf1)

## [1] 0.9437533

h2o.auc(glm_perf2)

## [1] 0.9438117

# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit2, train = TRUE)

## [1] 0.9443575

h2o.auc(glm_fit2, valid = TRUE)

## [1] 0.950664

#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf1)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.508711960103498:
##         N0 Yes    Error      Rate
## N0     779  32 0.039457   =32/811
## Yes     58 153 0.274882   =58/211
## Totals 837 185 0.088063  =90/1022

h2o.confusionMatrix(glm_perf2)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.477548815356572:
##         N0 Yes    Error      Rate
## N0     775  36 0.044390   =36/811
## Yes     56 155 0.265403   =56/211
## Totals 831 191 0.090020  =92/1022

RANDOM FOREST

rf_fit1 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit1",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |=================================================================| 100%

rf_fit2 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit2",
                            validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100000,
                            score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

#Let's compare the performance of the two RFs
rf_perf1 <- h2o.performance(model = rf_fit1,
                            newdata = test)
rf_perf2 <- h2o.performance(model = rf_fit2,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf1)

## [1] 0.9336464

h2o.auc(rf_perf2)

## [1] 0.9369803

#Cross Validate

rf_fit3 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit3",
                            seed = 1,
                            nfolds = 5)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%

# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit3, xval = TRUE)

## [1] 0.9245813

h2o.confusionMatrix(rf_perf1)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.42:
##         N0 Yes    Error       Rate
## N0     764  47 0.057953    =47/811
## Yes     53 158 0.251185    =53/211
## Totals 817 205 0.097847  =100/1022

h2o.confusionMatrix(rf_perf2)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.4125:
##         N0 Yes    Error       Rate
## N0     764  47 0.057953    =47/811
## Yes     55 156 0.260664    =55/211
## Totals 819 203 0.099804  =102/1022

Gradient Boosting Machine

gbm_fit1 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit1",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_fit2 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit2",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%

gbm_fit3 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit3",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 50000,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_perf1 <- h2o.performance(model = gbm_fit1,
                             newdata = test)
gbm_perf2 <- h2o.performance(model = gbm_fit2,
                             newdata = test)
gbm_perf3 <- h2o.performance(model = gbm_fit3,
newdata = test)

h2o.auc(gbm_perf1)

## [1] 0.9456116

h2o.auc(gbm_perf2)

## [1] 0.946944

h2o.auc(gbm_perf3)

## [1] 0.946944

h2o.scoreHistory(gbm_fit2)

## Scoring History: 
##             timestamp   duration number_of_trees training_rmse
## 1 2018-01-28 21:01:19  0.000 sec               0       0.41424
## 2 2018-01-28 21:01:19  0.032 sec               5       0.33477
## 3 2018-01-28 21:01:19  0.063 sec              10       0.29373
## 4 2018-01-28 21:01:19  0.107 sec              15       0.26949
## 5 2018-01-28 21:01:19  0.153 sec              20       0.25432
##   training_logloss training_auc training_lift
## 1          0.52690      0.50000       1.00000
## 2          0.37441      0.94039       4.54554
## 3          0.30527      0.95214       4.54554
## 4          0.26394      0.95846       4.54554
## 5          0.23658      0.96453       4.54554
##   training_classification_error validation_rmse validation_logloss
## 1                       0.78000         0.42489            0.54709
## 2                       0.10081         0.35010            0.39967
## 3                       0.08913         0.31656            0.34011
## 4                       0.08328         0.30122            0.31031
## 5                       0.07765         0.29461            0.29490
##   validation_auc validation_lift validation_classification_error
## 1        0.50000         1.00000                         0.76406
## 2        0.91773         4.23830                         0.11345
## 3        0.92434         4.23830                         0.11546
## 4        0.92637         4.23830                         0.10542
## 5        0.92795         4.23830                         0.10643
## 
## ---
##              timestamp   duration number_of_trees training_rmse
## 23 2018-01-28 21:01:19  0.775 sec             110       0.15847
## 24 2018-01-28 21:01:19  0.822 sec             115       0.15490
## 25 2018-01-28 21:01:19  0.853 sec             120       0.15165
## 26 2018-01-28 21:01:19  0.885 sec             125       0.14784
## 27 2018-01-28 21:01:19  0.916 sec             130       0.14553
## 28 2018-01-28 21:01:20  0.947 sec             135       0.14212
##    training_logloss training_auc training_lift
## 23          0.10434      0.99553       4.54554
## 24          0.10088      0.99605       4.54554
## 25          0.09789      0.99654       4.54554
## 26          0.09458      0.99699       4.54554
## 27          0.09248      0.99721       4.54554
## 28          0.08946      0.99750       4.54554
##    training_classification_error validation_rmse validation_logloss
## 23                       0.02588         0.27654            0.25339
## 24                       0.02379         0.27586            0.25214
## 25                       0.02254         0.27565            0.25178
## 26                       0.02108         0.27558            0.25205
## 27                       0.01941         0.27584            0.25245
## 28                       0.01774         0.27580            0.25253
##    validation_auc validation_lift validation_classification_error
## 23        0.94168         4.23830                         0.10843
## 24        0.94262         4.23830                         0.10743
## 25        0.94275         4.23830                         0.10643
## 26        0.94252         4.23830                         0.10643
## 27        0.94240         4.23830                         0.10743
## 28        0.94250         4.23830                         0.10643

h2o.scoreHistory(gbm_fit3)

## Scoring History: 
##             timestamp   duration number_of_trees training_rmse
## 1 2018-01-28 21:01:20  0.000 sec               0       0.41424
## 2 2018-01-28 21:01:20  0.047 sec               5       0.33477
## 3 2018-01-28 21:01:20  0.078 sec              10       0.29373
## 4 2018-01-28 21:01:20  0.125 sec              15       0.26949
## 5 2018-01-28 21:01:20  0.156 sec              20       0.25432
##   training_logloss training_auc training_lift
## 1          0.52690      0.50000       1.00000
## 2          0.37441      0.94039       4.54554
## 3          0.30527      0.95214       4.54554
## 4          0.26394      0.95846       4.54554
## 5          0.23658      0.96453       4.54554
##   training_classification_error validation_rmse validation_logloss
## 1                       0.78000         0.42489            0.54709
## 2                       0.10081         0.35010            0.39967
## 3                       0.08913         0.31656            0.34011
## 4                       0.08328         0.30122            0.31031
## 5                       0.07765         0.29461            0.29490
##   validation_auc validation_lift validation_classification_error
## 1        0.50000         1.00000                         0.76406
## 2        0.91773         4.23830                         0.11345
## 3        0.92434         4.23830                         0.11546
## 4        0.92637         4.23830                         0.10542
## 5        0.92795         4.23830                         0.10643
## 
## ---
##              timestamp   duration number_of_trees training_rmse
## 23 2018-01-28 21:01:21  0.737 sec             110       0.15847
## 24 2018-01-28 21:01:21  0.762 sec             115       0.15490
## 25 2018-01-28 21:01:21  0.794 sec             120       0.15165
## 26 2018-01-28 21:01:21  0.825 sec             125       0.14784
## 27 2018-01-28 21:01:21  0.856 sec             130       0.14553
## 28 2018-01-28 21:01:21  0.888 sec             135       0.14212
##    training_logloss training_auc training_lift
## 23          0.10434      0.99553       4.54554
## 24          0.10088      0.99605       4.54554
## 25          0.09789      0.99654       4.54554
## 26          0.09458      0.99699       4.54554
## 27          0.09248      0.99721       4.54554
## 28          0.08946      0.99750       4.54554
##    training_classification_error validation_rmse validation_logloss
## 23                       0.02588         0.27654            0.25339
## 24                       0.02379         0.27586            0.25214
## 25                       0.02254         0.27565            0.25178
## 26                       0.02108         0.27558            0.25205
## 27                       0.01941         0.27584            0.25245
## 28                       0.01774         0.27580            0.25253
##    validation_auc validation_lift validation_classification_error
## 23        0.94168         4.23830                         0.10843
## 24        0.94262         4.23830                         0.10743
## 25        0.94275         4.23830                         0.10643
## 26        0.94252         4.23830                         0.10643
## 27        0.94240         4.23830                         0.10743
## 28        0.94250         4.23830                         0.10643

plot(gbm_fit3, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit3, 
     timestep = "number_of_trees", 
metric = "logloss")

gbm_fit4 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit1",
                    ntrees=5,
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_perf4 <- h2o.performance(model = gbm_fit4,
newdata = test)

h2o.auc(gbm_perf4)

## [1] 0.922321

h2o.confusionMatrix(gbm_perf1)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.387829013222046:
##         N0 Yes    Error      Rate
## N0     766  45 0.055487   =45/811
## Yes     49 162 0.232227   =49/211
## Totals 815 207 0.091977  =94/1022

h2o.confusionMatrix(gbm_perf2)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.360968775857713:
##         N0 Yes    Error      Rate
## N0     762  49 0.060419   =49/811
## Yes     46 165 0.218009   =46/211
## Totals 808 214 0.092955  =95/1022

h2o.confusionMatrix(gbm_perf3)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.360968775857713:
##         N0 Yes    Error      Rate
## N0     762  49 0.060419   =49/811
## Yes     46 165 0.218009   =46/211
## Totals 808 214 0.092955  =95/1022

h2o.confusionMatrix(gbm_perf4)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.250636786717922:
##         N0 Yes    Error       Rate
## N0     741  70 0.086313    =70/811
## Yes     50 161 0.236967    =50/211
## Totals 791 231 0.117417  =120/1022

Deep Learning

dl_fit1 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit1",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

dl_fit2 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit2",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |=================================================================| 100%

dl_fit3 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit3",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |=================================================================| 100%

dl_perf1 <- h2o.performance(model = dl_fit1,
                            newdata = test)
dl_perf2 <- h2o.performance(model = dl_fit2,
                            newdata = test)
dl_perf3 <- h2o.performance(model = dl_fit3,
newdata = test)

h2o.auc(dl_perf1)  # 0.6774335

## [1] 0.9449045

h2o.auc(dl_perf2)  # 0.678446

## [1] 0.9446298

h2o.auc(dl_perf3) # 0.6770498

## [1] 0.9404866

plot(dl_fit3, 
     timestep = "epochs", 
metric = "AUC")

dl_fit4 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit4",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(32,64,32,128,32,64,32),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================================================================| 100%

dl_perf4 <- h2o.performance(model = dl_fit4,
newdata = test)

h2o.auc(dl_perf4)

## [1] 0.9396889

h2o.confusionMatrix(dl_perf1)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.487012442242802:
##         N0 Yes    Error      Rate
## N0     766  45 0.055487   =45/811
## Yes     47 164 0.222749   =47/211
## Totals 813 209 0.090020  =92/1022

h2o.confusionMatrix(dl_perf2)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.44685584564985:
##         N0 Yes    Error      Rate
## N0     765  46 0.056720   =46/811
## Yes     48 163 0.227488   =48/211
## Totals 813 209 0.091977  =94/1022

h2o.confusionMatrix(dl_perf3)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.464838685057443:
##         N0 Yes    Error      Rate
## N0     774  37 0.045623   =37/811
## Yes     50 161 0.236967   =50/211
## Totals 824 198 0.085127  =87/1022

h2o.confusionMatrix(dl_perf4)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.338893274470836:
##         N0 Yes    Error       Rate
## N0     730  81 0.099877    =81/811
## Yes     31 180 0.146919    =31/211
## Totals 761 261 0.109589  =112/1022

CARTESIAN GRID SEARCHES

gbm_params11 <- list(learn_rate = c(0.01, 0.1),
                    max_depth = c(3, 5, 9),
                    sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))

gbm_grid11 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid11",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
hyper_params = gbm_params11)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |========================================================         |  85%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf11 <- h2o.getGrid(grid_id = "gbm_grid11", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf11)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid11 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1             0.5        0.1         5         1.0 gbm_grid11_model_28
## 2             1.0        0.1         5         0.8 gbm_grid11_model_11
## 3             0.2        0.1         5         1.0 gbm_grid11_model_27
## 4             1.0        0.1         3         1.0 gbm_grid11_model_23
## 5             1.0        0.1         9         0.8 gbm_grid11_model_17
##                  auc
## 1 0.9448122571085079
## 2 0.9439399446417088
## 3  0.943707887158554
## 4 0.9422736041602594
## 5 0.9420946682696341
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             1.0       0.01         3         0.8  gbm_grid11_model_2
## 32             1.0       0.01         3         1.0 gbm_grid11_model_20
## 33             0.5       0.01         3         0.8  gbm_grid11_model_1
## 34             0.5       0.01         3         1.0 gbm_grid11_model_19
## 35             0.2       0.01         3         0.8  gbm_grid11_model_0
## 36             0.2       0.01         3         1.0 gbm_grid11_model_18
##                   auc
## 31  0.916940196270305
## 32 0.9167696479995527
## 33 0.9148684541616574
## 34 0.9134425587832359
## 35 0.9127324069673163
## 36 0.9125422875835267

gbm_params21 <- list(learn_rate = seq(0.001, 0.1, 0.001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.3, 1.0, 0.05),
                    col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria21 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid21 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid21",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
                      hyper_params = gbm_params21,
                      search_criteria = search_criteria21)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |====                                                             |   5%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  75%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  92%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf21 <- h2o.getGrid(grid_id = "gbm_grid21", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf21)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid21 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1            0.65      0.059         9         0.5 gbm_grid21_model_22
## 2             0.4      0.067         7        0.75 gbm_grid21_model_34
## 3             0.5      0.052         8         0.6 gbm_grid21_model_19
## 4             0.4      0.062         6         0.9  gbm_grid21_model_2
## 5             0.7      0.052         4         0.8 gbm_grid21_model_21
##                  auc
## 1 0.9422344619341851
## 2 0.9416724914026896
## 3  0.941339782481058
## 4 0.9406212430452652
## 5 0.9401599239522465
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             0.4      0.079         2        0.45 gbm_grid21_model_18
## 32            0.85      0.012         6         0.3 gbm_grid21_model_32
## 33             0.7      0.011         4        0.95  gbm_grid21_model_8
## 34             0.9       0.01         4        0.55  gbm_grid21_model_6
## 35             0.4      0.042         2         0.5 gbm_grid21_model_16
## 36            0.15      0.008         2        0.65 gbm_grid21_model_11
##                   auc
## 31 0.9301031677244388
## 32 0.9257220342774066
## 33 0.9240165515698828
## 34 0.9237201890010345
## 35    0.9235440489837
## 36 0.8947465540861689

#UPDATE

gbm_params31 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.6, 0.9, 0.005),
                    col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria31 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid31<- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid31",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 1000,
                      seed = 1,
                      hyper_params = gbm_params31,
                      search_criteria = search_criteria31)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf31 <- h2o.getGrid(grid_id = "gbm_grid31", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf31)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid31 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1           0.585     0.0436         3       0.785  gbm_grid31_model_5
## 2            0.64     0.0404         3       0.775 gbm_grid31_model_20
## 3            0.53     0.0499         2        0.61 gbm_grid31_model_29
## 4            0.53     0.0345         3        0.69 gbm_grid31_model_32
## 5           0.625     0.0367         2        0.78  gbm_grid31_model_0
##                  auc
## 1 0.9524198283333799
## 2  0.950801017697878
## 3 0.9500852741353761
## 4 0.9498755836385494
## 5 0.9495037324908435
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31           0.625     0.0365        10       0.685 gbm_grid31_model_13
## 32           0.615      0.036         9       0.685 gbm_grid31_model_31
## 33           0.715     0.0352        10        0.89  gbm_grid31_model_6
## 34            0.69     0.0353         9        0.64 gbm_grid31_model_10
## 35             0.5     0.0285         8       0.785 gbm_grid31_model_21
## 36           0.645     0.0372         9        0.85 gbm_grid31_model_22
##                   auc
## 31  0.942597925462018
## 32 0.9424497441775939
## 33 0.9421953197081108
## 34 0.9421869320882377
## 35 0.9419408952386278
## 36 0.9415187183716834

best_gbm_model_id <- gbm_gridperf31@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm, 
                                 newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541

## [1] 0.9476657

h2o.confusionMatrix(best_gbm_perf)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.496636246222322:
##         N0 Yes    Error      Rate
## N0     779  32 0.039457   =32/811
## Yes     56 155 0.265403   =56/211
## Totals 835 187 0.086106  =88/1022

RAIN TOMORROW

Basic Binomial Generalized Linear Model

y1 <- y2

glm_fit10 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit10",
family = "binomial") #similar to R's glm, h2o.glm has the family argument

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
glm_fit20 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit20",
                    validation_frame = valid,
                    family = "binomial",
lambda_search = TRUE)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=================================================================| 100%

# Let's compare the performance of the two GLMs
glm_perf10 <- h2o.performance(model = glm_fit10,
                             newdata = test)
glm_perf20 <- h2o.performance(model = glm_fit20,
                             newdata = test)

# Print model performance
glm_perf10

## H2OBinomialMetrics: glm
## 
## MSE:  0.1359599
## RMSE:  0.3687274
## LogLoss:  0.4285203
## Mean Per-Class Error:  0.26428
## AUC:  0.7990735
## Gini:  0.5981469
## R^2:  0.2228153
## Residual Deviance:  875.8955
## AIC:  961.8955
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     640 151 0.190898   =151/791
## Yes     78 153 0.337662    =78/231
## Totals 718 304 0.224070  =229/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.246202 0.571963 196
## 2                       max f2  0.126803 0.675583 280
## 3                 max f0point5  0.410343 0.584795 115
## 4                 max accuracy  0.448265 0.817025 103
## 5                max precision  0.998373 1.000000   0
## 6                   max recall  0.014665 1.000000 392
## 7              max specificity  0.998373 1.000000   0
## 8             max absolute_mcc  0.309329 0.436817 161
## 9   max min_per_class_accuracy  0.197032 0.722944 230
## 10 max mean_per_class_accuracy  0.246202 0.735720 196
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

glm_perf20

## H2OBinomialMetrics: glm
## 
## MSE:  0.1349278
## RMSE:  0.3673252
## LogLoss:  0.4249275
## Mean Per-Class Error:  0.2618473
## AUC:  0.8038157
## Gini:  0.6076313
## R^2:  0.2287153
## Residual Deviance:  868.5518
## AIC:  918.5518
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     637 154 0.194690   =154/791
## Yes     76 155 0.329004    =76/231
## Totals 713 309 0.225049  =230/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.241526 0.574074 190
## 2                       max f2  0.118447 0.680159 283
## 3                 max f0point5  0.356828 0.601704 125
## 4                 max accuracy  0.358210 0.821918 124
## 5                max precision  0.992446 1.000000   0
## 6                   max recall  0.024746 1.000000 388
## 7              max specificity  0.992446 1.000000   0
## 8             max absolute_mcc  0.356828 0.451257 125
## 9   max min_per_class_accuracy  0.191526 0.724399 223
## 10 max mean_per_class_accuracy  0.223121 0.739130 201
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf10)

## [1] 0.7990735

h2o.auc(glm_perf20)

## [1] 0.8038157

# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit10, train = TRUE)

## [1] 0.8101513

h2o.auc(glm_fit20, valid = TRUE)

## [1] 0.7799993

#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf10)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.246201735705228:
##         N0 Yes    Error       Rate
## N0     640 151 0.190898   =151/791
## Yes     78 153 0.337662    =78/231
## Totals 718 304 0.224070  =229/1022

h2o.confusionMatrix(glm_perf20)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.241526093530235:
##         N0 Yes    Error       Rate
## N0     637 154 0.194690   =154/791
## Yes     76 155 0.329004    =76/231
## Totals 713 309 0.225049  =230/1022

RANDOM FOREST

rf_fit10 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit10",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=================================================================| 100%

rf_fit20 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit20",
                            validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100000,
                            score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

#Let's compare the performance of the two RFs
rf_perf10 <- h2o.performance(model = rf_fit10,
                            newdata = test)
rf_perf20 <- h2o.performance(model = rf_fit20,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf10)

## [1] 0.7925252

h2o.auc(rf_perf20)

## [1] 0.7900871

#Cross Validate

rf_fit30 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit30",
                            seed = 1,
                            nfolds = 5)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |=================================================================| 100%

rf_perf30 <- h2o.performance(model = rf_fit30,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf30)

## [1] 0.7925252

# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit30, xval = TRUE)

## [1] 0.7838314

h2o.confusionMatrix(rf_perf10)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.35:
##         N0 Yes    Error       Rate
## N0     701  90 0.113780    =90/791
## Yes    101 130 0.437229   =101/231
## Totals 802 220 0.186888  =191/1022

h2o.confusionMatrix(rf_perf20)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.328571428571429:
##         N0 Yes    Error       Rate
## N0     683 108 0.136536   =108/791
## Yes     94 137 0.406926    =94/231
## Totals 777 245 0.197652  =202/1022

h2o.confusionMatrix(rf_perf30)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.35:
##         N0 Yes    Error       Rate
## N0     701  90 0.113780    =90/791
## Yes    101 130 0.437229   =101/231
## Totals 802 220 0.186888  =191/1022

Gradient Boosting Machine

gbm_fit10 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit10",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |=================================================================| 100%

gbm_fit20 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit20",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%

gbm_fit30 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit30",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 50000,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_perf10 <- h2o.performance(model = gbm_fit10,
                             newdata = test)
gbm_perf20 <- h2o.performance(model = gbm_fit20,
                             newdata = test)
gbm_perf30 <- h2o.performance(model = gbm_fit30,
newdata = test)

h2o.auc(gbm_perf10)

## [1] 0.7963589

h2o.auc(gbm_perf20)

## [1] 0.7972182

h2o.auc(gbm_perf30)

## [1] 0.7972182

h2o.scoreHistory(gbm_fit20)

## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:08:02  0.000 sec               0       0.41096
## 2  2018-01-28 21:08:02  0.015 sec               5       0.37158
## 3  2018-01-28 21:08:02  0.046 sec              10       0.35093
## 4  2018-01-28 21:08:02  0.078 sec              15       0.33775
## 5  2018-01-28 21:08:02  0.093 sec              20       0.32816
## 6  2018-01-28 21:08:02  0.124 sec              25       0.32067
## 7  2018-01-28 21:08:02  0.156 sec              30       0.31465
## 8  2018-01-28 21:08:02  0.171 sec              35       0.31121
## 9  2018-01-28 21:08:02  0.203 sec              40       0.30703
## 10 2018-01-28 21:08:02  0.234 sec              45       0.30324
## 11 2018-01-28 21:08:02  0.265 sec              50       0.29859
## 12 2018-01-28 21:08:02  0.296 sec              55       0.29613
## 13 2018-01-28 21:08:02  0.312 sec              60       0.29363
##    training_logloss training_auc training_lift
## 1           0.52076      0.50000       1.00000
## 2           0.43782      0.85058       4.36813
## 3           0.39786      0.86818       4.45332
## 4           0.37192      0.88191       4.64694
## 5           0.35322      0.89190       4.64694
## 6           0.33779      0.90331       4.64694
## 7           0.32618      0.91141       4.64694
## 8           0.31933      0.91535       4.64694
## 9           0.31161      0.92052       4.64694
## 10          0.30461      0.92575       4.64694
## 11          0.29689      0.93082       4.64694
## 12          0.29253      0.93309       4.64694
## 13          0.28826      0.93563       4.64694
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78480         0.42711            0.55150
## 2                        0.18284         0.39843            0.48816
## 3                        0.17178         0.38800            0.46590
## 4                        0.16427         0.38517            0.45884
## 5                        0.15508         0.38292            0.45369
## 6                        0.15362         0.38299            0.45288
## 7                        0.13525         0.38296            0.45279
## 8                        0.13066         0.38296            0.45271
## 9                        0.12252         0.38309            0.45297
## 10                       0.12106         0.38425            0.45511
## 11                       0.12711         0.38380            0.45496
## 12                       0.11396         0.38363            0.45491
## 13                       0.12336         0.38409            0.45626
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76104
## 2         0.78273         4.18487                         0.29819
## 3         0.78540         4.18487                         0.26606
## 4         0.78459         3.76639                         0.28213
## 5         0.78665         4.18487                         0.26205
## 6         0.78797         4.18487                         0.26908
## 7         0.78821         4.18487                         0.23996
## 8         0.78910         4.18487                         0.25803
## 9         0.79017         3.76639                         0.23795
## 10        0.78935         3.34790                         0.23594
## 11        0.78948         3.76639                         0.27610
## 12        0.79032         3.34790                         0.24096
## 13        0.78976         3.76639                         0.27108

h2o.scoreHistory(gbm_fit30)

## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:08:03  0.000 sec               0       0.41096
## 2  2018-01-28 21:08:03  0.015 sec               5       0.37158
## 3  2018-01-28 21:08:03  0.049 sec              10       0.35093
## 4  2018-01-28 21:08:03  0.075 sec              15       0.33775
## 5  2018-01-28 21:08:03  0.090 sec              20       0.32816
## 6  2018-01-28 21:08:03  0.122 sec              25       0.32067
## 7  2018-01-28 21:08:03  0.153 sec              30       0.31465
## 8  2018-01-28 21:08:03  0.168 sec              35       0.31121
## 9  2018-01-28 21:08:03  0.200 sec              40       0.30703
## 10 2018-01-28 21:08:03  0.231 sec              45       0.30324
## 11 2018-01-28 21:08:03  0.257 sec              50       0.29859
## 12 2018-01-28 21:08:03  0.274 sec              55       0.29613
## 13 2018-01-28 21:08:03  0.290 sec              60       0.29363
##    training_logloss training_auc training_lift
## 1           0.52076      0.50000       1.00000
## 2           0.43782      0.85058       4.36813
## 3           0.39786      0.86818       4.45332
## 4           0.37192      0.88191       4.64694
## 5           0.35322      0.89190       4.64694
## 6           0.33779      0.90331       4.64694
## 7           0.32618      0.91141       4.64694
## 8           0.31933      0.91535       4.64694
## 9           0.31161      0.92052       4.64694
## 10          0.30461      0.92575       4.64694
## 11          0.29689      0.93082       4.64694
## 12          0.29253      0.93309       4.64694
## 13          0.28826      0.93563       4.64694
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78480         0.42711            0.55150
## 2                        0.18284         0.39843            0.48816
## 3                        0.17178         0.38800            0.46590
## 4                        0.16427         0.38517            0.45884
## 5                        0.15508         0.38292            0.45369
## 6                        0.15362         0.38299            0.45288
## 7                        0.13525         0.38296            0.45279
## 8                        0.13066         0.38296            0.45271
## 9                        0.12252         0.38309            0.45297
## 10                       0.12106         0.38425            0.45511
## 11                       0.12711         0.38380            0.45496
## 12                       0.11396         0.38363            0.45491
## 13                       0.12336         0.38409            0.45626
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76104
## 2         0.78273         4.18487                         0.29819
## 3         0.78540         4.18487                         0.26606
## 4         0.78459         3.76639                         0.28213
## 5         0.78665         4.18487                         0.26205
## 6         0.78797         4.18487                         0.26908
## 7         0.78821         4.18487                         0.23996
## 8         0.78910         4.18487                         0.25803
## 9         0.79017         3.76639                         0.23795
## 10        0.78935         3.34790                         0.23594
## 11        0.78948         3.76639                         0.27610
## 12        0.79032         3.34790                         0.24096
## 13        0.78976         3.76639                         0.27108

plot(gbm_fit20, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit30, 
     timestep = "number_of_trees", 
metric = "logloss")

gbm_fit40 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit40",
                    ntrees=5,
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_perf40 <- h2o.performance(model = gbm_fit40,
newdata = test)

h2o.auc(gbm_perf40)

## [1] 0.7773026

h2o.confusionMatrix(gbm_perf10)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.303536886664258:
##         N0 Yes    Error       Rate
## N0     688 103 0.130215   =103/791
## Yes     97 134 0.419913    =97/231
## Totals 785 237 0.195695  =200/1022

h2o.confusionMatrix(gbm_perf20)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.279072940794731:
##         N0 Yes    Error       Rate
## N0     671 120 0.151707   =120/791
## Yes     92 139 0.398268    =92/231
## Totals 763 259 0.207436  =212/1022

h2o.confusionMatrix(gbm_perf30)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.279072940794731:
##         N0 Yes    Error       Rate
## N0     671 120 0.151707   =120/791
## Yes     92 139 0.398268    =92/231
## Totals 763 259 0.207436  =212/1022

h2o.confusionMatrix(gbm_perf40)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.210171705078586:
##         N0 Yes    Error       Rate
## N0     600 191 0.241466   =191/791
## Yes     72 159 0.311688    =72/231
## Totals 672 350 0.257339  =263/1022

Deep Learning

dl_fit10 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit10",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%

dl_fit20 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit20",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

dl_fit30 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit30",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================================| 100%

dl_perf10 <- h2o.performance(model = dl_fit10,
                            newdata = test)
dl_perf20 <- h2o.performance(model = dl_fit20,
                            newdata = test)
dl_perf30 <- h2o.performance(model = dl_fit30,
newdata = test)

h2o.auc(dl_perf10)  # 0.6774335

## [1] 0.7993088

h2o.auc(dl_perf20)  # 0.678446

## [1] 0.7977928

h2o.auc(dl_perf30) # 0.6770498

## [1] 0.7907575

plot(dl_fit30, 
     timestep = "epochs", 
metric = "AUC")

dl_fit40 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit40",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(32,64,32,128,32,64,32),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================================| 100%

dl_perf40 <- h2o.performance(model = dl_fit40,
newdata = test)

h2o.auc(dl_perf40)

## [1] 0.7963151

h2o.confusionMatrix(dl_perf10)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.270812603089717:
##         N0 Yes    Error       Rate
## N0     653 138 0.174463   =138/791
## Yes     85 146 0.367965    =85/231
## Totals 738 284 0.218200  =223/1022

h2o.confusionMatrix(dl_perf20)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.356675378728843:
##         N0 Yes    Error       Rate
## N0     702  89 0.112516    =89/791
## Yes    105 126 0.454545   =105/231
## Totals 807 215 0.189824  =194/1022

h2o.confusionMatrix(dl_perf30)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.403758161696188:
##         N0 Yes    Error       Rate
## N0     693  98 0.123894    =98/791
## Yes     98 133 0.424242    =98/231
## Totals 791 231 0.191781  =196/1022

h2o.confusionMatrix(dl_perf40)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.377268149879096:
##         N0 Yes    Error       Rate
## N0     649 142 0.179520   =142/791
## Yes     80 151 0.346320    =80/231
## Totals 729 293 0.217221  =222/1022

CARTESIAN GRID SEARCHES

gbm_params10 <- list(learn_rate = c(0.01, 0.1),
                    max_depth = c(3, 5, 9),
                    sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))

gbm_grid10 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid10",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
hyper_params = gbm_params10)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  58%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |===========================================                      |  65%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf10 <- h2o.getGrid(grid_id = "gbm_grid10", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf10)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid10 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1             0.2       0.01         9         0.8 gbm_grid10_model_12
## 2             0.2       0.01         9         1.0 gbm_grid10_model_30
## 3             0.2        0.1         5         1.0 gbm_grid10_model_27
## 4             0.5       0.01         9         1.0 gbm_grid10_model_31
## 5             1.0        0.1         9         0.8 gbm_grid10_model_17
##                  auc
## 1 0.7956198310458749
## 2  0.793009024190151
## 3 0.7909164985255316
## 4 0.7895445777255494
## 5 0.7890456974346466
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             0.5       0.01         3         1.0 gbm_grid10_model_19
## 32             1.0        0.1         9         1.0 gbm_grid10_model_35
## 33             1.0       0.01         3         1.0 gbm_grid10_model_20
## 34             0.5        0.1         9         0.8 gbm_grid10_model_16
## 35             0.2       0.01         3         1.0 gbm_grid10_model_18
## 36             1.0       0.01         9         1.0 gbm_grid10_model_32
##                   auc
## 31 0.7789295137580098
## 32 0.7788020221281124
## 33 0.7786052415689231
## 34 0.7776019378727745
## 35  0.776806500964502
## 36 0.7628655683909447

gbm_params20 <- list(learn_rate = seq(0.001, 0.1, 0.001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.3, 1.0, 0.05),
                    col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria20 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid20 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid20",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
                      hyper_params = gbm_params20,
                      search_criteria = search_criteria20)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |=======================================                          |  61%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf20 <- h2o.getGrid(grid_id = "gbm_grid20", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf20)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid20 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1             0.4      0.092         8        0.65 gbm_grid20_model_15
## 2            0.25      0.027         7        0.95  gbm_grid20_model_4
## 3             1.0      0.043         6        0.55 gbm_grid20_model_28
## 4            0.25      0.008         7         0.9  gbm_grid20_model_2
## 5            0.45       0.02        10         1.0 gbm_grid20_model_10
##                  auc
## 1 0.7939125518281192
## 2 0.7936658832398394
## 3 0.7928926187889405
## 4 0.7923909669408661
## 5 0.7922080441675351
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31             0.1      0.026         2         0.8 gbm_grid20_model_26
## 32            0.75      0.064         7        0.45 gbm_grid20_model_34
## 33            0.95      0.019         8         1.0 gbm_grid20_model_12
## 34            0.35      0.005         3         1.0 gbm_grid20_model_23
## 35            0.65      0.088         9         0.5  gbm_grid20_model_6
## 36             0.8      0.098         6         0.4 gbm_grid20_model_32
##                   auc
## 31  0.777762688188732
## 32 0.7777322010598434
## 33  0.776640207534201
## 34 0.7765432030331921
## 35 0.7752405711625019
## 36 0.7729429502671782

#UPDATE

gbm_params30 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.6, 0.9, 0.005),
                    col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria30 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid30 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid30",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 1000,
                      seed = 1,
                      hyper_params = gbm_params30,
                      search_criteria = search_criteria30)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  95%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf30 <- h2o.getGrid(grid_id = "gbm_grid30", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf30)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid30 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate           model_ids
## 1           0.785     0.0265         3        0.79 gbm_grid30_model_33
## 2           0.665     0.0331         8        0.85  gbm_grid30_model_9
## 3           0.655     0.0274         7        0.75 gbm_grid30_model_19
## 4            0.71     0.0399         9        0.79  gbm_grid30_model_6
## 5           0.555     0.0375         9        0.69 gbm_grid30_model_18
##                  auc
## 1 0.7850103101926786
## 2 0.7845751757167246
## 3 0.7840458082969336
## 4 0.7834471519478504
## 5 0.7832060264739141
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate           model_ids
## 31            0.54      0.042         6        0.74 gbm_grid30_model_15
## 32           0.765     0.0493         8         0.6 gbm_grid30_model_11
## 33           0.525     0.0489         4       0.885 gbm_grid30_model_21
## 34            0.54     0.0401         5       0.705 gbm_grid30_model_27
## 35           0.785      0.042         5       0.755 gbm_grid30_model_34
## 36            0.77     0.0487         5       0.815  gbm_grid30_model_3
##                   auc
## 31 0.7754290370501762
## 32 0.7746613157136205
## 33 0.7731452739407108
## 34  0.771820469612647
## 35 0.7707229329726614
## 36 0.7675190128821977

best_gbm_model_id <- gbm_gridperf30@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm, 
                                 newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541

## [1] 0.7976642

h2o.confusionMatrix(best_gbm_perf)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.255673344156405:
##         N0 Yes    Error       Rate
## N0     666 125 0.158028   =125/791
## Yes     85 146 0.367965    =85/231
## Totals 751 271 0.205479  =210/1022

RAIN IN 2 DAYS

Basic Binomial Generalized Linear Model

y1 <- y3

glm_fit105 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit105",
family = "binomial") #similar to R's glm, h2o.glm has the family argument

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

# Next we will do some automatic tuning by passing in a validation frame and setting 
# `lambda_search = True`.  Since we are training a GLM with regularization, we should 
# try to find the right amount of regularization (to avoid overfitting).  The model 
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can 
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE` 
# and passing in a validation frame (which is used to evaluate model performance using a 
# particular value of lambda).
glm_fit205 <- h2o.glm(x = x, 
                    y = y1, 
                    training_frame = train,
                    model_id = "glm_fit205",
                    validation_frame = valid,
                    family = "binomial",
lambda_search = TRUE)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================================================================| 100%

# Let's compare the performance of the two GLMs
glm_perf105 <- h2o.performance(model = glm_fit105,
                             newdata = test)
glm_perf205 <- h2o.performance(model = glm_fit205,
                             newdata = test)

# Print model performance
glm_perf105

## H2OBinomialMetrics: glm
## 
## MSE:  0.1554299
## RMSE:  0.394246
## LogLoss:  0.4807781
## Mean Per-Class Error:  0.335744
## AUC:  0.7075338
## Gini:  0.4150676
## R^2:  0.09756708
## Residual Deviance:  982.7104
## AIC:  1074.71
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     501 295 0.370603   =295/796
## Yes     68 158 0.300885    =68/226
## Totals 569 453 0.355186  =363/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.199328 0.465390 224
## 2                       max f2  0.093995 0.613884 335
## 3                 max f0point5  0.338541 0.430108 111
## 4                 max accuracy  0.644194 0.786693  11
## 5                max precision  0.906639 1.000000   0
## 6                   max recall  0.032023 1.000000 394
## 7              max specificity  0.906639 1.000000   0
## 8             max absolute_mcc  0.237015 0.277072 192
## 9   max min_per_class_accuracy  0.208790 0.646985 216
## 10 max mean_per_class_accuracy  0.199328 0.664256 224
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

glm_perf205

## H2OBinomialMetrics: glm
## 
## MSE:  0.1551997
## RMSE:  0.3939539
## LogLoss:  0.4795676
## Mean Per-Class Error:  0.3349824
## AUC:  0.7110914
## Gini:  0.4221828
## R^2:  0.09890381
## Residual Deviance:  980.2362
## AIC:  1042.236
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##         N0 Yes    Error       Rate
## N0     548 248 0.311558   =248/796
## Yes     81 145 0.358407    =81/226
## Totals 629 393 0.321918  =329/1022
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.214892 0.468498 206
## 2                       max f2  0.113418 0.623522 324
## 3                 max f0point5  0.335966 0.429062 111
## 4                 max accuracy  0.576065 0.786693  15
## 5                max precision  0.903696 1.000000   0
## 6                   max recall  0.035596 1.000000 397
## 7              max specificity  0.903696 1.000000   0
## 8             max absolute_mcc  0.254013 0.284842 172
## 9   max min_per_class_accuracy  0.204259 0.655779 217
## 10 max mean_per_class_accuracy  0.214892 0.665018 206
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

# Instead of printing the entire model performance metrics object, 
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf105)

## [1] 0.7075338

h2o.auc(glm_perf205)

## [1] 0.7110914

# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit105, train = TRUE)

## [1] 0.7259714

h2o.auc(glm_fit205, valid = TRUE)

## [1] 0.7131017

#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf105)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.19932800273972:
##         N0 Yes    Error       Rate
## N0     501 295 0.370603   =295/796
## Yes     68 158 0.300885    =68/226
## Totals 569 453 0.355186  =363/1022

h2o.confusionMatrix(glm_perf205)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.214891693551094:
##         N0 Yes    Error       Rate
## N0     548 248 0.311558   =248/796
## Yes     81 145 0.358407    =81/226
## Totals 629 393 0.321918  =329/1022

RANDOM FOREST

rf_fit105 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit105",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=================================================================| 100%

rf_fit205 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit205",
                            validation_frame = valid,  #only used if stopping_rounds > 0
                            ntrees = 100000,
                            score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

#Let's compare the performance of the two RFs
rf_perf105 <- h2o.performance(model = rf_fit105,
                            newdata = test)
rf_perf205 <- h2o.performance(model = rf_fit205,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf105)

## [1] 0.7107746

h2o.auc(rf_perf205)

## [1] 0.7180287

#Cross Validate

rf_fit305 <- h2o.randomForest(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "rf_fit305",
                            seed = 1,
                            nfolds = 5)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%

rf_perf305 <- h2o.performance(model = rf_fit305,
                            newdata = test)

# Print model performance

h2o.auc(rf_perf305)

## [1] 0.7107746

# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit305, xval = TRUE)

## [1] 0.6987228

h2o.confusionMatrix(rf_perf105)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.246666666269302:
##         N0 Yes    Error       Rate
## N0     551 245 0.307789   =245/796
## Yes     82 144 0.362832    =82/226
## Totals 633 389 0.319961  =327/1022

h2o.confusionMatrix(rf_perf205)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.216748768091202:
##         N0 Yes    Error       Rate
## N0     509 287 0.360553   =287/796
## Yes     67 159 0.296460    =67/226
## Totals 576 446 0.346380  =354/1022

h2o.confusionMatrix(rf_perf305)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.246666666269302:
##         N0 Yes    Error       Rate
## N0     551 245 0.307789   =245/796
## Yes     82 144 0.362832    =82/226
## Totals 633 389 0.319961  =327/1022

Gradient Boosting Machine

gbm_fit105 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit105",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=================================================================| 100%

gbm_fit205 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit205",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 500,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |=================================================================| 100%

gbm_fit305 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit305",
                    validation_frame = valid,  #only used if stopping_rounds > 0
                    ntrees = 50000,
                    score_tree_interval = 5,      #used for early stopping
                    stopping_rounds = 3,          #used for early stopping
                    stopping_metric = "AUC",      #used for early stopping
                    stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_perf105 <- h2o.performance(model = gbm_fit105,
                             newdata = test)
gbm_perf205 <- h2o.performance(model = gbm_fit205,
                             newdata = test)
gbm_perf305 <- h2o.performance(model = gbm_fit305,
newdata = test)

h2o.auc(gbm_perf105)

## [1] 0.6992985

h2o.auc(gbm_perf205)

## [1] 0.702278

h2o.auc(gbm_perf305)

## [1] 0.702278

h2o.scoreHistory(gbm_fit205)

## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:14:30  0.000 sec               0       0.41225
## 2  2018-01-28 21:14:30  0.031 sec               5       0.38988
## 3  2018-01-28 21:14:30  0.062 sec              10       0.37549
## 4  2018-01-28 21:14:30  0.078 sec              15       0.36573
## 5  2018-01-28 21:14:30  0.109 sec              20       0.35862
## 6  2018-01-28 21:14:30  0.140 sec              25       0.35329
## 7  2018-01-28 21:14:30  0.168 sec              30       0.34800
## 8  2018-01-28 21:14:30  0.199 sec              35       0.34195
## 9  2018-01-28 21:14:30  0.215 sec              40       0.33859
## 10 2018-01-28 21:14:30  0.246 sec              45       0.33531
##    training_logloss training_auc training_lift
## 1           0.52318      0.50000       1.00000
## 2           0.47292      0.79247       3.62497
## 3           0.44268      0.81789       4.22284
## 4           0.42239      0.83553       4.12686
## 5           0.40750      0.84855       4.31881
## 6           0.39658      0.85866       4.31881
## 7           0.38639      0.86996       4.31881
## 8           0.37493      0.88326       4.60673
## 9           0.36888      0.88902       4.60673
## 10          0.36309      0.89384       4.60673
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78293         0.42367            0.54481
## 2                        0.25631         0.40934            0.51132
## 3                        0.26717         0.40521            0.50237
## 4                        0.22313         0.40353            0.49790
## 5                        0.20643         0.40284            0.49550
## 6                        0.18743         0.40306            0.49584
## 7                        0.18076         0.40283            0.49501
## 8                        0.17074         0.40350            0.49761
## 9                        0.16114         0.40454            0.50025
## 10                       0.15300         0.40466            0.50008
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76606
## 2         0.70837         2.95939                         0.33936
## 3         0.70214         3.41974                         0.34739
## 4         0.70733         3.41974                         0.36546
## 5         0.71179         3.41974                         0.35442
## 6         0.71274         2.13734                         0.36345
## 7         0.71541         2.56481                         0.34538
## 8         0.71037         2.56481                         0.32329
## 9         0.70734         2.99227                         0.32129
## 10        0.70937         2.13734                         0.32229

h2o.scoreHistory(gbm_fit305)

## Scoring History: 
##              timestamp   duration number_of_trees training_rmse
## 1  2018-01-28 21:14:32  0.000 sec               0       0.41225
## 2  2018-01-28 21:14:32  0.016 sec               5       0.38988
## 3  2018-01-28 21:14:32  0.060 sec              10       0.37549
## 4  2018-01-28 21:14:32  0.075 sec              15       0.36573
## 5  2018-01-28 21:14:32  0.107 sec              20       0.35862
## 6  2018-01-28 21:14:32  0.138 sec              25       0.35329
## 7  2018-01-28 21:14:32  0.169 sec              30       0.34800
## 8  2018-01-28 21:14:32  0.200 sec              35       0.34195
## 9  2018-01-28 21:14:32  0.216 sec              40       0.33859
## 10 2018-01-28 21:14:32  0.247 sec              45       0.33531
##    training_logloss training_auc training_lift
## 1           0.52318      0.50000       1.00000
## 2           0.47292      0.79247       3.62497
## 3           0.44268      0.81789       4.22284
## 4           0.42239      0.83553       4.12686
## 5           0.40750      0.84855       4.31881
## 6           0.39658      0.85866       4.31881
## 7           0.38639      0.86996       4.31881
## 8           0.37493      0.88326       4.60673
## 9           0.36888      0.88902       4.60673
## 10          0.36309      0.89384       4.60673
##    training_classification_error validation_rmse validation_logloss
## 1                        0.78293         0.42367            0.54481
## 2                        0.25631         0.40934            0.51132
## 3                        0.26717         0.40521            0.50237
## 4                        0.22313         0.40353            0.49790
## 5                        0.20643         0.40284            0.49550
## 6                        0.18743         0.40306            0.49584
## 7                        0.18076         0.40283            0.49501
## 8                        0.17074         0.40350            0.49761
## 9                        0.16114         0.40454            0.50025
## 10                       0.15300         0.40466            0.50008
##    validation_auc validation_lift validation_classification_error
## 1         0.50000         1.00000                         0.76606
## 2         0.70837         2.95939                         0.33936
## 3         0.70214         3.41974                         0.34739
## 4         0.70733         3.41974                         0.36546
## 5         0.71179         3.41974                         0.35442
## 6         0.71274         2.13734                         0.36345
## 7         0.71541         2.56481                         0.34538
## 8         0.71037         2.56481                         0.32329
## 9         0.70734         2.99227                         0.32129
## 10        0.70937         2.13734                         0.32229

plot(gbm_fit205, 
     timestep = "number_of_trees", 
     metric = "AUC")

plot(gbm_fit305, 
     timestep = "number_of_trees", 
metric = "logloss")

gbm_fit405 <- h2o.gbm(x = x,
                    y = y1,
                    training_frame = train,
                    model_id = "gbm_fit405",
                    ntrees=5,
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

gbm_perf405 <- h2o.performance(model = gbm_fit405,
newdata = test)

h2o.auc(gbm_perf405)

## [1] 0.6973529

h2o.confusionMatrix(gbm_perf105)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.237339910626288:
##         N0 Yes    Error       Rate
## N0     583 213 0.267588   =213/796
## Yes     93 133 0.411504    =93/226
## Totals 676 346 0.299413  =306/1022

h2o.confusionMatrix(gbm_perf205)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.183380894744191:
##         N0 Yes    Error       Rate
## N0     506 290 0.364322   =290/796
## Yes     71 155 0.314159    =71/226
## Totals 577 445 0.353229  =361/1022

h2o.confusionMatrix(gbm_perf305)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.183380894744191:
##         N0 Yes    Error       Rate
## N0     506 290 0.364322   =290/796
## Yes     71 155 0.314159    =71/226
## Totals 577 445 0.353229  =361/1022

h2o.confusionMatrix(gbm_perf405)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.233304352675186:
##         N0 Yes    Error       Rate
## N0     593 203 0.255025   =203/796
## Yes     98 128 0.433628    =98/226
## Totals 691 331 0.294521  =301/1022

Deep Learning

dl_fit105 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit105",
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

dl_fit205 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit205",
                            #validation_frame = valid,  #only used if stopping_rounds > 0
                            epochs = 20,
                            hidden= c(10,10),
                            stopping_rounds = 0,  # disable early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

dl_fit305 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit305",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(10,10),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |=================================================================| 100%

dl_perf105 <- h2o.performance(model = dl_fit105,
                            newdata = test)
dl_perf205 <- h2o.performance(model = dl_fit205,
                            newdata = test)
dl_perf305 <- h2o.performance(model = dl_fit305,
newdata = test)

h2o.auc(dl_perf105)  # 0.6774335

## [1] 0.7017638

h2o.auc(dl_perf205)  # 0.678446

## [1] 0.702253

h2o.auc(dl_perf305) # 0.6770498

## [1] 0.705541

plot(dl_fit305, 
     timestep = "epochs", 
metric = "AUC")

dl_fit405 <- h2o.deeplearning(x = x,
                            y = y1,
                            training_frame = train,
                            model_id = "dl_fit405",
                            validation_frame = valid,  #in DL, early stopping is on by default
                            epochs = 200,
                            hidden = c(32,64,32,128,32,64,32),
                            score_interval = 1,           #used for early stopping
                            stopping_rounds = 3,          #used for early stopping
                            stopping_metric = "AUC",      #used for early stopping
                            stopping_tolerance = 0.0005,  #used for early stopping
seed = 1)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=================================================================| 100%

dl_perf405 <- h2o.performance(model = dl_fit405,
newdata = test)

h2o.auc(dl_perf40)

## [1] 0.7963151

h2o.confusionMatrix(dl_perf105)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.139217562640086:
##         N0 Yes    Error       Rate
## N0     377 419 0.526382   =419/796
## Yes     28 198 0.123894    =28/226
## Totals 405 617 0.437378  =447/1022

h2o.confusionMatrix(dl_perf205)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.24193354973481:
##         N0 Yes    Error       Rate
## N0     496 300 0.376884   =300/796
## Yes     68 158 0.300885    =68/226
## Totals 564 458 0.360078  =368/1022

h2o.confusionMatrix(dl_perf305)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.15720004889004:
##         N0 Yes    Error       Rate
## N0     509 287 0.360553   =287/796
## Yes     70 156 0.309735    =70/226
## Totals 579 443 0.349315  =357/1022

h2o.confusionMatrix(dl_perf405)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.212065363158288:
##         N0 Yes    Error       Rate
## N0     616 180 0.226131   =180/796
## Yes    106 120 0.469027   =106/226
## Totals 722 300 0.279843  =286/1022

CARTESIAN GRID SEARCHES

gbm_params105 <- list(learn_rate = c(0.01, 0.1),
                    max_depth = c(3, 5, 9),
                    sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))

gbm_grid105 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid105",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
hyper_params = gbm_params105)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |==============================                                   |  45%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf105 <- h2o.getGrid(grid_id = "gbm_grid105", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf105)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid105 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate            model_ids
## 1             0.2       0.01         5         1.0 gbm_grid105_model_24
## 2             1.0       0.01         5         0.8  gbm_grid105_model_8
## 3             1.0       0.01         3         0.8  gbm_grid105_model_2
## 4             0.5       0.01         3         0.8  gbm_grid105_model_1
## 5             0.5       0.01         3         1.0 gbm_grid105_model_19
##                  auc
## 1  0.716080076949471
## 2 0.7155457056232738
## 3 0.7145332125841635
## 4 0.7144572756062303
## 5 0.7143447763796623
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate            model_ids
## 31             0.5        0.1         5         0.8 gbm_grid105_model_10
## 32             0.2        0.1         9         1.0 gbm_grid105_model_33
## 33             0.2        0.1         9         0.8 gbm_grid105_model_15
## 34             0.5        0.1         9         1.0 gbm_grid105_model_34
## 35             0.5        0.1         9         0.8 gbm_grid105_model_16
## 36             1.0        0.1         9         1.0 gbm_grid105_model_35
##                   auc
## 31   0.68331186473093
## 32 0.6826059320842168
## 33 0.6742416145888998
## 34 0.6728888113894218
## 35 0.6702703918910558
## 36 0.6691172748187356

gbm_params205 <- list(learn_rate = seq(0.001, 0.1, 0.001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.3, 1.0, 0.05),
                    col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria205 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid205 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid205",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 100,
                      seed = 1,
                      hyper_params = gbm_params20,
                      search_criteria = search_criteria20)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf205 <- h2o.getGrid(grid_id = "gbm_grid205", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf205)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid205 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate            model_ids
## 1            0.65       0.01         6         0.5  gbm_grid205_model_7
## 2             0.4      0.052         3        0.65 gbm_grid205_model_13
## 3            0.35      0.069         9         0.3  gbm_grid205_model_3
## 4             0.3       0.05         3         0.8 gbm_grid205_model_35
## 5             0.7      0.036         5        0.45 gbm_grid205_model_22
##                  auc
## 1 0.7185184976853284
## 2 0.7173428807676948
## 3 0.7166116357950039
## 4 0.7165047615297644
## 5 0.7153713318220937
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate            model_ids
## 31             0.8      0.072        10        0.75  gbm_grid205_model_6
## 32            0.25       0.05         8        0.75 gbm_grid205_model_25
## 33             0.9      0.087         6         0.5  gbm_grid205_model_9
## 34            0.65      0.049         9         0.3 gbm_grid205_model_23
## 35             0.8      0.082         7         0.8 gbm_grid205_model_15
## 36             0.1      0.079        10        0.85  gbm_grid205_model_1
##                   auc
## 31 0.6932792962048386
## 32 0.6931864843429202
## 33 0.6887905770647826
## 34 0.6852468514278963
## 35 0.6816496886583904
## 36 0.6714066340793907

#UPDATE

gbm_params305 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
                    max_depth = seq(2, 10, 1),
                    sample_rate = seq(0.6, 0.9, 0.005),
                    col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria305 <- list(strategy = "RandomDiscrete", 
                         max_models = 36)

# Train and validate a grid of GBMs
gbm_grid305 <- h2o.grid("gbm", x = x, y = y1,
                      grid_id = "gbm_grid305",
                      training_frame = train,
                      validation_frame = valid,
                      ntrees = 1000,
                      seed = 1,
                      hyper_params = gbm_params30,
                      search_criteria = search_criteria30)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |=================================================================| 100%

gbm_gridperf305 <- h2o.getGrid(grid_id = "gbm_grid305", 
                             sort_by = "auc", 
                             decreasing = TRUE)
print(gbm_gridperf305)

## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid305 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  learn_rate 
##   -  max_depth 
##   -  sample_rate 
## Number of models: 36 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by decreasing auc
##   col_sample_rate learn_rate max_depth sample_rate            model_ids
## 1            0.63     0.0255         2       0.815  gbm_grid305_model_3
## 2           0.705     0.0356         2        0.68 gbm_grid305_model_19
## 3           0.705      0.039         2        0.65 gbm_grid305_model_22
## 4            0.63     0.0488         2       0.605 gbm_grid305_model_23
## 5            0.77     0.0356         2         0.6 gbm_grid305_model_20
##                  auc
## 1 0.7092879361454391
## 2 0.7022736093689356
## 3 0.7019136118439185
## 4 0.6977877027095438
## 5 0.6957514667086665
## 
## ---
##    col_sample_rate learn_rate max_depth sample_rate            model_ids
## 31           0.715     0.0264         7       0.685 gbm_grid305_model_34
## 32           0.535      0.032         5       0.745 gbm_grid305_model_29
## 33            0.52     0.0443         5         0.8 gbm_grid305_model_35
## 34           0.515     0.0327         7       0.715 gbm_grid305_model_18
## 35           0.725     0.0456         6         0.9 gbm_grid305_model_26
## 36             0.7     0.0444         4       0.625 gbm_grid305_model_30
##                   auc
## 31 0.6719128805989459
## 32 0.6704925778635272
## 33 0.6701213304158534
## 34 0.6698035201007994
## 35 0.6688135269070025
## 36 0.6649604283970548

best_gbm_model_id <- gbm_gridperf30@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm, 
                                 newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541

## [1] 0.7976642

h2o.confusionMatrix(best_gbm_perf)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.255673344156405:
##         N0 Yes    Error       Rate
## N0     666 125 0.158028   =125/791
## Yes     85 146 0.367965    =85/231
## Totals 751 271 0.205479  =210/1022

Weather

Madrid Mean Temperature Forecast

Read in data, select mean temperature and divide into training and forecasting sets (75:25)

Plot Training and Forecasting sets

Simple forecasting methods:

Arithmetic Mean:

Naive/ Random Walk method (with 80% & 95% CI’s)

Assessing Accuracy

Mean method

Random Walk method

Random walk with drift method

Moving Average Methods

Simple Moving Average SMA

ETS forecast

Seasonlaity in data, and variations around this

MACHINE LEARNING

RAIN TODAY

Basic Binomial Generalized Linear Model

RANDOM FOREST

Gradient Boosting Machine

Deep Learning

CARTESIAN GRID SEARCHES

RAIN TOMORROW

Basic Binomial Generalized Linear Model

RANDOM FOREST

Gradient Boosting Machine

Deep Learning

CARTESIAN GRID SEARCHES

RAIN IN 2 DAYS

Basic Binomial Generalized Linear Model

RANDOM FOREST

Gradient Boosting Machine

Deep Learning

CARTESIAN GRID SEARCHES