suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(gmodels))
suppressPackageStartupMessages(library(lattice))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(Kmisc))
suppressPackageStartupMessages(library(ROCR))
suppressPackageStartupMessages(library(corrplot))
madrid <- read.csv("madrid.csv")
temp <- madrid[,c(1,3)]
require(forecast)
require(tseries)
xtraintemp <- window(temp$Mean.TemperatureC,end=round(length(temp$CET)*.75))
xforetemp <- window(temp$Mean.TemperatureC,start=round(length(temp$CET)*.75))
plot(xtraintemp,type="l",main="Madrid mean daily temperature (training set)",ylab="Degrees Celsius",xlab="Day",col="blue")
plot(xforetemp,type="l",main="Madrid mean daily temperature (forecasting set)",ylab="Degrees Celsius",xlab="Day",col="red")
plot(temp$Mean.TemperatureC,type="p",main="Madrid mean daily temperature, all data",ylab="Degrees Celsius",xlab="Day")
lines(y=xtraintemp,x=1:5109,col="blue")
lines(y=xforetemp,x=5109:6812,col="red")
xt <- xtraintemp
xf <- xforetemp
x <- temp$Mean.TemperatureC
mean <- meanf(xt,h=length(xf))
plot(mean,main="Arithmetic Mean Method",ylab="Level",xlab="Day")
lines(x)
rw2 <- rwf(xt,h=length(xf))
plot(rw2,main="Naïve or Random Walk Method",ylab="Temperature",xlab="Day")
lines(x)
rwd <- rwf(xt,drift=T,h=length(xf))
plot(rwd,main="Random Walk with Drift Method",ylab="Temperature",xlab="Day")
lines(x)
ari <- Arima(xt,seasonal=c())
require(knitr)
kable(accuracy(mean,xf))
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | Theil’s U | |
|---|---|---|---|---|---|---|---|---|
| Training set | 0.000000 | 7.503106 | 6.457223 | -Inf | Inf | 4.222349 | 0.9625232 | NA |
| Test set | 1.258595 | 7.828909 | 6.799465 | -Inf | Inf | 4.446139 | 0.9659514 | NaN |
kable(accuracy(rw2,xf))
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | Theil’s U | |
|---|---|---|---|---|---|---|---|---|
| Training set | 0.0019596 | 2.048501 | 1.529297 | NaN | Inf | 1.000000 | -0.0982006 | NA |
| Test set | -1.3973005 | 7.852402 | 6.812793 | -Inf | Inf | 4.454854 | 0.9659514 | NaN |
kable(accuracy(rwd,xf))
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | Theil’s U | |
|---|---|---|---|---|---|---|---|---|
| Training set | 0.000000 | 2.048500 | 1.529737 | NaN | Inf | 1.000288 | -0.0982006 | NA |
| Test set | -3.067886 | 8.371454 | 7.161636 | -Inf | Inf | 4.682961 | 0.9663510 | NaN |
sma2 <- ma(xt,2)
sma5 <- ma(xt,5)
sma30 <- ma(xt,30)
sma120 <- ma(xt,120)
plot(x,main="Simple Moving Average SMA2",ylab="Level",xlab="Day",col="grey")
lines(sma2,col="blue")
plot(x,main="Simple Moving Average SMA5",ylab="Level",xlab="Day",col="grey")
lines(sma5,col="blue")
plot(x,main="Simple Moving Average SMA30",ylab="Level",xlab="Day",col="grey")
lines(sma30,col="blue")
plot(x,main="Simple Moving Average SMA120",ylab="Level",xlab="Day",col="grey")
lines(sma120,col="blue")
plot(forecast(sma2,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA2",ylab="Level",xlab="Day",col=4)
lines(x,col=3)
plot(forecast(sma5,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA5",ylab="Level",xlab="Day",col=4)
lines(x,col=3)
plot(forecast(sma30,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA30",ylab="Level",xlab="Day",col=4)
lines(x,col=3)
plot(forecast(sma120,h=length(xf),robust=T),main="Forecast Using Simple Moving Average SMA120",ylab="Level",xlab="Day",col=4)
lines(x,col=3)
f <- forecast(sma2,h=length(xf),robust = T)
f.up <- f$upper
f.low <- f$lower
f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]
plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA2 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")
f <- forecast(sma5,h=length(xf),robust = T)
f.up <- f$upper
f.low <- f$lower
f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]
plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA5 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")
f <- forecast(sma30,h=length(xf),robust = T)
f.up <- f$upper
f.low <- f$lower
f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]
plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA30 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")
f <- forecast(sma120,h=length(xf),robust = T)
f.up <- f$upper
f.low <- f$lower
f.up <- f.up[,1]
f.low <- f.low[,1]
x.plot <- x[5109:6812]
plot(y=x.plot,type="l",x=1:length(x.plot),ylim=c(min(f.low),max(f.up)),col="red",main="80% Confidence Interval for SMA120 Forecast vs Actual Values")
lines(y=f.low,x=1:length(x.plot),col="blue")
lines(y=f.up,x=1:length(x.plot),col="blue")
fit.ets <- ets(xt)
fr.ets <- forecast(fit.ets,h=length(xf))
plot(forecast(fit.ets,h=length(xf)))
lines(x)
require(ggplot2)
temp_ma = ts(na.omit(temp$Mean.TemperatureC), frequency=30)
decomp = stl(temp_ma, s.window="periodic")
plot(decomp)
par(mfrow=c(2,1))
plot(decomp$time.series[,2])
plot(decomp$time.series[,3])
(cols_withNa <- apply(madrid, 2, function(x) sum(is.na(x))))
## CET Max.TemperatureC
## 0 2
## Mean.TemperatureC Min.TemperatureC
## 3 2
## Dew.PointC MeanDew.PointC
## 2 2
## Min.DewpointC Max.Humidity
## 2 2
## Mean.Humidity Min.Humidity
## 2 2
## Max.Sea.Level.PressurehPa Mean.Sea.Level.PressurehPa
## 0 0
## Min.Sea.Level.PressurehPa Max.VisibilityKm
## 0 940
## Mean.VisibilityKm Min.VisibilitykM
## 940 940
## Max.Wind.SpeedKm.h Mean.Wind.SpeedKm.h
## 0 0
## Max.Gust.SpeedKm.h Precipitationmm
## 3306 0
## CloudCover Events
## 1372 0
## WindDirDegrees
## 0
require(dplyr)
require(tidyr)
madrid <- madrid %>% select(-c(Max.VisibilityKm, Mean.VisibilityKm, Min.VisibilitykM,Max.Gust.SpeedKm.h,CloudCover))
madrid.f <- madrid[complete.cases(madrid),]
dat <- madrid.f
par(mfrow=c(1,1))
factor_vars <- names(which(sapply(dat, class) == "factor"))
dat$CET <- as.Date(dat$CET)
names(dat)[1] <- c("Date")
require(stringr)
Rain <- ifelse(str_detect(dat$Events,"Rain"),1,0)
RainTomorrow <- numeric(length(Rain))
for(i in 1:length(Rain)-1){
if(Rain[i+1]==1){
RainTomorrow[i] <- 1
}
else(RainTomorrow[i] <- 0)
}
dat$RainTomorrow <- RainTomorrow
factor_vars <- names(which(sapply(dat, class) == "factor"))
require(corrplot)
matrixdat <- as.matrix(dat[,-c(1,17)])
matrixdat[,-c(17)] <- scale(matrixdat[,-c(17)])
datcor <- cor(matrixdat)
corrplot(datcor)
md <- as.data.frame(matrixdat)
factor_vars <- names(which(sapply(dat, class) == "factor"))
numeric_vars <- setdiff(colnames(dat), factor_vars)
numeric_vars <- setdiff(numeric_vars, c("RainTomorrow","Date"))
numeric_vars
## [1] "Max.TemperatureC" "Mean.TemperatureC"
## [3] "Min.TemperatureC" "Dew.PointC"
## [5] "MeanDew.PointC" "Min.DewpointC"
## [7] "Max.Humidity" "Mean.Humidity"
## [9] "Min.Humidity" "Max.Sea.Level.PressurehPa"
## [11] "Mean.Sea.Level.PressurehPa" "Min.Sea.Level.PressurehPa"
## [13] "Max.Wind.SpeedKm.h" "Mean.Wind.SpeedKm.h"
## [15] "Precipitationmm" "WindDirDegrees"
numeric_vars_mat <- as.matrix(dat[, numeric_vars, drop=FALSE])
numeric_vars_cor <- cor(numeric_vars_mat)
corrplot(numeric_vars_cor)
dat$RainTomorrow <- ifelse(dat$RainTomorrow==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) {
ggplot(data=dat, aes(x= RainTomorrow, y=eval(parse(text=x)), col = dat$RainTomorrow)) + geom_boxplot() + xlab("RainTomorrow") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1
##
## $chunk2
##
## $chunk3
##
## $chunk4
gp <- invisible(lapply(numeric_vars, function(x) {
ggplot(data=dat, aes(x=eval(parse(text=x)), col = RainTomorrow)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1
##
## $chunk2
##
## $chunk3
##
## $chunk4
dat$RainToday <- Rain
dat$RainToday <- ifelse(dat$RainToday==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) {
ggplot(data=dat, aes(x= RainToday, y=eval(parse(text=x)), col = dat$RainToday)) + geom_boxplot() + xlab("RainToday") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1
##
## $chunk2
##
## $chunk3
##
## $chunk4
gp <- invisible(lapply(numeric_vars, function(x) {
ggplot(data=dat, aes(x=eval(parse(text=x)), col = RainToday)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1
##
## $chunk2
##
## $chunk3
##
## $chunk4
Rain_in2days <- numeric(length(dat$RainToday))
for(i in 1:length(dat$RainToday)){
if(i+2<=length(dat$RainToday)){
Rain_in2days[i] <- ifelse(dat$RainToday[i+2]=="Yes",1,0)
}
else(Rain_in2days[i]<-0)
}
dat$Rain_in2days <- Rain_in2days
dat$Rain_in2days <- ifelse(dat$Rain_in2days==1,"Yes","N0")
gp <- invisible(lapply(numeric_vars, function(x) {
ggplot(data=dat, aes(x= Rain_in2days, y=eval(parse(text=x)), col = dat$Rain_in2days)) + geom_boxplot() + xlab("Rain in 2 days") + ylab(x) + ggtitle("") + theme(legend.position="none")}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1
##
## $chunk2
##
## $chunk3
##
## $chunk4
gp <- invisible(lapply(numeric_vars, function(x) {
ggplot(data=dat, aes(x=eval(parse(text=x)), col = Rain_in2days)) + geom_density() + xlab(x) + ggtitle(paste(x, "density", sep= " "))}))
grob_plots <- invisible(lapply(chunk(1, length(gp), 4), function(x) {
marrangeGrob(grobs=lapply(gp[x], ggplotGrob), nrow=2, ncol=2)}))
grob_plots
## $chunk1
##
## $chunk2
##
## $chunk3
##
## $chunk4
require(kohonen)
somdat <- dat[numeric_vars]
somdat <- scale(as.matrix(somdat))
somdat <- somdat[,c(2,5,8,11,14,15)]
som_grid <- somgrid(xdim = 5, ydim=5, topo="hexagonal")
som_model <- som(somdat,
grid=som_grid,
rlen=100,
alpha=c(0.05,0.01),
keep.data = TRUE)
plot(som_model, type="changes")
plot(som_model, type="count")
plot(som_model, type="dist.neighbours")
plot(som_model, type="codes")
dat$Month <- format(as.Date(dat$Date), "%m")
dat$Year <- format(as.Date(dat$Date), "%Y")
library(h2o)
h2o.init(nthreads = -1, #Number of threads -1 means use all cores on your machine
max_mem_size = "4G") #max mem size is the maximum memory to allocate to H2O
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\GERHAR~1\AppData\Local\Temp\RtmpqG2RrY/h2o_Gerhard_Viljoen_started_from_r.out
## C:\Users\GERHAR~1\AppData\Local\Temp\RtmpqG2RrY/h2o_Gerhard_Viljoen_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 540 milliseconds
## H2O cluster version: 3.16.0.2
## H2O cluster version age: 1 month and 29 days
## H2O cluster name: H2O_started_from_R_Gerhard_Viljoen_pbn723
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.56 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.3 (2017-11-30)
#h2o.no_progress()
dat$Events <- as.factor(dat$Events)
dat$RainToday <- as.factor(dat$RainToday)
dat$RainTomorrow <- as.factor(dat$RainTomorrow)
dat$Rain_in2days <- as.factor(dat$Rain_in2days)
dat$Year <- as.factor(dat$Year)
dat$Month <- as.factor(dat$Month)
dat <- dat %>% select(-c(Events,Precipitationmm,Date))
dat.hex <- as.h2o(dat)
##
|
| | 0%
|
|=================================================================| 100%
#dat.hex[,-c(18:20)] <- scale(dat.hex[,-c(18:20)])
# Partition the data into training, validation and test sets
splits <- h2o.splitFrame(data = dat.hex,
ratios = c(0.7, 0.15), #partition data into 70%, 15%, 15% chunks
seed = 1) #setting a seed will guarantee reproducibility
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]
y1 <- "RainToday"
y2 <- "RainTomorrow"
y3 <- "Rain_in2days"
x <- setdiff(names(dat.hex), c(y1,y2,y3))
print(x)
## [1] "Max.TemperatureC" "Mean.TemperatureC"
## [3] "Min.TemperatureC" "Dew.PointC"
## [5] "MeanDew.PointC" "Min.DewpointC"
## [7] "Max.Humidity" "Mean.Humidity"
## [9] "Min.Humidity" "Max.Sea.Level.PressurehPa"
## [11] "Mean.Sea.Level.PressurehPa" "Min.Sea.Level.PressurehPa"
## [13] "Max.Wind.SpeedKm.h" "Mean.Wind.SpeedKm.h"
## [15] "WindDirDegrees" "Month"
## [17] "Year"
glm_fit1 <- h2o.glm(x = x,
y = y1,
training_frame = train,
model_id = "glm_fit1",
family = "binomial") #similar to R's glm, h2o.glm has the family argument
##
|
| | 0%
|
|=================================================================| 100%
# Next we will do some automatic tuning by passing in a validation frame and setting
# `lambda_search = True`. Since we are training a GLM with regularization, we should
# try to find the right amount of regularization (to avoid overfitting). The model
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE`
# and passing in a validation frame (which is used to evaluate model performance using a
# particular value of lambda).
glm_fit2 <- h2o.glm(x = x,
y = y1,
training_frame = train,
model_id = "glm_fit2",
validation_frame = valid,
family = "binomial",
lambda_search = TRUE)
##
|
| | 0%
|
|= | 1%
|
|=================================================================| 100%
# Let's compare the performance of the two GLMs
glm_perf1 <- h2o.performance(model = glm_fit1,
newdata = test)
glm_perf2 <- h2o.performance(model = glm_fit2,
newdata = test)
# Print model performance
glm_perf1
## H2OBinomialMetrics: glm
##
## MSE: 0.06893514
## RMSE: 0.262555
## LogLoss: 0.2314325
## Mean Per-Class Error: 0.1571695
## AUC: 0.9437533
## Gini: 0.8875065
## R^2: 0.5792355
## Residual Deviance: 473.0479
## AIC: 561.0479
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N0 Yes Error Rate
## N0 779 32 0.039457 =32/811
## Yes 58 153 0.274882 =58/211
## Totals 837 185 0.088063 =90/1022
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.508712 0.772727 144
## 2 max f2 0.179051 0.824698 240
## 3 max f0point5 0.617765 0.810489 118
## 4 max accuracy 0.508712 0.911937 144
## 5 max precision 0.999272 1.000000 0
## 6 max recall 0.001937 1.000000 392
## 7 max specificity 0.999272 1.000000 0
## 8 max absolute_mcc 0.508712 0.720796 144
## 9 max min_per_class_accuracy 0.211979 0.865598 226
## 10 max mean_per_class_accuracy 0.179051 0.876774 240
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
glm_perf2
## H2OBinomialMetrics: glm
##
## MSE: 0.0690964
## RMSE: 0.2628619
## LogLoss: 0.2316171
## Mean Per-Class Error: 0.1548962
## AUC: 0.9438117
## Gini: 0.8876234
## R^2: 0.5782512
## Residual Deviance: 473.4253
## AIC: 563.4253
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N0 Yes Error Rate
## N0 775 36 0.044390 =36/811
## Yes 56 155 0.265403 =56/211
## Totals 831 191 0.090020 =92/1022
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.477549 0.771144 152
## 2 max f2 0.161318 0.826162 247
## 3 max f0point5 0.656427 0.814095 119
## 4 max accuracy 0.507839 0.910959 147
## 5 max precision 0.999323 1.000000 0
## 6 max recall 0.001606 1.000000 392
## 7 max specificity 0.999323 1.000000 0
## 8 max absolute_mcc 0.507839 0.717992 147
## 9 max min_per_class_accuracy 0.218971 0.872038 221
## 10 max mean_per_class_accuracy 0.161318 0.877294 247
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Instead of printing the entire model performance metrics object,
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf1)
## [1] 0.9437533
h2o.auc(glm_perf2)
## [1] 0.9438117
# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit2, train = TRUE)
## [1] 0.9443575
h2o.auc(glm_fit2, valid = TRUE)
## [1] 0.950664
#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf1)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.508711960103498:
## N0 Yes Error Rate
## N0 779 32 0.039457 =32/811
## Yes 58 153 0.274882 =58/211
## Totals 837 185 0.088063 =90/1022
h2o.confusionMatrix(glm_perf2)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.477548815356572:
## N0 Yes Error Rate
## N0 775 36 0.044390 =36/811
## Yes 56 155 0.265403 =56/211
## Totals 831 191 0.090020 =92/1022
rf_fit1 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit1",
seed = 1)
##
|
| | 0%
|
|============================================================== | 96%
|
|=================================================================| 100%
rf_fit2 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit2",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 100000,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
#Let's compare the performance of the two RFs
rf_perf1 <- h2o.performance(model = rf_fit1,
newdata = test)
rf_perf2 <- h2o.performance(model = rf_fit2,
newdata = test)
# Print model performance
h2o.auc(rf_perf1)
## [1] 0.9336464
h2o.auc(rf_perf2)
## [1] 0.9369803
#Cross Validate
rf_fit3 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit3",
seed = 1,
nfolds = 5)
##
|
| | 0%
|
|===== | 8%
|
|========================================= | 64%
|
|====================================================== | 83%
|
|=================================================================| 100%
# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit3, xval = TRUE)
## [1] 0.9245813
h2o.confusionMatrix(rf_perf1)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.42:
## N0 Yes Error Rate
## N0 764 47 0.057953 =47/811
## Yes 53 158 0.251185 =53/211
## Totals 817 205 0.097847 =100/1022
h2o.confusionMatrix(rf_perf2)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.4125:
## N0 Yes Error Rate
## N0 764 47 0.057953 =47/811
## Yes 55 156 0.260664 =55/211
## Totals 819 203 0.099804 =102/1022
gbm_fit1 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit1",
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_fit2 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit2",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
gbm_fit3 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit3",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 50000,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf1 <- h2o.performance(model = gbm_fit1,
newdata = test)
gbm_perf2 <- h2o.performance(model = gbm_fit2,
newdata = test)
gbm_perf3 <- h2o.performance(model = gbm_fit3,
newdata = test)
h2o.auc(gbm_perf1)
## [1] 0.9456116
h2o.auc(gbm_perf2)
## [1] 0.946944
h2o.auc(gbm_perf3)
## [1] 0.946944
h2o.scoreHistory(gbm_fit2)
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2018-01-28 21:01:19 0.000 sec 0 0.41424
## 2 2018-01-28 21:01:19 0.032 sec 5 0.33477
## 3 2018-01-28 21:01:19 0.063 sec 10 0.29373
## 4 2018-01-28 21:01:19 0.107 sec 15 0.26949
## 5 2018-01-28 21:01:19 0.153 sec 20 0.25432
## training_logloss training_auc training_lift
## 1 0.52690 0.50000 1.00000
## 2 0.37441 0.94039 4.54554
## 3 0.30527 0.95214 4.54554
## 4 0.26394 0.95846 4.54554
## 5 0.23658 0.96453 4.54554
## training_classification_error validation_rmse validation_logloss
## 1 0.78000 0.42489 0.54709
## 2 0.10081 0.35010 0.39967
## 3 0.08913 0.31656 0.34011
## 4 0.08328 0.30122 0.31031
## 5 0.07765 0.29461 0.29490
## validation_auc validation_lift validation_classification_error
## 1 0.50000 1.00000 0.76406
## 2 0.91773 4.23830 0.11345
## 3 0.92434 4.23830 0.11546
## 4 0.92637 4.23830 0.10542
## 5 0.92795 4.23830 0.10643
##
## ---
## timestamp duration number_of_trees training_rmse
## 23 2018-01-28 21:01:19 0.775 sec 110 0.15847
## 24 2018-01-28 21:01:19 0.822 sec 115 0.15490
## 25 2018-01-28 21:01:19 0.853 sec 120 0.15165
## 26 2018-01-28 21:01:19 0.885 sec 125 0.14784
## 27 2018-01-28 21:01:19 0.916 sec 130 0.14553
## 28 2018-01-28 21:01:20 0.947 sec 135 0.14212
## training_logloss training_auc training_lift
## 23 0.10434 0.99553 4.54554
## 24 0.10088 0.99605 4.54554
## 25 0.09789 0.99654 4.54554
## 26 0.09458 0.99699 4.54554
## 27 0.09248 0.99721 4.54554
## 28 0.08946 0.99750 4.54554
## training_classification_error validation_rmse validation_logloss
## 23 0.02588 0.27654 0.25339
## 24 0.02379 0.27586 0.25214
## 25 0.02254 0.27565 0.25178
## 26 0.02108 0.27558 0.25205
## 27 0.01941 0.27584 0.25245
## 28 0.01774 0.27580 0.25253
## validation_auc validation_lift validation_classification_error
## 23 0.94168 4.23830 0.10843
## 24 0.94262 4.23830 0.10743
## 25 0.94275 4.23830 0.10643
## 26 0.94252 4.23830 0.10643
## 27 0.94240 4.23830 0.10743
## 28 0.94250 4.23830 0.10643
h2o.scoreHistory(gbm_fit3)
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2018-01-28 21:01:20 0.000 sec 0 0.41424
## 2 2018-01-28 21:01:20 0.047 sec 5 0.33477
## 3 2018-01-28 21:01:20 0.078 sec 10 0.29373
## 4 2018-01-28 21:01:20 0.125 sec 15 0.26949
## 5 2018-01-28 21:01:20 0.156 sec 20 0.25432
## training_logloss training_auc training_lift
## 1 0.52690 0.50000 1.00000
## 2 0.37441 0.94039 4.54554
## 3 0.30527 0.95214 4.54554
## 4 0.26394 0.95846 4.54554
## 5 0.23658 0.96453 4.54554
## training_classification_error validation_rmse validation_logloss
## 1 0.78000 0.42489 0.54709
## 2 0.10081 0.35010 0.39967
## 3 0.08913 0.31656 0.34011
## 4 0.08328 0.30122 0.31031
## 5 0.07765 0.29461 0.29490
## validation_auc validation_lift validation_classification_error
## 1 0.50000 1.00000 0.76406
## 2 0.91773 4.23830 0.11345
## 3 0.92434 4.23830 0.11546
## 4 0.92637 4.23830 0.10542
## 5 0.92795 4.23830 0.10643
##
## ---
## timestamp duration number_of_trees training_rmse
## 23 2018-01-28 21:01:21 0.737 sec 110 0.15847
## 24 2018-01-28 21:01:21 0.762 sec 115 0.15490
## 25 2018-01-28 21:01:21 0.794 sec 120 0.15165
## 26 2018-01-28 21:01:21 0.825 sec 125 0.14784
## 27 2018-01-28 21:01:21 0.856 sec 130 0.14553
## 28 2018-01-28 21:01:21 0.888 sec 135 0.14212
## training_logloss training_auc training_lift
## 23 0.10434 0.99553 4.54554
## 24 0.10088 0.99605 4.54554
## 25 0.09789 0.99654 4.54554
## 26 0.09458 0.99699 4.54554
## 27 0.09248 0.99721 4.54554
## 28 0.08946 0.99750 4.54554
## training_classification_error validation_rmse validation_logloss
## 23 0.02588 0.27654 0.25339
## 24 0.02379 0.27586 0.25214
## 25 0.02254 0.27565 0.25178
## 26 0.02108 0.27558 0.25205
## 27 0.01941 0.27584 0.25245
## 28 0.01774 0.27580 0.25253
## validation_auc validation_lift validation_classification_error
## 23 0.94168 4.23830 0.10843
## 24 0.94262 4.23830 0.10743
## 25 0.94275 4.23830 0.10643
## 26 0.94252 4.23830 0.10643
## 27 0.94240 4.23830 0.10743
## 28 0.94250 4.23830 0.10643
plot(gbm_fit3,
timestep = "number_of_trees",
metric = "AUC")
plot(gbm_fit3,
timestep = "number_of_trees",
metric = "logloss")
gbm_fit4 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit1",
ntrees=5,
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf4 <- h2o.performance(model = gbm_fit4,
newdata = test)
h2o.auc(gbm_perf4)
## [1] 0.922321
h2o.confusionMatrix(gbm_perf1)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.387829013222046:
## N0 Yes Error Rate
## N0 766 45 0.055487 =45/811
## Yes 49 162 0.232227 =49/211
## Totals 815 207 0.091977 =94/1022
h2o.confusionMatrix(gbm_perf2)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.360968775857713:
## N0 Yes Error Rate
## N0 762 49 0.060419 =49/811
## Yes 46 165 0.218009 =46/211
## Totals 808 214 0.092955 =95/1022
h2o.confusionMatrix(gbm_perf3)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.360968775857713:
## N0 Yes Error Rate
## N0 762 49 0.060419 =49/811
## Yes 46 165 0.218009 =46/211
## Totals 808 214 0.092955 =95/1022
h2o.confusionMatrix(gbm_perf4)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.250636786717922:
## N0 Yes Error Rate
## N0 741 70 0.086313 =70/811
## Yes 50 161 0.236967 =50/211
## Totals 791 231 0.117417 =120/1022
dl_fit1 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit1",
seed = 1)
##
|
| | 0%
|
|==================== | 30%
|
|================================ | 50%
|
|============================================== | 70%
|
|========================================================== | 90%
|
|=================================================================| 100%
dl_fit2 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit2",
#validation_frame = valid, #only used if stopping_rounds > 0
epochs = 20,
hidden= c(10,10),
stopping_rounds = 0, # disable early stopping
seed = 1)
##
|
| | 0%
|
|============================================== | 70%
|
|=================================================================| 100%
dl_fit3 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit3",
validation_frame = valid, #in DL, early stopping is on by default
epochs = 200,
hidden = c(10,10),
score_interval = 1, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=== | 5%
|
|==================================== | 55%
|
|============================================================== | 95%
|
|=================================================================| 100%
dl_perf1 <- h2o.performance(model = dl_fit1,
newdata = test)
dl_perf2 <- h2o.performance(model = dl_fit2,
newdata = test)
dl_perf3 <- h2o.performance(model = dl_fit3,
newdata = test)
h2o.auc(dl_perf1) # 0.6774335
## [1] 0.9449045
h2o.auc(dl_perf2) # 0.678446
## [1] 0.9446298
h2o.auc(dl_perf3) # 0.6770498
## [1] 0.9404866
plot(dl_fit3,
timestep = "epochs",
metric = "AUC")
dl_fit4 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit4",
validation_frame = valid, #in DL, early stopping is on by default
epochs = 200,
hidden = c(32,64,32,128,32,64,32),
score_interval = 1, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=== | 5%
|
|========== | 15%
|
|================ | 25%
|
|=================================================================| 100%
dl_perf4 <- h2o.performance(model = dl_fit4,
newdata = test)
h2o.auc(dl_perf4)
## [1] 0.9396889
h2o.confusionMatrix(dl_perf1)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.487012442242802:
## N0 Yes Error Rate
## N0 766 45 0.055487 =45/811
## Yes 47 164 0.222749 =47/211
## Totals 813 209 0.090020 =92/1022
h2o.confusionMatrix(dl_perf2)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.44685584564985:
## N0 Yes Error Rate
## N0 765 46 0.056720 =46/811
## Yes 48 163 0.227488 =48/211
## Totals 813 209 0.091977 =94/1022
h2o.confusionMatrix(dl_perf3)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.464838685057443:
## N0 Yes Error Rate
## N0 774 37 0.045623 =37/811
## Yes 50 161 0.236967 =50/211
## Totals 824 198 0.085127 =87/1022
h2o.confusionMatrix(dl_perf4)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.338893274470836:
## N0 Yes Error Rate
## N0 730 81 0.099877 =81/811
## Yes 31 180 0.146919 =31/211
## Totals 761 261 0.109589 =112/1022
gbm_params11 <- list(learn_rate = c(0.01, 0.1),
max_depth = c(3, 5, 9),
sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))
gbm_grid11 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid11",
training_frame = train,
validation_frame = valid,
ntrees = 100,
seed = 1,
hyper_params = gbm_params11)
##
|
| | 0%
|
|== | 3%
|
|=== | 5%
|
|===== | 8%
|
|======= | 10%
|
|======== | 13%
|
|========== | 15%
|
|=========== | 17%
|
|============= | 19%
|
|============== | 22%
|
|================ | 24%
|
|================= | 26%
|
|================== | 28%
|
|==================== | 30%
|
|===================== | 32%
|
|====================== | 34%
|
|======================== | 36%
|
|========================= | 38%
|
|========================== | 40%
|
|=========================== | 41%
|
|============================ | 43%
|
|============================= | 45%
|
|============================== | 47%
|
|=============================== | 48%
|
|================================ | 50%
|
|================================== | 52%
|
|==================================== | 55%
|
|====================================== | 58%
|
|======================================== | 61%
|
|========================================= | 63%
|
|=========================================== | 66%
|
|============================================ | 68%
|
|============================================== | 70%
|
|=============================================== | 72%
|
|================================================= | 75%
|
|================================================== | 77%
|
|=================================================== | 79%
|
|===================================================== | 81%
|
|====================================================== | 83%
|
|======================================================== | 85%
|
|========================================================= | 87%
|
|========================================================== | 89%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================= | 94%
|
|============================================================== | 96%
|
|================================================================ | 98%
|
|=================================================================| 99%
|
|=================================================================| 100%
gbm_gridperf11 <- h2o.getGrid(grid_id = "gbm_grid11",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf11)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid11
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.5 0.1 5 1.0 gbm_grid11_model_28
## 2 1.0 0.1 5 0.8 gbm_grid11_model_11
## 3 0.2 0.1 5 1.0 gbm_grid11_model_27
## 4 1.0 0.1 3 1.0 gbm_grid11_model_23
## 5 1.0 0.1 9 0.8 gbm_grid11_model_17
## auc
## 1 0.9448122571085079
## 2 0.9439399446417088
## 3 0.943707887158554
## 4 0.9422736041602594
## 5 0.9420946682696341
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 1.0 0.01 3 0.8 gbm_grid11_model_2
## 32 1.0 0.01 3 1.0 gbm_grid11_model_20
## 33 0.5 0.01 3 0.8 gbm_grid11_model_1
## 34 0.5 0.01 3 1.0 gbm_grid11_model_19
## 35 0.2 0.01 3 0.8 gbm_grid11_model_0
## 36 0.2 0.01 3 1.0 gbm_grid11_model_18
## auc
## 31 0.916940196270305
## 32 0.9167696479995527
## 33 0.9148684541616574
## 34 0.9134425587832359
## 35 0.9127324069673163
## 36 0.9125422875835267
gbm_params21 <- list(learn_rate = seq(0.001, 0.1, 0.001),
max_depth = seq(2, 10, 1),
sample_rate = seq(0.3, 1.0, 0.05),
col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria21 <- list(strategy = "RandomDiscrete",
max_models = 36)
# Train and validate a grid of GBMs
gbm_grid21 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid21",
training_frame = train,
validation_frame = valid,
ntrees = 100,
seed = 1,
hyper_params = gbm_params21,
search_criteria = search_criteria21)
##
|
| | 0%
|
|= | 2%
|
|=== | 4%
|
|==== | 5%
|
|===== | 8%
|
|====== | 10%
|
|======== | 12%
|
|========= | 14%
|
|========== | 16%
|
|============ | 18%
|
|============= | 20%
|
|============== | 22%
|
|================ | 25%
|
|================== | 27%
|
|=================== | 29%
|
|==================== | 32%
|
|====================== | 34%
|
|======================= | 36%
|
|======================== | 38%
|
|========================= | 39%
|
|=========================== | 42%
|
|============================= | 44%
|
|============================== | 47%
|
|================================ | 49%
|
|================================== | 52%
|
|=================================== | 54%
|
|==================================== | 56%
|
|====================================== | 58%
|
|======================================= | 60%
|
|======================================== | 62%
|
|========================================== | 64%
|
|=========================================== | 66%
|
|============================================ | 68%
|
|============================================== | 70%
|
|=============================================== | 73%
|
|================================================ | 75%
|
|================================================== | 76%
|
|=================================================== | 78%
|
|===================================================== | 81%
|
|====================================================== | 83%
|
|======================================================= | 85%
|
|======================================================== | 87%
|
|========================================================= | 88%
|
|========================================================== | 90%
|
|=========================================================== | 92%
|
|============================================================= | 93%
|
|============================================================== | 95%
|
|=============================================================== | 97%
|
|================================================================ | 99%
|
|=================================================================| 100%
gbm_gridperf21 <- h2o.getGrid(grid_id = "gbm_grid21",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf21)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid21
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.65 0.059 9 0.5 gbm_grid21_model_22
## 2 0.4 0.067 7 0.75 gbm_grid21_model_34
## 3 0.5 0.052 8 0.6 gbm_grid21_model_19
## 4 0.4 0.062 6 0.9 gbm_grid21_model_2
## 5 0.7 0.052 4 0.8 gbm_grid21_model_21
## auc
## 1 0.9422344619341851
## 2 0.9416724914026896
## 3 0.941339782481058
## 4 0.9406212430452652
## 5 0.9401599239522465
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.4 0.079 2 0.45 gbm_grid21_model_18
## 32 0.85 0.012 6 0.3 gbm_grid21_model_32
## 33 0.7 0.011 4 0.95 gbm_grid21_model_8
## 34 0.9 0.01 4 0.55 gbm_grid21_model_6
## 35 0.4 0.042 2 0.5 gbm_grid21_model_16
## 36 0.15 0.008 2 0.65 gbm_grid21_model_11
## auc
## 31 0.9301031677244388
## 32 0.9257220342774066
## 33 0.9240165515698828
## 34 0.9237201890010345
## 35 0.9235440489837
## 36 0.8947465540861689
#UPDATE
gbm_params31 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
max_depth = seq(2, 10, 1),
sample_rate = seq(0.6, 0.9, 0.005),
col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria31 <- list(strategy = "RandomDiscrete",
max_models = 36)
# Train and validate a grid of GBMs
gbm_grid31<- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid31",
training_frame = train,
validation_frame = valid,
ntrees = 1000,
seed = 1,
hyper_params = gbm_params31,
search_criteria = search_criteria31)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|====================================================== | 82%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
gbm_gridperf31 <- h2o.getGrid(grid_id = "gbm_grid31",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf31)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid31
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.585 0.0436 3 0.785 gbm_grid31_model_5
## 2 0.64 0.0404 3 0.775 gbm_grid31_model_20
## 3 0.53 0.0499 2 0.61 gbm_grid31_model_29
## 4 0.53 0.0345 3 0.69 gbm_grid31_model_32
## 5 0.625 0.0367 2 0.78 gbm_grid31_model_0
## auc
## 1 0.9524198283333799
## 2 0.950801017697878
## 3 0.9500852741353761
## 4 0.9498755836385494
## 5 0.9495037324908435
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.625 0.0365 10 0.685 gbm_grid31_model_13
## 32 0.615 0.036 9 0.685 gbm_grid31_model_31
## 33 0.715 0.0352 10 0.89 gbm_grid31_model_6
## 34 0.69 0.0353 9 0.64 gbm_grid31_model_10
## 35 0.5 0.0285 8 0.785 gbm_grid31_model_21
## 36 0.645 0.0372 9 0.85 gbm_grid31_model_22
## auc
## 31 0.942597925462018
## 32 0.9424497441775939
## 33 0.9421953197081108
## 34 0.9421869320882377
## 35 0.9419408952386278
## 36 0.9415187183716834
best_gbm_model_id <- gbm_gridperf31@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)
# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm,
newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541
## [1] 0.9476657
h2o.confusionMatrix(best_gbm_perf)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.496636246222322:
## N0 Yes Error Rate
## N0 779 32 0.039457 =32/811
## Yes 56 155 0.265403 =56/211
## Totals 835 187 0.086106 =88/1022
y1 <- y2
glm_fit10 <- h2o.glm(x = x,
y = y1,
training_frame = train,
model_id = "glm_fit10",
family = "binomial") #similar to R's glm, h2o.glm has the family argument
##
|
| | 0%
|
|=================================================================| 100%
# Next we will do some automatic tuning by passing in a validation frame and setting
# `lambda_search = True`. Since we are training a GLM with regularization, we should
# try to find the right amount of regularization (to avoid overfitting). The model
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE`
# and passing in a validation frame (which is used to evaluate model performance using a
# particular value of lambda).
glm_fit20 <- h2o.glm(x = x,
y = y1,
training_frame = train,
model_id = "glm_fit20",
validation_frame = valid,
family = "binomial",
lambda_search = TRUE)
##
|
| | 0%
|
|============ | 19%
|
|=================================================================| 100%
# Let's compare the performance of the two GLMs
glm_perf10 <- h2o.performance(model = glm_fit10,
newdata = test)
glm_perf20 <- h2o.performance(model = glm_fit20,
newdata = test)
# Print model performance
glm_perf10
## H2OBinomialMetrics: glm
##
## MSE: 0.1359599
## RMSE: 0.3687274
## LogLoss: 0.4285203
## Mean Per-Class Error: 0.26428
## AUC: 0.7990735
## Gini: 0.5981469
## R^2: 0.2228153
## Residual Deviance: 875.8955
## AIC: 961.8955
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N0 Yes Error Rate
## N0 640 151 0.190898 =151/791
## Yes 78 153 0.337662 =78/231
## Totals 718 304 0.224070 =229/1022
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.246202 0.571963 196
## 2 max f2 0.126803 0.675583 280
## 3 max f0point5 0.410343 0.584795 115
## 4 max accuracy 0.448265 0.817025 103
## 5 max precision 0.998373 1.000000 0
## 6 max recall 0.014665 1.000000 392
## 7 max specificity 0.998373 1.000000 0
## 8 max absolute_mcc 0.309329 0.436817 161
## 9 max min_per_class_accuracy 0.197032 0.722944 230
## 10 max mean_per_class_accuracy 0.246202 0.735720 196
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
glm_perf20
## H2OBinomialMetrics: glm
##
## MSE: 0.1349278
## RMSE: 0.3673252
## LogLoss: 0.4249275
## Mean Per-Class Error: 0.2618473
## AUC: 0.8038157
## Gini: 0.6076313
## R^2: 0.2287153
## Residual Deviance: 868.5518
## AIC: 918.5518
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N0 Yes Error Rate
## N0 637 154 0.194690 =154/791
## Yes 76 155 0.329004 =76/231
## Totals 713 309 0.225049 =230/1022
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.241526 0.574074 190
## 2 max f2 0.118447 0.680159 283
## 3 max f0point5 0.356828 0.601704 125
## 4 max accuracy 0.358210 0.821918 124
## 5 max precision 0.992446 1.000000 0
## 6 max recall 0.024746 1.000000 388
## 7 max specificity 0.992446 1.000000 0
## 8 max absolute_mcc 0.356828 0.451257 125
## 9 max min_per_class_accuracy 0.191526 0.724399 223
## 10 max mean_per_class_accuracy 0.223121 0.739130 201
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Instead of printing the entire model performance metrics object,
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf10)
## [1] 0.7990735
h2o.auc(glm_perf20)
## [1] 0.8038157
# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit10, train = TRUE)
## [1] 0.8101513
h2o.auc(glm_fit20, valid = TRUE)
## [1] 0.7799993
#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf10)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.246201735705228:
## N0 Yes Error Rate
## N0 640 151 0.190898 =151/791
## Yes 78 153 0.337662 =78/231
## Totals 718 304 0.224070 =229/1022
h2o.confusionMatrix(glm_perf20)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.241526093530235:
## N0 Yes Error Rate
## N0 637 154 0.194690 =154/791
## Yes 76 155 0.329004 =76/231
## Totals 713 309 0.225049 =230/1022
rf_fit10 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit10",
seed = 1)
##
|
| | 0%
|
|====== | 10%
|
|=================================================================| 100%
rf_fit20 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit20",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 100000,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
#Let's compare the performance of the two RFs
rf_perf10 <- h2o.performance(model = rf_fit10,
newdata = test)
rf_perf20 <- h2o.performance(model = rf_fit20,
newdata = test)
# Print model performance
h2o.auc(rf_perf10)
## [1] 0.7925252
h2o.auc(rf_perf20)
## [1] 0.7900871
#Cross Validate
rf_fit30 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit30",
seed = 1,
nfolds = 5)
##
|
| | 0%
|
|==================================================== | 79%
|
|=================================================================| 100%
rf_perf30 <- h2o.performance(model = rf_fit30,
newdata = test)
# Print model performance
h2o.auc(rf_perf30)
## [1] 0.7925252
# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit30, xval = TRUE)
## [1] 0.7838314
h2o.confusionMatrix(rf_perf10)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.35:
## N0 Yes Error Rate
## N0 701 90 0.113780 =90/791
## Yes 101 130 0.437229 =101/231
## Totals 802 220 0.186888 =191/1022
h2o.confusionMatrix(rf_perf20)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.328571428571429:
## N0 Yes Error Rate
## N0 683 108 0.136536 =108/791
## Yes 94 137 0.406926 =94/231
## Totals 777 245 0.197652 =202/1022
h2o.confusionMatrix(rf_perf30)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.35:
## N0 Yes Error Rate
## N0 701 90 0.113780 =90/791
## Yes 101 130 0.437229 =101/231
## Totals 802 220 0.186888 =191/1022
gbm_fit10 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit10",
seed = 1)
##
|
| | 0%
|
|========== | 16%
|
|=================================================================| 100%
gbm_fit20 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit20",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
gbm_fit30 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit30",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 50000,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf10 <- h2o.performance(model = gbm_fit10,
newdata = test)
gbm_perf20 <- h2o.performance(model = gbm_fit20,
newdata = test)
gbm_perf30 <- h2o.performance(model = gbm_fit30,
newdata = test)
h2o.auc(gbm_perf10)
## [1] 0.7963589
h2o.auc(gbm_perf20)
## [1] 0.7972182
h2o.auc(gbm_perf30)
## [1] 0.7972182
h2o.scoreHistory(gbm_fit20)
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2018-01-28 21:08:02 0.000 sec 0 0.41096
## 2 2018-01-28 21:08:02 0.015 sec 5 0.37158
## 3 2018-01-28 21:08:02 0.046 sec 10 0.35093
## 4 2018-01-28 21:08:02 0.078 sec 15 0.33775
## 5 2018-01-28 21:08:02 0.093 sec 20 0.32816
## 6 2018-01-28 21:08:02 0.124 sec 25 0.32067
## 7 2018-01-28 21:08:02 0.156 sec 30 0.31465
## 8 2018-01-28 21:08:02 0.171 sec 35 0.31121
## 9 2018-01-28 21:08:02 0.203 sec 40 0.30703
## 10 2018-01-28 21:08:02 0.234 sec 45 0.30324
## 11 2018-01-28 21:08:02 0.265 sec 50 0.29859
## 12 2018-01-28 21:08:02 0.296 sec 55 0.29613
## 13 2018-01-28 21:08:02 0.312 sec 60 0.29363
## training_logloss training_auc training_lift
## 1 0.52076 0.50000 1.00000
## 2 0.43782 0.85058 4.36813
## 3 0.39786 0.86818 4.45332
## 4 0.37192 0.88191 4.64694
## 5 0.35322 0.89190 4.64694
## 6 0.33779 0.90331 4.64694
## 7 0.32618 0.91141 4.64694
## 8 0.31933 0.91535 4.64694
## 9 0.31161 0.92052 4.64694
## 10 0.30461 0.92575 4.64694
## 11 0.29689 0.93082 4.64694
## 12 0.29253 0.93309 4.64694
## 13 0.28826 0.93563 4.64694
## training_classification_error validation_rmse validation_logloss
## 1 0.78480 0.42711 0.55150
## 2 0.18284 0.39843 0.48816
## 3 0.17178 0.38800 0.46590
## 4 0.16427 0.38517 0.45884
## 5 0.15508 0.38292 0.45369
## 6 0.15362 0.38299 0.45288
## 7 0.13525 0.38296 0.45279
## 8 0.13066 0.38296 0.45271
## 9 0.12252 0.38309 0.45297
## 10 0.12106 0.38425 0.45511
## 11 0.12711 0.38380 0.45496
## 12 0.11396 0.38363 0.45491
## 13 0.12336 0.38409 0.45626
## validation_auc validation_lift validation_classification_error
## 1 0.50000 1.00000 0.76104
## 2 0.78273 4.18487 0.29819
## 3 0.78540 4.18487 0.26606
## 4 0.78459 3.76639 0.28213
## 5 0.78665 4.18487 0.26205
## 6 0.78797 4.18487 0.26908
## 7 0.78821 4.18487 0.23996
## 8 0.78910 4.18487 0.25803
## 9 0.79017 3.76639 0.23795
## 10 0.78935 3.34790 0.23594
## 11 0.78948 3.76639 0.27610
## 12 0.79032 3.34790 0.24096
## 13 0.78976 3.76639 0.27108
h2o.scoreHistory(gbm_fit30)
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2018-01-28 21:08:03 0.000 sec 0 0.41096
## 2 2018-01-28 21:08:03 0.015 sec 5 0.37158
## 3 2018-01-28 21:08:03 0.049 sec 10 0.35093
## 4 2018-01-28 21:08:03 0.075 sec 15 0.33775
## 5 2018-01-28 21:08:03 0.090 sec 20 0.32816
## 6 2018-01-28 21:08:03 0.122 sec 25 0.32067
## 7 2018-01-28 21:08:03 0.153 sec 30 0.31465
## 8 2018-01-28 21:08:03 0.168 sec 35 0.31121
## 9 2018-01-28 21:08:03 0.200 sec 40 0.30703
## 10 2018-01-28 21:08:03 0.231 sec 45 0.30324
## 11 2018-01-28 21:08:03 0.257 sec 50 0.29859
## 12 2018-01-28 21:08:03 0.274 sec 55 0.29613
## 13 2018-01-28 21:08:03 0.290 sec 60 0.29363
## training_logloss training_auc training_lift
## 1 0.52076 0.50000 1.00000
## 2 0.43782 0.85058 4.36813
## 3 0.39786 0.86818 4.45332
## 4 0.37192 0.88191 4.64694
## 5 0.35322 0.89190 4.64694
## 6 0.33779 0.90331 4.64694
## 7 0.32618 0.91141 4.64694
## 8 0.31933 0.91535 4.64694
## 9 0.31161 0.92052 4.64694
## 10 0.30461 0.92575 4.64694
## 11 0.29689 0.93082 4.64694
## 12 0.29253 0.93309 4.64694
## 13 0.28826 0.93563 4.64694
## training_classification_error validation_rmse validation_logloss
## 1 0.78480 0.42711 0.55150
## 2 0.18284 0.39843 0.48816
## 3 0.17178 0.38800 0.46590
## 4 0.16427 0.38517 0.45884
## 5 0.15508 0.38292 0.45369
## 6 0.15362 0.38299 0.45288
## 7 0.13525 0.38296 0.45279
## 8 0.13066 0.38296 0.45271
## 9 0.12252 0.38309 0.45297
## 10 0.12106 0.38425 0.45511
## 11 0.12711 0.38380 0.45496
## 12 0.11396 0.38363 0.45491
## 13 0.12336 0.38409 0.45626
## validation_auc validation_lift validation_classification_error
## 1 0.50000 1.00000 0.76104
## 2 0.78273 4.18487 0.29819
## 3 0.78540 4.18487 0.26606
## 4 0.78459 3.76639 0.28213
## 5 0.78665 4.18487 0.26205
## 6 0.78797 4.18487 0.26908
## 7 0.78821 4.18487 0.23996
## 8 0.78910 4.18487 0.25803
## 9 0.79017 3.76639 0.23795
## 10 0.78935 3.34790 0.23594
## 11 0.78948 3.76639 0.27610
## 12 0.79032 3.34790 0.24096
## 13 0.78976 3.76639 0.27108
plot(gbm_fit20,
timestep = "number_of_trees",
metric = "AUC")
plot(gbm_fit30,
timestep = "number_of_trees",
metric = "logloss")
gbm_fit40 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit40",
ntrees=5,
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf40 <- h2o.performance(model = gbm_fit40,
newdata = test)
h2o.auc(gbm_perf40)
## [1] 0.7773026
h2o.confusionMatrix(gbm_perf10)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.303536886664258:
## N0 Yes Error Rate
## N0 688 103 0.130215 =103/791
## Yes 97 134 0.419913 =97/231
## Totals 785 237 0.195695 =200/1022
h2o.confusionMatrix(gbm_perf20)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.279072940794731:
## N0 Yes Error Rate
## N0 671 120 0.151707 =120/791
## Yes 92 139 0.398268 =92/231
## Totals 763 259 0.207436 =212/1022
h2o.confusionMatrix(gbm_perf30)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.279072940794731:
## N0 Yes Error Rate
## N0 671 120 0.151707 =120/791
## Yes 92 139 0.398268 =92/231
## Totals 763 259 0.207436 =212/1022
h2o.confusionMatrix(gbm_perf40)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.210171705078586:
## N0 Yes Error Rate
## N0 600 191 0.241466 =191/791
## Yes 72 159 0.311688 =72/231
## Totals 672 350 0.257339 =263/1022
dl_fit10 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit10",
seed = 1)
##
|
| | 0%
|
|==================== | 30%
|
|================================ | 50%
|
|==================================================== | 80%
|
|=================================================================| 100%
dl_fit20 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit20",
#validation_frame = valid, #only used if stopping_rounds > 0
epochs = 20,
hidden= c(10,10),
stopping_rounds = 0, # disable early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
dl_fit30 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit30",
validation_frame = valid, #in DL, early stopping is on by default
epochs = 200,
hidden = c(10,10),
score_interval = 1, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|============= | 20%
|
|================================================= | 75%
|
|=================================================================| 100%
dl_perf10 <- h2o.performance(model = dl_fit10,
newdata = test)
dl_perf20 <- h2o.performance(model = dl_fit20,
newdata = test)
dl_perf30 <- h2o.performance(model = dl_fit30,
newdata = test)
h2o.auc(dl_perf10) # 0.6774335
## [1] 0.7993088
h2o.auc(dl_perf20) # 0.678446
## [1] 0.7977928
h2o.auc(dl_perf30) # 0.6770498
## [1] 0.7907575
plot(dl_fit30,
timestep = "epochs",
metric = "AUC")
dl_fit40 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit40",
validation_frame = valid, #in DL, early stopping is on by default
epochs = 200,
hidden = c(32,64,32,128,32,64,32),
score_interval = 1, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|====== | 10%
|
|============= | 20%
|
|=================================================================| 100%
dl_perf40 <- h2o.performance(model = dl_fit40,
newdata = test)
h2o.auc(dl_perf40)
## [1] 0.7963151
h2o.confusionMatrix(dl_perf10)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.270812603089717:
## N0 Yes Error Rate
## N0 653 138 0.174463 =138/791
## Yes 85 146 0.367965 =85/231
## Totals 738 284 0.218200 =223/1022
h2o.confusionMatrix(dl_perf20)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.356675378728843:
## N0 Yes Error Rate
## N0 702 89 0.112516 =89/791
## Yes 105 126 0.454545 =105/231
## Totals 807 215 0.189824 =194/1022
h2o.confusionMatrix(dl_perf30)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.403758161696188:
## N0 Yes Error Rate
## N0 693 98 0.123894 =98/791
## Yes 98 133 0.424242 =98/231
## Totals 791 231 0.191781 =196/1022
h2o.confusionMatrix(dl_perf40)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.377268149879096:
## N0 Yes Error Rate
## N0 649 142 0.179520 =142/791
## Yes 80 151 0.346320 =80/231
## Totals 729 293 0.217221 =222/1022
gbm_params10 <- list(learn_rate = c(0.01, 0.1),
max_depth = c(3, 5, 9),
sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))
gbm_grid10 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid10",
training_frame = train,
validation_frame = valid,
ntrees = 100,
seed = 1,
hyper_params = gbm_params10)
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========= | 15%
|
|=========== | 17%
|
|============= | 20%
|
|============== | 22%
|
|================ | 24%
|
|================= | 26%
|
|=================== | 29%
|
|==================== | 31%
|
|====================== | 33%
|
|======================= | 35%
|
|======================== | 37%
|
|========================= | 39%
|
|========================== | 41%
|
|=========================== | 42%
|
|============================= | 44%
|
|============================== | 46%
|
|=============================== | 48%
|
|================================ | 50%
|
|================================== | 52%
|
|=================================== | 54%
|
|===================================== | 58%
|
|======================================= | 60%
|
|========================================= | 63%
|
|=========================================== | 65%
|
|============================================ | 68%
|
|============================================== | 70%
|
|=============================================== | 72%
|
|================================================= | 75%
|
|================================================== | 77%
|
|==================================================== | 79%
|
|===================================================== | 81%
|
|====================================================== | 83%
|
|======================================================== | 86%
|
|========================================================= | 87%
|
|========================================================== | 89%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================= | 94%
|
|============================================================== | 96%
|
|================================================================ | 98%
|
|=================================================================| 99%
|
|=================================================================| 100%
gbm_gridperf10 <- h2o.getGrid(grid_id = "gbm_grid10",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf10)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid10
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.2 0.01 9 0.8 gbm_grid10_model_12
## 2 0.2 0.01 9 1.0 gbm_grid10_model_30
## 3 0.2 0.1 5 1.0 gbm_grid10_model_27
## 4 0.5 0.01 9 1.0 gbm_grid10_model_31
## 5 1.0 0.1 9 0.8 gbm_grid10_model_17
## auc
## 1 0.7956198310458749
## 2 0.793009024190151
## 3 0.7909164985255316
## 4 0.7895445777255494
## 5 0.7890456974346466
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.5 0.01 3 1.0 gbm_grid10_model_19
## 32 1.0 0.1 9 1.0 gbm_grid10_model_35
## 33 1.0 0.01 3 1.0 gbm_grid10_model_20
## 34 0.5 0.1 9 0.8 gbm_grid10_model_16
## 35 0.2 0.01 3 1.0 gbm_grid10_model_18
## 36 1.0 0.01 9 1.0 gbm_grid10_model_32
## auc
## 31 0.7789295137580098
## 32 0.7788020221281124
## 33 0.7786052415689231
## 34 0.7776019378727745
## 35 0.776806500964502
## 36 0.7628655683909447
gbm_params20 <- list(learn_rate = seq(0.001, 0.1, 0.001),
max_depth = seq(2, 10, 1),
sample_rate = seq(0.3, 1.0, 0.05),
col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria20 <- list(strategy = "RandomDiscrete",
max_models = 36)
# Train and validate a grid of GBMs
gbm_grid20 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid20",
training_frame = train,
validation_frame = valid,
ntrees = 100,
seed = 1,
hyper_params = gbm_params20,
search_criteria = search_criteria20)
##
|
| | 0%
|
|== | 3%
|
|=== | 5%
|
|===== | 7%
|
|====== | 10%
|
|======== | 12%
|
|========= | 15%
|
|=========== | 16%
|
|============ | 19%
|
|============= | 21%
|
|=============== | 23%
|
|================= | 26%
|
|================== | 28%
|
|=================== | 30%
|
|===================== | 32%
|
|====================== | 34%
|
|======================= | 35%
|
|======================== | 37%
|
|========================= | 39%
|
|=========================== | 42%
|
|============================ | 44%
|
|============================== | 46%
|
|=============================== | 48%
|
|================================= | 50%
|
|================================== | 52%
|
|=================================== | 54%
|
|===================================== | 56%
|
|====================================== | 58%
|
|======================================= | 61%
|
|========================================= | 63%
|
|========================================== | 64%
|
|============================================ | 67%
|
|============================================= | 69%
|
|============================================== | 71%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================== | 77%
|
|=================================================== | 79%
|
|===================================================== | 81%
|
|====================================================== | 83%
|
|======================================================== | 86%
|
|========================================================= | 88%
|
|========================================================== | 90%
|
|============================================================ | 92%
|
|============================================================= | 94%
|
|============================================================== | 96%
|
|================================================================ | 98%
|
|=================================================================| 100%
gbm_gridperf20 <- h2o.getGrid(grid_id = "gbm_grid20",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf20)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid20
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.4 0.092 8 0.65 gbm_grid20_model_15
## 2 0.25 0.027 7 0.95 gbm_grid20_model_4
## 3 1.0 0.043 6 0.55 gbm_grid20_model_28
## 4 0.25 0.008 7 0.9 gbm_grid20_model_2
## 5 0.45 0.02 10 1.0 gbm_grid20_model_10
## auc
## 1 0.7939125518281192
## 2 0.7936658832398394
## 3 0.7928926187889405
## 4 0.7923909669408661
## 5 0.7922080441675351
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.1 0.026 2 0.8 gbm_grid20_model_26
## 32 0.75 0.064 7 0.45 gbm_grid20_model_34
## 33 0.95 0.019 8 1.0 gbm_grid20_model_12
## 34 0.35 0.005 3 1.0 gbm_grid20_model_23
## 35 0.65 0.088 9 0.5 gbm_grid20_model_6
## 36 0.8 0.098 6 0.4 gbm_grid20_model_32
## auc
## 31 0.777762688188732
## 32 0.7777322010598434
## 33 0.776640207534201
## 34 0.7765432030331921
## 35 0.7752405711625019
## 36 0.7729429502671782
#UPDATE
gbm_params30 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
max_depth = seq(2, 10, 1),
sample_rate = seq(0.6, 0.9, 0.005),
col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria30 <- list(strategy = "RandomDiscrete",
max_models = 36)
# Train and validate a grid of GBMs
gbm_grid30 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid30",
training_frame = train,
validation_frame = valid,
ntrees = 1000,
seed = 1,
hyper_params = gbm_params30,
search_criteria = search_criteria30)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 11%
|
|======== | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================= | 44%
|
|============================= | 45%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 100%
gbm_gridperf30 <- h2o.getGrid(grid_id = "gbm_grid30",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf30)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid30
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.785 0.0265 3 0.79 gbm_grid30_model_33
## 2 0.665 0.0331 8 0.85 gbm_grid30_model_9
## 3 0.655 0.0274 7 0.75 gbm_grid30_model_19
## 4 0.71 0.0399 9 0.79 gbm_grid30_model_6
## 5 0.555 0.0375 9 0.69 gbm_grid30_model_18
## auc
## 1 0.7850103101926786
## 2 0.7845751757167246
## 3 0.7840458082969336
## 4 0.7834471519478504
## 5 0.7832060264739141
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.54 0.042 6 0.74 gbm_grid30_model_15
## 32 0.765 0.0493 8 0.6 gbm_grid30_model_11
## 33 0.525 0.0489 4 0.885 gbm_grid30_model_21
## 34 0.54 0.0401 5 0.705 gbm_grid30_model_27
## 35 0.785 0.042 5 0.755 gbm_grid30_model_34
## 36 0.77 0.0487 5 0.815 gbm_grid30_model_3
## auc
## 31 0.7754290370501762
## 32 0.7746613157136205
## 33 0.7731452739407108
## 34 0.771820469612647
## 35 0.7707229329726614
## 36 0.7675190128821977
best_gbm_model_id <- gbm_gridperf30@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)
# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm,
newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541
## [1] 0.7976642
h2o.confusionMatrix(best_gbm_perf)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.255673344156405:
## N0 Yes Error Rate
## N0 666 125 0.158028 =125/791
## Yes 85 146 0.367965 =85/231
## Totals 751 271 0.205479 =210/1022
y1 <- y3
glm_fit105 <- h2o.glm(x = x,
y = y1,
training_frame = train,
model_id = "glm_fit105",
family = "binomial") #similar to R's glm, h2o.glm has the family argument
##
|
| | 0%
|
|=================================================================| 100%
# Next we will do some automatic tuning by passing in a validation frame and setting
# `lambda_search = True`. Since we are training a GLM with regularization, we should
# try to find the right amount of regularization (to avoid overfitting). The model
# parameter, `lambda`, controls the amount of regularization in a GLM model and we can
# find the optimal value for `lambda` automatically by setting `lambda_search = TRUE`
# and passing in a validation frame (which is used to evaluate model performance using a
# particular value of lambda).
glm_fit205 <- h2o.glm(x = x,
y = y1,
training_frame = train,
model_id = "glm_fit205",
validation_frame = valid,
family = "binomial",
lambda_search = TRUE)
##
|
| | 0%
|
|================ | 25%
|
|=================================================================| 100%
# Let's compare the performance of the two GLMs
glm_perf105 <- h2o.performance(model = glm_fit105,
newdata = test)
glm_perf205 <- h2o.performance(model = glm_fit205,
newdata = test)
# Print model performance
glm_perf105
## H2OBinomialMetrics: glm
##
## MSE: 0.1554299
## RMSE: 0.394246
## LogLoss: 0.4807781
## Mean Per-Class Error: 0.335744
## AUC: 0.7075338
## Gini: 0.4150676
## R^2: 0.09756708
## Residual Deviance: 982.7104
## AIC: 1074.71
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N0 Yes Error Rate
## N0 501 295 0.370603 =295/796
## Yes 68 158 0.300885 =68/226
## Totals 569 453 0.355186 =363/1022
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.199328 0.465390 224
## 2 max f2 0.093995 0.613884 335
## 3 max f0point5 0.338541 0.430108 111
## 4 max accuracy 0.644194 0.786693 11
## 5 max precision 0.906639 1.000000 0
## 6 max recall 0.032023 1.000000 394
## 7 max specificity 0.906639 1.000000 0
## 8 max absolute_mcc 0.237015 0.277072 192
## 9 max min_per_class_accuracy 0.208790 0.646985 216
## 10 max mean_per_class_accuracy 0.199328 0.664256 224
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
glm_perf205
## H2OBinomialMetrics: glm
##
## MSE: 0.1551997
## RMSE: 0.3939539
## LogLoss: 0.4795676
## Mean Per-Class Error: 0.3349824
## AUC: 0.7110914
## Gini: 0.4221828
## R^2: 0.09890381
## Residual Deviance: 980.2362
## AIC: 1042.236
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## N0 Yes Error Rate
## N0 548 248 0.311558 =248/796
## Yes 81 145 0.358407 =81/226
## Totals 629 393 0.321918 =329/1022
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.214892 0.468498 206
## 2 max f2 0.113418 0.623522 324
## 3 max f0point5 0.335966 0.429062 111
## 4 max accuracy 0.576065 0.786693 15
## 5 max precision 0.903696 1.000000 0
## 6 max recall 0.035596 1.000000 397
## 7 max specificity 0.903696 1.000000 0
## 8 max absolute_mcc 0.254013 0.284842 172
## 9 max min_per_class_accuracy 0.204259 0.655779 217
## 10 max mean_per_class_accuracy 0.214892 0.665018 206
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Instead of printing the entire model performance metrics object,
# it is probably easier to print just the metric that you are interested in comparing.
# Retreive test set AUC
h2o.auc(glm_perf105)
## [1] 0.7075338
h2o.auc(glm_perf205)
## [1] 0.7110914
# Compare test AUC to the training AUC and validation AUC
h2o.auc(glm_fit105, train = TRUE)
## [1] 0.7259714
h2o.auc(glm_fit205, valid = TRUE)
## [1] 0.7131017
#glm_fit2@model$validation_metrics
h2o.confusionMatrix(glm_perf105)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.19932800273972:
## N0 Yes Error Rate
## N0 501 295 0.370603 =295/796
## Yes 68 158 0.300885 =68/226
## Totals 569 453 0.355186 =363/1022
h2o.confusionMatrix(glm_perf205)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.214891693551094:
## N0 Yes Error Rate
## N0 548 248 0.311558 =248/796
## Yes 81 145 0.358407 =81/226
## Totals 629 393 0.321918 =329/1022
rf_fit105 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit105",
seed = 1)
##
|
| | 0%
|
|======== | 12%
|
|=================================================================| 100%
rf_fit205 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit205",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 100000,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
#Let's compare the performance of the two RFs
rf_perf105 <- h2o.performance(model = rf_fit105,
newdata = test)
rf_perf205 <- h2o.performance(model = rf_fit205,
newdata = test)
# Print model performance
h2o.auc(rf_perf105)
## [1] 0.7107746
h2o.auc(rf_perf205)
## [1] 0.7180287
#Cross Validate
rf_fit305 <- h2o.randomForest(x = x,
y = y1,
training_frame = train,
model_id = "rf_fit305",
seed = 1,
nfolds = 5)
##
|
| | 0%
|
|============================================== | 70%
|
|====================================================== | 83%
|
|=================================================================| 100%
rf_perf305 <- h2o.performance(model = rf_fit305,
newdata = test)
# Print model performance
h2o.auc(rf_perf305)
## [1] 0.7107746
# To evaluate the cross-validated AUC, do the following:
h2o.auc(rf_fit305, xval = TRUE)
## [1] 0.6987228
h2o.confusionMatrix(rf_perf105)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.246666666269302:
## N0 Yes Error Rate
## N0 551 245 0.307789 =245/796
## Yes 82 144 0.362832 =82/226
## Totals 633 389 0.319961 =327/1022
h2o.confusionMatrix(rf_perf205)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.216748768091202:
## N0 Yes Error Rate
## N0 509 287 0.360553 =287/796
## Yes 67 159 0.296460 =67/226
## Totals 576 446 0.346380 =354/1022
h2o.confusionMatrix(rf_perf305)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.246666666269302:
## N0 Yes Error Rate
## N0 551 245 0.307789 =245/796
## Yes 82 144 0.362832 =82/226
## Totals 633 389 0.319961 =327/1022
gbm_fit105 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit105",
seed = 1)
##
|
| | 0%
|
|========= | 14%
|
|=================================================================| 100%
gbm_fit205 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit205",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 500,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
gbm_fit305 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit305",
validation_frame = valid, #only used if stopping_rounds > 0
ntrees = 50000,
score_tree_interval = 5, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf105 <- h2o.performance(model = gbm_fit105,
newdata = test)
gbm_perf205 <- h2o.performance(model = gbm_fit205,
newdata = test)
gbm_perf305 <- h2o.performance(model = gbm_fit305,
newdata = test)
h2o.auc(gbm_perf105)
## [1] 0.6992985
h2o.auc(gbm_perf205)
## [1] 0.702278
h2o.auc(gbm_perf305)
## [1] 0.702278
h2o.scoreHistory(gbm_fit205)
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2018-01-28 21:14:30 0.000 sec 0 0.41225
## 2 2018-01-28 21:14:30 0.031 sec 5 0.38988
## 3 2018-01-28 21:14:30 0.062 sec 10 0.37549
## 4 2018-01-28 21:14:30 0.078 sec 15 0.36573
## 5 2018-01-28 21:14:30 0.109 sec 20 0.35862
## 6 2018-01-28 21:14:30 0.140 sec 25 0.35329
## 7 2018-01-28 21:14:30 0.168 sec 30 0.34800
## 8 2018-01-28 21:14:30 0.199 sec 35 0.34195
## 9 2018-01-28 21:14:30 0.215 sec 40 0.33859
## 10 2018-01-28 21:14:30 0.246 sec 45 0.33531
## training_logloss training_auc training_lift
## 1 0.52318 0.50000 1.00000
## 2 0.47292 0.79247 3.62497
## 3 0.44268 0.81789 4.22284
## 4 0.42239 0.83553 4.12686
## 5 0.40750 0.84855 4.31881
## 6 0.39658 0.85866 4.31881
## 7 0.38639 0.86996 4.31881
## 8 0.37493 0.88326 4.60673
## 9 0.36888 0.88902 4.60673
## 10 0.36309 0.89384 4.60673
## training_classification_error validation_rmse validation_logloss
## 1 0.78293 0.42367 0.54481
## 2 0.25631 0.40934 0.51132
## 3 0.26717 0.40521 0.50237
## 4 0.22313 0.40353 0.49790
## 5 0.20643 0.40284 0.49550
## 6 0.18743 0.40306 0.49584
## 7 0.18076 0.40283 0.49501
## 8 0.17074 0.40350 0.49761
## 9 0.16114 0.40454 0.50025
## 10 0.15300 0.40466 0.50008
## validation_auc validation_lift validation_classification_error
## 1 0.50000 1.00000 0.76606
## 2 0.70837 2.95939 0.33936
## 3 0.70214 3.41974 0.34739
## 4 0.70733 3.41974 0.36546
## 5 0.71179 3.41974 0.35442
## 6 0.71274 2.13734 0.36345
## 7 0.71541 2.56481 0.34538
## 8 0.71037 2.56481 0.32329
## 9 0.70734 2.99227 0.32129
## 10 0.70937 2.13734 0.32229
h2o.scoreHistory(gbm_fit305)
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2018-01-28 21:14:32 0.000 sec 0 0.41225
## 2 2018-01-28 21:14:32 0.016 sec 5 0.38988
## 3 2018-01-28 21:14:32 0.060 sec 10 0.37549
## 4 2018-01-28 21:14:32 0.075 sec 15 0.36573
## 5 2018-01-28 21:14:32 0.107 sec 20 0.35862
## 6 2018-01-28 21:14:32 0.138 sec 25 0.35329
## 7 2018-01-28 21:14:32 0.169 sec 30 0.34800
## 8 2018-01-28 21:14:32 0.200 sec 35 0.34195
## 9 2018-01-28 21:14:32 0.216 sec 40 0.33859
## 10 2018-01-28 21:14:32 0.247 sec 45 0.33531
## training_logloss training_auc training_lift
## 1 0.52318 0.50000 1.00000
## 2 0.47292 0.79247 3.62497
## 3 0.44268 0.81789 4.22284
## 4 0.42239 0.83553 4.12686
## 5 0.40750 0.84855 4.31881
## 6 0.39658 0.85866 4.31881
## 7 0.38639 0.86996 4.31881
## 8 0.37493 0.88326 4.60673
## 9 0.36888 0.88902 4.60673
## 10 0.36309 0.89384 4.60673
## training_classification_error validation_rmse validation_logloss
## 1 0.78293 0.42367 0.54481
## 2 0.25631 0.40934 0.51132
## 3 0.26717 0.40521 0.50237
## 4 0.22313 0.40353 0.49790
## 5 0.20643 0.40284 0.49550
## 6 0.18743 0.40306 0.49584
## 7 0.18076 0.40283 0.49501
## 8 0.17074 0.40350 0.49761
## 9 0.16114 0.40454 0.50025
## 10 0.15300 0.40466 0.50008
## validation_auc validation_lift validation_classification_error
## 1 0.50000 1.00000 0.76606
## 2 0.70837 2.95939 0.33936
## 3 0.70214 3.41974 0.34739
## 4 0.70733 3.41974 0.36546
## 5 0.71179 3.41974 0.35442
## 6 0.71274 2.13734 0.36345
## 7 0.71541 2.56481 0.34538
## 8 0.71037 2.56481 0.32329
## 9 0.70734 2.99227 0.32129
## 10 0.70937 2.13734 0.32229
plot(gbm_fit205,
timestep = "number_of_trees",
metric = "AUC")
plot(gbm_fit305,
timestep = "number_of_trees",
metric = "logloss")
gbm_fit405 <- h2o.gbm(x = x,
y = y1,
training_frame = train,
model_id = "gbm_fit405",
ntrees=5,
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf405 <- h2o.performance(model = gbm_fit405,
newdata = test)
h2o.auc(gbm_perf405)
## [1] 0.6973529
h2o.confusionMatrix(gbm_perf105)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.237339910626288:
## N0 Yes Error Rate
## N0 583 213 0.267588 =213/796
## Yes 93 133 0.411504 =93/226
## Totals 676 346 0.299413 =306/1022
h2o.confusionMatrix(gbm_perf205)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.183380894744191:
## N0 Yes Error Rate
## N0 506 290 0.364322 =290/796
## Yes 71 155 0.314159 =71/226
## Totals 577 445 0.353229 =361/1022
h2o.confusionMatrix(gbm_perf305)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.183380894744191:
## N0 Yes Error Rate
## N0 506 290 0.364322 =290/796
## Yes 71 155 0.314159 =71/226
## Totals 577 445 0.353229 =361/1022
h2o.confusionMatrix(gbm_perf405)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.233304352675186:
## N0 Yes Error Rate
## N0 593 203 0.255025 =203/796
## Yes 98 128 0.433628 =98/226
## Totals 691 331 0.294521 =301/1022
dl_fit105 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit105",
seed = 1)
##
|
| | 0%
|
|==================== | 30%
|
|======================================= | 60%
|
|========================================================== | 90%
|
|=================================================================| 100%
dl_fit205 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit205",
#validation_frame = valid, #only used if stopping_rounds > 0
epochs = 20,
hidden= c(10,10),
stopping_rounds = 0, # disable early stopping
seed = 1)
##
|
| | 0%
|
|=================================================================| 100%
dl_fit305 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit305",
validation_frame = valid, #in DL, early stopping is on by default
epochs = 200,
hidden = c(10,10),
score_interval = 1, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|=== | 5%
|
|================================ | 50%
|
|============================================================== | 95%
|
|=================================================================| 100%
dl_perf105 <- h2o.performance(model = dl_fit105,
newdata = test)
dl_perf205 <- h2o.performance(model = dl_fit205,
newdata = test)
dl_perf305 <- h2o.performance(model = dl_fit305,
newdata = test)
h2o.auc(dl_perf105) # 0.6774335
## [1] 0.7017638
h2o.auc(dl_perf205) # 0.678446
## [1] 0.702253
h2o.auc(dl_perf305) # 0.6770498
## [1] 0.705541
plot(dl_fit305,
timestep = "epochs",
metric = "AUC")
dl_fit405 <- h2o.deeplearning(x = x,
y = y1,
training_frame = train,
model_id = "dl_fit405",
validation_frame = valid, #in DL, early stopping is on by default
epochs = 200,
hidden = c(32,64,32,128,32,64,32),
score_interval = 1, #used for early stopping
stopping_rounds = 3, #used for early stopping
stopping_metric = "AUC", #used for early stopping
stopping_tolerance = 0.0005, #used for early stopping
seed = 1)
##
|
| | 0%
|
|====== | 10%
|
|============= | 20%
|
|=================================================================| 100%
dl_perf405 <- h2o.performance(model = dl_fit405,
newdata = test)
h2o.auc(dl_perf40)
## [1] 0.7963151
h2o.confusionMatrix(dl_perf105)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.139217562640086:
## N0 Yes Error Rate
## N0 377 419 0.526382 =419/796
## Yes 28 198 0.123894 =28/226
## Totals 405 617 0.437378 =447/1022
h2o.confusionMatrix(dl_perf205)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.24193354973481:
## N0 Yes Error Rate
## N0 496 300 0.376884 =300/796
## Yes 68 158 0.300885 =68/226
## Totals 564 458 0.360078 =368/1022
h2o.confusionMatrix(dl_perf305)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.15720004889004:
## N0 Yes Error Rate
## N0 509 287 0.360553 =287/796
## Yes 70 156 0.309735 =70/226
## Totals 579 443 0.349315 =357/1022
h2o.confusionMatrix(dl_perf405)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.212065363158288:
## N0 Yes Error Rate
## N0 616 180 0.226131 =180/796
## Yes 106 120 0.469027 =106/226
## Totals 722 300 0.279843 =286/1022
gbm_params105 <- list(learn_rate = c(0.01, 0.1),
max_depth = c(3, 5, 9),
sample_rate = c(0.8, 1.0),
col_sample_rate = c(0.2, 0.5, 1.0))
gbm_grid105 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid105",
training_frame = train,
validation_frame = valid,
ntrees = 100,
seed = 1,
hyper_params = gbm_params105)
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========= | 14%
|
|=========== | 17%
|
|============ | 19%
|
|============== | 21%
|
|=============== | 23%
|
|================= | 26%
|
|================== | 28%
|
|==================== | 30%
|
|===================== | 33%
|
|====================== | 35%
|
|======================== | 37%
|
|========================= | 38%
|
|========================== | 40%
|
|=========================== | 42%
|
|============================ | 43%
|
|============================== | 45%
|
|=============================== | 47%
|
|================================ | 49%
|
|================================= | 50%
|
|=================================== | 53%
|
|==================================== | 56%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================== | 61%
|
|========================================= | 64%
|
|=========================================== | 66%
|
|============================================ | 68%
|
|============================================== | 71%
|
|=============================================== | 73%
|
|================================================= | 75%
|
|================================================== | 78%
|
|==================================================== | 80%
|
|====================================================== | 82%
|
|======================================================= | 85%
|
|======================================================== | 87%
|
|========================================================== | 89%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================= | 94%
|
|=============================================================== | 96%
|
|================================================================ | 98%
|
|=================================================================| 100%
gbm_gridperf105 <- h2o.getGrid(grid_id = "gbm_grid105",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf105)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid105
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.2 0.01 5 1.0 gbm_grid105_model_24
## 2 1.0 0.01 5 0.8 gbm_grid105_model_8
## 3 1.0 0.01 3 0.8 gbm_grid105_model_2
## 4 0.5 0.01 3 0.8 gbm_grid105_model_1
## 5 0.5 0.01 3 1.0 gbm_grid105_model_19
## auc
## 1 0.716080076949471
## 2 0.7155457056232738
## 3 0.7145332125841635
## 4 0.7144572756062303
## 5 0.7143447763796623
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.5 0.1 5 0.8 gbm_grid105_model_10
## 32 0.2 0.1 9 1.0 gbm_grid105_model_33
## 33 0.2 0.1 9 0.8 gbm_grid105_model_15
## 34 0.5 0.1 9 1.0 gbm_grid105_model_34
## 35 0.5 0.1 9 0.8 gbm_grid105_model_16
## 36 1.0 0.1 9 1.0 gbm_grid105_model_35
## auc
## 31 0.68331186473093
## 32 0.6826059320842168
## 33 0.6742416145888998
## 34 0.6728888113894218
## 35 0.6702703918910558
## 36 0.6691172748187356
gbm_params205 <- list(learn_rate = seq(0.001, 0.1, 0.001),
max_depth = seq(2, 10, 1),
sample_rate = seq(0.3, 1.0, 0.05),
col_sample_rate = seq(0.1, 1.0, 0.05))
search_criteria205 <- list(strategy = "RandomDiscrete",
max_models = 36)
# Train and validate a grid of GBMs
gbm_grid205 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid205",
training_frame = train,
validation_frame = valid,
ntrees = 100,
seed = 1,
hyper_params = gbm_params20,
search_criteria = search_criteria20)
##
|
| | 0%
|
|= | 2%
|
|=== | 4%
|
|==== | 7%
|
|====== | 9%
|
|======= | 11%
|
|========= | 14%
|
|========== | 16%
|
|=========== | 17%
|
|============ | 19%
|
|============= | 20%
|
|=============== | 22%
|
|================ | 25%
|
|================= | 27%
|
|=================== | 29%
|
|==================== | 31%
|
|====================== | 33%
|
|======================= | 36%
|
|========================= | 38%
|
|=========================== | 41%
|
|============================ | 43%
|
|============================= | 45%
|
|=============================== | 47%
|
|================================ | 49%
|
|================================== | 52%
|
|=================================== | 54%
|
|===================================== | 57%
|
|====================================== | 59%
|
|======================================== | 61%
|
|========================================= | 64%
|
|=========================================== | 66%
|
|============================================ | 68%
|
|============================================= | 70%
|
|=============================================== | 72%
|
|================================================ | 74%
|
|================================================= | 76%
|
|=================================================== | 78%
|
|===================================================== | 81%
|
|====================================================== | 83%
|
|======================================================= | 85%
|
|========================================================= | 87%
|
|========================================================== | 89%
|
|============================================================ | 92%
|
|============================================================= | 94%
|
|=============================================================== | 97%
|
|=================================================================| 100%
gbm_gridperf205 <- h2o.getGrid(grid_id = "gbm_grid205",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf205)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid205
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.65 0.01 6 0.5 gbm_grid205_model_7
## 2 0.4 0.052 3 0.65 gbm_grid205_model_13
## 3 0.35 0.069 9 0.3 gbm_grid205_model_3
## 4 0.3 0.05 3 0.8 gbm_grid205_model_35
## 5 0.7 0.036 5 0.45 gbm_grid205_model_22
## auc
## 1 0.7185184976853284
## 2 0.7173428807676948
## 3 0.7166116357950039
## 4 0.7165047615297644
## 5 0.7153713318220937
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.8 0.072 10 0.75 gbm_grid205_model_6
## 32 0.25 0.05 8 0.75 gbm_grid205_model_25
## 33 0.9 0.087 6 0.5 gbm_grid205_model_9
## 34 0.65 0.049 9 0.3 gbm_grid205_model_23
## 35 0.8 0.082 7 0.8 gbm_grid205_model_15
## 36 0.1 0.079 10 0.85 gbm_grid205_model_1
## auc
## 31 0.6932792962048386
## 32 0.6931864843429202
## 33 0.6887905770647826
## 34 0.6852468514278963
## 35 0.6816496886583904
## 36 0.6714066340793907
#UPDATE
gbm_params305 <- list(learn_rate = seq(0.02, 0.05, 0.0001),
max_depth = seq(2, 10, 1),
sample_rate = seq(0.6, 0.9, 0.005),
col_sample_rate = seq(0.5, 0.8, 0.005))
search_criteria305 <- list(strategy = "RandomDiscrete",
max_models = 36)
# Train and validate a grid of GBMs
gbm_grid305 <- h2o.grid("gbm", x = x, y = y1,
grid_id = "gbm_grid305",
training_frame = train,
validation_frame = valid,
ntrees = 1000,
seed = 1,
hyper_params = gbm_params30,
search_criteria = search_criteria30)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|== | 3%
|
|== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 20%
|
|============== | 21%
|
|=============== | 22%
|
|=============== | 23%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================= | 45%
|
|============================== | 46%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 63%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 80%
|
|===================================================== | 81%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|=================================================================| 100%
gbm_gridperf305 <- h2o.getGrid(grid_id = "gbm_grid305",
sort_by = "auc",
decreasing = TRUE)
print(gbm_gridperf305)
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid305
## Used hyper parameters:
## - col_sample_rate
## - learn_rate
## - max_depth
## - sample_rate
## Number of models: 36
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by decreasing auc
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 1 0.63 0.0255 2 0.815 gbm_grid305_model_3
## 2 0.705 0.0356 2 0.68 gbm_grid305_model_19
## 3 0.705 0.039 2 0.65 gbm_grid305_model_22
## 4 0.63 0.0488 2 0.605 gbm_grid305_model_23
## 5 0.77 0.0356 2 0.6 gbm_grid305_model_20
## auc
## 1 0.7092879361454391
## 2 0.7022736093689356
## 3 0.7019136118439185
## 4 0.6977877027095438
## 5 0.6957514667086665
##
## ---
## col_sample_rate learn_rate max_depth sample_rate model_ids
## 31 0.715 0.0264 7 0.685 gbm_grid305_model_34
## 32 0.535 0.032 5 0.745 gbm_grid305_model_29
## 33 0.52 0.0443 5 0.8 gbm_grid305_model_35
## 34 0.515 0.0327 7 0.715 gbm_grid305_model_18
## 35 0.725 0.0456 6 0.9 gbm_grid305_model_26
## 36 0.7 0.0444 4 0.625 gbm_grid305_model_30
## auc
## 31 0.6719128805989459
## 32 0.6704925778635272
## 33 0.6701213304158534
## 34 0.6698035201007994
## 35 0.6688135269070025
## 36 0.6649604283970548
best_gbm_model_id <- gbm_gridperf30@model_ids[[1]]
best_gbm <- h2o.getModel(best_gbm_model_id)
# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf <- h2o.performance(model = best_gbm,
newdata = test)
h2o.auc(best_gbm_perf) # 0.683855910541
## [1] 0.7976642
h2o.confusionMatrix(best_gbm_perf)
## Confusion Matrix (vertical: actual; across: predicted) for max f1 @ threshold = 0.255673344156405:
## N0 Yes Error Rate
## N0 666 125 0.158028 =125/791
## Yes 85 146 0.367965 =85/231
## Totals 751 271 0.205479 =210/1022