# Logistic Regression using H2O
# Load the occupancy data
occupancy_train <- read.csv('datatraining.txt', stringsAsFactors = T)
occupancy_test <- read.csv('datatest.txt', stringsAsFactors = T)
# Define input (x) and output (y) variables"
x <- c("Temperature", "Humidity", "Light", "CO2", "HumidityRatio")
y <- "Occupancy"
# Converting dependentvariable into factor (Based on the req of H2O)
occupancy_train$Occupancy <- as.factor(occupancy_train$Occupancy)
occupancy_test$Occupancy <- as.factor((occupancy_test$Occupancy))
# H2O Library
#install.packages('h2o')
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
localH2O <- h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\manma\AppData\Local\Temp\RtmpSEWBNS/h2o_manma_started_from_r.out
## C:\Users\manma\AppData\Local\Temp\RtmpSEWBNS/h2o_manma_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 335 milliseconds
## H2O cluster version: 3.16.0.2
## H2O cluster version age: 1 month and 18 days
## H2O cluster name: H2O_started_from_R_manma_bvg535
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.76 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.3 (2017-11-30)
# Converting dataset into H2OParsedData objects
occupancy_train.hex <- as.h2o(x = occupancy_train, destination_frame = "occupancy_train.hex")
##
|
| | 0%
|
|=================================================================| 100%
occupancy_test.hex <- as.h2o(x = occupancy_test, destination_frame = "occupancy_test.hex")
##
|
| | 0%
|
|=================================================================| 100%
# Train the model
occupancy_train.glm <- h2o.glm(x = x, # Vector of predictor variable names
y = y, # Name of response/dependent variable
training_frame = occupancy_train.hex, # Training data
seed = 1234567, # Seed for random numbers
family = "binomial", # Outcome variable
lambda_search = TRUE, # Optimum regularisation lambda
alpha = 0.5, # Elastic net regularisation
nfolds = 5 # N-fold cross validation
)
##
|
| | 0%
|
|====================== | 34%
|
|=============================== | 47%
|
|================================================= | 75%
|
|=================================================================| 100%
# Training accuracy (AUC)
occupancy_train.glm@model$training_metrics@metrics$AUC
## [1] 0.9945787
# Cross validation accuracy (AUC)
occupancy_train.glm@model$cross_validation_metrics@metrics$AUC
## [1] 0.9945019
# Predict on test data
yhat <- h2o.predict(occupancy_train.glm, occupancy_test.hex)
##
|
| | 0%
|
|=================================================================| 100%
#compute variable importance and performance
h2o.varimp_plot(occupancy_train.glm, num_of_features = 5)
