# Logistic Regression using H2O

# Load the occupancy data 
occupancy_train <- read.csv('datatraining.txt', stringsAsFactors = T)
occupancy_test <- read.csv('datatest.txt', stringsAsFactors = T)

# Define input (x) and output (y) variables"
x <- c("Temperature", "Humidity", "Light", "CO2", "HumidityRatio")
y <- "Occupancy"

# Converting dependentvariable into factor (Based on the req of H2O)
occupancy_train$Occupancy <- as.factor(occupancy_train$Occupancy)
occupancy_test$Occupancy <- as.factor((occupancy_test$Occupancy))

# H2O Library
#install.packages('h2o')
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
localH2O <- h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\manma\AppData\Local\Temp\RtmpSEWBNS/h2o_manma_started_from_r.out
##     C:\Users\manma\AppData\Local\Temp\RtmpSEWBNS/h2o_manma_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 335 milliseconds 
##     H2O cluster version:        3.16.0.2 
##     H2O cluster version age:    1 month and 18 days  
##     H2O cluster name:           H2O_started_from_R_manma_bvg535 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.76 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.3 (2017-11-30)
# Converting dataset into H2OParsedData objects
occupancy_train.hex <- as.h2o(x = occupancy_train, destination_frame = "occupancy_train.hex")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
occupancy_test.hex <- as.h2o(x = occupancy_test, destination_frame = "occupancy_test.hex")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
# Train the model
occupancy_train.glm <- h2o.glm(x = x, # Vector of predictor variable names
                               y = y, # Name of response/dependent variable
                               training_frame = occupancy_train.hex, # Training data
                               seed = 1234567,        # Seed for random numbers
                               family = "binomial",   # Outcome variable
                               lambda_search = TRUE,  # Optimum regularisation lambda
                               alpha = 0.5,           # Elastic net regularisation
                               nfolds = 5             # N-fold cross validation
                               )
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================================| 100%
# Training accuracy (AUC)
occupancy_train.glm@model$training_metrics@metrics$AUC
## [1] 0.9945787
# Cross validation accuracy (AUC)
occupancy_train.glm@model$cross_validation_metrics@metrics$AUC
## [1] 0.9945019
# Predict on test data
yhat <- h2o.predict(occupancy_train.glm, occupancy_test.hex)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
#compute variable importance and performance
h2o.varimp_plot(occupancy_train.glm, num_of_features = 5)