load library

library(h2o)

----------------------------------------------------------------------

Your next step is to start H2O:
    > h2o.init()

For H2O package documentation, ask for help:
    > ??h2o

After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai

----------------------------------------------------------------------


Attaching package: 㤼㸱h2o㤼㸲

The following objects are masked from 㤼㸱package:stats㤼㸲:

    cor, sd, var

The following objects are masked from 㤼㸱package:base㤼㸲:

    %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames, colnames<-, ifelse, is.character,
    is.factor, is.numeric, log, log10, log1p, log2, round, signif, trunc
h2o.init()

H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    C:\Users\r631758\AppData\Local\Temp\1\RtmpiOuWNy/h2o_r631758_started_from_r.out
    C:\Users\r631758\AppData\Local\Temp\1\RtmpiOuWNy/h2o_r631758_started_from_r.err

java version "1.8.0_144"
Java(TM) SE Runtime Environment (build 1.8.0_144-b01)
Java HotSpot(TM) 64-Bit Server VM (build 25.144-b01, mixed mode)

Starting H2O JVM and connecting: . Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 seconds 19 milliseconds 
    H2O cluster version:        3.10.5.3 
    H2O cluster version age:    2 months and 26 days  
    H2O cluster name:           H2O_started_from_R_r631758_gdc101 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.48 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    R Version:                  R version 3.4.1 (2017-06-30) 
h2o.removeAll()
[1] 0

import cover type data

D = h2o.importFile(path ="C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\covtype.full.csv", parse=TRUE)

  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |==========================================================================================================| 100%
h2o.summary(D)
Approximated quantiles computed! If you are interested in exact quantiles, please pass the `exact_quantiles=TRUE` parameter.
 Elevation      Aspect          Slope          Horizontal_Distance_To_Hydrology Vertical_Distance_To_Hydrology
 Min.   :1859   Min.   :  0.0   Min.   : 0.0   Min.   :   0.0                   Min.   :-173.00               
 1st Qu.:2809   1st Qu.: 58.0   1st Qu.: 9.0   1st Qu.: 107.6                   1st Qu.:   7.00               
 Median :2995   Median :127.0   Median :13.0   Median : 216.7                   Median :  30.00               
 Mean   :2959   Mean   :155.7   Mean   :14.1   Mean   : 269.4                   Mean   :  46.42               
 3rd Qu.:3163   3rd Qu.:260.0   3rd Qu.:18.0   3rd Qu.: 383.1                   3rd Qu.:  69.00               
 Max.   :3858   Max.   :360.0   Max.   :66.0   Max.   :1397.0                   Max.   : 601.00               
 Horizontal_Distance_To_Roadways Hillshade_9am   Hillshade_Noon  Hillshade_3pm   Horizontal_Distance_To_Fire_Points
 Min.   :   0                    Min.   :  0.0   Min.   :  0.0   Min.   :  0.0   Min.   :   0                      
 1st Qu.:1103                    1st Qu.:198.0   1st Qu.:213.0   1st Qu.:119.0   1st Qu.:1019                      
 Median :1993                    Median :218.0   Median :226.0   Median :143.0   Median :1707                      
 Mean   :2350                    Mean   :212.1   Mean   :223.3   Mean   :142.5   Mean   :1980                      
 3rd Qu.:3324                    3rd Qu.:231.0   3rd Qu.:237.0   3rd Qu.:168.0   3rd Qu.:2547                      
 Max.   :7117                    Max.   :254.0   Max.   :254.0   Max.   :254.0   Max.   :7173                      
 Wilderness_Area Soil_Type       Cover_Type     
 area_0:260796   type_28:115247  class_2:283301 
 area_2:253364   type_22: 57752  class_1:211840 
 area_3: 36968   type_31: 52519  class_3: 35754 
 area_1: 29884   type_32: 45154  class_7: 20510 
                 type_21: 33373  class_6: 17367 
                 type_9 : 32634  class_5:  9493 
D.R<-as.data.frame(D)

split data

data=h2o.splitFrame(D,ratios=c(.7,.15), destination_frames = c("train","test","valid"))
names(data)<-c("Train","Test","Valid")

multinomial model

y="Cover_Type"
x=names(data$Train)
x=x[-which(x==y)]
start=Sys.time()
glm = h2o.glm(training_frame = data$Train, validation_frame = data$Valid, x = x, y = y,family='multinomial',solver='L_BFGS')

  |                                                                   
  |                                                             |   0%
  |                                                                   
  |=                                                            |   1%
  |                                                                   
  |=====                                                        |   9%
  |                                                                   
  |==========                                                   |  16%
  |                                                                   
  |===============                                              |  24%
  |                                                                   
  |===================                                          |  31%
  |                                                                   
  |=============================================================| 100%
glm_time<-Sys.time()-start
print(paste("Took", round(glm_time, digits=2), units(glm_time), "to build multinomail regression model."))
[1] "Took 6.48 secs to build multinomail regression model."
h2o.confusionMatrix(glm, valid=TRUE)
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
        class_1 class_2 class_3 class_4 class_5 class_6 class_7  Error
class_1   22014    9023       4       0       0       8     595 0.3043
class_2    7635   34041     568       0      14     173      23 0.1982
class_3       0     567    4473      68       1     318       0 0.1758
class_4       0       1     235     115       0      50       0 0.7132
class_5       4    1357      38       0       0       6       0 1.0000
class_6       0     639    1434       7       1     552       0 0.7904
class_7    1409      31       0       0       0       0    1605 0.4729
Totals    31062   45659    6752     190      16    1107    2223 0.2782
                     Rate
class_1 =  9,630 / 31,644
class_2 =  8,413 / 42,454
class_3 =     954 / 5,427
class_4 =       286 / 401
class_5 =   1,405 / 1,405
class_6 =   2,081 / 2,633
class_7 =   1,440 / 3,045
Totals  = 24,209 / 87,009

disable regularization of the glm model

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/lambda.html

start=Sys.time()
glm2 = h2o.glm(training_frame = data$Train, validation_frame = data$Valid, x = x, y = y,family='multinomial',solver='L_BFGS', lambda=0)

  |                                                                               
  |                                                                         |   0%
  |                                                                               
  |=                                                                        |   1%
  |                                                                               
  |======                                                                   |   9%
  |                                                                               
  |===========                                                              |  15%
  |                                                                               
  |=================                                                        |  23%
  |                                                                               
  |======================                                                   |  31%
  |                                                                               
  |============================                                             |  38%
  |                                                                               
  |================================                                         |  44%
  |                                                                               
  |=====================================                                    |  51%
  |                                                                               
  |==========================================                               |  57%
  |                                                                               
  |==============================================                           |  64%
  |                                                                               
  |===================================================                      |  70%
  |                                                                               
  |========================================================                 |  76%
  |                                                                               
  |==============================================================           |  84%
  |                                                                               
  |==================================================================       |  91%
  |                                                                               
  |======================================================================== |  99%
  |                                                                               
  |=========================================================================| 100%
Reached maximum number of iterations 140!
glm_time<-Sys.time()-start
print(paste("Took", round(glm_time, digits=2), units(glm_time), "to build multinomail regression model."))
[1] "Took 18.53 secs to build multinomail regression model."
h2o.confusionMatrix(glm2, valid=FALSE) # get confusion matrix in the training data
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
        class_1 class_2 class_3 class_4 class_5 class_6 class_7  Error
class_1  103845   41596      28       0      10      50    3014 0.3009
class_2   36032  158507    2313       2     152    1136     114 0.2005
class_3       0    2492   20047     424      29    2013       0 0.1983
class_4       0       6     883     827       0     231       0 0.5752
class_5      33    6330     208       0      63      32       0 0.9905
class_6       0    2754    6018      45      22    3333       0 0.7262
class_7    6036     116       0       0       0       0    8171 0.4295
Totals   145946  211801   29497    1298     276    6795   11299 0.2755
                       Rate
class_1 =  44,698 / 148,543
class_2 =  39,749 / 198,256
class_3 =    4,958 / 25,005
class_4 =     1,120 / 1,947
class_5 =     6,603 / 6,666
class_6 =    8,839 / 12,172
class_7 =    6,152 / 14,323
Totals  = 112,119 / 406,912
h2o.confusionMatrix(glm2, valid=TRUE) # get confusion matrix in the test data
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
        class_1 class_2 class_3 class_4 class_5 class_6 class_7  Error
class_1   22046    8916       8       0       4      17     653 0.3033
class_2    7650   34005     495       0      33     243      28 0.1990
class_3       0     544    4386      80       5     412       0 0.1918
class_4       0       1     172     184       0      44       0 0.5411
class_5       3    1339      43       0      13       7       0 0.9907
class_6       0     577    1325      16       2     713       0 0.7292
class_7    1270      38       0       0       0       0    1737 0.4296
Totals    30969   45420    6429     280      57    1436    2418 0.2750
                     Rate
class_1 =  9,598 / 31,644
class_2 =  8,449 / 42,454
class_3 =   1,041 / 5,427
class_4 =       217 / 401
class_5 =   1,392 / 1,405
class_6 =   1,920 / 2,633
class_7 =   1,308 / 3,045
Totals  = 23,925 / 87,009

try binomial model

D_binomial=D[D$Cover_Type %in% c("class_1","class_2"),]
h2o.setLevels(D_binomial$Cover_Type, c("class_1","class_2"))
  Cover_Type
1    class_1
2    class_1
3    class_2
4    class_2
5    class_1
6    class_2

[495141 rows x 1 column] 
#split to train/test/validation again
data_binomial<-h2o.splitFrame(D_binomial,ratio=c(.7,.15), destination_frames = c("train_b","test_b","valid_b"))
names(data_binomial)<-c("Train","Test","Valid")

run binomial model

m_binomial = h2o.glm(training_frame = data_binomial$Train, validation_frame = data_binomial$Valid, x = x, y = y, family='binomial',lambda=0)

  |                                                                               
  |                                                                         |   0%
  |                                                                               
  |=======                                                                  |  10%
  |                                                                               
  |=========================================================================| 100%
h2o.confusionMatrix(m_binomial, valid = FALSE)
Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.432283992854981:
        class_1 class_2    Error           Rate
class_1   95584   52666 0.355251  =52666/148250
class_2   26968  171431 0.135928  =26968/198399
Totals   122552  224097 0.229725  =79634/346649
h2o.confusionMatrix(m_binomial, valid = TRUE)
Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.421594496651137:
        class_1 class_2    Error          Rate
class_1   20062   11668 0.367728  =11668/31730
class_2    5539   36856 0.130652   =5539/42395
Totals    25601   48524 0.232135  =17207/74125

ROC curve

fpr = m_binomial@model$training_metrics@metrics$thresholds_and_metric_scores$fpr
tpr = m_binomial@model$training_metrics@metrics$thresholds_and_metric_scores$tpr
fpr_val = m_binomial@model$validation_metrics@metrics$thresholds_and_metric_scores$fpr
tpr_val = m_binomial@model$validation_metrics@metrics$thresholds_and_metric_scores$tpr
plot(fpr,tpr, type='l')
title('AUC')
lines(fpr_val,tpr_val,type='l',col='red')
legend("bottomright",c("Train", "Validation"),col=c("black","red"),lty=c(1,1),lwd=c(3,3))

h2o.auc(m_binomial,valid=FALSE) # on train                   
[1] 0.8487388
h2o.auc(m_binomial,valid=TRUE)  # on test
[1] 0.8488461

threshold

https://en.wikipedia.org/wiki/F1_score

m_binomial@model$training_metrics@metrics$max_criteria_and_metric_scores
Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.432284 0.811515 236
2                       max f2  0.149571 0.885211 347
3                 max f0point5  0.646413 0.815667 156
4                 max accuracy  0.511930 0.776128 206
5                max precision  0.997803 1.000000   0
6                   max recall  0.005826 1.000000 399
7              max specificity  0.997803 1.000000   0
8             max absolute_mcc  0.548828 0.543831 192
9   max min_per_class_accuracy  0.562751 0.772560 187
10 max mean_per_class_accuracy  0.562751 0.773638 187

bins

cut_column <- function(data, col) {
  # need lower/upper bound due to h2o.cut behavior (points < the first break or > the last break are replaced with missing value) 
  min_val = min(data$Train[,col])-1
  max_val = max(data$Train[,col])+1
  x = h2o.hist(data$Train[, col])
  # use only the breaks with enough support
  breaks = x$breaks[which(x$counts > 1000)]
  # assign level names 
  lvls = c("min",paste("i_",breaks[2:length(breaks)-1],sep=""),"max")
  col_cut <- paste(col,"_cut",sep="")
  data$Train[,col_cut] <- h2o.setLevels(h2o.cut(x = data$Train[,col],breaks=c(min_val,breaks,max_val)),lvls)
  # now do the same for test and validation, but using the breaks computed on the training!
  if(!is.null(data$Test)) {
    min_val = min(data$Test[,col])-1
    max_val = max(data$Test[,col])+1
    data$Test[,col_cut] <- h2o.setLevels(h2o.cut(x = data$Test[,col],breaks=c(min_val,breaks,max_val)),lvls)
  }
  if(!is.null(data$Valid)) {
    min_val = min(data$Valid[,col])-1
    max_val = max(data$Valid[,col])+1
    data$Valid[,col_cut] <- h2o.setLevels(h2o.cut(x = data$Valid[,col],breaks=c(min_val,breaks,max_val)),lvls)
  }
  data
}

make interaction

interactions <- function(data, cols, pairwise = TRUE) {
  iii = h2o.interaction(data = data$Train, destination_frame = "itrain",factors = cols,pairwise=pairwise,max_factors=1000,min_occurrence=100)
  data$Train <- h2o.cbind(data$Train,iii)
  if(!is.null(data$Test)) {
    iii = h2o.interaction(data = data$Test, destination_frame = "itest",factors = cols,pairwise=pairwise,max_factors=1000,min_occurrence=100)
    data$Test <- h2o.cbind(data$Test,iii)
  }
  if(!is.null(data$Valid)) {
    iii = h2o.interaction(data = data$Valid, destination_frame = "ivalid",factors = cols,pairwise=pairwise,max_factors=1000,min_occurrence=100)
    data$Valid <- h2o.cbind(data$Valid,iii)
  }
  data
}

add features to our ocer type example

add_features <- function(data) {
  names(data) <- c("Train","Test","Valid")
  data = cut_column(data,'Elevation')
  data = cut_column(data,'Hillshade_Noon')
  data = cut_column(data,'Hillshade_9am')
  data = cut_column(data,'Hillshade_3pm')
  data = cut_column(data,'Horizontal_Distance_To_Hydrology')
  data = cut_column(data,'Slope')
  data = cut_column(data,'Horizontal_Distance_To_Roadways')
  data = cut_column(data,'Aspect')
  # pairwise interactions between all categorical columns
  interaction_cols = c("Elevation_cut","Wilderness_Area","Soil_Type","Hillshade_Noon_cut","Hillshade_9am_cut","Hillshade_3pm_cut","Horizontal_Distance_To_Hydrology_cut","Slope_cut","Horizontal_Distance_To_Roadways_cut","Aspect_cut")
  data = interactions(data, interaction_cols)
  # interactions between Hillshade columns
  interaction_cols2 = c("Hillshade_Noon_cut","Hillshade_9am_cut","Hillshade_3pm_cut")
  data = interactions(data, interaction_cols2,pairwise = FALSE)
  data
}

add features

data_binomial_ext <- add_features(data_binomial)


  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |=========                                                                                                 |   9%
  |                                                                                                                
  |==========================================================================================================| 100%

  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |=====================                                                                                     |  20%
  |                                                                                                                
  |==========================================================================================================| 100%

  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |===================                                                                                       |  18%
  |                                                                                                                
  |==========================================================================================================| 100%

  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |==========================================================================================================| 100%

  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |==========================================================================================================| 100%

  |                                                                                                                
  |                                                                                                          |   0%
  |                                                                                                                
  |==========================================================================================================| 100%

data_binomial_ext$Train <- h2o.assign(data_binomial_ext$Train,"train_b_ext")
data_binomial_ext$Valid <- h2o.assign(data_binomial_ext$Valid,"valid_b_ext")
data_binomial_ext$Test <- h2o.assign(data_binomial_ext$Test,"test_b_ext")
y = "Cover_Type"
x = names(data_binomial_ext$Train)
x = x[-which(x==y)]

build model

h2o.auc(m_binomial_1_ext,valid=TRUE)
[1] 0.9003605

try adjust lambda

m_binomial_2_ext = h2o.glm(training_frame = data_binomial_ext$Train, validation_frame = data_binomial_ext$Valid, x = x, y = y, family='binomial', solver='L_BFGS', lambda=1e-4)

  |                                                                              
  |                                                                        |   0%
  |                                                                              
  |=                                                                       |   1%
  |                                                                              
  |=                                                                       |   2%
  |                                                                              
  |==                                                                      |   2%
  |                                                                              
  |==                                                                      |   3%
  |                                                                              
  |===                                                                     |   4%
  |                                                                              
  |========================================================================| 100%
h2o.confusionMatrix(m_binomial_2_ext, valid=TRUE)
Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.436429755974436:
        class_1 class_2    Error          Rate
class_1   22404    9326 0.293917   =9326/31730
class_2    3946   38449 0.093077   =3946/42395
Totals    26350   47775 0.179049  =13272/74125
h2o.auc(m_binomial_2_ext,valid=TRUE)
[1] 0.9032734

try adjust other parameters

m_binomial_3_ext = h2o.glm(training_frame = data_binomial_ext$Train, validation_frame = data_binomial_ext$Valid, x = x, y = y, family='binomial', lambda_search=TRUE)

  |                                                                              
  |                                                                        |   0%
  |                                                                              
  |=====                                                                   |   7%
  |                                                                              
  |============                                                            |  16%
  |                                                                              
  |==================                                                      |  25%
  |                                                                              
  |=======================                                                 |  32%
  |                                                                              
  |===========================                                             |  38%
  |                                                                              
  |================================                                        |  44%
  |                                                                              
  |====================================                                    |  50%
  |                                                                              
  |=======================================                                 |  54%
  |                                                                              
  |==========================================                              |  58%
  |                                                                              
  |============================================                            |  61%
  |                                                                              
  |=============================================                           |  63%
  |                                                                              
  |================================================                        |  66%
  |                                                                              
  |=================================================                       |  68%
  |                                                                              
  |==================================================                      |  69%
  |                                                                              
  |====================================================                    |  72%
  |                                                                              
  |=====================================================                   |  73%
  |                                                                              
  |=====================================================                   |  74%
  |                                                                              
  |=======================================================                 |  76%
  |                                                                              
  |=======================================================                 |  77%
  |                                                                              
  |========================================================                |  78%
  |                                                                              
  |=========================================================               |  79%
  |                                                                              
  |==========================================================              |  80%
  |                                                                              
  |==========================================================              |  81%
  |                                                                              
  |===========================================================             |  82%
  |                                                                              
  |============================================================            |  83%
  |                                                                              
  |============================================================            |  84%
  |                                                                              
  |=============================================================           |  85%
  |                                                                              
  |==============================================================          |  86%
  |                                                                              
  |===============================================================         |  87%
  |                                                                              
  |===============================================================         |  88%
  |                                                                              
  |================================================================        |  89%
  |                                                                              
  |=================================================================       |  90%
  |                                                                              
  |==================================================================      |  91%
  |                                                                              
  |==================================================================      |  92%
  |                                                                              
  |===================================================================     |  93%
  |                                                                              
  |====================================================================    |  94%
  |                                                                              
  |====================================================================    |  95%
  |                                                                              
  |=====================================================================   |  96%
  |                                                                              
  |======================================================================  |  97%
  |                                                                              
  |======================================================================= |  98%
  |                                                                              
  |======================================================================= |  99%
  |                                                                              
  |========================================================================| 100%
h2o.confusionMatrix(m_binomial_3_ext, valid=TRUE)
Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.431418064129616:
        class_1 class_2    Error          Rate
class_1   22316    9414 0.296691   =9414/31730
class_2    3897   38498 0.091921   =3897/42395
Totals    26213   47912 0.179575  =13311/74125
h2o.auc(m_binomial_3_ext,valid=TRUE)
[1] 0.9038901

multinomial model 2

h2o.confusionMatrix(m2, valid=TRUE)
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
        class_1 class_2 class_3 class_4 class_5 class_6 class_7  Error              Rate
class_1   23526    7668       4       0      26      20     551 0.2601 =  8,269 / 31,795
class_2    5283   36639     375       3     193     218      63 0.1434 =  6,135 / 42,774
class_3       0     393    4349     128       8     396       0 0.1754 =     925 / 5,274
class_4       0       1      83     288       0      19       0 0.2634 =       103 / 391
class_5      47     798      38       0     509       6       0 0.6359 =     889 / 1,398
class_6       6     423     812      40       2    1216       0 0.5134 =   1,283 / 2,499
class_7     590      37       0       0       0       0    2443 0.2042 =     627 / 3,070
Totals    29452   45959    5661     459     738    1875    3057 0.2091 = 18,231 / 87,201

https://github.com/h2oai/h2o-tutorials/blob/master/tutorials/glm/glm.md

---
title: "Prediction of forest coverage"
output: html_notebook
---


#load library
```{r}
library(h2o)
h2o.init()
h2o.removeAll()
```

#import cover type data
```{r}
D = h2o.importFile(path ="C:\\Users\\r631758\\Desktop\\r631758\\R codes\\H2O\\exercise\\covtype.full.csv", parse=TRUE)
h2o.summary(D)
D.R<-as.data.frame(D)
```

#split data
```{r}
data=h2o.splitFrame(D,ratios=c(.7,.15), destination_frames = c("train","test","valid"))
names(data)<-c("Train","Test","Valid")

```

#multinomial model
```{r}
y="Cover_Type"
x=names(data$Train)
x=x[-which(x==y)]
start=Sys.time()
glm1 = h2o.glm(training_frame = data$Train, validation_frame = data$Valid, x = x, y = y,family='multinomial',solver='L_BFGS')
glm_time<-Sys.time()-start
print(paste("Took", round(glm_time, digits=2), units(glm_time), "to build multinomail regression model."))
h2o.confusionMatrix(glm1, valid=TRUE)
```

#disable regularization of the glm model
#http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/lambda.html
```{r}
start=Sys.time()
glm2 = h2o.glm(training_frame = data$Train, validation_frame = data$Valid, x = x, y = y,family='multinomial',solver='L_BFGS', lambda=0)
glm_time<-Sys.time()-start
print(paste("Took", round(glm_time, digits=2), units(glm_time), "to build multinomail regression model."))
h2o.confusionMatrix(glm2, valid=FALSE) # get confusion matrix in the training data
h2o.confusionMatrix(glm2, valid=TRUE) # get confusion matrix in the test data

```

#try binomial model
```{r}
D_binomial=D[D$Cover_Type %in% c("class_1","class_2"),]
h2o.setLevels(D_binomial$Cover_Type, c("class_1","class_2"))
#split to train/test/validation again
data_binomial<-h2o.splitFrame(D_binomial,ratio=c(.7,.15), destination_frames = c("train_b","test_b","valid_b"))
names(data_binomial)<-c("Train","Test","Valid")

```

#run binomial model
```{r}
m_binomial = h2o.glm(training_frame = data_binomial$Train, validation_frame = data_binomial$Valid, x = x, y = y, family='binomial',lambda=0)
h2o.confusionMatrix(m_binomial, valid = FALSE)
h2o.confusionMatrix(m_binomial, valid = TRUE)
```

#ROC curve
```{r}
fpr = m_binomial@model$training_metrics@metrics$thresholds_and_metric_scores$fpr
tpr = m_binomial@model$training_metrics@metrics$thresholds_and_metric_scores$tpr
fpr_val = m_binomial@model$validation_metrics@metrics$thresholds_and_metric_scores$fpr
tpr_val = m_binomial@model$validation_metrics@metrics$thresholds_and_metric_scores$tpr
plot(fpr,tpr, type='l')
title('AUC')
lines(fpr_val,tpr_val,type='l',col='red')
legend("bottomright",c("Train", "Validation"),col=c("black","red"),lty=c(1,1),lwd=c(3,3))
h2o.auc(m_binomial,valid=FALSE) # on train                   
h2o.auc(m_binomial,valid=TRUE)  # on test
```

#threshold
#https://en.wikipedia.org/wiki/F1_score
```{r}
m_binomial@model$training_metrics@metrics$max_criteria_and_metric_scores
```

#bins
```{r}
cut_column <- function(data, col) {
  # need lower/upper bound due to h2o.cut behavior (points < the first break or > the last break are replaced with missing value) 
  min_val = min(data$Train[,col])-1
  max_val = max(data$Train[,col])+1
  x = h2o.hist(data$Train[, col])
  # use only the breaks with enough support
  breaks = x$breaks[which(x$counts > 1000)]
  # assign level names 
  lvls = c("min",paste("i_",breaks[2:length(breaks)-1],sep=""),"max")
  col_cut <- paste(col,"_cut",sep="")
  data$Train[,col_cut] <- h2o.setLevels(h2o.cut(x = data$Train[,col],breaks=c(min_val,breaks,max_val)),lvls)
  # now do the same for test and validation, but using the breaks computed on the training!
  if(!is.null(data$Test)) {
    min_val = min(data$Test[,col])-1
    max_val = max(data$Test[,col])+1
    data$Test[,col_cut] <- h2o.setLevels(h2o.cut(x = data$Test[,col],breaks=c(min_val,breaks,max_val)),lvls)
  }
  if(!is.null(data$Valid)) {
    min_val = min(data$Valid[,col])-1
    max_val = max(data$Valid[,col])+1
    data$Valid[,col_cut] <- h2o.setLevels(h2o.cut(x = data$Valid[,col],breaks=c(min_val,breaks,max_val)),lvls)
  }
  data
}
```

#make interaction
```{r}
interactions <- function(data, cols, pairwise = TRUE) {
  iii = h2o.interaction(data = data$Train, destination_frame = "itrain",factors = cols,pairwise=pairwise,max_factors=1000,min_occurrence=100)
  data$Train <- h2o.cbind(data$Train,iii)
  if(!is.null(data$Test)) {
    iii = h2o.interaction(data = data$Test, destination_frame = "itest",factors = cols,pairwise=pairwise,max_factors=1000,min_occurrence=100)
    data$Test <- h2o.cbind(data$Test,iii)
  }
  if(!is.null(data$Valid)) {
    iii = h2o.interaction(data = data$Valid, destination_frame = "ivalid",factors = cols,pairwise=pairwise,max_factors=1000,min_occurrence=100)
    data$Valid <- h2o.cbind(data$Valid,iii)
  }
  data
}
```

#add features to our ocer type example
```{r}
add_features <- function(data) {
  names(data) <- c("Train","Test","Valid")
  data = cut_column(data,'Elevation')
  data = cut_column(data,'Hillshade_Noon')
  data = cut_column(data,'Hillshade_9am')
  data = cut_column(data,'Hillshade_3pm')
  data = cut_column(data,'Horizontal_Distance_To_Hydrology')
  data = cut_column(data,'Slope')
  data = cut_column(data,'Horizontal_Distance_To_Roadways')
  data = cut_column(data,'Aspect')
  # pairwise interactions between all categorical columns
  interaction_cols = c("Elevation_cut","Wilderness_Area","Soil_Type","Hillshade_Noon_cut","Hillshade_9am_cut","Hillshade_3pm_cut","Horizontal_Distance_To_Hydrology_cut","Slope_cut","Horizontal_Distance_To_Roadways_cut","Aspect_cut")
  data = interactions(data, interaction_cols)
  # interactions between Hillshade columns
  interaction_cols2 = c("Hillshade_Noon_cut","Hillshade_9am_cut","Hillshade_3pm_cut")
  data = interactions(data, interaction_cols2,pairwise = FALSE)
  data
}
```
#add features

```{r}
data_binomial_ext <- add_features(data_binomial)
data_binomial_ext$Train <- h2o.assign(data_binomial_ext$Train,"train_b_ext")
data_binomial_ext$Valid <- h2o.assign(data_binomial_ext$Valid,"valid_b_ext")
data_binomial_ext$Test <- h2o.assign(data_binomial_ext$Test,"test_b_ext")
y = "Cover_Type"
x = names(data_binomial_ext$Train)
x = x[-which(x==y)]
```

#build model
```{r}
m_binomial_1_ext = try(h2o.glm(training_frame = data_binomial_ext$Train, validation_frame = data_binomial_ext$Valid, x = x, y = y, family='binomial', solver='L_BFGS'))
h2o.confusionMatrix(m_binomial_1_ext)
h2o.auc(m_binomial_1_ext,valid=TRUE)
```

#try adjust lambda
```{r}
m_binomial_2_ext = h2o.glm(training_frame = data_binomial_ext$Train, validation_frame = data_binomial_ext$Valid, x = x, y = y, family='binomial', solver='L_BFGS', lambda=1e-4)
h2o.confusionMatrix(m_binomial_2_ext, valid=TRUE)
h2o.auc(m_binomial_2_ext,valid=TRUE)
```

#try adjust other parameters
```{r}
m_binomial_3_ext = h2o.glm(training_frame = data_binomial_ext$Train, validation_frame = data_binomial_ext$Valid, x = x, y = y, family='binomial', lambda_search=TRUE)
h2o.confusionMatrix(m_binomial_3_ext, valid=TRUE)
h2o.auc(m_binomial_3_ext,valid=TRUE)
```

#multinomial model 2
```{r}
# let's revisit the multinomial case with our new features
data_ext <- add_features(data)
data_ext$Train <- h2o.assign(data_ext$Train,"train_m_ext")
data_ext$Valid <- h2o.assign(data_ext$Valid,"valid_m_ext")
data_ext$Test <- h2o.assign(data_ext$Test,"test_m_ext")
y = "Cover_Type"
x = names(data_ext$Train)
x = x[-which(x==y)]
m2 = h2o.glm(training_frame = data_ext$Train, validation_frame = data_ext$Valid, x = x, y = y,family='multinomial',solver='L_BFGS',lambda=1e-4)
# 21% err down from 28%
h2o.confusionMatrix(m2, valid=TRUE)

```

#https://github.com/h2oai/h2o-tutorials/blob/master/tutorials/glm/glm.md
