=======

Classification of Abalone

=======

abalone <- read.csv("abalone.csv", header =T, na.strings=c("","NA"))
Warning message:
package ‘h2o’ was built under R version 3.4.4 
suppressWarnings(suppressMessages(library(dplyr)))
#create an sex column that is numeric
abalone <- abalone %>%
  mutate(sex_num =case_when(
    sex %in% 'M' ~ 0,
    sex %in% "F"  ~ 1,
    sex %in% "I" ~ 2
  ))
#create an age column with young = 0, adult = 1, old=2
abalone <- abalone %>%
  mutate(age=case_when(
    rings %in% 1:5 ~ "young",
    rings %in% 6:13 ~ "adult",
    rings %in% 14:30 ~ "old"
  ))
# remove rings, sex
abalone <- abalone[c(-1, -9)]
abalone$age <- as.factor(abalone$age)
# split data
set.seed(100)
n = nrow(abalone)
trainIndex = sample(1:n, size = round(0.5*n), replace=FALSE)
train = abalone[trainIndex ,]
test = abalone[-trainIndex ,]
suppressWarnings(suppressMessages(library(h2o)))
h2o.init()

H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    C:\Users\jkkli\AppData\Local\Temp\Rtmpu4lNtf/h2o_jkkli_started_from_r.out
    C:\Users\jkkli\AppData\Local\Temp\Rtmpu4lNtf/h2o_jkkli_started_from_r.err

java version "1.8.0_171"
Java(TM) SE Runtime Environment (build 1.8.0_171-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.171-b11, mixed mode)

Starting H2O JVM and connecting: . Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 seconds 226 milliseconds 
    H2O cluster timezone:       Asia/Singapore 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.18.0.8 
    H2O cluster version age:    16 days  
    H2O cluster name:           H2O_started_from_R_jkkli_jzr413 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   7.09 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.4.2 (2017-09-28) 
#convert to H2O frame
train.h2o <- as.h2o(train)

  |                                                                                                                                         
  |                                                                                                                                   |   0%
  |                                                                                                                                         
  |===================================================================================================================================| 100%
test.h2o <- as.h2o(test)

  |                                                                                                                                         
  |                                                                                                                                   |   0%
  |                                                                                                                                         
  |===================================================================================================================================| 100%
### values below for columns
y.dep <- 9 # interested in age  factor
x.indep <- c(1:8) #other independent variables
#GBM
system.time(
  gbm.model <- h2o.gbm(y=y.dep, x=x.indep, training_frame = train.h2o, ntrees = 1000, max_depth = 4, learn_rate = 0.01, seed = 1122)
)  

  |                                                                                                                                         
  |                                                                                                                                   |   0%
  |                                                                                                                                         
  |==================                                                                                                                 |  14%
  |                                                                                                                                         
  |==============================================                                                                                     |  35%
  |                                                                                                                                         
  |=======================================================================                                                            |  55%
  |                                                                                                                                         
  |====================================================================================================                               |  76%
  |                                                                                                                                         
  |===================================================================================================================================| 100%
   user  system elapsed 
   0.11    0.00    5.49 

Performance of the model

perf <- h2o.performance(gbm.model)
h2o.confusionMatrix(perf)
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
       adult old young  Error          Rate
adult   1736   5     2 0.0040 =   7 / 1,743
old      102 164     0 0.3835 =   102 / 266
young     15   0    64 0.1899 =     15 / 79
Totals  1853 169    66 0.0594 = 124 / 2,088

Calculate MSE, Accuracy for TRAIN model

h2o.mse(gbm.model)
[1] 0.04423702
# accuracy = tp/tp+tn stated above
(1736+154+64)/2089
[1] 0.9353758

Predict against Test data

# predict against test data
gbm.model.test <- h2o.gbm(y=y.dep, x=x.indep, training_frame = test.h2o, ntrees = 1000, max_depth = 4, learn_rate = 0.01, seed = 1122)

  |                                                                                                                                        
  |                                                                                                                                  |   0%
  |                                                                                                                                        
  |===                                                                                                                               |   2%
  |                                                                                                                                        
  |=============================================                                                                                     |  34%
  |                                                                                                                                        
  |=======================================================================                                                           |  55%
  |                                                                                                                                        
  |=============================================================================================                                     |  71%
  |                                                                                                                                        
  |==================================================================================================================================| 100%
h2o.confusionMatrix(gbm.model.test)
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
       adult old young  Error          Rate
adult   1749   2     4 0.0034 =   6 / 1,755
old       83 141     0 0.3705 =    83 / 224
young     13   0    97 0.1182 =    13 / 110
Totals  1845 143   101 0.0488 = 102 / 2,089

Calculate MSE, Accuracy for TEST model

#mse
h2o.mse(gbm.model.test)
[1] 0.03828092
# accuracy = tp/tp+tn stated above
(1749+141+97)/2089
[1] 0.9511728
LS0tDQp0aXRsZTogIk5vdGVib29rIC0gQWJhbG9uZSB1c2luZyBIMjAgYWxnb3JpdGhtIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyA9PT09PT09DQojIENsYXNzaWZpY2F0aW9uIG9mIEFiYWxvbmUNCiMgPT09PT09PQ0KDQoNCmBgYHtyfQ0KYWJhbG9uZSA8LSByZWFkLmNzdigiYWJhbG9uZS5jc3YiLCBoZWFkZXIgPVQsIG5hLnN0cmluZ3M9YygiIiwiTkEiKSkNCg0KDQpzdXBwcmVzc1dhcm5pbmdzKHN1cHByZXNzTWVzc2FnZXMobGlicmFyeShkcGx5cikpKQ0KDQoNCiNjcmVhdGUgYW4gc2V4IGNvbHVtbiB0aGF0IGlzIG51bWVyaWMNCmFiYWxvbmUgPC0gYWJhbG9uZSAlPiUNCiAgbXV0YXRlKHNleF9udW0gPWNhc2Vfd2hlbigNCiAgICBzZXggJWluJSAnTScgfiAwLA0KICAgIHNleCAlaW4lICJGIiAgfiAxLA0KICAgIHNleCAlaW4lICJJIiB+IDINCiAgKSkNCg0KI2NyZWF0ZSBhbiBhZ2UgY29sdW1uIHdpdGggeW91bmcgPSAwLCBhZHVsdCA9IDEsIG9sZD0yDQphYmFsb25lIDwtIGFiYWxvbmUgJT4lDQogIG11dGF0ZShhZ2U9Y2FzZV93aGVuKA0KICAgIHJpbmdzICVpbiUgMTo1IH4gInlvdW5nIiwNCiAgICByaW5ncyAlaW4lIDY6MTMgfiAiYWR1bHQiLA0KICAgIHJpbmdzICVpbiUgMTQ6MzAgfiAib2xkIg0KICApKQ0KDQoNCiMgcmVtb3ZlIHJpbmdzLCBzZXgNCmFiYWxvbmUgPC0gYWJhbG9uZVtjKC0xLCAtOSldDQphYmFsb25lJGFnZSA8LSBhcy5mYWN0b3IoYWJhbG9uZSRhZ2UpDQoNCiMgc3BsaXQgZGF0YQ0Kc2V0LnNlZWQoMTAwKQ0KDQpuID0gbnJvdyhhYmFsb25lKQ0KdHJhaW5JbmRleCA9IHNhbXBsZSgxOm4sIHNpemUgPSByb3VuZCgwLjUqbiksIHJlcGxhY2U9RkFMU0UpDQp0cmFpbiA9IGFiYWxvbmVbdHJhaW5JbmRleCAsXQ0KdGVzdCA9IGFiYWxvbmVbLXRyYWluSW5kZXggLF0NCmBgYA0KDQpgYGB7cn0NCnN1cHByZXNzV2FybmluZ3Moc3VwcHJlc3NNZXNzYWdlcyhsaWJyYXJ5KGgybykpKQ0KaDJvLmluaXQoKQ0KDQojY29udmVydCB0byBIMk8gZnJhbWUNCnRyYWluLmgybyA8LSBhcy5oMm8odHJhaW4pDQp0ZXN0LmgybyA8LSBhcy5oMm8odGVzdCkNCg0KIyMjIHZhbHVlcyBiZWxvdyBmb3IgY29sdW1ucw0KeS5kZXAgPC0gOSAjIGludGVyZXN0ZWQgaW4gYWdlICBmYWN0b3INCnguaW5kZXAgPC0gYygxOjgpICNvdGhlciBpbmRlcGVuZGVudCB2YXJpYWJsZXMNCg0KI0dCTQ0Kc3lzdGVtLnRpbWUoDQogIGdibS5tb2RlbCA8LSBoMm8uZ2JtKHk9eS5kZXAsIHg9eC5pbmRlcCwgdHJhaW5pbmdfZnJhbWUgPSB0cmFpbi5oMm8sIG50cmVlcyA9IDEwMDAsIG1heF9kZXB0aCA9IDQsIGxlYXJuX3JhdGUgPSAwLjAxLCBzZWVkID0gMTEyMikNCikgIA0KDQoNCmBgYA0KDQojIFBlcmZvcm1hbmNlIG9mIHRoZSBtb2RlbA0KYGBge3J9DQpwZXJmIDwtIGgyby5wZXJmb3JtYW5jZShnYm0ubW9kZWwpDQpoMm8uY29uZnVzaW9uTWF0cml4KHBlcmYpDQpgYGANCg0KIyBDYWxjdWxhdGUgTVNFLCBBY2N1cmFjeSBmb3IgVFJBSU4gbW9kZWwNCmBgYHtyfQ0KaDJvLm1zZShnYm0ubW9kZWwpDQoNCiMgYWNjdXJhY3kgPSB0cC90cCt0biBzdGF0ZWQgYWJvdmUNCigxNzM2KzE1NCs2NCkvMjA4OQ0KDQpgYGANCg0KDQojIFByZWRpY3QgYWdhaW5zdCBUZXN0IGRhdGENCmBgYHtyfQ0KIyBwcmVkaWN0IGFnYWluc3QgdGVzdCBkYXRhDQpnYm0ubW9kZWwudGVzdCA8LSBoMm8uZ2JtKHk9eS5kZXAsIHg9eC5pbmRlcCwgdHJhaW5pbmdfZnJhbWUgPSB0ZXN0LmgybywgbnRyZWVzID0gMTAwMCwgbWF4X2RlcHRoID0gNCwgbGVhcm5fcmF0ZSA9IDAuMDEsIHNlZWQgPSAxMTIyKQ0KDQpoMm8uY29uZnVzaW9uTWF0cml4KGdibS5tb2RlbC50ZXN0KQ0KYGBgDQoNCiMgQ2FsY3VsYXRlIE1TRSwgQWNjdXJhY3kgZm9yIFRFU1QgbW9kZWwNCmBgYHtyfQ0KI21zZQ0KaDJvLm1zZShnYm0ubW9kZWwudGVzdCkNCg0KIyBhY2N1cmFjeSA9IHRwL3RwK3RuIHN0YXRlZCBhYm92ZQ0KKDE3NDkrMTQxKzk3KS8yMDg5DQoNCmBgYA0K