=======
Classification of Abalone
=======
abalone <- read.csv("abalone.csv", header =T, na.strings=c("","NA"))
Warning message:
package ‘h2o’ was built under R version 3.4.4
suppressWarnings(suppressMessages(library(dplyr)))
#create an sex column that is numeric
abalone <- abalone %>%
mutate(sex_num =case_when(
sex %in% 'M' ~ 0,
sex %in% "F" ~ 1,
sex %in% "I" ~ 2
))
#create an age column with young = 0, adult = 1, old=2
abalone <- abalone %>%
mutate(age=case_when(
rings %in% 1:5 ~ "young",
rings %in% 6:13 ~ "adult",
rings %in% 14:30 ~ "old"
))
# remove rings, sex
abalone <- abalone[c(-1, -9)]
abalone$age <- as.factor(abalone$age)
# split data
set.seed(100)
n = nrow(abalone)
trainIndex = sample(1:n, size = round(0.5*n), replace=FALSE)
train = abalone[trainIndex ,]
test = abalone[-trainIndex ,]
suppressWarnings(suppressMessages(library(h2o)))
h2o.init()
H2O is not running yet, starting it now...
Note: In case of errors look at the following log files:
C:\Users\jkkli\AppData\Local\Temp\Rtmpu4lNtf/h2o_jkkli_started_from_r.out
C:\Users\jkkli\AppData\Local\Temp\Rtmpu4lNtf/h2o_jkkli_started_from_r.err
java version "1.8.0_171"
Java(TM) SE Runtime Environment (build 1.8.0_171-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.171-b11, mixed mode)
Starting H2O JVM and connecting: . Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 2 seconds 226 milliseconds
H2O cluster timezone: Asia/Singapore
H2O data parsing timezone: UTC
H2O cluster version: 3.18.0.8
H2O cluster version age: 16 days
H2O cluster name: H2O_started_from_R_jkkli_jzr413
H2O cluster total nodes: 1
H2O cluster total memory: 7.09 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
H2O API Extensions: Algos, AutoML, Core V3, Core V4
R Version: R version 3.4.2 (2017-09-28)
#convert to H2O frame
train.h2o <- as.h2o(train)
|
| | 0%
|
|===================================================================================================================================| 100%
test.h2o <- as.h2o(test)
|
| | 0%
|
|===================================================================================================================================| 100%
### values below for columns
y.dep <- 9 # interested in age factor
x.indep <- c(1:8) #other independent variables
#GBM
system.time(
gbm.model <- h2o.gbm(y=y.dep, x=x.indep, training_frame = train.h2o, ntrees = 1000, max_depth = 4, learn_rate = 0.01, seed = 1122)
)
|
| | 0%
|
|================== | 14%
|
|============================================== | 35%
|
|======================================================================= | 55%
|
|==================================================================================================== | 76%
|
|===================================================================================================================================| 100%
user system elapsed
0.11 0.00 5.49
Calculate MSE, Accuracy for TRAIN model
h2o.mse(gbm.model)
[1] 0.04423702
# accuracy = tp/tp+tn stated above
(1736+154+64)/2089
[1] 0.9353758
Predict against Test data
# predict against test data
gbm.model.test <- h2o.gbm(y=y.dep, x=x.indep, training_frame = test.h2o, ntrees = 1000, max_depth = 4, learn_rate = 0.01, seed = 1122)
|
| | 0%
|
|=== | 2%
|
|============================================= | 34%
|
|======================================================================= | 55%
|
|============================================================================================= | 71%
|
|==================================================================================================================================| 100%
h2o.confusionMatrix(gbm.model.test)
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
adult old young Error Rate
adult 1749 2 4 0.0034 = 6 / 1,755
old 83 141 0 0.3705 = 83 / 224
young 13 0 97 0.1182 = 13 / 110
Totals 1845 143 101 0.0488 = 102 / 2,089
Calculate MSE, Accuracy for TEST model
#mse
h2o.mse(gbm.model.test)
[1] 0.03828092
# accuracy = tp/tp+tn stated above
(1749+141+97)/2089
[1] 0.9511728
LS0tDQp0aXRsZTogIk5vdGVib29rIC0gQWJhbG9uZSB1c2luZyBIMjAgYWxnb3JpdGhtIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyA9PT09PT09DQojIENsYXNzaWZpY2F0aW9uIG9mIEFiYWxvbmUNCiMgPT09PT09PQ0KDQoNCmBgYHtyfQ0KYWJhbG9uZSA8LSByZWFkLmNzdigiYWJhbG9uZS5jc3YiLCBoZWFkZXIgPVQsIG5hLnN0cmluZ3M9YygiIiwiTkEiKSkNCg0KDQpzdXBwcmVzc1dhcm5pbmdzKHN1cHByZXNzTWVzc2FnZXMobGlicmFyeShkcGx5cikpKQ0KDQoNCiNjcmVhdGUgYW4gc2V4IGNvbHVtbiB0aGF0IGlzIG51bWVyaWMNCmFiYWxvbmUgPC0gYWJhbG9uZSAlPiUNCiAgbXV0YXRlKHNleF9udW0gPWNhc2Vfd2hlbigNCiAgICBzZXggJWluJSAnTScgfiAwLA0KICAgIHNleCAlaW4lICJGIiAgfiAxLA0KICAgIHNleCAlaW4lICJJIiB+IDINCiAgKSkNCg0KI2NyZWF0ZSBhbiBhZ2UgY29sdW1uIHdpdGggeW91bmcgPSAwLCBhZHVsdCA9IDEsIG9sZD0yDQphYmFsb25lIDwtIGFiYWxvbmUgJT4lDQogIG11dGF0ZShhZ2U9Y2FzZV93aGVuKA0KICAgIHJpbmdzICVpbiUgMTo1IH4gInlvdW5nIiwNCiAgICByaW5ncyAlaW4lIDY6MTMgfiAiYWR1bHQiLA0KICAgIHJpbmdzICVpbiUgMTQ6MzAgfiAib2xkIg0KICApKQ0KDQoNCiMgcmVtb3ZlIHJpbmdzLCBzZXgNCmFiYWxvbmUgPC0gYWJhbG9uZVtjKC0xLCAtOSldDQphYmFsb25lJGFnZSA8LSBhcy5mYWN0b3IoYWJhbG9uZSRhZ2UpDQoNCiMgc3BsaXQgZGF0YQ0Kc2V0LnNlZWQoMTAwKQ0KDQpuID0gbnJvdyhhYmFsb25lKQ0KdHJhaW5JbmRleCA9IHNhbXBsZSgxOm4sIHNpemUgPSByb3VuZCgwLjUqbiksIHJlcGxhY2U9RkFMU0UpDQp0cmFpbiA9IGFiYWxvbmVbdHJhaW5JbmRleCAsXQ0KdGVzdCA9IGFiYWxvbmVbLXRyYWluSW5kZXggLF0NCmBgYA0KDQpgYGB7cn0NCnN1cHByZXNzV2FybmluZ3Moc3VwcHJlc3NNZXNzYWdlcyhsaWJyYXJ5KGgybykpKQ0KaDJvLmluaXQoKQ0KDQojY29udmVydCB0byBIMk8gZnJhbWUNCnRyYWluLmgybyA8LSBhcy5oMm8odHJhaW4pDQp0ZXN0LmgybyA8LSBhcy5oMm8odGVzdCkNCg0KIyMjIHZhbHVlcyBiZWxvdyBmb3IgY29sdW1ucw0KeS5kZXAgPC0gOSAjIGludGVyZXN0ZWQgaW4gYWdlICBmYWN0b3INCnguaW5kZXAgPC0gYygxOjgpICNvdGhlciBpbmRlcGVuZGVudCB2YXJpYWJsZXMNCg0KI0dCTQ0Kc3lzdGVtLnRpbWUoDQogIGdibS5tb2RlbCA8LSBoMm8uZ2JtKHk9eS5kZXAsIHg9eC5pbmRlcCwgdHJhaW5pbmdfZnJhbWUgPSB0cmFpbi5oMm8sIG50cmVlcyA9IDEwMDAsIG1heF9kZXB0aCA9IDQsIGxlYXJuX3JhdGUgPSAwLjAxLCBzZWVkID0gMTEyMikNCikgIA0KDQoNCmBgYA0KDQojIFBlcmZvcm1hbmNlIG9mIHRoZSBtb2RlbA0KYGBge3J9DQpwZXJmIDwtIGgyby5wZXJmb3JtYW5jZShnYm0ubW9kZWwpDQpoMm8uY29uZnVzaW9uTWF0cml4KHBlcmYpDQpgYGANCg0KIyBDYWxjdWxhdGUgTVNFLCBBY2N1cmFjeSBmb3IgVFJBSU4gbW9kZWwNCmBgYHtyfQ0KaDJvLm1zZShnYm0ubW9kZWwpDQoNCiMgYWNjdXJhY3kgPSB0cC90cCt0biBzdGF0ZWQgYWJvdmUNCigxNzM2KzE1NCs2NCkvMjA4OQ0KDQpgYGANCg0KDQojIFByZWRpY3QgYWdhaW5zdCBUZXN0IGRhdGENCmBgYHtyfQ0KIyBwcmVkaWN0IGFnYWluc3QgdGVzdCBkYXRhDQpnYm0ubW9kZWwudGVzdCA8LSBoMm8uZ2JtKHk9eS5kZXAsIHg9eC5pbmRlcCwgdHJhaW5pbmdfZnJhbWUgPSB0ZXN0LmgybywgbnRyZWVzID0gMTAwMCwgbWF4X2RlcHRoID0gNCwgbGVhcm5fcmF0ZSA9IDAuMDEsIHNlZWQgPSAxMTIyKQ0KDQpoMm8uY29uZnVzaW9uTWF0cml4KGdibS5tb2RlbC50ZXN0KQ0KYGBgDQoNCiMgQ2FsY3VsYXRlIE1TRSwgQWNjdXJhY3kgZm9yIFRFU1QgbW9kZWwNCmBgYHtyfQ0KI21zZQ0KaDJvLm1zZShnYm0ubW9kZWwudGVzdCkNCg0KIyBhY2N1cmFjeSA9IHRwL3RwK3RuIHN0YXRlZCBhYm92ZQ0KKDE3NDkrMTQxKzk3KS8yMDg5DQoNCmBgYA0K