H2O 를 활용한 Generalized Additive Models (GAM)
[참조 1] http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/gam.html
일반화가법모델(GAMs)은 가산성은 유지하면서 각 변수의 비선형함수들을 허용하여 표준선형모델을 확장
선형모델과 마찬가지로 GAMs은 질적 및 양적 반응변수 모두에 적용
## Warning: package 'h2o' was built under R version 4.0.3
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 5 hours 35 minutes
## H2O cluster timezone: Asia/Seoul
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.0.1
## H2O cluster version age: 25 days
## H2O cluster name: H2O_started_from_R_user_uho906
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.96 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.2 (2020-06-22)
# create frame knots
knots1 <- c(-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290)
frame_Knots1 <- as.h2o(knots1)
##
|
| | 0%
|
|======================================================================| 100%
knots2 <- c(-1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589)
frame_Knots2 <- as.h2o(knots2)
##
|
| | 0%
|
|======================================================================| 100%
knots3 <- c(-1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676)
frame_Knots3 <- as.h2o(knots3)
##
|
| | 0%
|
|======================================================================| 100%
# import the dataset
h2o_data <- h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
##
|
| | 0%
|
|==== | 6%
|
|====================== | 31%
|
|============================================ | 62%
|
|========================================================= | 81%
|
|============================================================= | 87%
|
|======================================================================| 100%
# Convert the C1, C2, and C11 columns to factors
h2o_data["C1"] <- as.factor(h2o_data["C1"])
h2o_data["C2"] <- as.factor(h2o_data["C2"])
h2o_data["C11"] <- as.factor(h2o_data["C11"])
# split into train and test sets
splits <- h2o.splitFrame(data = h2o_data, ratios = 0.8)
train <- splits[[1]]
test <- splits[[2]]
# Set the predictor and response columns
predictors <- colnames(train[1:2])
response <- 'C11'
# specify the knots array
numKnots <- c(5, 5, 5)
# build the GAM model
gam_model <- h2o.gam(x = predictors,
y = response,
training_frame = train,
family = 'multinomial',
gam_columns = c("C6", "C7", "C8"),
scale = c(1, 1, 1),
num_knots = numKnots,
knot_ids = c(h2o.keyof(frame_Knots1), h2o.keyof(frame_Knots2), h2o.keyof(frame_Knots3)))
##
|
| | 0%
|
|======================================================================| 100%
## Model Details:
## ==============
##
## H2OMultinomialModel: gam
## Model Key: GAM_model_R_1604363756625_130
## NULL
##
## H2OMultinomialMetrics: gam
## ** Reported on training data. **
##
## Training Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.4597694
## RMSE: (Extract with `h2o.rmse`) 0.678063
## Logloss: (Extract with `h2o.logloss`) 1.319915
## Mean Per-Class Error: 0.6290221
## Null Deviance: (Extract with `h2o.nulldeviance`) 0
## Residual Deviance: (Extract with `h2o.residual_deviance`) 0
## R^2: (Extract with `h2o.r2`) 0.9504142
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 2 3 4 5 6 7 8 9 Error Rate
## 0 89 29 7 77 2 0 55 1 109 5 0.7620 = 285 / 374
## 1 24 226 119 96 1 0 154 0 36 24 0.6676 = 454 / 680
## 2 4 92 570 110 0 13 121 8 80 73 0.4678 = 501 / 1,071
## 3 29 64 100 820 12 14 55 8 195 206 0.4544 = 683 / 1,503
## 4 5 4 9 97 25 2 3 1 74 33 0.9012 = 228 / 253
## 5 0 3 73 28 1 19 5 0 10 11 0.8733 = 131 / 150
## 6 22 83 108 48 0 4 470 2 58 34 0.4331 = 359 / 829
## 7 1 0 34 34 0 1 16 14 39 34 0.9191 = 159 / 173
## 8 49 27 75 187 11 1 42 6 913 202 0.3966 = 600 / 1,513
## 9 3 13 93 222 0 1 22 7 221 820 0.4151 = 582 / 1,402
## Totals 226 541 1188 1719 52 55 943 47 1735 1442 0.5010 = 3,982 / 7,948
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-10 Hit Ratios:
## k hit_ratio
## 1 1 0.498993
## 2 2 0.737292
## 3 3 0.858329
## 4 4 0.926271
## 5 5 0.960619
## 6 6 0.982637
## 7 7 0.992828
## 8 8 0.996980
## 9 9 0.998868
## 10 10 1.000000
##
##
##
##
##
## NULL
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## variable relative_importance scaled_importance percentage
## 1 C1.4 8.3912079 1.00000000 0.080368923
## 2 C2.3 7.0242097 0.83709160 0.067276150
## 3 C8_0_center__2 6.8993243 0.82220872 0.066080030
## 4 C1.2 6.8797420 0.81987504 0.065892475
## 5 C2.5 6.5956232 0.78601594 0.063171255
## 6 C7_0_center__2 5.6363710 0.67169960 0.053983773
## 7 C1.3 5.5990294 0.66724952 0.053626125
## 8 C2.0 5.4987815 0.65530273 0.052665975
## 9 C1.1 5.3344000 0.63571300 0.051091570
## 10 C1.5 5.2862160 0.62997080 0.050630075
## 11 C2.2 5.1927796 0.61883577 0.049735164
## 12 C2.4 5.0334953 0.59985348 0.048209578
## 13 C8_0_center__3 4.8832040 0.58194292 0.046770126
## 14 C7_0_center__3 4.1981007 0.50029754 0.040208375
## 15 C6_0_center__2 3.8276861 0.45615436 0.036660635
## 16 C2.1 3.5644850 0.42478806 0.034139759
## 17 C6_0_center__3 3.3829775 0.40315740 0.032401326
## 18 C8_0_center__1 2.9885253 0.35614959 0.028623359
## 19 C1.0 2.4785580 0.29537558 0.023739018
## 20 C6_0_center__1 2.0383686 0.24291719 0.019522993
## 21 C7_0_center__1 2.0306710 0.24199984 0.019449267
## 22 C6_0_center__0 0.9912683 0.11813178 0.009494124
## 23 C7_0_center__0 0.4553072 0.05426003 0.004360820
## 24 C8_0_center__0 0.1982829 0.02362984 0.001899105
# get the model coefficients
coefficients <- h2o.coef(gam_model)
# generate predictions using the test data
pred <- h2o.predict(object = gam_model, newdata = test)
##
|
| | 0%
|
|======================================================================| 100%