H2O package 를 활용한 Deep Learning
[참조 1] http://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r
[참조 2] https://r-kor.org/wp-content/uploads/2018/08/h2o__v3.pdf
[참조 3] https://github.com/DarrenCook/h2o/
## Warning: package 'h2o' was built under R version 4.0.3
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 6 hours 13 minutes
## H2O cluster timezone: Asia/Seoul
## H2O data parsing timezone: UTC
## H2O cluster version: 3.32.0.1
## H2O cluster version age: 19 days
## H2O cluster name: H2O_started_from_R_user_gax063
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.36 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.2 (2020-06-22)
# Import a sample binary outcome train/test set into H2O
# import 안되면 홈페이지서 저장하고 fread(), as.h2o() 변환하여 활용
#train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
#test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- fread("higgs_train_10k.csv")
train <- as.h2o(train)
##
|
| | 0%
|
|======================================================================| 100%
## [1] 10000 29
## response x1 x2 x3 x4 x5 x6
## 1 1 0.8692932 -0.6350818 0.225690261 0.3274701 -0.6899932 0.7542022
## 2 1 0.9075421 0.3291473 0.359411865 1.4979699 -0.3130095 1.0955306
## 3 1 0.7988347 1.4706388 -1.635974765 0.4537732 0.4256292 1.1048746
## 4 0 1.3443848 -0.8766260 0.935912728 1.9920501 0.8824544 1.7860659
## 5 1 1.1050090 0.3213556 1.522401214 0.8828076 -1.2053493 0.6814661
## 6 0 1.5958393 -0.6078107 0.007074916 1.8184496 -0.1119060 0.8475499
## x7 x8 x9 x10 x11 x12 x13
## 1 -0.2485731 -1.0920639 0.000000 1.3749921 -0.6536742 0.9303491 1.107436
## 2 -0.5575249 -1.5882298 2.173076 0.8125812 -0.2136419 1.2710146 2.214872
## 3 1.2823223 1.3816643 0.000000 0.8517372 1.5406590 -0.8196895 2.214872
## 4 -1.6467777 -0.9423825 0.000000 2.4232647 -0.6760158 0.7361587 2.214872
## 5 -1.0704639 -0.9218706 0.000000 0.8008721 1.0209740 0.9714065 2.214872
## 6 -0.5664370 1.5812393 2.173076 0.7554210 0.6431096 1.4263668 0.000000
## x14 x15 x16 x17 x18 x19 x20
## 1 1.1389043 -1.5781983 -1.0469854 0.000000 0.6579295 -0.01045457 -0.0457671694
## 2 0.4999940 -1.2614318 0.7321562 0.000000 0.3987009 -1.13893008 -0.0008191102
## 3 0.9934899 0.3560801 -0.2087775 2.548224 1.2569546 1.12884760 0.9004608393
## 4 1.2987198 -1.4307381 -0.3646582 0.000000 0.7453127 -0.67837882 -1.3603563309
## 5 0.5967613 -0.3502729 0.6311943 0.000000 0.4799989 -0.37356552 0.1130406111
## 6 0.9216608 -1.1904324 -1.6155890 0.000000 0.6511141 -0.65422696 -1.2743449211
## x21 x22 x23 x24 x25 x26 x27
## 1 3.101961 1.3537600 0.9795631 0.9780762 0.9200048 0.7216575 0.9887509
## 2 0.000000 0.3022199 0.8330482 0.9856997 0.9780984 0.7797322 0.9923558
## 3 0.000000 0.9097533 1.1083305 0.9856922 0.9513313 0.8032515 0.8659244
## 4 0.000000 0.9466525 1.0287037 0.9986561 0.7282806 0.8692002 1.0267365
## 5 0.000000 0.7558565 1.3610570 0.9866097 0.8380846 1.1332952 0.8722449
## 6 3.101961 0.8237606 0.9381914 0.9717582 0.7891763 0.4305533 0.9613569
## x28
## 1 0.8766783
## 2 0.7983426
## 3 0.7801176
## 4 0.9579040
## 5 0.8084865
## 6 0.9578179
##
|
| | 0%
|
|======================================================================| 100%
## [1] 5000 29
## response x1 x2 x3 x4 x5 x6
## 1 0 0.6238780 0.4596591 -0.3165119 1.5570095 0.6447843 0.4559291
## 2 0 1.6820366 0.7742510 1.3287545 0.2347291 -0.5578297 0.4429209
## 3 1 1.0940284 -0.8873397 0.9492294 0.4102605 -1.5586771 0.5233520
## 4 0 1.3387115 -0.8990273 -1.2553400 0.8842851 -0.7479247 0.5474447
## 5 0 0.6990948 1.4336280 -1.7225332 0.6501126 -0.1675736 0.9674639
## 6 1 0.8422078 -0.1442015 -1.7020034 0.6858755 0.5443376 0.5787744
## x7 x8 x9 x10 x11 x12 x13
## 1 -2.24982786 -0.3619488 0.000000 0.8329776 1.4114662 1.52845550 0.000000
## 2 -0.08320475 1.0811927 2.173076 1.1458478 0.1622797 -0.26818651 0.000000
## 3 0.13167514 0.3045129 0.000000 1.1191562 1.0938271 -0.39635164 2.214872
## 4 -0.79518002 0.1742347 2.173076 0.7716625 -0.3185503 1.02356052 2.214872
## 5 0.73769587 -0.8863906 2.173076 1.6679695 0.4468922 0.85100842 0.000000
## 6 -0.22183695 -0.5864736 1.086538 0.9094010 2.8131583 -0.07066655 0.000000
## x14 x15 x16 x17 x18 x19 x20
## 1 0.7228574 1.0096387 0.9995385 0.000000 1.190991 0.8140404820 -0.87980253
## 2 1.2140703 0.1949661 -1.1562682 2.548224 1.019146 -0.7591626644 0.13634692
## 3 0.6289001 0.6282445 1.4577500 0.000000 0.523082 0.5083943605 0.98813695
## 4 0.4084952 -1.1430995 0.1396988 0.000000 1.293952 -1.3413060904 1.61685240
## 5 0.5964100 0.1567357 0.3405130 0.000000 0.694684 -0.0004606903 -1.14948976
## 6 1.5498583 -0.8518199 1.5881127 0.000000 1.149855 0.5900110006 0.05976906
## x21 x22 x23 x24 x25 x26 x27
## 1 3.101961 0.5833441 0.7902463 1.0408747 0.8488581 0.2297017 0.6559401
## 2 0.000000 0.9418300 0.9419757 0.9881598 0.8643497 0.8351316 0.7678714
## 3 0.000000 0.9299892 0.9737419 0.9891410 0.7805187 0.7498209 0.8875451
## 4 0.000000 0.7920395 0.7692099 0.9840821 1.0692756 0.7008520 0.8308298
## 5 3.101961 0.7141817 0.8398433 0.9840674 0.9177661 0.3885529 0.9166403
## 6 0.000000 0.8722794 0.6843407 0.9903201 0.5101509 0.7204455 0.5554540
## x28
## 1 0.6778572
## 2 0.8330134
## 3 0.9252784
## 4 0.7167103
## 5 0.8863315
## 6 0.5357955
# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)
# For binary classification, response should be a factor
train[, y] <- as.factor(train[, y])
test[, y] <- as.factor(test[, y])
#train, test 분할 예시
#parts <- h2o.splitFrame(data, 0.8)
#train <- parts[[1]], test <- parts[[2]]
########### deep learning
model <-
h2o.deeplearning(
x,
y,
training_frame = train,
validation_frame = test,
distribution = "multinomial",
activation = "Rectifier",
hidden = c(200, 200),
epochs = 1000
)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========== | 14%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|======================================================================| 100%
## Model Details:
## ==============
##
## H2OBinomialModel: deeplearning
## Model Key: DeepLearning_model_R_1603843254237_2514
## Status of Neuron Layers: predicting response, 2-class classification, multinomial distribution, CrossEntropy loss, 46,402 weights/biases, 555.9 KB, 1,900,000 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms momentum
## 1 1 28 Input 0.00 % NA NA NA NA NA
## 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.004780 0.001418 0.000000
## 3 3 200 Rectifier 0.00 % 0.000000 0.000000 0.050511 0.085752 0.000000
## 4 4 2 Softmax NA 0.000000 0.000000 0.002787 0.000936 0.000000
## mean_weight weight_rms mean_bias bias_rms
## 1 NA NA NA NA
## 2 -0.000192 0.143242 0.259596 0.090074
## 3 -0.031985 0.100436 0.910850 0.066787
## 4 0.024190 0.410155 -0.006789 0.236023
##
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
##
## MSE: 0.1824438
## RMSE: 0.4271344
## LogLoss: 0.5681054
## Mean Per-Class Error: 0.2686236
## AUC: 0.8099719
## AUCPR: 0.8040738
## Gini: 0.6199439
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 2889 1816 0.385972 =1816/4705
## 1 801 4494 0.151275 =801/5295
## Totals 3690 6310 0.261700 =2617/10000
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.438723 0.774494 235
## 2 max f2 0.100823 0.863206 353
## 3 max f0point5 0.726529 0.762878 136
## 4 max accuracy 0.505882 0.745700 213
## 5 max precision 0.979104 0.918367 10
## 6 max recall 0.000347 1.000000 399
## 7 max specificity 0.998476 0.999362 0
## 8 max absolute_mcc 0.505882 0.489839 213
## 9 max min_per_class_accuracy 0.629713 0.741977 172
## 10 max mean_per_class_accuracy 0.558922 0.742404 195
## 11 max tns 0.998476 4702.000000 0
## 12 max fns 0.998476 5284.000000 0
## 13 max fps 0.000347 4705.000000 399
## 14 max tps 0.000347 5295.000000 399
## 15 max tnr 0.998476 0.999362 0
## 16 max fnr 0.998476 0.997923 0
## 17 max fpr 0.000347 1.000000 399
## 18 max tpr 0.000347 1.000000 399
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
##
## MSE: 0.2433057
## RMSE: 0.4932603
## LogLoss: 0.7454531
## Mean Per-Class Error: 0.3910916
## AUC: 0.7032347
## AUCPR: 0.7094568
## Gini: 0.4064693
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 706 1609 0.695032 =1609/2315
## 1 234 2451 0.087151 =234/2685
## Totals 940 4060 0.368600 =1843/5000
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.159826 0.726761 333
## 2 max f2 0.024414 0.858511 386
## 3 max f0point5 0.519553 0.673110 212
## 4 max accuracy 0.455971 0.654200 234
## 5 max precision 0.997457 1.000000 0
## 6 max recall 0.000414 1.000000 399
## 7 max specificity 0.997457 1.000000 0
## 8 max absolute_mcc 0.397621 0.300368 253
## 9 max min_per_class_accuracy 0.621055 0.644320 175
## 10 max mean_per_class_accuracy 0.519553 0.647130 212
## 11 max tns 0.997457 2315.000000 0
## 12 max fns 0.997457 2676.000000 0
## 13 max fps 0.000414 2315.000000 399
## 14 max tps 0.000414 2685.000000 399
## 15 max tnr 0.997457 1.000000 0
## 16 max fnr 0.997457 0.996648 0
## 17 max fpr 0.000414 1.000000 399
## 18 max tpr 0.000414 1.000000 399
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
##
## Scoring History:
## timestamp duration training_speed epochs iterations
## 1 2020-10-28 15:14:39 0.000 sec NA 0.00000 0
## 2 2020-10-28 15:14:47 8.765 sec 12679 obs/sec 10.00000 1
## 3 2020-10-28 15:14:57 18.004 sec 18060 obs/sec 30.00000 3
## 4 2020-10-28 15:15:05 26.849 sec 20008 obs/sec 50.00000 5
## 5 2020-10-28 15:15:14 35.622 sec 21069 obs/sec 70.00000 7
## 6 2020-10-28 15:15:23 44.321 sec 21748 obs/sec 90.00000 9
## 7 2020-10-28 15:15:32 53.046 sec 22185 obs/sec 110.00000 11
## 8 2020-10-28 15:15:41 1 min 1.891 sec 22433 obs/sec 130.00000 13
## 9 2020-10-28 15:15:49 1 min 10.545 sec 22640 obs/sec 150.00000 15
## 10 2020-10-28 15:15:58 1 min 19.525 sec 22686 obs/sec 170.00000 17
## 11 2020-10-28 15:16:07 1 min 28.222 sec 22805 obs/sec 190.00000 19
## 12 2020-10-28 15:16:07 1 min 28.509 sec 22801 obs/sec 190.00000 19
## samples training_rmse training_logloss training_r2 training_auc
## 1 0.000000 NA NA NA NA
## 2 100000.000000 0.42713 0.56811 0.26768 0.80997
## 3 300000.000000 0.31725 0.34533 0.59601 0.93552
## 4 500000.000000 0.23241 0.20083 0.78319 0.97896
## 5 700000.000000 0.17462 0.11627 0.87760 0.99311
## 6 900000.000000 0.13303 0.07090 0.92896 0.99726
## 7 1100000.000000 0.09671 0.03529 0.96246 0.99905
## 8 1300000.000000 0.07681 0.02414 0.97632 0.99955
## 9 1500000.000000 0.03114 0.00427 0.99611 0.99995
## 10 1700000.000000 0.01876 0.00219 0.99859 0.99999
## 11 1900000.000000 0.00949 0.00039 0.99964 1.00000
## 12 1900000.000000 0.42713 0.56811 0.26768 0.80997
## training_pr_auc training_lift training_classification_error validation_rmse
## 1 NA NA NA NA
## 2 0.80407 1.69972 0.26170 0.49326
## 3 0.93671 1.88857 0.13620 0.53845
## 4 0.97915 1.88857 0.06950 0.56489
## 5 0.99252 1.88857 0.03750 0.56859
## 6 0.99733 1.88857 0.02150 0.57629
## 7 0.99890 1.88857 0.01120 0.58073
## 8 0.99964 1.88857 0.00710 0.57914
## 9 0.99995 1.88857 0.00110 0.58613
## 10 0.99999 1.88857 0.00030 0.58476
## 11 1.00000 1.88857 0.00010 0.58744
## 12 0.80407 1.69972 0.26170 0.49326
## validation_logloss validation_r2 validation_auc validation_pr_auc
## 1 NA NA NA NA
## 2 0.74545 0.02142 0.70323 0.70946
## 3 1.19523 -0.16611 0.69036 0.69128
## 4 1.91623 -0.28344 0.68210 0.68912
## 5 2.60707 -0.30031 0.68051 0.67478
## 6 3.25369 -0.33573 0.68075 0.67766
## 7 3.77313 -0.35641 0.67474 0.66965
## 8 4.25531 -0.34899 0.67426 0.67210
## 9 4.57449 -0.38174 0.66923 0.66589
## 10 4.82681 -0.37531 0.67069 0.66759
## 11 5.05231 -0.38796 0.66899 0.66569
## 12 0.74545 0.02142 0.70323 0.70946
## validation_lift validation_classification_error
## 1 NA NA
## 2 1.60149 0.36860
## 3 1.56425 0.36560
## 4 1.56425 0.39640
## 5 1.67598 0.38100
## 6 1.64312 0.37440
## 7 1.60660 0.37000
## 8 1.60435 0.36800
## 9 1.47244 0.36600
## 10 1.48129 0.36400
## 11 1.43846 0.36000
## 12 1.60149 0.36860
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 x26 1.000000 1.000000 0.057439
## 2 x28 0.812700 0.812700 0.046681
## 3 x6 0.710595 0.710595 0.040816
## 4 x27 0.697142 0.697142 0.040043
## 5 x23 0.668373 0.668373 0.038391
##
## ---
## variable relative_importance scaled_importance percentage
## 23 x3 0.555396 0.555396 0.031901
## 24 x24 0.532795 0.532795 0.030603
## 25 x15 0.525916 0.525916 0.030208
## 26 x19 0.524819 0.524819 0.030145
## 27 x5 0.522640 0.522640 0.030020
## 28 x21 0.501666 0.501666 0.028815
##
|
| | 0%
|
|======================================================================| 100%
## predict p0 p1
## 1 1 0.4951252 0.5048748
## 2 1 0.8113229 0.1886771
## 3 1 0.2131663 0.7868337
## 4 1 0.7058216 0.2941784
## 5 1 0.0774948 0.9225052
## 6 1 0.1871479 0.8128521
##
## [5000 rows x 3 columns]