Deep Learning(using h2o)

0.1 Deep Learning

H2O package 를 활용한 Deep Learning

[참조 1] http://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r

[참조 2] https://r-kor.org/wp-content/uploads/2018/08/h2o__v3.pdf

[참조 3] https://github.com/DarrenCook/h2o/

0.1.1 packages

library(h2o)

## Warning: package 'h2o' was built under R version 4.0.3

library(data.table)

# 초기 준비
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         6 hours 13 minutes 
##     H2O cluster timezone:       Asia/Seoul 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.32.0.1 
##     H2O cluster version age:    19 days  
##     H2O cluster name:           H2O_started_from_R_user_gax063 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.36 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.2 (2020-06-22)

0.1.2 데이터 준비

# Import a sample binary outcome train/test set into H2O
# import 안되면 홈페이지서 저장하고 fread(), as.h2o() 변환하여 활용

#train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
#test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

train <- fread("higgs_train_10k.csv")
train <- as.h2o(train)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

dim(train)

## [1] 10000    29

head(train)

##   response        x1         x2           x3        x4         x5        x6
## 1        1 0.8692932 -0.6350818  0.225690261 0.3274701 -0.6899932 0.7542022
## 2        1 0.9075421  0.3291473  0.359411865 1.4979699 -0.3130095 1.0955306
## 3        1 0.7988347  1.4706388 -1.635974765 0.4537732  0.4256292 1.1048746
## 4        0 1.3443848 -0.8766260  0.935912728 1.9920501  0.8824544 1.7860659
## 5        1 1.1050090  0.3213556  1.522401214 0.8828076 -1.2053493 0.6814661
## 6        0 1.5958393 -0.6078107  0.007074916 1.8184496 -0.1119060 0.8475499
##           x7         x8       x9       x10        x11        x12      x13
## 1 -0.2485731 -1.0920639 0.000000 1.3749921 -0.6536742  0.9303491 1.107436
## 2 -0.5575249 -1.5882298 2.173076 0.8125812 -0.2136419  1.2710146 2.214872
## 3  1.2823223  1.3816643 0.000000 0.8517372  1.5406590 -0.8196895 2.214872
## 4 -1.6467777 -0.9423825 0.000000 2.4232647 -0.6760158  0.7361587 2.214872
## 5 -1.0704639 -0.9218706 0.000000 0.8008721  1.0209740  0.9714065 2.214872
## 6 -0.5664370  1.5812393 2.173076 0.7554210  0.6431096  1.4263668 0.000000
##         x14        x15        x16      x17       x18         x19           x20
## 1 1.1389043 -1.5781983 -1.0469854 0.000000 0.6579295 -0.01045457 -0.0457671694
## 2 0.4999940 -1.2614318  0.7321562 0.000000 0.3987009 -1.13893008 -0.0008191102
## 3 0.9934899  0.3560801 -0.2087775 2.548224 1.2569546  1.12884760  0.9004608393
## 4 1.2987198 -1.4307381 -0.3646582 0.000000 0.7453127 -0.67837882 -1.3603563309
## 5 0.5967613 -0.3502729  0.6311943 0.000000 0.4799989 -0.37356552  0.1130406111
## 6 0.9216608 -1.1904324 -1.6155890 0.000000 0.6511141 -0.65422696 -1.2743449211
##        x21       x22       x23       x24       x25       x26       x27
## 1 3.101961 1.3537600 0.9795631 0.9780762 0.9200048 0.7216575 0.9887509
## 2 0.000000 0.3022199 0.8330482 0.9856997 0.9780984 0.7797322 0.9923558
## 3 0.000000 0.9097533 1.1083305 0.9856922 0.9513313 0.8032515 0.8659244
## 4 0.000000 0.9466525 1.0287037 0.9986561 0.7282806 0.8692002 1.0267365
## 5 0.000000 0.7558565 1.3610570 0.9866097 0.8380846 1.1332952 0.8722449
## 6 3.101961 0.8237606 0.9381914 0.9717582 0.7891763 0.4305533 0.9613569
##         x28
## 1 0.8766783
## 2 0.7983426
## 3 0.7801176
## 4 0.9579040
## 5 0.8084865
## 6 0.9578179

test <- fread("higgs_test_5k.csv")
test <- as.h2o(test)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

dim(test)

## [1] 5000   29

head(test)

##   response        x1         x2         x3        x4         x5        x6
## 1        0 0.6238780  0.4596591 -0.3165119 1.5570095  0.6447843 0.4559291
## 2        0 1.6820366  0.7742510  1.3287545 0.2347291 -0.5578297 0.4429209
## 3        1 1.0940284 -0.8873397  0.9492294 0.4102605 -1.5586771 0.5233520
## 4        0 1.3387115 -0.8990273 -1.2553400 0.8842851 -0.7479247 0.5474447
## 5        0 0.6990948  1.4336280 -1.7225332 0.6501126 -0.1675736 0.9674639
## 6        1 0.8422078 -0.1442015 -1.7020034 0.6858755  0.5443376 0.5787744
##            x7         x8       x9       x10        x11         x12      x13
## 1 -2.24982786 -0.3619488 0.000000 0.8329776  1.4114662  1.52845550 0.000000
## 2 -0.08320475  1.0811927 2.173076 1.1458478  0.1622797 -0.26818651 0.000000
## 3  0.13167514  0.3045129 0.000000 1.1191562  1.0938271 -0.39635164 2.214872
## 4 -0.79518002  0.1742347 2.173076 0.7716625 -0.3185503  1.02356052 2.214872
## 5  0.73769587 -0.8863906 2.173076 1.6679695  0.4468922  0.85100842 0.000000
## 6 -0.22183695 -0.5864736 1.086538 0.9094010  2.8131583 -0.07066655 0.000000
##         x14        x15        x16      x17      x18           x19         x20
## 1 0.7228574  1.0096387  0.9995385 0.000000 1.190991  0.8140404820 -0.87980253
## 2 1.2140703  0.1949661 -1.1562682 2.548224 1.019146 -0.7591626644  0.13634692
## 3 0.6289001  0.6282445  1.4577500 0.000000 0.523082  0.5083943605  0.98813695
## 4 0.4084952 -1.1430995  0.1396988 0.000000 1.293952 -1.3413060904  1.61685240
## 5 0.5964100  0.1567357  0.3405130 0.000000 0.694684 -0.0004606903 -1.14948976
## 6 1.5498583 -0.8518199  1.5881127 0.000000 1.149855  0.5900110006  0.05976906
##        x21       x22       x23       x24       x25       x26       x27
## 1 3.101961 0.5833441 0.7902463 1.0408747 0.8488581 0.2297017 0.6559401
## 2 0.000000 0.9418300 0.9419757 0.9881598 0.8643497 0.8351316 0.7678714
## 3 0.000000 0.9299892 0.9737419 0.9891410 0.7805187 0.7498209 0.8875451
## 4 0.000000 0.7920395 0.7692099 0.9840821 1.0692756 0.7008520 0.8308298
## 5 3.101961 0.7141817 0.8398433 0.9840674 0.9177661 0.3885529 0.9166403
## 6 0.000000 0.8722794 0.6843407 0.9903201 0.5101509 0.7204455 0.5554540
##         x28
## 1 0.6778572
## 2 0.8330134
## 3 0.9252784
## 4 0.7167103
## 5 0.8863315
## 6 0.5357955

0.1.3 전처리

# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)

# For binary classification, response should be a factor
train[, y] <- as.factor(train[, y])
test[, y] <- as.factor(test[, y])

#train, test 분할 예시
#parts <-  h2o.splitFrame(data, 0.8)
#train <- parts[[1]], test <- parts[[2]]

0.1.4 deep learning 모델링

########### deep learning
model <-
  h2o.deeplearning(
    x,
    y,
    training_frame = train,
    validation_frame = test,
    distribution = "multinomial",
    activation = "Rectifier",
    hidden = c(200, 200),
    epochs = 1000
  )

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |======================================================================| 100%

summary(model)

## Model Details:
## ==============
## 
## H2OBinomialModel: deeplearning
## Model Key:  DeepLearning_model_R_1603843254237_2514 
## Status of Neuron Layers: predicting response, 2-class classification, multinomial distribution, CrossEntropy loss, 46,402 weights/biases, 555.9 KB, 1,900,000 training samples, mini-batch size 1
##   layer units      type dropout       l1       l2 mean_rate rate_rms momentum
## 1     1    28     Input  0.00 %       NA       NA        NA       NA       NA
## 2     2   200 Rectifier  0.00 % 0.000000 0.000000  0.004780 0.001418 0.000000
## 3     3   200 Rectifier  0.00 % 0.000000 0.000000  0.050511 0.085752 0.000000
## 4     4     2   Softmax      NA 0.000000 0.000000  0.002787 0.000936 0.000000
##   mean_weight weight_rms mean_bias bias_rms
## 1          NA         NA        NA       NA
## 2   -0.000192   0.143242  0.259596 0.090074
## 3   -0.031985   0.100436  0.910850 0.066787
## 4    0.024190   0.410155 -0.006789 0.236023
## 
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## MSE:  0.1824438
## RMSE:  0.4271344
## LogLoss:  0.5681054
## Mean Per-Class Error:  0.2686236
## AUC:  0.8099719
## AUCPR:  0.8040738
## Gini:  0.6199439
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##           0    1    Error         Rate
## 0      2889 1816 0.385972   =1816/4705
## 1       801 4494 0.151275    =801/5295
## Totals 3690 6310 0.261700  =2617/10000
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.438723    0.774494 235
## 2                       max f2  0.100823    0.863206 353
## 3                 max f0point5  0.726529    0.762878 136
## 4                 max accuracy  0.505882    0.745700 213
## 5                max precision  0.979104    0.918367  10
## 6                   max recall  0.000347    1.000000 399
## 7              max specificity  0.998476    0.999362   0
## 8             max absolute_mcc  0.505882    0.489839 213
## 9   max min_per_class_accuracy  0.629713    0.741977 172
## 10 max mean_per_class_accuracy  0.558922    0.742404 195
## 11                     max tns  0.998476 4702.000000   0
## 12                     max fns  0.998476 5284.000000   0
## 13                     max fps  0.000347 4705.000000 399
## 14                     max tps  0.000347 5295.000000 399
## 15                     max tnr  0.998476    0.999362   0
## 16                     max fnr  0.998476    0.997923   0
## 17                     max fpr  0.000347    1.000000 399
## 18                     max tpr  0.000347    1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
## 
## MSE:  0.2433057
## RMSE:  0.4932603
## LogLoss:  0.7454531
## Mean Per-Class Error:  0.3910916
## AUC:  0.7032347
## AUCPR:  0.7094568
## Gini:  0.4064693
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##          0    1    Error        Rate
## 0      706 1609 0.695032  =1609/2315
## 1      234 2451 0.087151   =234/2685
## Totals 940 4060 0.368600  =1843/5000
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.159826    0.726761 333
## 2                       max f2  0.024414    0.858511 386
## 3                 max f0point5  0.519553    0.673110 212
## 4                 max accuracy  0.455971    0.654200 234
## 5                max precision  0.997457    1.000000   0
## 6                   max recall  0.000414    1.000000 399
## 7              max specificity  0.997457    1.000000   0
## 8             max absolute_mcc  0.397621    0.300368 253
## 9   max min_per_class_accuracy  0.621055    0.644320 175
## 10 max mean_per_class_accuracy  0.519553    0.647130 212
## 11                     max tns  0.997457 2315.000000   0
## 12                     max fns  0.997457 2676.000000   0
## 13                     max fps  0.000414 2315.000000 399
## 14                     max tps  0.000414 2685.000000 399
## 15                     max tnr  0.997457    1.000000   0
## 16                     max fnr  0.997457    0.996648   0
## 17                     max fpr  0.000414    1.000000 399
## 18                     max tpr  0.000414    1.000000 399
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## 
## Scoring History: 
##              timestamp          duration training_speed    epochs iterations
## 1  2020-10-28 15:14:39         0.000 sec             NA   0.00000          0
## 2  2020-10-28 15:14:47         8.765 sec  12679 obs/sec  10.00000          1
## 3  2020-10-28 15:14:57        18.004 sec  18060 obs/sec  30.00000          3
## 4  2020-10-28 15:15:05        26.849 sec  20008 obs/sec  50.00000          5
## 5  2020-10-28 15:15:14        35.622 sec  21069 obs/sec  70.00000          7
## 6  2020-10-28 15:15:23        44.321 sec  21748 obs/sec  90.00000          9
## 7  2020-10-28 15:15:32        53.046 sec  22185 obs/sec 110.00000         11
## 8  2020-10-28 15:15:41  1 min  1.891 sec  22433 obs/sec 130.00000         13
## 9  2020-10-28 15:15:49  1 min 10.545 sec  22640 obs/sec 150.00000         15
## 10 2020-10-28 15:15:58  1 min 19.525 sec  22686 obs/sec 170.00000         17
## 11 2020-10-28 15:16:07  1 min 28.222 sec  22805 obs/sec 190.00000         19
## 12 2020-10-28 15:16:07  1 min 28.509 sec  22801 obs/sec 190.00000         19
##           samples training_rmse training_logloss training_r2 training_auc
## 1        0.000000            NA               NA          NA           NA
## 2   100000.000000       0.42713          0.56811     0.26768      0.80997
## 3   300000.000000       0.31725          0.34533     0.59601      0.93552
## 4   500000.000000       0.23241          0.20083     0.78319      0.97896
## 5   700000.000000       0.17462          0.11627     0.87760      0.99311
## 6   900000.000000       0.13303          0.07090     0.92896      0.99726
## 7  1100000.000000       0.09671          0.03529     0.96246      0.99905
## 8  1300000.000000       0.07681          0.02414     0.97632      0.99955
## 9  1500000.000000       0.03114          0.00427     0.99611      0.99995
## 10 1700000.000000       0.01876          0.00219     0.99859      0.99999
## 11 1900000.000000       0.00949          0.00039     0.99964      1.00000
## 12 1900000.000000       0.42713          0.56811     0.26768      0.80997
##    training_pr_auc training_lift training_classification_error validation_rmse
## 1               NA            NA                            NA              NA
## 2          0.80407       1.69972                       0.26170         0.49326
## 3          0.93671       1.88857                       0.13620         0.53845
## 4          0.97915       1.88857                       0.06950         0.56489
## 5          0.99252       1.88857                       0.03750         0.56859
## 6          0.99733       1.88857                       0.02150         0.57629
## 7          0.99890       1.88857                       0.01120         0.58073
## 8          0.99964       1.88857                       0.00710         0.57914
## 9          0.99995       1.88857                       0.00110         0.58613
## 10         0.99999       1.88857                       0.00030         0.58476
## 11         1.00000       1.88857                       0.00010         0.58744
## 12         0.80407       1.69972                       0.26170         0.49326
##    validation_logloss validation_r2 validation_auc validation_pr_auc
## 1                  NA            NA             NA                NA
## 2             0.74545       0.02142        0.70323           0.70946
## 3             1.19523      -0.16611        0.69036           0.69128
## 4             1.91623      -0.28344        0.68210           0.68912
## 5             2.60707      -0.30031        0.68051           0.67478
## 6             3.25369      -0.33573        0.68075           0.67766
## 7             3.77313      -0.35641        0.67474           0.66965
## 8             4.25531      -0.34899        0.67426           0.67210
## 9             4.57449      -0.38174        0.66923           0.66589
## 10            4.82681      -0.37531        0.67069           0.66759
## 11            5.05231      -0.38796        0.66899           0.66569
## 12            0.74545       0.02142        0.70323           0.70946
##    validation_lift validation_classification_error
## 1               NA                              NA
## 2          1.60149                         0.36860
## 3          1.56425                         0.36560
## 4          1.56425                         0.39640
## 5          1.67598                         0.38100
## 6          1.64312                         0.37440
## 7          1.60660                         0.37000
## 8          1.60435                         0.36800
## 9          1.47244                         0.36600
## 10         1.48129                         0.36400
## 11         1.43846                         0.36000
## 12         1.60149                         0.36860
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##   variable relative_importance scaled_importance percentage
## 1      x26            1.000000          1.000000   0.057439
## 2      x28            0.812700          0.812700   0.046681
## 3       x6            0.710595          0.710595   0.040816
## 4      x27            0.697142          0.697142   0.040043
## 5      x23            0.668373          0.668373   0.038391
## 
## ---
##    variable relative_importance scaled_importance percentage
## 23       x3            0.555396          0.555396   0.031901
## 24      x24            0.532795          0.532795   0.030603
## 25      x15            0.525916          0.525916   0.030208
## 26      x19            0.524819          0.524819   0.030145
## 27       x5            0.522640          0.522640   0.030020
## 28      x21            0.501666          0.501666   0.028815

0.1.5 예측

pred <- h2o.predict(model, test)  # predict(aml, test) also works

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

pred

##   predict        p0        p1
## 1       1 0.4951252 0.5048748
## 2       1 0.8113229 0.1886771
## 3       1 0.2131663 0.7868337
## 4       1 0.7058216 0.2941784
## 5       1 0.0774948 0.9225052
## 6       1 0.1871479 0.8128521
## 
## [5000 rows x 3 columns]

Deep Learning(using h2o)

updragon

2020 10 28

0.1 Deep Learning

0.1.1 packages

0.1.2 데이터 준비

0.1.3 전처리

0.1.4 deep learning 모델링

0.1.5 예측