library(tidyverse) # many useful data manipulation packages, such as dplyr, ggplot2, and tidyr
library(synthpop) # create synthetic data
library(h2o)
set.seed(12345)
load(file = "viable.sans_Braden.RData")
synth.viable <- syn(viable)
## Warning: In your synthesis there are numeric variables with 5 or fewer levels: pri, Vasopressors, lab_count_observed, rx_count_observed, BMI_observed, min_pH_observed, min_Hgb_observed, min_Alb_observed, max_Lactate_observed, min_hgb_A1C_observed, min_O2_sat_observed, min_mbp_observed, min_Abpm_observed, min_Abps_observed, min_Abpd_observed, max_WBC_observed, max_temp_observed, min_pO2_observed, min_BE_observed, max_pCO2_observed.
## Consider changing them to factors. You can do it using parameter 'minnumlevels'.
##
## Variable(s): min_dbp_observed, max_HR_observed, min_HR_observed, min_sbp_observed numeric but with only 1 or fewer distinct values turned into factor(s) for synthesis.
##
## Variable min_dbp_observed has only one value so its method has been changed to "constant".
## Variable min_dbp_observed removed as predictor because only one value.
## Variable max_HR_observed has only one value so its method has been changed to "constant".
## Variable max_HR_observed removed as predictor because only one value.
## Variable min_HR_observed has only one value so its method has been changed to "constant".
## Variable min_HR_observed removed as predictor because only one value.
## Variable min_sbp_observed has only one value so its method has been changed to "constant".
## Variable min_sbp_observed removed as predictor because only one value.
##
## Synthesis
## -----------
## pri lab_count rx_count BMI min_pH age min_Hgb min_Alb max_Lactate min_dbp
## previous_visits min_hgb_A1C max_HR min_HR min_O2_sat min_sbp min_mbp min_Abpm min_Abps min_Abpd
## max_WBC max_temp max_PEEP min_pO2 min_BE max_pCO2 Vasopressors lab_count_observed rx_count_observed BMI_observed
## min_pH_observed min_Hgb_observed min_Alb_observed max_Lactate_observed min_dbp_observed min_hgb_A1C_observed max_HR_observed min_HR_observed min_O2_sat_observed min_sbp_observed
## min_mbp_observed min_Abpm_observed min_Abps_observed min_Abpd_observed max_WBC_observed max_temp_observed min_pO2_observed min_BE_observed max_pCO2_observed
synth_viable <- synth.viable$syn
write.csv(synth_viable, "synth_viable.csv", row.names = FALSE)
h2o datasetsh2o.init(nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 21 hours
## H2O cluster timezone: America/Denver
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.4
## H2O cluster version age: 4 months
## H2O cluster name: H2O_started_from_R_andywilson1_iiy192
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.81 GB
## H2O cluster total cores: 10
## H2O cluster allowed cores: 10
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.1 (2023-06-16)
viable$pri <- as.factor(viable$pri)
viable.h2o <- as.h2o(viable)
##
|
| | 0%
|
|======================================================================| 100%
y <- "pri"
x <- setdiff(names(viable), y)
y
## [1] "pri"
x
## [1] "lab_count" "rx_count" "BMI"
## [4] "min_pH" "age" "min_Hgb"
## [7] "min_Alb" "max_Lactate" "min_dbp"
## [10] "previous_visits" "min_hgb_A1C" "max_HR"
## [13] "min_HR" "min_O2_sat" "min_sbp"
## [16] "min_mbp" "min_Abpm" "min_Abps"
## [19] "min_Abpd" "max_WBC" "max_temp"
## [22] "max_PEEP" "min_pO2" "min_BE"
## [25] "max_pCO2" "Vasopressors" "lab_count_observed"
## [28] "rx_count_observed" "BMI_observed" "min_pH_observed"
## [31] "min_Hgb_observed" "min_Alb_observed" "max_Lactate_observed"
## [34] "min_dbp_observed" "min_hgb_A1C_observed" "max_HR_observed"
## [37] "min_HR_observed" "min_O2_sat_observed" "min_sbp_observed"
## [40] "min_mbp_observed" "min_Abpm_observed" "min_Abps_observed"
## [43] "min_Abpd_observed" "max_WBC_observed" "max_temp_observed"
## [46] "min_pO2_observed" "min_BE_observed" "max_pCO2_observed"
my_automl <- h2o.automl(x = x,
y = y,
balance_classes = TRUE,
training_frame = viable.h2o)
##
|
| | 0%
## 09:08:38.616: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
| | 1%
|
|= | 1%
## 09:09:19.294: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|== | 2%
|
|== | 3%
## 09:09:23.100: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
## 09:09:33.832: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 09:09:34.500: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|==== | 6%
|
|===== | 8%
## 09:10:14.332: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|====== | 8%
|
|====== | 9%
## 09:10:37.859: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|======= | 10%
|
|======= | 11%
## 09:10:44.498: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|======== | 11%
|
|========= | 12%
## 09:10:51.365: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|========= | 13%
|
|========== | 14%
## 09:10:58.963: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 09:10:59.515: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|============ | 17%
## 09:11:00.358: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|============= | 18%
## 09:12:34.148: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
## 09:12:58.606: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|============== | 21%
|
|=============== | 22%
## 09:13:04.562: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|================ | 23%
|
|================ | 24%
|
|================== | 26%
## 09:13:14.215: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 09:13:15.68: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
## 10:01:24.59: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 10:01:26.412: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
## 10:07:19.77: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 10:07:20.775: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|===================================================================== | 98%
## 10:07:38.554: _train param, Dropping bad and constant columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 10:07:44.142: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|===================================================================== | 99%
## 10:07:49.818: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
## 10:08:06.66: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
|
|======================================================================| 99%
|
|======================================================================| 100%
## 10:08:23.556: _train param, Dropping unused columns: [max_HR_observed, min_HR_observed, min_sbp_observed, min_dbp_observed]
h2o.saveModel(my_automl@leader, path = "/Users/andywilson1/Library/CloudStorage/Box-Box/Active/MIMIC4 v.1.0", force = TRUE)
## [1] "/Users/andywilson1/Library/CloudStorage/Box-Box/Active/MIMIC4 v.1.0/StackedEnsemble_BestOfFamily_5_AutoML_2_20230829_90838"
my_aml <- my_automl@leader
my_aml
## Model Details:
## ==============
##
## H2OBinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_5_AutoML_2_20230829_90838
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 5/6
## 3 # GBM base models (used / total) 1/1
## 4 # XGBoost base models (used / total) 1/1
## 5 # DeepLearning base models (used / total) 1/1
## 6 # GLM base models (used / total) 1/1
## 7 # DRF base models (used / total) 1/2
## 8 Metalearner algorithm GLM
## 9 Metalearner fold assignment scheme Random
## 10 Metalearner nfolds 5
## 11 Metalearner fold_column NA
## 12 Custom metalearner hyperparameters None
##
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.03784811
## RMSE: 0.1945459
## LogLoss: 0.1434172
## Mean Per-Class Error: 0.3277215
## AUC: 0.8747589
## AUCPR: 0.3653137
## Gini: 0.7495179
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 9350 234 0.024416 =234/9584
## 1 301 176 0.631027 =301/477
## Totals 9651 410 0.053176 =535/10061
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.206464 0.396843 100
## 2 max f2 0.098854 0.461561 195
## 3 max f0point5 0.234175 0.447898 82
## 4 max accuracy 0.287058 0.956366 54
## 5 max precision 0.572051 1.000000 0
## 6 max recall 0.006593 1.000000 384
## 7 max specificity 0.572051 1.000000 0
## 8 max absolute_mcc 0.232268 0.372074 83
## 9 max min_per_class_accuracy 0.069838 0.798623 235
## 10 max mean_per_class_accuracy 0.062766 0.800481 246
## 11 max tns 0.572051 9584.000000 0
## 12 max fns 0.572051 476.000000 0
## 13 max fps 0.002180 9584.000000 399
## 14 max tps 0.006593 477.000000 384
## 15 max tnr 0.572051 1.000000 0
## 16 max fnr 0.572051 0.997904 0
## 17 max fpr 0.002180 1.000000 399
## 18 max tpr 0.006593 1.000000 384
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.04345336
## RMSE: 0.2084547
## LogLoss: 0.1678189
## Mean Per-Class Error: 0.3506697
## AUC: 0.7949661
## AUCPR: 0.1739178
## Gini: 0.5899321
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 24709 2291 0.084852 =2291/27000
## 1 860 535 0.616487 =860/1395
## Totals 25569 2826 0.110970 =3151/28395
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.122098 0.253494 173
## 2 max f2 0.075192 0.367367 233
## 3 max f0point5 0.177291 0.235789 122
## 4 max accuracy 0.432071 0.950977 8
## 5 max precision 0.607924 1.000000 0
## 6 max recall 0.003141 1.000000 395
## 7 max specificity 0.607924 1.000000 0
## 8 max absolute_mcc 0.088601 0.223842 214
## 9 max min_per_class_accuracy 0.052760 0.720430 269
## 10 max mean_per_class_accuracy 0.048055 0.726700 278
## 11 max tns 0.607924 27000.000000 0
## 12 max fns 0.607924 1394.000000 0
## 13 max fps 0.001531 27000.000000 399
## 14 max tps 0.003141 1395.000000 395
## 15 max tnr 0.607924 1.000000 0
## 16 max fnr 0.607924 0.999283 0
## 17 max fpr 0.001531 1.000000 399
## 18 max tpr 0.003141 1.000000 395
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 0.891255 0.031951 0.891084 0.905212 0.914703 0.836268
## auc 0.794948 0.010558 0.808681 0.803620 0.786388 0.790573
## err 0.108745 0.031951 0.108916 0.094788 0.085297 0.163732
## err_count 617.600000 182.118910 623.000000 542.000000 496.000000 930.000000
## f0point5 0.223991 0.026914 0.238187 0.237000 0.235394 0.175953
## cv_5_valid
## accuracy 0.909008
## auc 0.785479
## err 0.090992
## err_count 497.000000
## f0point5 0.233422
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## precision 0.205309 0.031028 0.215412 0.216842 0.226158
## r2 0.069315 0.011807 0.083996 0.076416 0.068726
## recall 0.390476 0.104018 0.412752 0.377289 0.281356
## residual_deviance 1906.087500 79.260020 1966.611700 1857.442500 2014.034400
## rmse 0.208411 0.004117 0.212687 0.204915 0.211772
## specificity 0.917314 0.039417 0.917374 0.931680 0.948551
## cv_4_valid cv_5_valid
## precision 0.150313 0.217822
## r2 0.052944 0.064493
## recall 0.553846 0.327138
## residual_deviance 1834.611100 1857.737900
## rmse 0.203388 0.209294
## specificity 0.849815 0.939149