## Warning: package 'jsonlite' was built under R version 3.6.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Post to API
payload <- list(code = "BR")
response <- httr::POST(url = "https://api.statworx.com/covid",
body = toJSON(payload, auto_unbox = TRUE), encode = "json")
# Convert to data frame
content <- rawToChar(response$content)
df <- data.frame(fromJSON(content))
tail(df)
## date day month year cases deaths country code population continentExp
## 115 2020-04-23 23 4 2020 2678 165 Brazil BR 209469333 America
## 116 2020-04-24 24 4 2020 3735 407 Brazil BR 209469333 America
## 117 2020-04-25 25 4 2020 3503 357 Brazil BR 209469333 America
## 118 2020-04-26 26 4 2020 5514 346 Brazil BR 209469333 America
## 119 2020-04-27 27 4 2020 3379 189 Brazil BR 209469333 America
## 120 2020-04-28 28 4 2020 4613 338 Brazil BR 209469333 America
## cases_cum deaths_cum
## 115 45757 2906
## 116 49492 3313
## 117 52995 3670
## 118 58509 4016
## 119 61888 4205
## 120 66501 4543
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\kwilliam\AppData\Local\Temp\Rtmp0aehH2\file369c7b7a5ede/h2o_kwilliam_started_from_r.out
## C:\Users\kwilliam\AppData\Local\Temp\Rtmp0aehH2\file369c66d34640/h2o_kwilliam_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 7 seconds 848 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.30.0.1
## H2O cluster version age: 25 days
## H2O cluster name: H2O_started_from_R_kwilliam_ddz265
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.76 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 12321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.1 (2019-07-05)
conv_data <- df
conv_data$date <- as.factor(conv_data$date)
conv_data$day <- as.factor(conv_data$day)
conv_data$month <- as.factor(conv_data$month)
conv_data$year <- as.factor(conv_data$year)
conv_data$cases <- as.factor(conv_data$cases)
conv_data$deaths <- as.factor(conv_data$deaths)
conv_data$country <- as.factor(conv_data$country)
conv_data$code <- as.factor(conv_data$code)
conv_data$population <- as.factor(conv_data$population)
conv_data$continentExp <- as.factor(conv_data$continentExp)
conv_data$cases_cum <- as.factor(conv_data$cases_cum)
conv_data$deaths_cum <- as.factor(conv_data$deaths_cum)
conv_data$ID <- 1:nrow(conv_data)
conv_data.hex <- as.h2o(conv_data)
##
|
| | 0%
|
|======================================================================| 100%
split_h2o <- h2o.splitFrame(conv_data.hex, c(0.6, 0.2), seed = 1234 )
train_conv_h2o <- h2o.assign(split_h2o[[1]], "train" ) # 60%
valid_conv_h2o <- h2o.assign(split_h2o[[2]], "valid" ) # 20%
test_conv_h2o <- h2o.assign(split_h2o[[3]], "test" ) # 20%
#Set names for h2o
target <- "cases_cum"
predictors <- setdiff(names(train_conv_h2o), target)
automl_h2o_models <- h2o.automl(
x = predictors,
y = target,
training_frame = train_conv_h2o,
leaderboard_frame = valid_conv_h2o
)
##
|
| | 0%
## 02:38:41.305: AutoML: XGBoost is not available; skipping it.
|
|====== | 9%
|
|========= | 12%
|
|============ | 17%
|
|=============== | 21%
|
|================== | 25%
## 02:39:11.524: Skipping training of model GBM_5_AutoML_20200429_023841 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200429_023841. Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 73.0.
##
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|=================================================================== | 96%
|
|======================================================================| 100%
automl_h2o_models@leaderboard
## model_id mean_per_class_error
## 1 DeepLearning_grid__2_AutoML_20200429_023841_model_5 0.1785714
## 2 DeepLearning_grid__2_AutoML_20200429_023841_model_8 0.1785714
## 3 GBM_grid__1_AutoML_20200429_023841_model_9 0.1785714
## 4 DeepLearning_grid__2_AutoML_20200429_023841_model_3 0.1785714
## 5 GBM_grid__1_AutoML_20200429_023841_model_57 0.1785714
## 6 GBM_grid__1_AutoML_20200429_023841_model_32 0.1785714
## logloss rmse mse
## 1 3.568392 0.6894312 0.4753153
## 2 4.146939 0.6898032 0.4758285
## 3 1.523864 0.6612382 0.4372359
## 4 3.558740 0.6895443 0.4754714
## 5 1.698843 0.6874935 0.4726473
## 6 1.425712 0.6546192 0.4285263
##
## [84 rows x 5 columns]
automl_leader <- automl_h2o_models@leader
automl_leader
## Model Details:
## ==============
##
## H2OMultinomialModel: deeplearning
## Model ID: DeepLearning_grid__2_AutoML_20200429_023841_model_5
## Status of Neuron Layers: predicting cases_cum, 56-class classification, multinomial distribution, CrossEntropy loss, 428,056 weights/biases, 4.9 MB, 5,840 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms
## 1 1 298 Input 0.00 % NA NA NA NA
## 2 2 500 RectifierDropout 20.00 % 0.000000 0.000000 0.516914 0.449797
## 3 3 500 RectifierDropout 20.00 % 0.000000 0.000000 0.413334 0.352939
## 4 4 56 Softmax NA 0.000000 0.000000 0.899176 0.214394
## momentum mean_weight weight_rms mean_bias bias_rms
## 1 NA NA NA NA NA
## 2 0.000000 -0.002748 0.055047 0.083972 0.124599
## 3 0.000000 -0.017824 0.060834 0.896688 0.116211
## 4 0.000000 -0.027620 0.238580 -0.068496 0.036251
##
##
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
##
## Training Set Metrics:
## =====================
##
## Extract training frame with `h2o.getFrame("automl_training_train")`
## MSE: (Extract with `h2o.mse`) 1.555199e-05
## RMSE: (Extract with `h2o.rmse`) 0.003943601
## Logloss: (Extract with `h2o.logloss`) 0.0007089488
## Mean Per-Class Error: 0
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 0 29 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11130 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 1128 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425 33682
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11130 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52 52995
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11130 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1128 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98 Error Rate
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 29
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 3
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 NA = 0 / 0
## 11130 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 1
## 1128 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 1
##
## ---
## 0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 7910 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 904 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9056 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Totals 29 3 0 1 1 0 1 2 1 0 1 1 1 1 3
## 200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425
## 7910 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 904 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9056 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Totals 1 1 1 1 0 1 1 2 0 0 1 0 1 1
## 33682 34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52
## 7910 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 904 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9056 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Totals 1 1 0 0 1 0 1 0 1 1 0 1 1 0
## 52995 5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98 Error
## 7910 0 0 0 0 0 0 0 0 1 0 0 0 0 0.0000
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 NA
## 904 0 0 0 0 0 0 0 0 0 0 1 0 0 0.0000
## 9056 0 0 0 0 0 0 0 0 0 0 0 1 0 0.0000
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 NA
## Totals 1 1 1 0 1 0 1 1 1 0 1 1 0 0.0000
## Rate
## 7910 = 0 / 1
## 8 = 0 / 0
## 904 = 0 / 1
## 9056 = 0 / 1
## 98 = 0 / 0
## Totals = 0 / 73
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-10 Hit Ratios:
## k hit_ratio
## 1 1 1.000000
## 2 2 1.000000
## 3 3 1.000000
## 4 4 1.000000
## 5 5 1.000000
## 6 6 1.000000
## 7 7 1.000000
## 8 8 1.000000
## 9 9 1.000000
## 10 10 1.000000
##
##
##
## H2OMultinomialMetrics: deeplearning
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## Cross-Validation Set Metrics:
## =====================
##
## Extract cross-validation frame with `h2o.getFrame("automl_training_train")`
## MSE: (Extract with `h2o.mse`) 0.5748294
## RMSE: (Extract with `h2o.rmse`) 0.7581751
## Logloss: (Extract with `h2o.logloss`) 4.242266
## Mean Per-Class Error: 0.6666667
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-10 Hit Ratios:
## k hit_ratio
## 1 1 0.424658
## 2 2 0.465753
## 3 3 0.479452
## 4 4 0.479452
## 5 5 0.506849
## 6 6 0.506849
## 7 7 0.506849
## 8 8 0.520548
## 9 9 0.534247
## 10 10 0.534247
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid
## accuracy 0.42571428 0.043330718 0.4 0.4 0.4
## err 0.5742857 0.043330718 0.6 0.6 0.6
## err_count 8.4 0.8944272 9.0 9.0 9.0
## logloss 4.23353 0.3719279 4.7857876 4.355686 4.1968155
## max_per_class_error 1.0 0.0 1.0 1.0 1.0
## mean_per_class_accuracy 0.85 0.015971914 0.83928573 0.83928573 0.83928573
## mean_per_class_error 0.15 0.015971914 0.16071428 0.16071428 0.16071428
## mse 0.5739241 0.036900133 0.5990403 0.59517324 0.5936449
## r2 0.9979163 5.239575E-4 0.9980201 0.9985814 0.9981937
## rmse 0.7572528 0.024806777 0.7739769 0.7714747 0.77048355
## cv_4_valid cv_5_valid
## accuracy 0.5 0.42857143
## err 0.5 0.5714286
## err_count 7.0 8.0
## logloss 3.7948205 4.0345407
## max_per_class_error 1.0 1.0
## mean_per_class_accuracy 0.875 0.85714287
## mean_per_class_error 0.125 0.14285715
## mse 0.5109517 0.5708106
## r2 0.99729186 0.9974943
## rmse 0.7148088 0.7555201
pred_conversion <- h2o.predict(object = automl_leader, newdata = test_conv_h2o)
##
|
| | 0%
|
|======================================================================| 100%
h2o.table(pred_conversion$predict, test_conv_h2o$converted)
## predict Count
## 1 0 18
## 2 2 4
## 3 200 1
## 4 23430 2
## 5 40581 1
##
## [5 rows x 2 columns]
perf <- h2o.performance(automl_leader,conv_data.hex)
h2o.confusionMatrix(perf)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 0 57 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11130 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 1128 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425 33682
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10278 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 11130 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52 52995
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11130 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1128 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98 Error Rate
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 57
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0.2500 = 1 / 4
## 10278 0 0 0 0 0 0 0 0 0 0 0 0 1.0000 = 1 / 1
## 11130 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 1
## 1128 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 = 0 / 1
##
## ---
## 0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 7910 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 904 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9056 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## Totals 58 4 0 1 1 0 1 2 1 0 1 1 1 1 10
## 200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425
## 7910 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 904 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9056 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Totals 2 1 1 1 0 8 1 2 0 0 2 0 1 1
## 33682 34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52
## 7910 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 904 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9056 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Totals 1 1 0 0 1 0 2 0 1 1 0 1 1 0
## 52995 5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98 Error
## 7910 0 0 0 0 0 0 0 0 1 0 0 0 0 0.0000
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 1.0000
## 904 0 0 0 0 0 0 0 0 0 0 1 0 0 0.0000
## 9056 0 0 0 0 0 0 0 0 0 0 0 1 0 0.0000
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0 1.0000
## Totals 1 1 1 0 1 0 1 1 1 0 1 1 0 0.1500
## Rate
## 7910 = 0 / 1
## 8 = 1 / 1
## 904 = 0 / 1
## 9056 = 0 / 1
## 98 = 1 / 1
## Totals = 18 / 120
plot(automl_leader)