Confirmed Cumulative Cases of COVID-19 in Brazil

## Warning: package 'jsonlite' was built under R version 3.6.3

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

# Post to API
payload <- list(code = "BR")
response <- httr::POST(url = "https://api.statworx.com/covid",
                       body = toJSON(payload, auto_unbox = TRUE), encode = "json")

# Convert to data frame
content <- rawToChar(response$content)
df <- data.frame(fromJSON(content))

tail(df)

##           date day month year cases deaths country code population continentExp
## 115 2020-04-23  23     4 2020  2678    165  Brazil   BR  209469333      America
## 116 2020-04-24  24     4 2020  3735    407  Brazil   BR  209469333      America
## 117 2020-04-25  25     4 2020  3503    357  Brazil   BR  209469333      America
## 118 2020-04-26  26     4 2020  5514    346  Brazil   BR  209469333      America
## 119 2020-04-27  27     4 2020  3379    189  Brazil   BR  209469333      America
## 120 2020-04-28  28     4 2020  4613    338  Brazil   BR  209469333      America
##     cases_cum deaths_cum
## 115     45757       2906
## 116     49492       3313
## 117     52995       3670
## 118     58509       4016
## 119     61888       4205
## 120     66501       4543

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\kwilliam\AppData\Local\Temp\Rtmp0aehH2\file369c7b7a5ede/h2o_kwilliam_started_from_r.out
##     C:\Users\kwilliam\AppData\Local\Temp\Rtmp0aehH2\file369c66d34640/h2o_kwilliam_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         7 seconds 848 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    25 days  
##     H2O cluster name:           H2O_started_from_R_kwilliam_ddz265 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.76 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        12321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.1 (2019-07-05)

Converting Variables into Factors

conv_data <- df
conv_data$date <- as.factor(conv_data$date)
conv_data$day <- as.factor(conv_data$day)
conv_data$month <- as.factor(conv_data$month)  
conv_data$year <- as.factor(conv_data$year) 
conv_data$cases <- as.factor(conv_data$cases)  
conv_data$deaths <- as.factor(conv_data$deaths)  
conv_data$country <- as.factor(conv_data$country)  
conv_data$code <- as.factor(conv_data$code)
conv_data$population <- as.factor(conv_data$population)
conv_data$continentExp <- as.factor(conv_data$continentExp)
conv_data$cases_cum <- as.factor(conv_data$cases_cum)  
conv_data$deaths_cum <- as.factor(conv_data$deaths_cum)
conv_data$ID  <- 1:nrow(conv_data)  
conv_data.hex  <- as.h2o(conv_data)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Split Data into Train/Validation/Test Sets

split_h2o <- h2o.splitFrame(conv_data.hex, c(0.6, 0.2), seed = 1234 )
train_conv_h2o <- h2o.assign(split_h2o[[1]], "train" ) # 60%
valid_conv_h2o <- h2o.assign(split_h2o[[2]], "valid" ) # 20%
test_conv_h2o  <- h2o.assign(split_h2o[[3]], "test" )  # 20%

Cases Cumulative Model

#Set names for h2o
target <- "cases_cum"
predictors <- setdiff(names(train_conv_h2o), target)

Run Model

automl_h2o_models <- h2o.automl(
  x = predictors, 
  y = target,
  training_frame    = train_conv_h2o,
  leaderboard_frame = valid_conv_h2o
)

## 
  |                                                                            
  |                                                                      |   0%
## 02:38:41.305: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |==================                                                    |  25%
## 02:39:11.524: Skipping training of model GBM_5_AutoML_20200429_023841 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200429_023841.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 73.0.
## 
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%

automl_h2o_models@leaderboard

##                                              model_id mean_per_class_error
## 1 DeepLearning_grid__2_AutoML_20200429_023841_model_5            0.1785714
## 2 DeepLearning_grid__2_AutoML_20200429_023841_model_8            0.1785714
## 3          GBM_grid__1_AutoML_20200429_023841_model_9            0.1785714
## 4 DeepLearning_grid__2_AutoML_20200429_023841_model_3            0.1785714
## 5         GBM_grid__1_AutoML_20200429_023841_model_57            0.1785714
## 6         GBM_grid__1_AutoML_20200429_023841_model_32            0.1785714
##    logloss      rmse       mse
## 1 3.568392 0.6894312 0.4753153
## 2 4.146939 0.6898032 0.4758285
## 3 1.523864 0.6612382 0.4372359
## 4 3.558740 0.6895443 0.4754714
## 5 1.698843 0.6874935 0.4726473
## 6 1.425712 0.6546192 0.4285263
## 
## [84 rows x 5 columns]

Extract Leader Model

automl_leader <- automl_h2o_models@leader
automl_leader

## Model Details:
## ==============
## 
## H2OMultinomialModel: deeplearning
## Model ID:  DeepLearning_grid__2_AutoML_20200429_023841_model_5 
## Status of Neuron Layers: predicting cases_cum, 56-class classification, multinomial distribution, CrossEntropy loss, 428,056 weights/biases, 4.9 MB, 5,840 training samples, mini-batch size 1
##   layer units             type dropout       l1       l2 mean_rate rate_rms
## 1     1   298            Input  0.00 %       NA       NA        NA       NA
## 2     2   500 RectifierDropout 20.00 % 0.000000 0.000000  0.516914 0.449797
## 3     3   500 RectifierDropout 20.00 % 0.000000 0.000000  0.413334 0.352939
## 4     4    56          Softmax      NA 0.000000 0.000000  0.899176 0.214394
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1       NA          NA         NA        NA       NA
## 2 0.000000   -0.002748   0.055047  0.083972 0.124599
## 3 0.000000   -0.017824   0.060834  0.896688 0.116211
## 4 0.000000   -0.027620   0.238580 -0.068496 0.036251
## 
## 
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("automl_training_train")`
## MSE: (Extract with `h2o.mse`) 1.555199e-05
## RMSE: (Extract with `h2o.rmse`) 0.003943601
## Logloss: (Extract with `h2o.logloss`) 0.0007089488
## Mean Per-Class Error: 0
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 0     29 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 1      0 3     0     0    0     0   0  0     0    0     0     0    0     0 0
## 10278  0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 11130  0 0     0     1    0     0   0  0     0    0     0     0    0     0 0
## 1128   0 0     0     0    1     0   0  0     0    0     0     0    0     0 0
##       200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425 33682
## 0       0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 1       0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 10278   0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 11130   0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 1128    0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
##       34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52 52995
## 0      0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 1      0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 10278  0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 11130  0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 1128   0    0     0     0    0     0    0   0     0     0    0     0  0     0
##       5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98  Error     Rate
## 0        0     0     0   0     0    0  0    0 0   0    0  0 0.0000 = 0 / 29
## 1        0     0     0   0     0    0  0    0 0   0    0  0 0.0000 =  0 / 3
## 10278    0     0     0   0     0    0  0    0 0   0    0  0     NA =  0 / 0
## 11130    0     0     0   0     0    0  0    0 0   0    0  0 0.0000 =  0 / 1
## 1128     0     0     0   0     0    0  0    0 0   0    0  0 0.0000 =  0 / 1
## 
## ---
##         0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 7910    0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 8       0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 904     0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 9056    0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 98      0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## Totals 29 3     0     1    1     0   1  2     1    0     1     1    1     1 3
##        200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425
## 7910     0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 8        0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 904      0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 9056     0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 98       0     0    0     0   0     0    0  0     0     0   0    0 0     0
## Totals   1     1    1     1   0     1    1  2     0     0   1    0 1     1
##        33682 34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52
## 7910       0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 8          0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 904        0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 9056       0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 98         0  0    0     0     0    0     0    0   0     0     0    0     0  0
## Totals     1  1    0     0     1    0     1    0   1     1     0    1     1  0
##        52995 5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98  Error
## 7910       0    0     0     0   0     0    0  0    1 0   0    0  0 0.0000
## 8          0    0     0     0   0     0    0  0    0 0   0    0  0     NA
## 904        0    0     0     0   0     0    0  0    0 0   1    0  0 0.0000
## 9056       0    0     0     0   0     0    0  0    0 0   0    1  0 0.0000
## 98         0    0     0     0   0     0    0  0    0 0   0    0  0     NA
## Totals     1    1     1     0   1     0    1  1    1 0   1    1  0 0.0000
##            Rate
## 7910   =  0 / 1
## 8      =  0 / 0
## 904    =  0 / 1
## 9056   =  0 / 1
## 98     =  0 / 0
## Totals = 0 / 73
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-10 Hit Ratios: 
##     k hit_ratio
## 1   1  1.000000
## 2   2  1.000000
## 3   3  1.000000
## 4   4  1.000000
## 5   5  1.000000
## 6   6  1.000000
## 7   7  1.000000
## 8   8  1.000000
## 9   9  1.000000
## 10 10  1.000000
## 
## 
## 
## H2OMultinomialMetrics: deeplearning
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## Cross-Validation Set Metrics: 
## =====================
## 
## Extract cross-validation frame with `h2o.getFrame("automl_training_train")`
## MSE: (Extract with `h2o.mse`) 0.5748294
## RMSE: (Extract with `h2o.rmse`) 0.7581751
## Logloss: (Extract with `h2o.logloss`) 4.242266
## Mean Per-Class Error: 0.6666667
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-10 Hit Ratios: 
##     k hit_ratio
## 1   1  0.424658
## 2   2  0.465753
## 3   3  0.479452
## 4   4  0.479452
## 5   5  0.506849
## 6   6  0.506849
## 7   7  0.506849
## 8   8  0.520548
## 9   9  0.534247
## 10 10  0.534247
## 
## 
## Cross-Validation Metrics Summary: 
##                               mean          sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                0.42571428 0.043330718        0.4        0.4        0.4
## err                      0.5742857 0.043330718        0.6        0.6        0.6
## err_count                      8.4   0.8944272        9.0        9.0        9.0
## logloss                    4.23353   0.3719279  4.7857876   4.355686  4.1968155
## max_per_class_error            1.0         0.0        1.0        1.0        1.0
## mean_per_class_accuracy       0.85 0.015971914 0.83928573 0.83928573 0.83928573
## mean_per_class_error          0.15 0.015971914 0.16071428 0.16071428 0.16071428
## mse                      0.5739241 0.036900133  0.5990403 0.59517324  0.5936449
## r2                       0.9979163 5.239575E-4  0.9980201  0.9985814  0.9981937
## rmse                     0.7572528 0.024806777  0.7739769  0.7714747 0.77048355
##                         cv_4_valid cv_5_valid
## accuracy                       0.5 0.42857143
## err                            0.5  0.5714286
## err_count                      7.0        8.0
## logloss                  3.7948205  4.0345407
## max_per_class_error            1.0        1.0
## mean_per_class_accuracy      0.875 0.85714287
## mean_per_class_error         0.125 0.14285715
## mse                      0.5109517  0.5708106
## r2                      0.99729186  0.9974943
## rmse                     0.7148088  0.7555201

Predict on Hold-out Test Set

pred_conversion <- h2o.predict(object = automl_leader, newdata = test_conv_h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

Confusion Matrix on Test Data Set

h2o.table(pred_conversion$predict, test_conv_h2o$converted)

##   predict Count
## 1       0    18
## 2       2     4
## 3     200     1
## 4   23430     2
## 5   40581     1
## 
## [5 rows x 2 columns]

Compute Performance

perf <- h2o.performance(automl_leader,conv_data.hex)
h2o.confusionMatrix(perf)

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638 2
## 0     57 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 1      1 3     0     0    0     0   0  0     0    0     0     0    0     0 0
## 10278  0 0     0     0    0     0   0  0     0    0     0     0    0     0 0
## 11130  0 0     0     1    0     0   0  0     0    0     0     0    0     0 0
## 1128   0 0     0     0    1     0   0  0     0    0     0     0    0     0 0
##       200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425 33682
## 0       0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 1       0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 10278   0     0    0     0   0     1    0  0     0     0   0    0 0     0     0
## 11130   0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
## 1128    0     0    0     0   0     0    0  0     0     0   0    0 0     0     0
##       34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52 52995
## 0      0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 1      0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 10278  0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 11130  0    0     0     0    0     0    0   0     0     0    0     0  0     0
## 1128   0    0     0     0    0     0    0   0     0     0    0     0  0     0
##       5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98  Error       Rate
## 0        0     0     0   0     0    0  0    0 0   0    0  0 0.0000 =   0 / 57
## 1        0     0     0   0     0    0  0    0 0   0    0  0 0.2500 =    1 / 4
## 10278    0     0     0   0     0    0  0    0 0   0    0  0 1.0000 =    1 / 1
## 11130    0     0     0   0     0    0  0    0 0   0    0  0 0.0000 =    0 / 1
## 1128     0     0     0   0     0    0  0    0 0   0    0  0 0.0000 =    0 / 1
## 
## ---
##         0 1 10278 11130 1128 12056 121 13 13717 1546 15927 17857 1891 19638  2
## 7910    0 0     0     0    0     0   0  0     0    0     0     0    0     0  0
## 8       0 0     0     0    0     0   0  0     0    0     0     0    0     0  1
## 904     0 0     0     0    0     0   0  0     0    0     0     0    0     0  0
## 9056    0 0     0     0    0     0   0  0     0    0     0     0    0     0  0
## 98      0 0     0     0    0     0   0  0     0    0     0     0    0     0  1
## Totals 58 4     0     1    1     0   1  2     1    0     1     1    1     1 10
##        200 20727 2201 22169 234 23430 2433 25 25262 28320 291 2915 3 30425
## 7910     0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 8        0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 904      0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 9056     0     0    0     0   0     0    0  0     0     0   0    0 0     0
## 98       0     0    0     0   0     0    0  0     0     0   0    0 0     0
## Totals   2     1    1     1   0     8    1  2     0     0   2    0 1     1
##        33682 34 3417 36599 38654 3904 40581 4256 428 43079 45757 4579 49492 52
## 7910       0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 8          0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 904        0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 9056       0  0    0     0     0    0     0    0   0     0     0    0     0  0
## 98         0  0    0     0     0    0     0    0   0     0     0    0     0  0
## Totals     1  1    0     0     1    0     2    0   1     1     0    1     1  0
##        52995 5717 58509 61888 621 66501 6836 77 7910 8 904 9056 98  Error
## 7910       0    0     0     0   0     0    0  0    1 0   0    0  0 0.0000
## 8          0    0     0     0   0     0    0  0    0 0   0    0  0 1.0000
## 904        0    0     0     0   0     0    0  0    0 0   1    0  0 0.0000
## 9056       0    0     0     0   0     0    0  0    0 0   0    1  0 0.0000
## 98         0    0     0     0   0     0    0  0    0 0   0    0  0 1.0000
## Totals     1    1     1     0   1     0    1  1    1 0   1    1  0 0.1500
##              Rate
## 7910   =    0 / 1
## 8      =    1 / 1
## 904    =    0 / 1
## 9056   =    0 / 1
## 98     =    1 / 1
## Totals = 18 / 120

AutoML Leader Plot

plot(automl_leader)

Cases Cumulative of COVID-19 in Brazil

Kyle W. Brown

4/29/2020

Confirmed Cumulative Cases of COVID-19 in Brazil

Converting Variables into Factors

Split Data into Train/Validation/Test Sets

Cases Cumulative Model

Run Model

Extract Leader Model

Predict on Hold-out Test Set

Confusion Matrix on Test Data Set

Compute Performance

AutoML Leader Plot