library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.5.3
## Warning: package 'zoo' was built under R version 4.5.3
## Warning: package 'quantmod' was built under R version 4.5.3
## Warning: package 'TTR' was built under R version 4.5.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.2
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for times series
library(timetk)
## Warning: package 'timetk' was built under R version 4.5.3
## 
## Attaching package: 'timetk'
## 
## The following object is masked from 'package:tidyquant':
## 
##     FANG
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.12     ✔ rsample      1.3.2 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2 
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(h2o)
## Warning: package 'h2o' was built under R version 4.5.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

data <- claims_tbl %>% mutate(across(where(is.character), factor))
## Split data
set.seed(1234)

data_split <- initial_split(data, strata = "symbol")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(symbol ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors()) 
options(h2o.connect_timeout = 60)
h2o.init(ip    = "localhost",
  port          = 54321,
  max_mem_size  = "4G",
  nthreads      = -1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         50 minutes 59 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    2 years, 4 months and 4 days 
##     H2O cluster name:           H2O_started_from_R_jrchi_kkr629 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.63 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.5.2 (2025-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (2 years, 4 months and 4 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
train_for_h2o <- as.h2o(train_tbl)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
split.h2o <- h2o.splitFrame(train_for_h2o, ratios= c(0.85), seed = 1234)
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
y <- "symbol"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
  x = x,
  y = y,
  training_frame = train_h2o,
  validation_frame = valid_h2o,
  leaderboard_frame = test_h2o,
  max_runtime_secs =30,
  nfolds = 5,
  seed = 3456
)
##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 19:36:28.526: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 19:36:28.527: AutoML: XGBoost is not available; skipping it.  |                                                                              |==========                                                            |  14%  |                                                                              |================                                                      |  23%  |                                                                              |=======================                                               |  33%  |                                                                              |===============================                                       |  44%  |                                                                              |========================================                              |  56%  |                                                                              |==============================================                        |  65%  |                                                                              |=====================================================                 |  75%  |                                                                              |===========================================================           |  84%  |                                                                              |=================================================================     |  93%  |                                                                              |======================================================================| 100%
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
##                                                  model_id mean_per_class_error
## 1 StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628            0.4236824
## 2                          GBM_1_AutoML_9_20260424_193628            0.5023956
## 3                          DRF_1_AutoML_9_20260424_193628            0.5318275
## 4                          GLM_1_AutoML_9_20260424_193628            0.5345654
## 5                          GBM_2_AutoML_9_20260424_193628            0.5934292
## 6                          GBM_3_AutoML_9_20260424_193628            0.5958248
##     logloss      rmse       mse
## 1 0.9097385 0.5720480 0.3272389
## 2 1.0594759 0.6250693 0.3907117
## 3 1.4088168 0.7482321 0.5598513
## 4 1.2404378 0.6694644 0.4481826
## 5 1.2453307 0.6702153 0.4491885
## 6 1.2627741 0.6728839 0.4527728
## 
## [9 rows x 5 columns]
models_h2o@leader
## Model Details:
## ==============
## 
## H2OMultinomialModel: stackedensemble
## Model ID:  StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628 
## Model Summary for Stacked Ensemble: 
##                                    key            value
## 1                    Stacking strategy cross_validation
## 2 Number of base models (used / total)              2/2
## 3     # GBM base models (used / total)              1/1
## 4     # GLM base models (used / total)              1/1
## 5                Metalearner algorithm              GLM
## 6   Metalearner fold assignment scheme           Random
## 7                   Metalearner nfolds                5
## 8              Metalearner fold_column               NA
## 9   Custom metalearner hyperparameters             None
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("AutoML_9_20260424_193628_training_RTMP_sid_b889_5")`
## MSE: (Extract with `h2o.mse`) 0.2882602
## RMSE: (Extract with `h2o.rmse`) 0.5368987
## Logloss: (Extract with `h2o.logloss`) 0.8078297
## Mean Per-Class Error: 0.3385007
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 26668.08
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12023.74
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut          1059     1           164             3           22
## Maine                  27   539             4           210          429
## Massachusetts          67     0          1171             0            5
## New Hampshire           3   181             4           610           92
## Rhode Island          102   301            12            82          743
## Vermont                 3    86             1           312           39
## Totals               1261  1108          1356          1217         1330
##               Vermont  Error            Rate
## Connecticut         0 0.1521 =   190 / 1,249
## Maine              15 0.5596 =   685 / 1,224
## Massachusetts       0 0.0579 =    72 / 1,243
## New Hampshire     340 0.5041 =   620 / 1,230
## Rhode Island        7 0.4042 =   504 / 1,247
## Vermont           808 0.3531 =   441 / 1,249
## Totals           1170 0.3375 = 2,512 / 7,442
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.662456
## 2 2  0.902446
## 3 3  0.970975
## 4 4  0.997447
## 5 5  0.999463
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on validation data. **
## 
## Validation Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.3296064
## RMSE: (Extract with `h2o.rmse`) 0.5741136
## Logloss: (Extract with `h2o.logloss`) 0.9291205
## Mean Per-Class Error: 0.4375751
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 4702.589
## Residual Deviance: (Extract with `h2o.residual_deviance`) 2438.012
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           173     0            31             0            6
## Maine                  11    77             0            38          103
## Massachusetts          30     0           182             0            4
## New Hampshire           2    36             1            86           26
## Rhode Island           22    73             0            14          102
## Vermont                 1    19             0            68           12
## Totals                239   205           214           206          253
##               Vermont  Error          Rate
## Connecticut         0 0.1762 =    37 / 210
## Maine               6 0.6723 =   158 / 235
## Massachusetts       0 0.1574 =    34 / 216
## New Hampshire      78 0.6245 =   143 / 229
## Rhode Island        1 0.5189 =   110 / 212
## Vermont           110 0.4762 =   100 / 210
## Totals            195 0.4436 = 582 / 1,312
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.556402
## 2 2  0.863567
## 3 3  0.955793
## 4 4  0.994665
## 5 5  0.997713
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## Cross-Validation Set Metrics: 
## =====================
## 
## Extract cross-validation frame with `h2o.getFrame("levelone_training_StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628")`
## MSE: (Extract with `h2o.mse`) 0.320287
## RMSE: (Extract with `h2o.rmse`) 0.565939
## Logloss: (Extract with `h2o.logloss`) 0.908603
## Mean Per-Class Error: 0.4102346
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 26682.16
## Residual Deviance: (Extract with `h2o.residual_deviance`) 13523.65
## AIC: (Extract with `h2o.aic`) NaN
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.590836
## 2 2  0.871809
## 3 3  0.958479
## 4 4  0.996372
## 5 5  0.999059
## 6 6  1.000000
## 
## 
## 
## 
## Cross-Validation Metrics Summary: 
##                                mean         sd  cv_1_valid  cv_2_valid
## accuracy                   0.591788   0.014097    0.606242    0.605193
## auc                              NA   0.000000          NA          NA
## err                        0.408212   0.014097    0.393758    0.394807
## err_count                607.200000  22.509998  593.000000  593.000000
## logloss                    0.909151   0.030833    0.872778    0.891336
## max_per_class_error        0.665817   0.037094    0.637097    0.682171
## mean_per_class_accuracy    0.591619   0.010668    0.601382    0.603515
## mean_per_class_error       0.408381   0.010668    0.398618    0.396485
## mse                        0.320937   0.009635    0.309163    0.315768
## null_deviance           5336.431000 218.093280 5402.035000 5385.071300
## pr_auc                           NA   0.000000          NA          NA
## r2                         0.890216   0.003839    0.892349    0.895187
## residual_deviance       2704.235400  82.169100 2628.806600 2677.574700
## rmse                       0.566462   0.008485    0.556024    0.561932
##                          cv_3_valid  cv_4_valid  cv_5_valid
## accuracy                   0.583658    0.590457    0.573391
## auc                              NA          NA          NA
## err                        0.416342    0.409543    0.426609
## err_count                642.000000  618.000000  590.000000
## logloss                    0.916418    0.910153    0.955069
## max_per_class_error        0.642241    0.643725    0.723849
## mean_per_class_accuracy    0.579084    0.590197    0.583918
## mean_per_class_error       0.420915    0.409803    0.416082
## mse                        0.322544    0.322057    0.335153
## null_deviance           5526.980500 5408.567400 4959.501000
## pr_auc                           NA          NA          NA
## r2                         0.888662    0.889876    0.885004
## residual_deviance       2826.233600 2746.840300 2641.721400
## rmse                       0.567930    0.567501    0.578924
?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel

best_model <- h2o.getModel(models_h2o@leader@model_id)

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
predictions_tbl <- predictions %>%
  as_tibble()

predictions_tbl %>%
  bind_cols(test_tbl)
## # A tibble: 2,922 × 10
##    predict  Connecticut   Maine Massachusetts New.Hampshire Rhode.Island Vermont
##    <fct>          <dbl>   <dbl>         <dbl>         <dbl>        <dbl>   <dbl>
##  1 Massach…      0.0948 8.60e-5      0.904       0.00000964     0.000688 5.39e-6
##  2 Massach…      0.181  1.44e-4      0.818       0.0000316      0.000540 1.17e-4
##  3 Connect…      0.809  3.20e-2      0.0356      0.00225        0.119    2.05e-3
##  4 Connect…      0.889  7.35e-3      0.0748      0.000344       0.0285   3.94e-4
##  5 Connect…      0.817  3.78e-2      0.0229      0.00132        0.120    7.31e-4
##  6 Connect…      0.813  3.03e-2      0.0373      0.00209        0.115    1.97e-3
##  7 Connect…      0.516  1.53e-1      0.00385     0.00784        0.316    3.58e-3
##  8 Connect…      0.861  2.91e-2      0.0172      0.00132        0.0901   9.70e-4
##  9 Rhode I…      0.344  1.92e-1      0.000979    0.0309         0.412    1.96e-2
## 10 Rhode I…      0.236  2.66e-1      0.000578    0.0591         0.413    2.55e-2
## # ℹ 2,912 more rows
## # ℹ 3 more variables: symbol <fct>, date <date>, claims <int>

Evaluate model

?h2o.performance

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628"
## 
## 
## $model_checksum
## [1] "4485432773037823824"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_b889_3"
## 
## 
## $frame_checksum
## [1] "5360492745852793936"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.777074e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.3272389
## 
## $RMSE
## [1] 0.572048
## 
## $nobs
## [1] 2922
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.8878038
## 
## $hit_ratio_table
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.576318
## 2 2  0.879192
## 3 3  0.960643
## 4 4  0.995893
## 5 5  0.998289
## 6 6  1.000000
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           386     0            83             3           15
## Maine                  16   165             0           110          189
## Massachusetts          39     0           446             0            2
## New Hampshire           2    81             0           200           47
## Rhode Island           54   197             5            33          195
## Vermont                 4    47             0           126           18
## Totals                501   490           534           472          466
##               Vermont  Error            Rate
## Connecticut         0 0.2074 =     101 / 487
## Maine               7 0.6612 =     322 / 487
## Massachusetts       0 0.0842 =      41 / 487
## New Hampshire     157 0.5893 =     287 / 487
## Rhode Island        3 0.5996 =     292 / 487
## Vermont           292 0.4004 =     195 / 487
## Totals            459 0.4237 = 1,238 / 2,922
## 
## 
## $logloss
## [1] 0.9097385
## 
## $mean_per_class_error
## [1] 0.4236824
## 
## $AUC
## [1] "NaN"
## 
## $pr_auc
## [1] "NaN"
## 
## $multinomial_auc_table
## NULL
## 
## $multinomial_aucpr_table
## NULL
## 
## $residual_deviance
## [1] 5316.512
## 
## $null_deviance
## [1] 10471.23
## 
## $AIC
## [1] "NaN"
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 2921
## 
## $residual_degrees_of_freedom
## [1] 2855
h2o.auc(performance_h2o)
## [1] "NaN"
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           386     0            83             3           15
## Maine                  16   165             0           110          189
## Massachusetts          39     0           446             0            2
## New Hampshire           2    81             0           200           47
## Rhode Island           54   197             5            33          195
## Vermont                 4    47             0           126           18
## Totals                501   490           534           472          466
##               Vermont  Error            Rate
## Connecticut         0 0.2074 =     101 / 487
## Maine               7 0.6612 =     322 / 487
## Massachusetts       0 0.0842 =      41 / 487
## New Hampshire     157 0.5893 =     287 / 487
## Rhode Island        3 0.5996 =     292 / 487
## Vermont           292 0.4004 =     195 / 487
## Totals            459 0.4237 = 1,238 / 2,922