library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.5.3
## Warning: package 'zoo' was built under R version 4.5.3
## Warning: package 'quantmod' was built under R version 4.5.3
## Warning: package 'TTR' was built under R version 4.5.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0 ✔ TTR 0.24.4
## ✔ quantmod 0.4.28 ✔ xts 0.14.2
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for times series
library(timetk)
## Warning: package 'timetk' was built under R version 4.5.3
##
## Attaching package: 'timetk'
##
## The following object is masked from 'package:tidyquant':
##
## FANG
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.12 ✔ rsample 1.3.2
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.1.0 ✔ tune 2.0.1
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.4.1 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(h2o)
## Warning: package 'h2o' was built under R version 4.5.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
start_date <- "1989-01-01"
symbols_txt <- c("CTICLAIMS", # Connecticut
"MEICLAIMS", # Maine
"MAICLAIMS", # Massachusetts
"NHICLAIMS", # New Hampshire
"RIICLAIMS", # Rhode Island
"VTICLAIMS") # Vermont
claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
mutate(symbol = fct_recode(symbol,
"Connecticut" = "CTICLAIMS",
"Maine" = "MEICLAIMS",
"Massachusetts" = "MAICLAIMS",
"New Hampshire" = "NHICLAIMS",
"Rhode Island" = "RIICLAIMS",
"Vermont" = "VTICLAIMS")) %>%
rename(claims = price)
data <- claims_tbl %>% mutate(across(where(is.character), factor))
## Split data
set.seed(1234)
data_split <- initial_split(data, strata = "symbol")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(symbol ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
options(h2o.connect_timeout = 60)
h2o.init(ip = "localhost",
port = 54321,
max_mem_size = "4G",
nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 50 minutes 59 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 2 years, 4 months and 4 days
## H2O cluster name: H2O_started_from_R_jrchi_kkr629
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.63 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.5.2 (2025-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (2 years, 4 months and 4 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
train_for_h2o <- as.h2o(train_tbl)
## | | | 0% | |======================================================================| 100%
split.h2o <- h2o.splitFrame(train_for_h2o, ratios= c(0.85), seed = 1234)
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "symbol"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs =30,
nfolds = 5,
seed = 3456
)
## | | | 0% | |=== | 4%
## 19:36:28.526: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 19:36:28.527: AutoML: XGBoost is not available; skipping it. | |========== | 14% | |================ | 23% | |======================= | 33% | |=============================== | 44% | |======================================== | 56% | |============================================== | 65% | |===================================================== | 75% | |=========================================================== | 84% | |================================================================= | 93% | |======================================================================| 100%
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id mean_per_class_error
## 1 StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628 0.4236824
## 2 GBM_1_AutoML_9_20260424_193628 0.5023956
## 3 DRF_1_AutoML_9_20260424_193628 0.5318275
## 4 GLM_1_AutoML_9_20260424_193628 0.5345654
## 5 GBM_2_AutoML_9_20260424_193628 0.5934292
## 6 GBM_3_AutoML_9_20260424_193628 0.5958248
## logloss rmse mse
## 1 0.9097385 0.5720480 0.3272389
## 2 1.0594759 0.6250693 0.3907117
## 3 1.4088168 0.7482321 0.5598513
## 4 1.2404378 0.6694644 0.4481826
## 5 1.2453307 0.6702153 0.4491885
## 6 1.2627741 0.6728839 0.4527728
##
## [9 rows x 5 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OMultinomialModel: stackedensemble
## Model ID: StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 2/2
## 3 # GBM base models (used / total) 1/1
## 4 # GLM base models (used / total) 1/1
## 5 Metalearner algorithm GLM
## 6 Metalearner fold assignment scheme Random
## 7 Metalearner nfolds 5
## 8 Metalearner fold_column NA
## 9 Custom metalearner hyperparameters None
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## Training Set Metrics:
## =====================
##
## Extract training frame with `h2o.getFrame("AutoML_9_20260424_193628_training_RTMP_sid_b889_5")`
## MSE: (Extract with `h2o.mse`) 0.2882602
## RMSE: (Extract with `h2o.rmse`) 0.5368987
## Logloss: (Extract with `h2o.logloss`) 0.8078297
## Mean Per-Class Error: 0.3385007
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 26668.08
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12023.74
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 1059 1 164 3 22
## Maine 27 539 4 210 429
## Massachusetts 67 0 1171 0 5
## New Hampshire 3 181 4 610 92
## Rhode Island 102 301 12 82 743
## Vermont 3 86 1 312 39
## Totals 1261 1108 1356 1217 1330
## Vermont Error Rate
## Connecticut 0 0.1521 = 190 / 1,249
## Maine 15 0.5596 = 685 / 1,224
## Massachusetts 0 0.0579 = 72 / 1,243
## New Hampshire 340 0.5041 = 620 / 1,230
## Rhode Island 7 0.4042 = 504 / 1,247
## Vermont 808 0.3531 = 441 / 1,249
## Totals 1170 0.3375 = 2,512 / 7,442
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.662456
## 2 2 0.902446
## 3 3 0.970975
## 4 4 0.997447
## 5 5 0.999463
## 6 6 1.000000
##
##
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## Validation Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.3296064
## RMSE: (Extract with `h2o.rmse`) 0.5741136
## Logloss: (Extract with `h2o.logloss`) 0.9291205
## Mean Per-Class Error: 0.4375751
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 4702.589
## Residual Deviance: (Extract with `h2o.residual_deviance`) 2438.012
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 173 0 31 0 6
## Maine 11 77 0 38 103
## Massachusetts 30 0 182 0 4
## New Hampshire 2 36 1 86 26
## Rhode Island 22 73 0 14 102
## Vermont 1 19 0 68 12
## Totals 239 205 214 206 253
## Vermont Error Rate
## Connecticut 0 0.1762 = 37 / 210
## Maine 6 0.6723 = 158 / 235
## Massachusetts 0 0.1574 = 34 / 216
## New Hampshire 78 0.6245 = 143 / 229
## Rhode Island 1 0.5189 = 110 / 212
## Vermont 110 0.4762 = 100 / 210
## Totals 195 0.4436 = 582 / 1,312
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.556402
## 2 2 0.863567
## 3 3 0.955793
## 4 4 0.994665
## 5 5 0.997713
## 6 6 1.000000
##
##
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## Cross-Validation Set Metrics:
## =====================
##
## Extract cross-validation frame with `h2o.getFrame("levelone_training_StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628")`
## MSE: (Extract with `h2o.mse`) 0.320287
## RMSE: (Extract with `h2o.rmse`) 0.565939
## Logloss: (Extract with `h2o.logloss`) 0.908603
## Mean Per-Class Error: 0.4102346
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 26682.16
## Residual Deviance: (Extract with `h2o.residual_deviance`) 13523.65
## AIC: (Extract with `h2o.aic`) NaN
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.590836
## 2 2 0.871809
## 3 3 0.958479
## 4 4 0.996372
## 5 5 0.999059
## 6 6 1.000000
##
##
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## accuracy 0.591788 0.014097 0.606242 0.605193
## auc NA 0.000000 NA NA
## err 0.408212 0.014097 0.393758 0.394807
## err_count 607.200000 22.509998 593.000000 593.000000
## logloss 0.909151 0.030833 0.872778 0.891336
## max_per_class_error 0.665817 0.037094 0.637097 0.682171
## mean_per_class_accuracy 0.591619 0.010668 0.601382 0.603515
## mean_per_class_error 0.408381 0.010668 0.398618 0.396485
## mse 0.320937 0.009635 0.309163 0.315768
## null_deviance 5336.431000 218.093280 5402.035000 5385.071300
## pr_auc NA 0.000000 NA NA
## r2 0.890216 0.003839 0.892349 0.895187
## residual_deviance 2704.235400 82.169100 2628.806600 2677.574700
## rmse 0.566462 0.008485 0.556024 0.561932
## cv_3_valid cv_4_valid cv_5_valid
## accuracy 0.583658 0.590457 0.573391
## auc NA NA NA
## err 0.416342 0.409543 0.426609
## err_count 642.000000 618.000000 590.000000
## logloss 0.916418 0.910153 0.955069
## max_per_class_error 0.642241 0.643725 0.723849
## mean_per_class_accuracy 0.579084 0.590197 0.583918
## mean_per_class_error 0.420915 0.409803 0.416082
## mse 0.322544 0.322057 0.335153
## null_deviance 5526.980500 5408.567400 4959.501000
## pr_auc NA NA NA
## r2 0.888662 0.889876 0.885004
## residual_deviance 2826.233600 2746.840300 2641.721400
## rmse 0.567930 0.567501 0.578924
?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel
best_model <- h2o.getModel(models_h2o@leader@model_id)
Make predictions
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 2,922 × 10
## predict Connecticut Maine Massachusetts New.Hampshire Rhode.Island Vermont
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Massach… 0.0948 8.60e-5 0.904 0.00000964 0.000688 5.39e-6
## 2 Massach… 0.181 1.44e-4 0.818 0.0000316 0.000540 1.17e-4
## 3 Connect… 0.809 3.20e-2 0.0356 0.00225 0.119 2.05e-3
## 4 Connect… 0.889 7.35e-3 0.0748 0.000344 0.0285 3.94e-4
## 5 Connect… 0.817 3.78e-2 0.0229 0.00132 0.120 7.31e-4
## 6 Connect… 0.813 3.03e-2 0.0373 0.00209 0.115 1.97e-3
## 7 Connect… 0.516 1.53e-1 0.00385 0.00784 0.316 3.58e-3
## 8 Connect… 0.861 2.91e-2 0.0172 0.00132 0.0901 9.70e-4
## 9 Rhode I… 0.344 1.92e-1 0.000979 0.0309 0.412 1.96e-2
## 10 Rhode I… 0.236 2.66e-1 0.000578 0.0591 0.413 2.55e-2
## # ℹ 2,912 more rows
## # ℹ 3 more variables: symbol <fct>, date <date>, claims <int>
Evaluate model
?h2o.performance
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_9_20260424_193628"
##
##
## $model_checksum
## [1] "4485432773037823824"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_b889_3"
##
##
## $frame_checksum
## [1] "5360492745852793936"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.777074e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.3272389
##
## $RMSE
## [1] 0.572048
##
## $nobs
## [1] 2922
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.8878038
##
## $hit_ratio_table
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.576318
## 2 2 0.879192
## 3 3 0.960643
## 4 4 0.995893
## 5 5 0.998289
## 6 6 1.000000
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 386 0 83 3 15
## Maine 16 165 0 110 189
## Massachusetts 39 0 446 0 2
## New Hampshire 2 81 0 200 47
## Rhode Island 54 197 5 33 195
## Vermont 4 47 0 126 18
## Totals 501 490 534 472 466
## Vermont Error Rate
## Connecticut 0 0.2074 = 101 / 487
## Maine 7 0.6612 = 322 / 487
## Massachusetts 0 0.0842 = 41 / 487
## New Hampshire 157 0.5893 = 287 / 487
## Rhode Island 3 0.5996 = 292 / 487
## Vermont 292 0.4004 = 195 / 487
## Totals 459 0.4237 = 1,238 / 2,922
##
##
## $logloss
## [1] 0.9097385
##
## $mean_per_class_error
## [1] 0.4236824
##
## $AUC
## [1] "NaN"
##
## $pr_auc
## [1] "NaN"
##
## $multinomial_auc_table
## NULL
##
## $multinomial_aucpr_table
## NULL
##
## $residual_deviance
## [1] 5316.512
##
## $null_deviance
## [1] 10471.23
##
## $AIC
## [1] "NaN"
##
## $loglikelihood
## [1] 0
##
## $null_degrees_of_freedom
## [1] 2921
##
## $residual_degrees_of_freedom
## [1] 2855
h2o.auc(performance_h2o)
## [1] "NaN"
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 386 0 83 3 15
## Maine 16 165 0 110 189
## Massachusetts 39 0 446 0 2
## New Hampshire 2 81 0 200 47
## Rhode Island 54 197 5 33 195
## Vermont 4 47 0 126 18
## Totals 501 490 534 472 466
## Vermont Error Rate
## Connecticut 0 0.2074 = 101 / 487
## Maine 7 0.6612 = 322 / 487
## Massachusetts 0 0.0842 = 41 / 487
## New Hampshire 157 0.5893 = 287 / 487
## Rhode Island 3 0.5996 = 292 / 487
## Vermont 292 0.4004 = 195 / 487
## Totals 459 0.4237 = 1,238 / 2,922