# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.4.3
## Warning: package 'zoo' was built under R version 4.4.3
## Warning: package 'quantmod' was built under R version 4.4.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.4.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.11 ──
## ✔ PerformanceAnalytics 2.0.8 ✔ TTR 0.24.4
## ✔ quantmod 0.4.27 ✔ xts 0.14.1
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date() masks base::as.Date()
## ✖ zoo::as.Date.numeric() masks base::as.Date.numeric()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary() masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for times series
library(timetk)
## Warning: package 'timetk' was built under R version 4.4.3
##
## Attaching package: 'timetk'
##
## The following object is masked from 'package:tidyquant':
##
## FANG
library(h2o)
## Warning: package 'h2o' was built under R version 4.4.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
##
## Attaching package: 'h2o'
##
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
##
## The following objects are masked from 'package:stats':
##
## cor, sd, var
##
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.7 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.1 ✔ yardstick 1.3.2
## ✔ recipes 1.2.1
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(dplyr)
library(lubridate)
Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.
The following is the replication of Matt Dancho’s tutorial on this page
start_date <- "1989-01-01"
symbols_txt <- c("CTICLAIMS", # Connecticut
"MEICLAIMS", # Maine
"MAICLAIMS", # Massachusetts
"NHICLAIMS", # New Hampshire
"RIICLAIMS", # Rhode Island
"VTICLAIMS") # Vermont
claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
mutate(symbol = fct_recode(symbol,
"Connecticut" = "CTICLAIMS",
"Maine" = "MEICLAIMS",
"Massachusetts" = "MAICLAIMS",
"New Hampshire" = "NHICLAIMS",
"Rhode Island" = "RIICLAIMS",
"Vermont" = "VTICLAIMS")) %>%
rename(claims = price)
claims_tbl
## # A tibble: 11,376 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,366 more rows
claims_tbl %>%
plot_time_series(.date_var = date, .value = claims)
claims_tbl
## # A tibble: 11,376 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,366 more rows
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(
.date_var = date,
.value = claims,
.facet_ncol = 2,
.facet_scales = "free",
.interactive = FALSE)
claims_tbl
## # A tibble: 11,376 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,366 more rows
claims_tbl %>%
filter_by_time(.date_var = date, .end_date = "1991-08-17
") %>%
group_by(symbol) %>%
plot_time_series_boxplot(
.date_var = date,
.value = claims,
.period = "1 year",
.facet_ncol = 2)
## Warning: There were 30 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `.value_smooth = auto_smooth(...)`.
## ℹ In group 1: `symbol = Connecticut`.
## Caused by warning in `simpleLoess()`:
## ! span too small. fewer data values than degrees of freedom.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 29 remaining warnings.
claims_tbl <- claims_tbl %>%
mutate(date = as.Date(date))
claims_tbl %>%
group_by(symbol) %>%
plot_time_series_regression(
.date_var = date,
.facet_ncol = 2,
.formula = log(claims) ~ as.numeric(date) + lubridate::month(date, label = TRUE),
.show_summary = FALSE
)
claims_tbl %>%
group_by(symbol) %>%
plot_acf_diagnostics(
date, claims,
.lags = "7 days")
claims_tbl %>%
group_by(symbol) %>%
plot_acf_diagnostics(
date, claims,
.ccf_vars = c(date, claims),
.lags = "3 months")
claims_tbl %>%
plot_seasonal_diagnostics(date, claims)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
## symbol n
## <fct> <int>
## 1 Connecticut 1896
## 2 Massachusetts 1896
## 3 Maine 1896
## 4 New Hampshire 1896
## 5 Rhode Island 1896
## 6 Vermont 1896
claims_tbl %>%
group_by(symbol) %>%
plot_seasonal_diagnostics(date, claims)
claims_tbl %>%
group_by(symbol) %>%
plot_stl_diagnostics(
date, claims,
.feature_set = c("observed", "season", "trend", "remainder")
)
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
claims_tbl %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, claims = sum(claims), .by = "quarter") %>%
plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)
claims_tbl %>%
group_by(symbol) %>%
filter_by_time(.date_var = date,
.start_date = "1989-01-07
",
.end_date = "1995-03-18
") %>%
plot_time_series(date, claims, .facet_ncol = 2)
claims_tbl %>%
group_by(symbol) %>%
pad_by_time(date, .by = "day", .pad_value = 0)
## # A tibble: 79,596 × 3
## # Groups: symbol [6]
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-08 0
## 3 Connecticut 1989-01-09 0
## 4 Connecticut 1989-01-10 0
## 5 Connecticut 1989-01-11 0
## 6 Connecticut 1989-01-12 0
## 7 Connecticut 1989-01-13 0
## 8 Connecticut 1989-01-14 6503
## 9 Connecticut 1989-01-15 0
## 10 Connecticut 1989-01-16 0
## # ℹ 79,586 more rows
claims_tbl %>%
head(10) %>%
mutate(rolling_avg_2 = slidify_vec(claims, mean,
.period = 2,
.align = "right",
.partial = TRUE))
## # A tibble: 10 × 4
## symbol date claims rolling_avg_2
## <fct> <date> <int> <dbl>
## 1 Connecticut 1989-01-07 8345 8345
## 2 Connecticut 1989-01-14 6503 7424
## 3 Connecticut 1989-01-21 3821 5162
## 4 Connecticut 1989-01-28 4663 4242
## 5 Connecticut 1989-02-04 4162 4412.
## 6 Connecticut 1989-02-11 4337 4250.
## 7 Connecticut 1989-02-18 4079 4208
## 8 Connecticut 1989-02-25 3556 3818.
## 9 Connecticut 1989-03-04 3826 3691
## 10 Connecticut 1989-03-11 3515 3670.
# Make the rolling function
roll_avg_30 <- slidify(.f = mean, .period = 30, .align = "center", .partial = TRUE)
# Apply the rolling function and plot
claims_tbl %>%
select(symbol, date, claims) %>%
group_by(symbol) %>%
# Apply Sliding Function
mutate(rolling_avg_30 = roll_avg_30(claims)) %>%
tidyr::pivot_longer(
cols = c(claims, rolling_avg_30),
names_to = "name",
values_to = "value"
) %>%
plot_time_series(
.date_var = date,
.value = value, # <- This is now the correct value column
.color_var = name, # <- This maps "claims" vs "rolling_avg_30"
.facet_ncol = 2,
.smooth = FALSE,
.interactive = FALSE
)
set.seed(1234)
data_split <- initial_split(claims_tbl, strata = "symbol")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)
recipe_obj <- recipe(symbol ~ ., data = train_tbl) %>%
# Remove zero variance variables
step_zv(all_predictors())
# Intiialize h2o
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 8 minutes 43 seconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 1 year, 4 months and 19 days
## H2O cluster name: H2O_started_from_R_tch30_obs789
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.90 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.4.2 (2024-10-31 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (1 year, 4 months and 19 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
## | | | 0% | |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o <- as.h2o(test_tbl)
## | | | 0% | |======================================================================| 100%
y <- "symbol"
x <- setdiff(names(train_tbl), y)
models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
# max_runtime_secs = 30,
max_models = 10,
exclude_algos = "DeepLearning",
nfolds = 5,
seed = 3456
)
## | | | 0%
## 22:57:25.724: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 22:57:25.725: AutoML: XGBoost is not available; skipping it. | |=== | 4% | |===== | 8% | |========= | 12% | |=============== | 21% | |================== | 25% | |======================= | 33% | |============================================ | 62% | |=============================================== | 67% | |================================================== | 71% | |======================================================================| 100%
Examine the output of h2o.automl
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name" "leader" "leaderboard" "event_log"
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
## model_id mean_per_class_error
## 1 StackedEnsemble_AllModels_1_AutoML_2_20250509_225725 0.3491561
## 2 GBM_grid_1_AutoML_2_20250509_225725_model_1 0.4310830
## 3 StackedEnsemble_BestOfFamily_1_AutoML_2_20250509_225725 0.4324895
## 4 GBM_1_AutoML_2_20250509_225725 0.5112518
## 5 XRT_1_AutoML_2_20250509_225725 0.5161744
## 6 GLM_1_AutoML_2_20250509_225725 0.5464135
## logloss rmse mse
## 1 0.7816055 0.5222620 0.2727576
## 2 0.9005072 0.5694990 0.3243291
## 3 0.8897671 0.5657805 0.3201075
## 4 1.0557578 0.6259757 0.3918456
## 5 1.2948684 0.7145706 0.5106112
## 6 1.2291510 0.6686307 0.4470670
##
## [12 rows x 5 columns]
models_h2o@leader
## Model Details:
## ==============
##
## H2OMultinomialModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_1_AutoML_2_20250509_225725
## Model Summary for Stacked Ensemble:
## key value
## 1 Stacking strategy cross_validation
## 2 Number of base models (used / total) 10/10
## 3 # GBM base models (used / total) 7/7
## 4 # DRF base models (used / total) 2/2
## 5 # GLM base models (used / total) 1/1
## 6 Metalearner algorithm GLM
## 7 Metalearner fold assignment scheme Random
## 8 Metalearner nfolds 5
## 9 Metalearner fold_column NA
## 10 Custom metalearner hyperparameters None
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on training data. **
##
## Training Set Metrics:
## =====================
##
## Extract training frame with `h2o.getFrame("AutoML_2_20250509_225725_training_RTMP_sid_9f1b_5")`
## MSE: (Extract with `h2o.mse`) 0.3410215
## RMSE: (Extract with `h2o.rmse`) 0.5839705
## Logloss: (Extract with `h2o.logloss`) 0.9581399
## Mean Per-Class Error: 0.4816274
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 26062.29
## Residual Deviance: (Extract with `h2o.residual_deviance`) 13937.1
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 929 10 224 4 41
## Maine 34 270 4 237 599
## Massachusetts 82 0 1115 1 2
## New Hampshire 5 220 3 408 116
## Rhode Island 103 601 6 77 405
## Vermont 1 121 0 442 33
## Totals 1154 1222 1352 1169 1196
## Vermont Error Rate
## Connecticut 0 0.2310 = 279 / 1,208
## Maine 63 0.7763 = 937 / 1,207
## Massachusetts 0 0.0708 = 85 / 1,200
## New Hampshire 462 0.6639 = 806 / 1,214
## Rhode Island 16 0.6647 = 803 / 1,208
## Vermont 639 0.4830 = 597 / 1,236
## Totals 1180 0.4822 = 3,507 / 7,273
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.517806
## 2 2 0.847656
## 3 3 0.952427
## 4 4 0.996838
## 5 5 0.999725
## 6 6 1.000000
##
##
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on validation data. **
##
## Validation Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.2743986
## RMSE: (Extract with `h2o.rmse`) 0.5238307
## Logloss: (Extract with `h2o.logloss`) 0.7980875
## Mean Per-Class Error: 0.3416047
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 4513.042
## Residual Deviance: (Extract with `h2o.residual_deviance`) 2009.584
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 173 1 37 1 2
## Maine 5 101 0 31 69
## Massachusetts 16 0 206 0 0
## New Hampshire 1 28 1 106 18
## Rhode Island 25 48 3 8 125
## Vermont 2 15 0 43 5
## Totals 222 193 247 189 219
## Vermont Error Rate
## Connecticut 0 0.1916 = 41 / 214
## Maine 9 0.5302 = 114 / 215
## Massachusetts 0 0.0721 = 16 / 222
## New Hampshire 54 0.4904 = 102 / 208
## Rhode Island 5 0.4159 = 89 / 214
## Vermont 121 0.3495 = 65 / 186
## Totals 189 0.3392 = 427 / 1,259
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.660842
## 2 2 0.909452
## 3 3 0.968229
## 4 4 0.996029
## 5 5 0.998411
## 6 6 1.000000
##
##
##
##
## H2OMultinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## Cross-Validation Set Metrics:
## =====================
##
## Extract cross-validation frame with `h2o.getFrame("levelone_training_StackedEnsemble_AllModels_1_AutoML_2_20250509_225725")`
## MSE: (Extract with `h2o.mse`) 0.2765501
## RMSE: (Extract with `h2o.rmse`) 0.5258803
## Logloss: (Extract with `h2o.logloss`) 0.7939465
## Mean Per-Class Error: 0.3485276
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 26074.9
## Residual Deviance: (Extract with `h2o.residual_deviance`) 11548.75
## AIC: (Extract with `h2o.aic`) NaN
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.651038
## 2 2 0.900591
## 3 3 0.972226
## 4 4 0.996975
## 5 5 0.999313
## 6 6 1.000000
##
##
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## accuracy 0.648349 0.011009 0.650643 0.635061
## auc NA 0.000000 NA NA
## err 0.351651 0.011009 0.349357 0.364939
## err_count 511.200000 18.116291 516.000000 535.000000
## logloss 0.793320 0.017679 0.766899 0.812596
## max_per_class_error 0.524061 0.021964 0.516260 0.552941
## mean_per_class_accuracy 0.648859 0.010306 0.646281 0.634626
## mean_per_class_error 0.351141 0.010306 0.353719 0.365374
## mse 0.276457 0.006588 0.267557 0.283722
## null_deviance 5214.980000 217.809830 5298.145500 5255.961000
## pr_auc NA 0.000000 NA NA
## r2 0.905513 0.001989 0.907583 0.906564
## residual_deviance 2307.248500 90.852190 2265.419200 2382.530500
## rmse 0.525762 0.006270 0.517259 0.532656
## cv_3_valid cv_4_valid cv_5_valid
## accuracy 0.656478 0.660569 0.638992
## auc NA NA NA
## err 0.343522 0.339431 0.361008
## err_count 517.000000 501.000000 487.000000
## logloss 0.794649 0.787228 0.805226
## max_per_class_error 0.502165 0.507812 0.541126
## mean_per_class_accuracy 0.655293 0.661805 0.646288
## mean_per_class_error 0.344707 0.338195 0.353712
## mse 0.275157 0.273706 0.282143
## null_deviance 5394.546000 5289.844000 4836.404000
## pr_auc NA NA NA
## r2 0.904773 0.906199 0.902444
## residual_deviance 2391.894500 2323.898700 2172.500000
## rmse 0.524554 0.523169 0.531171
# Ensure the model is retrieved correctly
model <- h2o.getModel("StackedEnsemble_AllModels_1_AutoML_1_20250509_224845")
# Save the model, overwriting if it already exists
saved_model_path <- h2o.saveModel(model, path = "h2o models/", force = TRUE)
# Load the model back from the same path
best_model <- h2o.loadModel(saved_model_path)
predictions <- h2o.predict(best_model, newdata = test_h2o)
## | | | 0% | |======================================================================| 100%
predictions_tbl <- predictions %>%
as_tibble()
predictions_tbl %>%
bind_cols(test_tbl)
## # A tibble: 2,844 × 10
## predict Connecticut Maine Massachusetts New.Hampshire Rhode.Island Vermont
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Massach… 0.102 2.79e-4 0.896 0.0000135 0.00170 2.91e-5
## 2 Connect… 0.757 1.10e-3 0.229 0.0000747 0.0117 4.75e-4
## 3 Connect… 0.671 1.87e-1 0.00129 0.00107 0.139 3.91e-4
## 4 Connect… 0.586 1.34e-1 0.00483 0.00576 0.264 5.91e-3
## 5 Connect… 0.938 2.01e-2 0.00458 0.000147 0.0372 1.67e-4
## 6 Connect… 0.836 1.02e-1 0.000394 0.00121 0.0605 6.47e-4
## 7 Connect… 0.971 1.18e-2 0.00292 0.0000721 0.0142 5.26e-5
## 8 Connect… 0.474 2.98e-1 0.00132 0.00861 0.215 1.89e-3
## 9 Connect… 0.838 6.38e-2 0.000757 0.000868 0.0957 6.92e-4
## 10 Rhode I… 0.318 2.46e-1 0.000357 0.0157 0.416 4.29e-3
## # ℹ 2,834 more rows
## # ℹ 3 more variables: symbol <fct>, date <date>, claims <int>
performance_h2o <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train" "on_valid" "on_xval" "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
##
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
##
## $model$`__meta`$schema_type
## [1] "Key<Model>"
##
##
## $model$name
## [1] "StackedEnsemble_AllModels_1_AutoML_1_20250509_224845"
##
## $model$type
## [1] "Key<Model>"
##
## $model$URL
## [1] "/3/Models/StackedEnsemble_AllModels_1_AutoML_1_20250509_224845"
##
##
## $model_checksum
## [1] "-8918001974282627472"
##
## $frame
## $frame$name
## [1] "test_tbl_sid_9f1b_3"
##
##
## $frame_checksum
## [1] "4049197626323489370"
##
## $description
## NULL
##
## $scoring_time
## [1] 1.746846e+12
##
## $predictions
## NULL
##
## $MSE
## [1] 0.2727576
##
## $RMSE
## [1] 0.522262
##
## $nobs
## [1] 2844
##
## $custom_metric_name
## NULL
##
## $custom_metric_value
## [1] 0
##
## $r2
## [1] 0.9064831
##
## $hit_ratio_table
## Top-6 Hit Ratios:
## k hit_ratio
## 1 1 0.650844
## 2 2 0.903657
## 3 3 0.970113
## 4 4 0.996484
## 5 5 0.998945
## 6 6 1.000000
##
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
##
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
##
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
##
##
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 385 1 74 1 13
## Maine 8 237 1 70 134
## Massachusetts 27 0 445 0 2
## New Hampshire 1 62 0 246 44
## Rhode Island 37 150 3 27 250
## Vermont 3 49 0 110 24
## Totals 461 499 523 454 467
## Vermont Error Rate
## Connecticut 0 0.1878 = 89 / 474
## Maine 24 0.5000 = 237 / 474
## Massachusetts 0 0.0612 = 29 / 474
## New Hampshire 121 0.4810 = 228 / 474
## Rhode Island 7 0.4726 = 224 / 474
## Vermont 288 0.3924 = 186 / 474
## Totals 440 0.3492 = 993 / 2,844
##
##
## $logloss
## [1] 0.7816055
##
## $mean_per_class_error
## [1] 0.3491561
##
## $AUC
## [1] "NaN"
##
## $pr_auc
## [1] "NaN"
##
## $multinomial_auc_table
## NULL
##
## $multinomial_aucpr_table
## NULL
##
## $residual_deviance
## [1] 4445.772
##
## $null_deviance
## [1] 10191.78
##
## $AIC
## [1] "NaN"
##
## $loglikelihood
## [1] 0
##
## $null_degrees_of_freedom
## [1] 2843
##
## $residual_degrees_of_freedom
## [1] 2608
h2o.auc(performance_h2o)
## [1] "NaN"
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut 385 1 74 1 13
## Maine 8 237 1 70 134
## Massachusetts 27 0 445 0 2
## New Hampshire 1 62 0 246 44
## Rhode Island 37 150 3 27 250
## Vermont 3 49 0 110 24
## Totals 461 499 523 454 467
## Vermont Error Rate
## Connecticut 0 0.1878 = 89 / 474
## Maine 24 0.5000 = 237 / 474
## Massachusetts 0 0.0612 = 29 / 474
## New Hampshire 121 0.4810 = 228 / 474
## Rhode Island 7 0.4726 = 224 / 474
## Vermont 288 0.3924 = 186 / 474
## Totals 440 0.3492 = 993 / 2,844
performance_h2o@metrics$thresholds_and_metric_scores
## NULL
I would say the the h2o method was far quicker and quite a bit easier to do as it required less tweaks. As for the effectiveness of the models accuracy I found that they were a bit off compared to the previous plots in a few areas.