# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Warning: package 'xts' was built under R version 4.5.3
## Warning: package 'zoo' was built under R version 4.5.3
## Warning: package 'quantmod' was built under R version 4.5.3
## Warning: package 'TTR' was built under R version 4.5.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.5.3
## ── Attaching core tidyquant packages ─────────────────────── tidyquant 1.0.12 ──
## ✔ PerformanceAnalytics 2.1.0      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.28     ✔ xts                  0.14.2
## ── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for times series
library(timetk)
## Warning: package 'timetk' was built under R version 4.5.3
## 
## Attaching package: 'timetk'
## 
## The following object is masked from 'package:tidyquant':
## 
##     FANG

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl
## # A tibble: 11,676 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,666 more rows
claims_tbl %>%
  plot_time_series(.date_var = date, .value = claims)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
##   Please report the issue at
##   <https://github.com/business-science/timetk/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • colour : "Legend"
claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(
    .date_var = date,
    .value    = log(claims),
    .facet_ncol   = 2,
    .facet_scales = "free_y",
    .color_var    = year(date)
  )
claims_tbl %>%
  plot_time_series(
    date, claims,
    .color_var = month(date, label = TRUE),

    # Returns static ggplot
    .interactive = FALSE,

    # Customize
    .title     = "New England Initial Unemployment Claims",
    .x_lab     = "Date",
    .y_lab     = "Initial Claims",
    .color_lab = "Month"
  )

Box plots

claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1946
## 2 Massachusetts  1946
## 3 Maine          1946
## 4 New Hampshire  1946
## 5 Rhode Island   1946
## 6 Vermont        1946
claims_tbl %>%
  filter_by_time(.date_var = date, .end_date = "1995") %>%
  group_by(symbol) %>%
  plot_time_series_boxplot(
    .date_var   = date,
    .value      = claims,
    .period     = "1 year",
    .facet_ncol = 2
  )
## Ignoring unknown labels:
## • colour : "Legend"

Regression plots

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series_regression(
    .date_var   = date,
    .facet_ncol = 2,
    .formula    = log(claims) ~ as.numeric(date) + month(date, label = TRUE),
    .show_summary = FALSE
  )

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
  group_by(symbol) %>%
  plot_acf_diagnostics(
    date, claims,
    .lags = "2 years")

Seasonality

claims_tbl %>%
  plot_seasonal_diagnostics(date, claims)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1946
## 2 Massachusetts  1946
## 3 Maine          1946
## 4 New Hampshire  1946
## 5 Rhode Island   1946
## 6 Vermont        1946
claims_tbl %>%
  group_by(symbol) %>%
  plot_seasonal_diagnostics(date, claims)

STL Diagnostics

claims_tbl %>%
  group_by(symbol) %>%
  plot_stl_diagnostics(
    date, claims,
    .feature_set = c("observed", "season", "trend", "remainder")
  )
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, claims = sum(claims), .by = "quarter") %>%
  plot_time_series(
    date, claims,
    .facet_ncol = 2,
    .interactive = FALSE
  )
## Ignoring unknown labels:
## • colour : "Legend"

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, claims = mean(claims), .by = "month") %>%
  plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)
## Ignoring unknown labels:
## • colour : "Legend"

Filter By Time

claims_tbl %>%
  group_by(symbol) %>%
  filter_by_time(.date_var = date,
                 .start_date = "2008-01",
                 .end_date = "2010") %>%
  plot_time_series(date, claims, .facet_ncol = 2)
## Ignoring unknown labels:
## • colour : "Legend"

Padding Data

claims_tbl %>%
  group_by(symbol) %>%
  pad_by_time(date, .by = "week", .pad_value = 0)
## # A tibble: 11,676 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,666 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
  head(10) %>%
  mutate(rolling_avg_4 = slidify_vec(claims, mean,
                                     .period = 4,
                                     .align = "right",
                                     .partial = TRUE))
## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_4
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345         8345 
##  2 Connecticut 1989-01-14   6503         7424 
##  3 Connecticut 1989-01-21   3821         6223 
##  4 Connecticut 1989-01-28   4663         5833 
##  5 Connecticut 1989-02-04   4162         4787.
##  6 Connecticut 1989-02-11   4337         4246.
##  7 Connecticut 1989-02-18   4079         4310.
##  8 Connecticut 1989-02-25   3556         4034.
##  9 Connecticut 1989-03-04   3826         3950.
## 10 Connecticut 1989-03-11   3515         3744
# Rolling regressions are easy to implement using `.unlist = FALSE`
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), .period = 26,
                   .unlist = FALSE, .align = "right")

claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>%
  mutate(lag_claims = lag(claims),
         numeric_date = as.numeric(date)) %>%
  filter(!is.na(lag_claims)) %>%
  # Apply rolling regression
  mutate(rolling_lm = lm_roll(claims, lag_claims, numeric_date)) %>%
  filter(!is.na(rolling_lm))
## # A tibble: 11,520 × 6
## # Groups:   symbol [6]
##    symbol      date       claims lag_claims numeric_date rolling_lm
##    <fct>       <date>      <int>      <int>        <dbl> <list>    
##  1 Connecticut 1989-07-08   7010       5232         7128 <lm>      
##  2 Connecticut 1989-07-15   5630       7010         7135 <lm>      
##  3 Connecticut 1989-07-22   4590       5630         7142 <lm>      
##  4 Connecticut 1989-07-29   4929       4590         7149 <lm>      
##  5 Connecticut 1989-08-05   7029       4929         7156 <lm>      
##  6 Connecticut 1989-08-12   3704       7029         7163 <lm>      
##  7 Connecticut 1989-08-19   4082       3704         7170 <lm>      
##  8 Connecticut 1989-08-26   3373       4082         7177 <lm>      
##  9 Connecticut 1989-09-02   2902       3373         7184 <lm>      
## 10 Connecticut 1989-09-09   2856       2902         7191 <lm>      
## # ℹ 11,510 more rows
## Model

library(h2o)
## Warning: package 'h2o' was built under R version 4.5.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.12     ✔ rsample      1.3.2 
## ✔ dials        1.4.2      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.0.1 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.1      ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 minutes 29 seconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    2 years, 4 months and 7 days 
##     H2O cluster name:           H2O_started_from_R_javony_gwl568 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.40 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.5.2 (2025-10-31 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (2 years, 4 months and 7 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
claims_model_tbl <- claims_tbl %>%
  mutate(
    claims_class = if_else(claims > median(claims, na.rm = TRUE), "High", "Low"),
    claims_class = as.factor(claims_class),
    symbol = as.factor(symbol),
    year = lubridate::year(date),
    month = lubridate::month(date)
  ) %>%
  select(-date)

set.seed(2345)

split <- initial_split(claims_model_tbl, prop = 0.80, strata = claims_class)

train_tbl <- training(split)
test_tbl  <- testing(split)

split.h2o <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 2345)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
train_h2o <- split.h2o[[1]]
valid_h2o <- split.h2o[[2]]
test_h2o  <- as.h2o(test_tbl)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
y <- "claims_class"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
  x = x,
  y = y,
  training_frame    = train_h2o,
  validation_frame  = valid_h2o,
  leaderboard_frame = test_h2o,
  max_models        = 5,
  exclude_algos     = "DeepLearning",
  nfolds            = 3,
  seed              = 3456
)
##   |                                                                              |                                                                      |   0%  |                                                                              |===                                                                   |   4%
## 08:57:44.883: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 08:57:44.887: AutoML: XGBoost is not available; skipping it.  |                                                                              |=====                                                                 |   8%  |                                                                              |=========                                                             |  12%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  17%  |                                                                              |=============                                                         |  18%  |                                                                              |==============                                                        |  20%  |                                                                              |====================                                                  |  29%  |                                                                              |======================================================================| 100%
## Examine the output of h2o.automl

models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
##                                                 model_id auc      logloss aucpr
## 1 StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744   1 1.666746e-04     1
## 2                          DRF_1_AutoML_2_20260428_85744   1 3.225557e-03     1
## 3                          GBM_1_AutoML_2_20260428_85744   1 2.416923e-05     1
## 4                          GBM_3_AutoML_2_20260428_85744   1 3.563682e-12     1
## 5    StackedEnsemble_AllModels_1_AutoML_2_20260428_85744   1 1.033902e-06     1
## 6                          GBM_2_AutoML_2_20260428_85744   1 1.392626e-10     1
##   mean_per_class_error         rmse          mse
## 1                    0 1.813247e-03 3.287864e-06
## 2                    0 2.087159e-02 4.356234e-04
## 3                    0 2.065652e-04 4.266918e-08
## 4                    0 6.019183e-11 3.623057e-21
## 5                    0 1.829544e-06 3.347232e-12
## 6                    0 4.540186e-09 2.061329e-17
## 
## [7 rows x 7 columns]
models_h2o@leader
## Model Details:
## ==============
## 
## H2OBinomialModel: stackedensemble
## Model ID:  StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744 
## Model Summary for Stacked Ensemble: 
##                                     key            value
## 1                     Stacking strategy cross_validation
## 2  Number of base models (used / total)              2/3
## 3      # GBM base models (used / total)              1/1
## 4      # DRF base models (used / total)              1/1
## 5      # GLM base models (used / total)              0/1
## 6                 Metalearner algorithm              GLM
## 7    Metalearner fold assignment scheme           Random
## 8                    Metalearner nfolds                3
## 9               Metalearner fold_column               NA
## 10   Custom metalearner hyperparameters             None
## 
## 
## H2OBinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  2.916358e-07
## RMSE:  0.0005400331
## LogLoss:  5.164982e-05
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High  Low    Error     Rate
## High   3961    0 0.000000  =0/3961
## Low       0 3997 0.000000  =0/3997
## Totals 3961 3997 0.000000  =0/7958
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.991413    1.000000  95
## 2                       max f2  0.991413    1.000000  95
## 3                 max f0point5  0.991413    1.000000  95
## 4                 max accuracy  0.991413    1.000000  95
## 5                max precision  1.000000    1.000000   0
## 6                   max recall  0.991413    1.000000  95
## 7              max specificity  1.000000    1.000000   0
## 8             max absolute_mcc  0.991413    1.000000  95
## 9   max min_per_class_accuracy  0.991413    1.000000  95
## 10 max mean_per_class_accuracy  0.991413    1.000000  95
## 11                     max tns  1.000000 3961.000000   0
## 12                     max fns  1.000000  102.000000   0
## 13                     max fps  0.000000 3961.000000 399
## 14                     max tps  0.991413 3997.000000  95
## 15                     max tnr  1.000000    1.000000   0
## 16                     max fnr  1.000000    0.025519   0
## 17                     max fpr  0.000000    1.000000 399
## 18                     max tpr  0.991413    1.000000  95
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on validation data. **
## 
## MSE:  1.401988e-05
## RMSE:  0.003744313
## LogLoss:  0.000268815
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  1
## Gini:  1
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High Low    Error     Rate
## High    709   0 0.000000   =0/709
## Low       0 673 0.000000   =0/673
## Totals  709 673 0.000000  =0/1382
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.990220   1.000000  25
## 2                       max f2  0.990220   1.000000  25
## 3                 max f0point5  0.990220   1.000000  25
## 4                 max accuracy  0.990220   1.000000  25
## 5                max precision  1.000000   1.000000   0
## 6                   max recall  0.990220   1.000000  25
## 7              max specificity  1.000000   1.000000   0
## 8             max absolute_mcc  0.990220   1.000000  25
## 9   max min_per_class_accuracy  0.990220   1.000000  25
## 10 max mean_per_class_accuracy  0.990220   1.000000  25
## 11                     max tns  1.000000 709.000000   0
## 12                     max fns  1.000000  26.000000   0
## 13                     max fps  0.000000 709.000000 399
## 14                     max tps  0.990220 673.000000  25
## 15                     max tnr  1.000000   1.000000   0
## 16                     max fnr  1.000000   0.038633   0
## 17                     max fpr  0.000000   1.000000 399
## 18                     max tpr  0.990220   1.000000  25
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 3-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.0003900759
## RMSE:  0.01975034
## LogLoss:  0.001823021
## Mean Per-Class Error:  0.0002501876
## AUC:  0.9999996
## AUCPR:  0.9999996
## Gini:  0.9999992
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        High  Low    Error     Rate
## High   3961    0 0.000000  =0/3961
## Low       2 3995 0.000500  =2/3997
## Totals 3963 3995 0.000251  =2/7958
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.396117    0.999750 144
## 2                       max f2  0.244490    0.999800 150
## 3                 max f0point5  0.396117    0.999900 144
## 4                 max accuracy  0.396117    0.999749 144
## 5                max precision  1.000000    1.000000   0
## 6                   max recall  0.244490    1.000000 150
## 7              max specificity  1.000000    1.000000   0
## 8             max absolute_mcc  0.396117    0.999497 144
## 9   max min_per_class_accuracy  0.396117    0.999500 144
## 10 max mean_per_class_accuracy  0.396117    0.999750 144
## 11                     max tns  1.000000 3961.000000   0
## 12                     max fns  1.000000  197.000000   0
## 13                     max fps  0.000000 3961.000000 399
## 14                     max tps  0.244490 3997.000000 150
## 15                     max tnr  1.000000    1.000000   0
## 16                     max fnr  1.000000    0.049287   0
## 17                     max fpr  0.000000    1.000000 399
## 18                     max tpr  0.244490    1.000000 150
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##               mean       sd cv_1_valid cv_2_valid cv_3_valid
## accuracy  0.999873 0.000221   1.000000   1.000000   0.999618
## auc       1.000000 0.000000   1.000000   1.000000   0.999999
## err       0.000127 0.000221   0.000000   0.000000   0.000382
## err_count 0.333333 0.577350   0.000000   0.000000   1.000000
## f0point5  0.999800 0.000347   1.000000   1.000000   0.999400
## 
## ---
##                       mean       sd cv_1_valid cv_2_valid cv_3_valid
## precision         0.999750 0.000433   1.000000   1.000000   0.999250
## r2                0.998436 0.000456   0.998800   0.998583   0.997925
## recall            1.000000 0.000000   1.000000   1.000000   1.000000
## residual_deviance 9.671735 1.603306   9.527632   8.145347  11.342228
## rmse              0.019638 0.002818   0.017317   0.018824   0.022774
## specificity       0.999741 0.000449   1.000000   1.000000   0.999222
## Save and Load

?h2o.getModel
## starting httpd help server ... done
?h2o.saveModel
?h2o.loadModel

dir.create("h2o_models", showWarnings = FALSE)

h2o.saveModel(
  object = models_h2o@leader,
  path = "h2o_models/",
  force = TRUE
)
## [1] "C:\\Users\\javony\\Desktop\\PSU_DATA3100\\h2o_models\\StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744"
best_model <- models_h2o@leader
## Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
predictions_tbl <- predictions %>%
  as_tibble()

predictions_tbl %>%
  bind_cols(test_tbl)
## # A tibble: 2,336 × 8
##    predict  High      Low symbol      claims claims_class  year month
##    <fct>   <dbl>    <dbl> <fct>        <int> <fct>        <dbl> <dbl>
##  1 High    1.000 1.19e- 9 Connecticut   4162 High          1989     2
##  2 High    1.000 1.58e- 9 Connecticut   2886 High          1989     3
##  3 High    1.000 1.47e- 9 Connecticut   2694 High          1989     4
##  4 High    1.000 1.43e- 9 Connecticut   3224 High          1989     5
##  5 High    1.000 1.38e- 9 Connecticut   2663 High          1989     6
##  6 High    1.000 1.26e- 9 Connecticut   5630 High          1989     7
##  7 High    1.000 1.21e- 9 Connecticut   7029 High          1989     8
##  8 High    1.000 1.20e- 9 Connecticut   3025 High          1989     9
##  9 High    1.000 1.29e- 9 Connecticut   3454 High          1989    10
## 10 High    1.000 9.69e-10 Connecticut   4392 High          1990     5
## # ℹ 2,326 more rows
## Evaluate model

?h2o.performance

performance_h2o <- h2o.performance(best_model, newdata = test_h2o)

typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/StackedEnsemble_BestOfFamily_1_AutoML_2_20260428_85744"
## 
## 
## $model_checksum
## [1] "-1123968970341071515"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_8b64_3"
## 
## 
## $frame_checksum
## [1] 5.300821e+15
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.777381e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 3.287864e-06
## 
## $RMSE
## [1] 0.001813247
## 
## $nobs
## [1] 2336
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.9999868
## 
## $logloss
## [1] 0.0001666746
## 
## $AUC
## [1] 1
## 
## $pr_auc
## [1] 1
## 
## $Gini
## [1] 1
## 
## $mean_per_class_error
## [1] 0
## 
## $domain
## [1] "High" "Low" 
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        High  Low  Error        Rate
## High   1168    0 0.0000 = 0 / 1,168
## Low       0 1168 0.0000 = 0 / 1,168
## Totals 1168 1168 0.0000 = 0 / 2,336
## 
## 
## $thresholds_and_metric_scores
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  1.000000 0.979467 0.967547 0.991684 0.979880  1.000000 0.959760    1.000000
## 2  0.999991 0.979913 0.968243 0.991867 0.980308  1.000000 0.960616    1.000000
## 3  0.999991 0.980358 0.968939 0.992049 0.980736  1.000000 0.961473    1.000000
## 4  0.999969 0.980803 0.969634 0.992232 0.981164  1.000000 0.962329    1.000000
## 5  0.999966 0.981247 0.970329 0.992414 0.981592  1.000000 0.963185    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy  tns fns fps  tps
## 1     0.960538               0.959760                0.979880 1168  47   0 1121
## 2     0.961362               0.960616                0.980308 1168  46   0 1122
## 3     0.962187               0.961473                0.980736 1168  45   0 1123
## 4     0.963012               0.962329                0.981164 1168  44   0 1124
## 5     0.963838               0.963185                0.981592 1168  43   0 1125
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.040240 0.000000 0.959760   0
## 2 1.000000 0.039384 0.000000 0.960616   1
## 3 1.000000 0.038527 0.000000 0.961473   2
## 4 1.000000 0.037671 0.000000 0.962329   3
## 5 1.000000 0.036815 0.000000 0.963185   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.000000 0.672811 0.837156 0.562404 0.513699  0.506944 1.000000
## 396  0.000000 0.671072 0.836077 0.560461 0.509846  0.504972 1.000000
## 397  0.000000 0.669149 0.834882 0.558317 0.505565  0.502798 1.000000
## 398  0.000000 0.668575 0.834524 0.557678 0.504281  0.502150 1.000000
## 399  0.000000 0.668001 0.834167 0.557039 0.502997  0.501503 1.000000
## 400  0.000000 0.666667 0.833333 0.555556 0.500000  0.500000 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.027397     0.117851               0.027397                0.513699  32
## 396    0.019692     0.099719               0.019692                0.509846  23
## 397    0.011130     0.074808               0.011130                0.505565  13
## 398    0.008562     0.065568               0.008562                0.504281  10
## 399    0.005993     0.054823               0.005993                0.502997   7
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns  fps  tps      tnr      fnr      fpr      tpr idx
## 395   0 1136 1168 0.027397 0.000000 0.972603 1.000000 394
## 396   0 1145 1168 0.019692 0.000000 0.980308 1.000000 395
## 397   0 1155 1168 0.011130 0.000000 0.988870 1.000000 396
## 398   0 1158 1168 0.008562 0.000000 0.991438 1.000000 397
## 399   0 1161 1168 0.005993 0.000000 0.994007 1.000000 398
## 400   0 1168 1168 0.000000 0.000000 1.000000 1.000000 399
## 
## $max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold       value idx
## 1                       max f1  0.980389    1.000000  47
## 2                       max f2  0.980389    1.000000  47
## 3                 max f0point5  0.980389    1.000000  47
## 4                 max accuracy  0.980389    1.000000  47
## 5                max precision  1.000000    1.000000   0
## 6                   max recall  0.980389    1.000000  47
## 7              max specificity  1.000000    1.000000   0
## 8             max absolute_mcc  0.980389    1.000000  47
## 9   max min_per_class_accuracy  0.980389    1.000000  47
## 10 max mean_per_class_accuracy  0.980389    1.000000  47
## 11                     max tns  1.000000 1168.000000   0
## 12                     max fns  1.000000   47.000000   0
## 13                     max fps  0.000000 1168.000000 399
## 14                     max tps  0.980389 1168.000000  47
## 15                     max tnr  1.000000    1.000000   0
## 16                     max fnr  1.000000    0.040240   0
## 17                     max fpr  0.000000    1.000000 399
## 18                     max tpr  0.980389    1.000000  47
## 
## $gains_lift_table
## Gains/Lift Table: Avg response rate: 50.00 %, avg score: 50.01 %
##    group cumulative_data_fraction lower_threshold     lift cumulative_lift
## 1      1               0.01027397        1.000000 2.000000        2.000000
## 2      2               0.02011986        1.000000 2.000000        2.000000
## 3      3               0.03039384        1.000000 2.000000        2.000000
## 4      4               0.04023973        1.000000 2.000000        2.000000
## 5      5               0.05008562        1.000000 2.000000        2.000000
## 6      6               0.10017123        1.000000 2.000000        2.000000
## 7      7               0.15025685        1.000000 2.000000        2.000000
## 8      8               0.20034247        1.000000 2.000000        2.000000
## 9      9               0.30008562        1.000000 2.000000        2.000000
## 10    10               0.40025685        1.000000 2.000000        2.000000
## 11    11               0.50000000        0.521434 2.000000        2.000000
## 12    12               0.60017123        0.000000 0.000000        1.666191
## 13    13               0.69991438        0.000000 0.000000        1.428746
## 14    14               0.80008562        0.000000 0.000000        1.249866
## 15    15               0.89982877        0.000000 0.000000        1.111323
## 16    16               1.00000000        0.000000 0.000000        1.000000
##    response_rate    score cumulative_response_rate cumulative_score
## 1       1.000000 1.000000                 1.000000         1.000000
## 2       1.000000 1.000000                 1.000000         1.000000
## 3       1.000000 1.000000                 1.000000         1.000000
## 4       1.000000 1.000000                 1.000000         1.000000
## 5       1.000000 1.000000                 1.000000         1.000000
## 6       1.000000 1.000000                 1.000000         1.000000
## 7       1.000000 1.000000                 1.000000         1.000000
## 8       1.000000 1.000000                 1.000000         1.000000
## 9       1.000000 1.000000                 1.000000         1.000000
## 10      1.000000 1.000000                 1.000000         1.000000
## 11      1.000000 0.999430                 1.000000         0.999886
## 12      0.000000 0.001079                 0.833096         0.833181
## 13      0.000000 0.000000                 0.714373         0.714446
## 14      0.000000 0.000000                 0.624933         0.624997
## 15      0.000000 0.000000                 0.555661         0.555718
## 16      0.000000 0.000000                 0.500000         0.500051
##    capture_rate cumulative_capture_rate        gain cumulative_gain
## 1      0.020548                0.020548  100.000000      100.000000
## 2      0.019692                0.040240  100.000000      100.000000
## 3      0.020548                0.060788  100.000000      100.000000
## 4      0.019692                0.080479  100.000000      100.000000
## 5      0.019692                0.100171  100.000000      100.000000
## 6      0.100171                0.200342  100.000000      100.000000
## 7      0.100171                0.300514  100.000000      100.000000
## 8      0.100171                0.400685  100.000000      100.000000
## 9      0.199486                0.600171  100.000000      100.000000
## 10     0.200342                0.800514  100.000000      100.000000
## 11     0.199486                1.000000  100.000000      100.000000
## 12     0.000000                1.000000 -100.000000       66.619116
## 13     0.000000                1.000000 -100.000000       42.874618
## 14     0.000000                1.000000 -100.000000       24.986624
## 15     0.000000                1.000000 -100.000000       11.132255
## 16     0.000000                1.000000 -100.000000        0.000000
##    kolmogorov_smirnov
## 1            0.020548
## 2            0.040240
## 3            0.060788
## 4            0.080479
## 5            0.100171
## 6            0.200342
## 7            0.300514
## 8            0.400685
## 9            0.600171
## 10           0.800514
## 11           1.000000
## 12           0.799658
## 13           0.600171
## 14           0.399829
## 15           0.200342
## 16           0.000000
## 
## $residual_deviance
## [1] 0.7787036
## 
## $null_deviance
## [1] 3238.431
## 
## $AIC
## [1] 6.778704
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 2335
## 
## $residual_degrees_of_freedom
## [1] 2333
h2o.auc(performance_h2o)
## [1] 1
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.980388689091854:
##        High  Low    Error     Rate
## High   1168    0 0.000000  =0/1168
## Low       0 1168 0.000000  =0/1168
## Totals 1168 1168 0.000000  =0/2336
h2o.metric(performance_h2o)
## Metrics for Thresholds: Binomial metrics as a function of classification thresholds
##   threshold       f1       f2 f0point5 accuracy precision   recall specificity
## 1  1.000000 0.979467 0.967547 0.991684 0.979880  1.000000 0.959760    1.000000
## 2  0.999991 0.979913 0.968243 0.991867 0.980308  1.000000 0.960616    1.000000
## 3  0.999991 0.980358 0.968939 0.992049 0.980736  1.000000 0.961473    1.000000
## 4  0.999969 0.980803 0.969634 0.992232 0.981164  1.000000 0.962329    1.000000
## 5  0.999966 0.981247 0.970329 0.992414 0.981592  1.000000 0.963185    1.000000
##   absolute_mcc min_per_class_accuracy mean_per_class_accuracy  tns fns fps  tps
## 1     0.960538               0.959760                0.979880 1168  47   0 1121
## 2     0.961362               0.960616                0.980308 1168  46   0 1122
## 3     0.962187               0.961473                0.980736 1168  45   0 1123
## 4     0.963012               0.962329                0.981164 1168  44   0 1124
## 5     0.963838               0.963185                0.981592 1168  43   0 1125
##        tnr      fnr      fpr      tpr idx
## 1 1.000000 0.040240 0.000000 0.959760   0
## 2 1.000000 0.039384 0.000000 0.960616   1
## 3 1.000000 0.038527 0.000000 0.961473   2
## 4 1.000000 0.037671 0.000000 0.962329   3
## 5 1.000000 0.036815 0.000000 0.963185   4
## 
## ---
##     threshold       f1       f2 f0point5 accuracy precision   recall
## 395  0.000000 0.672811 0.837156 0.562404 0.513699  0.506944 1.000000
## 396  0.000000 0.671072 0.836077 0.560461 0.509846  0.504972 1.000000
## 397  0.000000 0.669149 0.834882 0.558317 0.505565  0.502798 1.000000
## 398  0.000000 0.668575 0.834524 0.557678 0.504281  0.502150 1.000000
## 399  0.000000 0.668001 0.834167 0.557039 0.502997  0.501503 1.000000
## 400  0.000000 0.666667 0.833333 0.555556 0.500000  0.500000 1.000000
##     specificity absolute_mcc min_per_class_accuracy mean_per_class_accuracy tns
## 395    0.027397     0.117851               0.027397                0.513699  32
## 396    0.019692     0.099719               0.019692                0.509846  23
## 397    0.011130     0.074808               0.011130                0.505565  13
## 398    0.008562     0.065568               0.008562                0.504281  10
## 399    0.005993     0.054823               0.005993                0.502997   7
## 400    0.000000     0.000000               0.000000                0.500000   0
##     fns  fps  tps      tnr      fnr      fpr      tpr idx
## 395   0 1136 1168 0.027397 0.000000 0.972603 1.000000 394
## 396   0 1145 1168 0.019692 0.000000 0.980308 1.000000 395
## 397   0 1155 1168 0.011130 0.000000 0.988870 1.000000 396
## 398   0 1158 1168 0.008562 0.000000 0.991438 1.000000 397
## 399   0 1161 1168 0.005993 0.000000 0.994007 1.000000 398
## 400   0 1168 1168 0.000000 0.000000 1.000000 1.000000 399