# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# for times series
library(timetk)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl
## # A tibble: 11,238 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,228 more rows
claims_tbl %>%
    plot_time_series(.date_var = date, .value = claims)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1873
## 2 Massachusetts  1873
## 3 Maine          1873
## 4 New Hampshire  1873
## 5 Rhode Island   1873
## 6 Vermont        1873
claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(
        .date_var     = date, 
        .value        = claims,
        .facet_ncol   = 2,
        .facet_scales = "free_x",
        .interactive  = FALSE)

Box plots

claims_tbl %>%
    filter_by_time(.date_var = date) %>%
    group_by(symbol) %>%
    plot_time_series_boxplot(
         date,claims,
        .period      = "1 year",
        .facet_ncol  = 2) 

Regression plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series_regression(
        .date_var     = date,
        .formula      = log(claims) ~ as.numeric(date) + month(date, label = TRUE),
        .facet_ncol   = 2,
        .show_summary = FALSE
    )

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(
        date, claims,
        .lags = "7 days")
claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(
        Date, claims, 
        .ccf_vars    = c(date, claims))

Seasonality

claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(date, claims)

STL Diagnostics

claims_tbl %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(
        date, claims,
        .feature_set = c("observed", "trend", "remainder"))
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)

Summarize by quarter

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, volume = sum(claims), .by = "quarter") %>%
  plot_time_series(date, volume, .facet_ncol = 2, .interactive = FALSE)

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, adjusted = mean(claims), .by = "month") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = FALSE)

Filter By Time

claims_tbl %>%
  group_by(symbol) %>%
  filter_by_time(.date_var   = date, 
                 .start_date = "2013",
                 .end_date   = "2015") %>%
  plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>%
  group_by(symbol) %>%
  pad_by_time(date, .by = "auto", .pad_value = 0)
## pad applied on the interval: week
## # A tibble: 11,238 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,228 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(claims, mean, 
                                       .period  = 2,
                                       .align   = "left",
                                       .partial = TRUE))
## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345         7424 
##  2 Connecticut 1989-01-14   6503         5162 
##  3 Connecticut 1989-01-21   3821         4242 
##  4 Connecticut 1989-01-28   4663         4412.
##  5 Connecticut 1989-02-04   4162         4250.
##  6 Connecticut 1989-02-11   4337         4208 
##  7 Connecticut 1989-02-18   4079         3818.
##  8 Connecticut 1989-02-25   3556         3691 
##  9 Connecticut 1989-03-04   3826         3670.
## 10 Connecticut 1989-03-11   3515         3515
# Make the rolling function
roll_avg_30 <- slidify(.f = mean, .period = 30, .align = "center", .partial = TRUE)

# Apply the rolling function
claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>%
    
  # Apply Sliding Function
  mutate(rolling_avg_30 = roll_avg_30(claims)) %>%
  tidyr::pivot_longer(cols = c(claims, rolling_avg_30)) %>%
  plot_time_series(date, value, .color_var = name,
                   .facet_ncol = 2, .smooth = FALSE)

Apply 11

Build a Classification Model using H2O

Set up

Import data

Import the cleaned data from Module 7.

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10
## Warning: package 'modeldata' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
claims_tbl %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))
## # A tibble: 11,238 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,228 more rows

Split data

set.seed(1234)

data_split <- initial_split(claims_tbl, strata = "claims")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(claims ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors()) 

Model

# Initialize h2o
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 37 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 12 days 
##     H2O cluster name:           H2O_started_from_R_johnnymckinnon_nqx583 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.76 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h20 <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 5639)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
train_h2o <- split.h20[[1]]
valid_h2o <- split.h20[[2]]
test_h2o  <- as.h2o(test_tbl)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
y <- "symbol"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)
## 
  |                                                                            
  |                                                                      |   0%
## 16:23:21.577: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
##                                                  model_id mean_per_class_error
## 1    StackedEnsemble_AllModels_1_AutoML_9_20241203_162321            0.3685360
## 2 StackedEnsemble_BestOfFamily_1_AutoML_9_20241203_162321            0.4152764
## 3                          XRT_1_AutoML_9_20241203_162321            0.5167572
## 4                          GBM_1_AutoML_9_20241203_162321            0.5177677
## 5                          DRF_1_AutoML_9_20241203_162321            0.5292755
## 6                          GLM_1_AutoML_9_20241203_162321            0.5388845
##    logloss      rmse       mse
## 1 0.852542 0.5438285 0.2957494
## 2 0.911274 0.5664689 0.3208871
## 3 1.319180 0.7219818 0.5212578
## 4 1.068785 0.6276309 0.3939205
## 5 1.340430 0.7281312 0.5301751
## 6 1.244583 0.6699670 0.4488558
## 
## [12 rows x 5 columns]
models_h2o@leader
## Model Details:
## ==============
## 
## H2OMultinomialModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_1_AutoML_9_20241203_162321 
## Model Summary for Stacked Ensemble: 
##                                     key            value
## 1                     Stacking strategy cross_validation
## 2  Number of base models (used / total)            10/10
## 3      # GBM base models (used / total)              4/4
## 4  # XGBoost base models (used / total)              3/3
## 5      # DRF base models (used / total)              2/2
## 6      # GLM base models (used / total)              1/1
## 7                 Metalearner algorithm              GLM
## 8    Metalearner fold assignment scheme           Random
## 9                    Metalearner nfolds                5
## 10              Metalearner fold_column               NA
## 11   Custom metalearner hyperparameters             None
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("AutoML_9_20241203_162321_training_RTMP_sid_ae6d_5")`
## MSE: (Extract with `h2o.mse`) 0.3098416
## RMSE: (Extract with `h2o.rmse`) 0.5566342
## Logloss: (Extract with `h2o.logloss`) 0.8600982
## Mean Per-Class Error: 0.4415219
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 25755.11
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12364.77
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut          1019     4           179             1           27
## Maine                  19   332             1           246          592
## Massachusetts          85     0          1067             0            7
## New Hampshire           2   157             1           325           67
## Rhode Island          104   535             6            64          455
## Vermont                 3    35             0           291           53
## Totals               1232  1063          1254           927         1201
##               Vermont  Error            Rate
## Connecticut         0 0.1715 =   211 / 1,230
## Maine              40 0.7301 =   898 / 1,230
## Massachusetts       0 0.0794 =    92 / 1,159
## New Hampshire     645 0.7285 =   872 / 1,197
## Rhode Island       24 0.6170 =   733 / 1,188
## Vermont           802 0.3226 =   382 / 1,184
## Totals           1511 0.4435 = 3,188 / 7,188
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.556483
## 2 2  0.893433
## 3 3  0.966889
## 4 4  0.998052
## 5 5  0.999583
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on validation data. **
## 
## Validation Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.305676
## RMSE: (Extract with `h2o.rmse`) 0.5528798
## Logloss: (Extract with `h2o.logloss`) 0.8784158
## Mean Per-Class Error: 0.3921174
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 4438.605
## Residual Deviance: (Extract with `h2o.residual_deviance`) 2174.957
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           152     0            34             0            4
## Maine                   2    77             1            39           73
## Massachusetts          32     0           193             1            3
## New Hampshire           3    26             0            97           22
## Rhode Island           17    51             2            20          102
## Vermont                 1    10             0            40           11
## Totals                207   164           230           197          215
##               Vermont  Error          Rate
## Connecticut         0 0.2000 =    38 / 190
## Maine              18 0.6333 =   133 / 210
## Massachusetts       0 0.1572 =    36 / 229
## New Hampshire      75 0.5650 =   126 / 223
## Rhode Island        2 0.4742 =    92 / 194
## Vermont           130 0.3229 =    62 / 192
## Totals            225 0.3934 = 487 / 1,238
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.606624
## 2 2  0.882068
## 3 3  0.970113
## 4 4  0.997577
## 5 5  0.999192
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## Cross-Validation Set Metrics: 
## =====================
## 
## Extract cross-validation frame with `h2o.getFrame("levelone_training_StackedEnsemble_AllModels_1_AutoML_9_20241203_162321")`
## MSE: (Extract with `h2o.mse`) 0.2925926
## RMSE: (Extract with `h2o.rmse`) 0.5409183
## Logloss: (Extract with `h2o.logloss`) 0.8423963
## Mean Per-Class Error: 0.3708954
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 25767.24
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12110.29
## AIC: (Extract with `h2o.aic`) NaN
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.628130
## 2 2  0.891903
## 3 3  0.965220
## 4 4  0.995409
## 5 5  0.998748
## 6 6  1.000000
## 
## 
## 
## 
## Cross-Validation Metrics Summary: 
##                                mean         sd  cv_1_valid  cv_2_valid
## accuracy                   0.627391   0.013936    0.605498    0.624741
## auc                              NA   0.000000          NA          NA
## err                        0.372609   0.013936    0.394502    0.375259
## err_count                536.000000  36.090164  574.000000  543.000000
## logloss                    0.841912   0.013921    0.859396    0.831485
## max_per_class_error        0.567701   0.039323    0.571970    0.604255
## mean_per_class_accuracy    0.628757   0.011565    0.614289    0.623152
## mean_per_class_error       0.371243   0.011565    0.385711    0.376848
## mse                        0.292296   0.005311    0.301223    0.290064
## null_deviance           5153.448700 214.011960 5216.419400 5188.217000
## pr_auc                           NA   0.000000          NA          NA
## r2                         0.900432   0.002286    0.899640    0.898534
## residual_deviance       2419.913000  85.605640 2500.841800 2406.318000
## rmse                       0.540626   0.004891    0.548838    0.538576
##                          cv_3_valid  cv_4_valid  cv_5_valid
## accuracy                   0.632997    0.630539    0.643178
## auc                              NA          NA          NA
## err                        0.367003    0.369461    0.356822
## err_count                545.000000  542.000000  476.000000
## logloss                    0.832230    0.831742    0.854704
## max_per_class_error        0.590909    0.569170    0.502203
## mean_per_class_accuracy    0.625481    0.638095    0.642768
## mean_per_class_error       0.374519    0.361905    0.357232
## mse                        0.287202    0.292172    0.290817
## null_deviance           5323.343000 5257.700700 4781.564500
## pr_auc                           NA          NA          NA
## r2                         0.903365    0.898304    0.902316
## residual_deviance       2471.723400 2440.331800 2280.350300
## rmse                       0.535912    0.540529    0.539275

Save and Load

?h2o.getModel
?h2o.saveModel
?h2o.loadModel

#h2o.getModel("StackedEnsemble_AllModels_1_AutoML_1_20241203_144731") %>%
    #h2o.saveModel("h2o_models/")

best_model <- h2o.loadModel("h2o_models/StackedEnsemble_AllModels_1_AutoML_1_20241203_144731")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
predictions_tbl <- predictions %>%
    as.tibble()
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
predictions_tbl %>%
    bind_cols(test_tbl)
## # A tibble: 2,812 × 10
##    predict  Connecticut   Maine Massachusetts New.Hampshire Rhode.Island Vermont
##    <fct>          <dbl>   <dbl>         <dbl>         <dbl>        <dbl>   <dbl>
##  1 Massach…       0.474 8.37e-4      0.520        0.000200       0.00452 8.53e-5
##  2 Connect…       0.866 1.09e-2      0.0682       0.0000946      0.0547  1.47e-4
##  3 Connect…       0.851 1.65e-2      0.00961      0.00201        0.119   1.70e-3
##  4 Connect…       0.600 5.46e-2      0.00121      0.0159         0.323   5.34e-3
##  5 Connect…       0.448 1.66e-1      0.000147     0.0206         0.358   6.93e-3
##  6 Connect…       0.908 1.33e-3      0.0814       0.0000165      0.00884 7.40e-5
##  7 Connect…       0.675 6.43e-2      0.000387     0.00815        0.249   3.37e-3
##  8 Rhode I…       0.408 1.40e-1      0.000327     0.0136         0.433   4.15e-3
##  9 Connect…       0.422 1.63e-1      0.000226     0.0213         0.384   9.56e-3
## 10 Connect…       0.590 1.10e-1      0.00101      0.0166         0.276   6.50e-3
## # ℹ 2,802 more rows
## # ℹ 3 more variables: symbol <fct>, date <date>, claims <int>

Evaluate model

performance_h2o  <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)
## [1] "S4"
slotNames(performance_h2o)
## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"
performance_h2o@metrics
## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "StackedEnsemble_AllModels_1_AutoML_1_20241203_144731"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/StackedEnsemble_AllModels_1_AutoML_1_20241203_144731"
## 
## 
## $model_checksum
## [1] "-4233719964055368032"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_ae6d_3"
## 
## 
## $frame_checksum
## [1] "-741132111419434106"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.733261e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.2957494
## 
## $RMSE
## [1] 0.5438285
## 
## $nobs
## [1] 2812
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.8990639
## 
## $hit_ratio_table
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.636558
## 2 2  0.883001
## 3 3  0.964794
## 4 4  0.997511
## 5 5  0.999289
## 6 6  1.000000
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           361     6            68             1           17
## Maine                   9   172             1            81          132
## Massachusetts          49     0           429             1            6
## New Hampshire           0    66             2           191           42
## Rhode Island           29   131             5            40          276
## Vermont                 1    21             0            92           22
## Totals                449   396           505           406          495
##               Vermont  Error            Rate
## Connecticut         0 0.2031 =      92 / 453
## Maine              38 0.6028 =     261 / 433
## Massachusetts       0 0.1155 =      56 / 485
## New Hampshire     152 0.5784 =     262 / 453
## Rhode Island       10 0.4379 =     215 / 491
## Vermont           361 0.2736 =     136 / 497
## Totals            561 0.3634 = 1,022 / 2,812
## 
## 
## $logloss
## [1] 0.852542
## 
## $mean_per_class_error
## [1] 0.368536
## 
## $AUC
## [1] "NaN"
## 
## $pr_auc
## [1] "NaN"
## 
## $multinomial_auc_table
## NULL
## 
## $multinomial_aucpr_table
## NULL
## 
## $residual_deviance
## [1] 4794.696
## 
## $null_deviance
## [1] 10082.92
## 
## $AIC
## [1] "NaN"
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 2811
## 
## $residual_degrees_of_freedom
## [1] 2557
h2o.auc(best_model)
## [1] "NaN"
h2o.confusionMatrix(performance_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           361     6            68             1           17
## Maine                   9   172             1            81          132
## Massachusetts          49     0           429             1            6
## New Hampshire           0    66             2           191           42
## Rhode Island           29   131             5            40          276
## Vermont                 1    21             0            92           22
## Totals                449   396           505           406          495
##               Vermont  Error            Rate
## Connecticut         0 0.2031 =      92 / 453
## Maine              38 0.6028 =     261 / 433
## Massachusetts       0 0.1155 =      56 / 485
## New Hampshire     152 0.5784 =     262 / 453
## Rhode Island       10 0.4379 =     215 / 491
## Vermont           361 0.2736 =     136 / 497
## Totals            561 0.3634 = 1,022 / 2,812