# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# for times series
library(timetk)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl
## # A tibble: 11,244 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,234 more rows
claims_tbl %>%
    plot_time_series(.date_var = date, .value = claims)
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1874
## 2 Massachusetts  1874
## 3 Maine          1874
## 4 New Hampshire  1874
## 5 Rhode Island   1874
## 6 Vermont        1874
claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(
        .date_var     = date, 
        .value        = claims,
        .facet_ncol   = 2,
        .facet_scales = "free",
        .interactive  = FALSE)

Box plots

claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1874
## 2 Massachusetts  1874
## 3 Maine          1874
## 4 New Hampshire  1874
## 5 Rhode Island   1874
## 6 Vermont        1874
claims_tbl %>%
    filter_by_time(.date_var = date, .end_date = "1995") %>%
    group_by(symbol) %>%
    plot_time_series_boxplot(.date_var   = date, 
                             .value      = log(claims), 
                             .period     = "1 year", 
                             .facet_ncol = 2)

Regression plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series_regression(
        .date_var = date, 
        .facet_ncol = 3, 
        .formula = log(claims) ~ as.numeric(date) + month(date, label = TRUE), 
        .show_summary = FALSE)

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(date, claims, 
                         .lags = "1 years") 

Seasonality

claims_tbl %>%
    plot_seasonal_diagnostics(date, log(claims))
claims_tbl %>% count(symbol)
## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1874
## 2 Massachusetts  1874
## 3 Maine          1874
## 4 New Hampshire  1874
## 5 Rhode Island   1874
## 6 Vermont        1874
claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(date, log(claims))

STL Diagnostics

claims_tbl %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(
        date, claims, 
        .feature_set = c("observed", "season", "trend", "remainder"))
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(date, log(claims), .facet_ncol = 2, .interactive = FALSE)

claims_tbl %>%
    group_by(symbol) %>%
    summarise_by_time(.date_var = date, claims = mean(log(claims)), .by = "year") %>%
    plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)

Filter By Time

claims_tbl %>%
    group_by(symbol) %>%
    filter_by_time(.date_var = date, 
                   .start_date = "2022-01", 
                   .end_date = "2022") %>%
    plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>%
    group_by(symbol) %>%
    pad_by_time(date, .by = "week", .pad_value = 0)
## # A tibble: 11,244 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,234 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(log(claims), mean, 
                                       .period = 2, 
                                       .align = "right", 
                                       .partial = TRUE))
## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345          9.03
##  2 Connecticut 1989-01-14   6503          8.90
##  3 Connecticut 1989-01-21   3821          8.51
##  4 Connecticut 1989-01-28   4663          8.35
##  5 Connecticut 1989-02-04   4162          8.39
##  6 Connecticut 1989-02-11   4337          8.35
##  7 Connecticut 1989-02-18   4079          8.34
##  8 Connecticut 1989-02-25   3556          8.24
##  9 Connecticut 1989-03-04   3826          8.21
## 10 Connecticut 1989-03-11   3515          8.21
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
claims_tbl %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))
## # A tibble: 11,244 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,234 more rows

Split Data

set.seed(1234)

data_split <- initial_split(claims_tbl, strata = "claims")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(claims ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors()) 

Model

# Initialize h2o
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 hours 4 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 21 days 
##     H2O cluster name:           H2O_started_from_R_alexnelson_hps198 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.46 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.1 (2022-06-23)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 21 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
split.h20 <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 5639)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
train_h2o <- split.h20[[1]]
valid_h2o <- split.h20[[2]]
test_h2o  <- as.h2o(test_tbl)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
y <- "symbol"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
## 12:18:28.224: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |======================================================================| 100%
models_h2o %>% typeof()
## [1] "S4"
models_h2o %>% slotNames()
## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"
models_h2o@leaderboard
##                             model_id mean_per_class_error  logloss      rmse
## 1 XGBoost_1_AutoML_6_20241212_121828            0.5438247 1.161853 0.6476953
##         mse
## 1 0.4195092
## 
## [1 row x 5 columns]
models_h2o@leader
## Model Details:
## ==============
## 
## H2OMultinomialModel: xgboost
## Model ID:  XGBoost_1_AutoML_6_20241212_121828 
## Model Summary: 
##   number_of_trees
## 1              71
## 
## 
## H2OMultinomialMetrics: xgboost
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("AutoML_6_20241212_121828_training_RTMP_sid_a059_5")`
## MSE: (Extract with `h2o.mse`) 0.2778627
## RMSE: (Extract with `h2o.rmse`) 0.5271268
## Logloss: (Extract with `h2o.logloss`) 0.7652044
## Mean Per-Class Error: 0.2698747
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## R^2: (Extract with `h2o.r2`) 0.9060138
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut          1018     4           179             0           28
## Maine                  61   770            10           103          222
## Massachusetts          59     1          1090             0           10
## New Hampshire          19   127             5           663           85
## Rhode Island          123   206            20            53          764
## Vermont                 8    62             0           156           40
## Totals               1288  1170          1304           975         1149
##               Vermont  Error            Rate
## Connecticut         0 0.1717 =   211 / 1,229
## Maine              63 0.3735 =   459 / 1,229
## Massachusetts       0 0.0603 =    70 / 1,160
## New Hampshire     276 0.4357 =   512 / 1,175
## Rhode Island       24 0.3580 =   426 / 1,190
## Vermont           943 0.2200 =   266 / 1,209
## Totals           1306 0.2703 = 1,944 / 7,192
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.729700
## 2 2  0.924639
## 3 3  0.979978
## 4 4  0.994855
## 5 5  0.999027
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: xgboost
## ** Reported on validation data. **
## 
## Validation Set Metrics: 
## =====================
## 
## Extract validation frame with `h2o.getFrame("RTMP_sid_a059_7")`
## MSE: (Extract with `h2o.mse`) 0.4182965
## RMSE: (Extract with `h2o.rmse`) 0.6467584
## Logloss: (Extract with `h2o.logloss`) 1.149908
## Mean Per-Class Error: 0.5700614
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## R^2: (Extract with `h2o.r2`) 0.8491219
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           126     2            44             0           18
## Maine                  20    51             3            37           77
## Massachusetts          48     1           178             0            6
## New Hampshire           4    38             0            40           26
## Rhode Island           27    96             3            19           43
## Vermont                 2    12             0            72           12
## Totals                227   200           228           168          182
##               Vermont  Error          Rate
## Connecticut         0 0.3368 =    64 / 190
## Maine              22 0.7571 =   159 / 210
## Massachusetts       0 0.2361 =    55 / 233
## New Hampshire     109 0.8157 =   177 / 217
## Rhode Island        5 0.7772 =   150 / 193
## Vermont            99 0.4975 =    98 / 197
## Totals            235 0.5669 = 703 / 1,240
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.433065
## 2 2  0.774193
## 3 3  0.906452
## 4 4  0.987903
## 5 5  0.996774
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: xgboost
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## Cross-Validation Set Metrics: 
## =====================
## 
## Extract cross-validation frame with `h2o.getFrame("AutoML_6_20241212_121828_training_RTMP_sid_a059_5")`
## MSE: (Extract with `h2o.mse`) 0.3951766
## RMSE: (Extract with `h2o.rmse`) 0.6286307
## Logloss: (Extract with `h2o.logloss`) 1.090599
## Mean Per-Class Error: 0.498862
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## R^2: (Extract with `h2o.r2`) 0.8663328
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.500417
## 2 2  0.794077
## 3 3  0.919216
## 4 4  0.990823
## 5 5  0.997219
## 6 6  1.000000
## 
## 
## 
## 
## Cross-Validation Metrics Summary: 
##                               mean        sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                  0.500415  0.012931   0.512856   0.506602   0.498609
## auc                             NA  0.000000         NA         NA         NA
## err                       0.499585  0.012931   0.487144   0.493398   0.501391
## err_count               718.600000 18.420097 701.000000 710.000000 721.000000
## logloss                   1.090601  0.011888   1.083716   1.084928   1.083350
## max_per_class_error       0.748112  0.021781   0.760684   0.727660   0.744681
## mean_per_class_accuracy   0.501129  0.013093   0.513660   0.507247   0.499673
## mean_per_class_error      0.498871  0.013093   0.486340   0.492753   0.500327
## mse                       0.395178  0.005633   0.391113   0.391439   0.394302
## pr_auc                          NA  0.000000         NA         NA         NA
## r2                        0.866332  0.001933   0.867991   0.867637   0.866435
## rmse                      0.628619  0.004463   0.625390   0.625651   0.627935
##                         cv_4_valid cv_5_valid
## accuracy                  0.504868   0.479138
## auc                             NA         NA
## err                       0.495132   0.520862
## err_count               712.000000 749.000000
## logloss                   1.089619   1.111390
## max_per_class_error       0.728814   0.778723
## mean_per_class_accuracy   0.505597   0.479469
## mean_per_class_error      0.494403   0.520531
## mse                       0.394132   0.404902
## pr_auc                          NA         NA
## r2                        0.866496   0.863100
## rmse                      0.627799   0.636319