# for Core packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for financial analysis
library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# for times series
library(timetk)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl

## # A tibble: 11,244 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,234 more rows

claims_tbl %>%
    plot_time_series(.date_var = date, .value = claims)

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1874
## 2 Massachusetts  1874
## 3 Maine          1874
## 4 New Hampshire  1874
## 5 Rhode Island   1874
## 6 Vermont        1874

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(
        .date_var     = date, 
        .value        = claims,
        .facet_ncol   = 2,
        .facet_scales = "free",
        .interactive  = FALSE)

Box plots

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1874
## 2 Massachusetts  1874
## 3 Maine          1874
## 4 New Hampshire  1874
## 5 Rhode Island   1874
## 6 Vermont        1874

claims_tbl %>%
    filter_by_time(.date_var = date, .end_date = "1995") %>%
    group_by(symbol) %>%
    plot_time_series_boxplot(.date_var   = date, 
                             .value      = log(claims), 
                             .period     = "1 year", 
                             .facet_ncol = 2)

Regression plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series_regression(
        .date_var = date, 
        .facet_ncol = 3, 
        .formula = log(claims) ~ as.numeric(date) + month(date, label = TRUE), 
        .show_summary = FALSE)

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(date, claims, 
                         .lags = "1 years")

Seasonality

claims_tbl %>%
    plot_seasonal_diagnostics(date, log(claims))

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1874
## 2 Massachusetts  1874
## 3 Maine          1874
## 4 New Hampshire  1874
## 5 Rhode Island   1874
## 6 Vermont        1874

claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(date, log(claims))

STL Diagnostics

claims_tbl %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(
        date, claims, 
        .feature_set = c("observed", "season", "trend", "remainder"))

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(date, log(claims), .facet_ncol = 2, .interactive = FALSE)

claims_tbl %>%
    group_by(symbol) %>%
    summarise_by_time(.date_var = date, claims = mean(log(claims)), .by = "year") %>%
    plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)

Filter By Time

claims_tbl %>%
    group_by(symbol) %>%
    filter_by_time(.date_var = date, 
                   .start_date = "2022-01", 
                   .end_date = "2022") %>%
    plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>%
    group_by(symbol) %>%
    pad_by_time(date, .by = "week", .pad_value = 0)

## # A tibble: 11,244 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,234 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(log(claims), mean, 
                                       .period = 2, 
                                       .align = "right", 
                                       .partial = TRUE))

## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345          9.03
##  2 Connecticut 1989-01-14   6503          8.90
##  3 Connecticut 1989-01-21   3821          8.51
##  4 Connecticut 1989-01-28   4663          8.35
##  5 Connecticut 1989-02-04   4162          8.39
##  6 Connecticut 1989-02-11   4337          8.35
##  7 Connecticut 1989-02-18   4079          8.34
##  8 Connecticut 1989-02-25   3556          8.24
##  9 Connecticut 1989-03-04   3826          8.21
## 10 Connecticut 1989-03-11   3515          8.21

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

claims_tbl %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))

## # A tibble: 11,244 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,234 more rows

Split Data

set.seed(1234)

data_split <- initial_split(claims_tbl, strata = "claims")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(claims ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 hours 4 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 21 days 
##     H2O cluster name:           H2O_started_from_R_alexnelson_hps198 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.46 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.1 (2022-06-23)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 21 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h20 <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 5639)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

train_h2o <- split.h20[[1]]
valid_h2o <- split.h20[[2]]
test_h2o  <- as.h2o(test_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

y <- "symbol"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
## 12:18:28.224: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |======================================================================| 100%

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                             model_id mean_per_class_error  logloss      rmse
## 1 XGBoost_1_AutoML_6_20241212_121828            0.5438247 1.161853 0.6476953
##         mse
## 1 0.4195092
## 
## [1 row x 5 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OMultinomialModel: xgboost
## Model ID:  XGBoost_1_AutoML_6_20241212_121828 
## Model Summary: 
##   number_of_trees
## 1              71
## 
## 
## H2OMultinomialMetrics: xgboost
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("AutoML_6_20241212_121828_training_RTMP_sid_a059_5")`
## MSE: (Extract with `h2o.mse`) 0.2778627
## RMSE: (Extract with `h2o.rmse`) 0.5271268
## Logloss: (Extract with `h2o.logloss`) 0.7652044
## Mean Per-Class Error: 0.2698747
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## R^2: (Extract with `h2o.r2`) 0.9060138
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut          1018     4           179             0           28
## Maine                  61   770            10           103          222
## Massachusetts          59     1          1090             0           10
## New Hampshire          19   127             5           663           85
## Rhode Island          123   206            20            53          764
## Vermont                 8    62             0           156           40
## Totals               1288  1170          1304           975         1149
##               Vermont  Error            Rate
## Connecticut         0 0.1717 =   211 / 1,229
## Maine              63 0.3735 =   459 / 1,229
## Massachusetts       0 0.0603 =    70 / 1,160
## New Hampshire     276 0.4357 =   512 / 1,175
## Rhode Island       24 0.3580 =   426 / 1,190
## Vermont           943 0.2200 =   266 / 1,209
## Totals           1306 0.2703 = 1,944 / 7,192
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.729700
## 2 2  0.924639
## 3 3  0.979978
## 4 4  0.994855
## 5 5  0.999027
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: xgboost
## ** Reported on validation data. **
## 
## Validation Set Metrics: 
## =====================
## 
## Extract validation frame with `h2o.getFrame("RTMP_sid_a059_7")`
## MSE: (Extract with `h2o.mse`) 0.4182965
## RMSE: (Extract with `h2o.rmse`) 0.6467584
## Logloss: (Extract with `h2o.logloss`) 1.149908
## Mean Per-Class Error: 0.5700614
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## R^2: (Extract with `h2o.r2`) 0.8491219
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           126     2            44             0           18
## Maine                  20    51             3            37           77
## Massachusetts          48     1           178             0            6
## New Hampshire           4    38             0            40           26
## Rhode Island           27    96             3            19           43
## Vermont                 2    12             0            72           12
## Totals                227   200           228           168          182
##               Vermont  Error          Rate
## Connecticut         0 0.3368 =    64 / 190
## Maine              22 0.7571 =   159 / 210
## Massachusetts       0 0.2361 =    55 / 233
## New Hampshire     109 0.8157 =   177 / 217
## Rhode Island        5 0.7772 =   150 / 193
## Vermont            99 0.4975 =    98 / 197
## Totals            235 0.5669 = 703 / 1,240
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.433065
## 2 2  0.774193
## 3 3  0.906452
## 4 4  0.987903
## 5 5  0.996774
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: xgboost
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## Cross-Validation Set Metrics: 
## =====================
## 
## Extract cross-validation frame with `h2o.getFrame("AutoML_6_20241212_121828_training_RTMP_sid_a059_5")`
## MSE: (Extract with `h2o.mse`) 0.3951766
## RMSE: (Extract with `h2o.rmse`) 0.6286307
## Logloss: (Extract with `h2o.logloss`) 1.090599
## Mean Per-Class Error: 0.498862
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## R^2: (Extract with `h2o.r2`) 0.8663328
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.500417
## 2 2  0.794077
## 3 3  0.919216
## 4 4  0.990823
## 5 5  0.997219
## 6 6  1.000000
## 
## 
## 
## 
## Cross-Validation Metrics Summary: 
##                               mean        sd cv_1_valid cv_2_valid cv_3_valid
## accuracy                  0.500415  0.012931   0.512856   0.506602   0.498609
## auc                             NA  0.000000         NA         NA         NA
## err                       0.499585  0.012931   0.487144   0.493398   0.501391
## err_count               718.600000 18.420097 701.000000 710.000000 721.000000
## logloss                   1.090601  0.011888   1.083716   1.084928   1.083350
## max_per_class_error       0.748112  0.021781   0.760684   0.727660   0.744681
## mean_per_class_accuracy   0.501129  0.013093   0.513660   0.507247   0.499673
## mean_per_class_error      0.498871  0.013093   0.486340   0.492753   0.500327
## mse                       0.395178  0.005633   0.391113   0.391439   0.394302
## pr_auc                          NA  0.000000         NA         NA         NA
## r2                        0.866332  0.001933   0.867991   0.867637   0.866435
## rmse                      0.628619  0.004463   0.625390   0.625651   0.627935
##                         cv_4_valid cv_5_valid
## accuracy                  0.504868   0.479138
## auc                             NA         NA
## err                       0.495132   0.520862
## err_count               712.000000 749.000000
## logloss                   1.089619   1.111390
## max_per_class_error       0.728814   0.778723
## mean_per_class_accuracy   0.505597   0.479469
## mean_per_class_error      0.494403   0.520531
## mse                       0.394132   0.404902
## pr_auc                          NA         NA
## r2                        0.866496   0.863100
## rmse                      0.627799   0.636319

Apply 11

Alex Nelson

2024-12-12

Plotting time series

Box plots

Regression plots

Plotting Seasonality and Correlation

Correlation Plots

Seasonality

STL Diagnostics

Time Series Data Wrangling

Summarize by Time

Filter By Time

Padding Data

Sliding (Rolling) Calculations

Split Data

Recipes

Model