# for Core packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for financial analysis
library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# for times series
library(timetk)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl

## # A tibble: 11,238 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,228 more rows

claims_tbl %>%
    plot_time_series(.date_var = date, .value = claims)

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1873
## 2 Massachusetts  1873
## 3 Maine          1873
## 4 New Hampshire  1873
## 5 Rhode Island   1873
## 6 Vermont        1873

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(
        .date_var     = date, 
        .value        = claims,
        .facet_ncol   = 2,
        .facet_scales = "free_x",
        .interactive  = FALSE)

Box plots

claims_tbl %>%
    filter_by_time(.date_var = date) %>%
    group_by(symbol) %>%
    plot_time_series_boxplot(
         date,claims,
        .period      = "1 year",
        .facet_ncol  = 2)

Regression plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series_regression(
        .date_var     = date,
        .formula      = log(claims) ~ as.numeric(date) + month(date, label = TRUE),
        .facet_ncol   = 2,
        .show_summary = FALSE
    )

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(
        date, claims,
        .lags = "7 days")

claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(
        Date, claims, 
        .ccf_vars    = c(date, claims))

Seasonality

claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(date, claims)

STL Diagnostics

claims_tbl %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(
        date, claims,
        .feature_set = c("observed", "trend", "remainder"))

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(date, claims, .facet_ncol = 2, .interactive = FALSE)

Summarize by quarter

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, volume = sum(claims), .by = "quarter") %>%
  plot_time_series(date, volume, .facet_ncol = 2, .interactive = FALSE)

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, adjusted = mean(claims), .by = "month") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = FALSE)

Filter By Time

claims_tbl %>%
  group_by(symbol) %>%
  filter_by_time(.date_var   = date, 
                 .start_date = "2013",
                 .end_date   = "2015") %>%
  plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>%
  group_by(symbol) %>%
  pad_by_time(date, .by = "auto", .pad_value = 0)

## pad applied on the interval: week

## # A tibble: 11,238 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,228 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(claims, mean, 
                                       .period  = 2,
                                       .align   = "left",
                                       .partial = TRUE))

## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345         7424 
##  2 Connecticut 1989-01-14   6503         5162 
##  3 Connecticut 1989-01-21   3821         4242 
##  4 Connecticut 1989-01-28   4663         4412.
##  5 Connecticut 1989-02-04   4162         4250.
##  6 Connecticut 1989-02-11   4337         4208 
##  7 Connecticut 1989-02-18   4079         3818.
##  8 Connecticut 1989-02-25   3556         3691 
##  9 Connecticut 1989-03-04   3826         3670.
## 10 Connecticut 1989-03-11   3515         3515

# Make the rolling function
roll_avg_30 <- slidify(.f = mean, .period = 30, .align = "center", .partial = TRUE)

# Apply the rolling function
claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>%
    
  # Apply Sliding Function
  mutate(rolling_avg_30 = roll_avg_30(claims)) %>%
  tidyr::pivot_longer(cols = c(claims, rolling_avg_30)) %>%
  plot_time_series(date, value, .color_var = name,
                   .facet_ncol = 2, .smooth = FALSE)

Apply 11

Build a Classification Model using H2O

Set up

Import data

Import the cleaned data from Module 7.

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10

## Warning: package 'modeldata' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ xts::first()      masks dplyr::first()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ xts::last()       masks dplyr::last()
## ✖ dials::momentum() masks TTR::momentum()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

claims_tbl %>%
    
    # h2o requires all variables to be either numeric or factors
    mutate(across(where(is.character), factor))

## # A tibble: 11,238 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,228 more rows

Split data

set.seed(1234)

data_split <- initial_split(claims_tbl, strata = "claims")
train_tbl <- training(data_split)
test_tbl <- testing(data_split)

Recipes

recipe_obj <- recipe(claims ~ ., data = train_tbl) %>%
    
    # Remove zero variance variables
    step_zv(all_predictors())

Model

# Initialize h2o
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 37 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 12 days 
##     H2O cluster name:           H2O_started_from_R_johnnymckinnon_nqx583 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   2.76 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

split.h20 <- h2o.splitFrame(as.h2o(train_tbl), ratios = c(0.85), seed = 5639)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

train_h2o <- split.h20[[1]]
valid_h2o <- split.h20[[2]]
test_h2o  <- as.h2o(test_tbl)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

y <- "symbol"
x <- setdiff(names(train_tbl), y)

models_h2o <- h2o.automl(
    x = x,
    y = y, 
    training_frame    = train_h2o,
    validation_frame  = valid_h2o, 
    leaderboard_frame = test_h2o, 
    # max_runtime_secs  = 30, 
    max_models        = 10, 
    exclude_algos     = "DeepLearning",
    nfolds            = 5, 
    seed              = 3456   
)

## 
  |                                                                            
  |                                                                      |   0%
## 16:23:21.577: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |======================================================================| 100%

Examine the output of h2o.automl

models_h2o %>% typeof()

## [1] "S4"

models_h2o %>% slotNames()

## [1] "project_name"   "leader"         "leaderboard"    "event_log"     
## [5] "modeling_steps" "training_info"

models_h2o@leaderboard

##                                                  model_id mean_per_class_error
## 1    StackedEnsemble_AllModels_1_AutoML_9_20241203_162321            0.3685360
## 2 StackedEnsemble_BestOfFamily_1_AutoML_9_20241203_162321            0.4152764
## 3                          XRT_1_AutoML_9_20241203_162321            0.5167572
## 4                          GBM_1_AutoML_9_20241203_162321            0.5177677
## 5                          DRF_1_AutoML_9_20241203_162321            0.5292755
## 6                          GLM_1_AutoML_9_20241203_162321            0.5388845
##    logloss      rmse       mse
## 1 0.852542 0.5438285 0.2957494
## 2 0.911274 0.5664689 0.3208871
## 3 1.319180 0.7219818 0.5212578
## 4 1.068785 0.6276309 0.3939205
## 5 1.340430 0.7281312 0.5301751
## 6 1.244583 0.6699670 0.4488558
## 
## [12 rows x 5 columns]

models_h2o@leader

## Model Details:
## ==============
## 
## H2OMultinomialModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_1_AutoML_9_20241203_162321 
## Model Summary for Stacked Ensemble: 
##                                     key            value
## 1                     Stacking strategy cross_validation
## 2  Number of base models (used / total)            10/10
## 3      # GBM base models (used / total)              4/4
## 4  # XGBoost base models (used / total)              3/3
## 5      # DRF base models (used / total)              2/2
## 6      # GLM base models (used / total)              1/1
## 7                 Metalearner algorithm              GLM
## 8    Metalearner fold assignment scheme           Random
## 9                    Metalearner nfolds                5
## 10              Metalearner fold_column               NA
## 11   Custom metalearner hyperparameters             None
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("AutoML_9_20241203_162321_training_RTMP_sid_ae6d_5")`
## MSE: (Extract with `h2o.mse`) 0.3098416
## RMSE: (Extract with `h2o.rmse`) 0.5566342
## Logloss: (Extract with `h2o.logloss`) 0.8600982
## Mean Per-Class Error: 0.4415219
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 25755.11
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12364.77
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut          1019     4           179             1           27
## Maine                  19   332             1           246          592
## Massachusetts          85     0          1067             0            7
## New Hampshire           2   157             1           325           67
## Rhode Island          104   535             6            64          455
## Vermont                 3    35             0           291           53
## Totals               1232  1063          1254           927         1201
##               Vermont  Error            Rate
## Connecticut         0 0.1715 =   211 / 1,230
## Maine              40 0.7301 =   898 / 1,230
## Massachusetts       0 0.0794 =    92 / 1,159
## New Hampshire     645 0.7285 =   872 / 1,197
## Rhode Island       24 0.6170 =   733 / 1,188
## Vermont           802 0.3226 =   382 / 1,184
## Totals           1511 0.4435 = 3,188 / 7,188
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.556483
## 2 2  0.893433
## 3 3  0.966889
## 4 4  0.998052
## 5 5  0.999583
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on validation data. **
## 
## Validation Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.305676
## RMSE: (Extract with `h2o.rmse`) 0.5528798
## Logloss: (Extract with `h2o.logloss`) 0.8784158
## Mean Per-Class Error: 0.3921174
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 4438.605
## Residual Deviance: (Extract with `h2o.residual_deviance`) 2174.957
## AIC: (Extract with `h2o.aic`) NaN
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           152     0            34             0            4
## Maine                   2    77             1            39           73
## Massachusetts          32     0           193             1            3
## New Hampshire           3    26             0            97           22
## Rhode Island           17    51             2            20          102
## Vermont                 1    10             0            40           11
## Totals                207   164           230           197          215
##               Vermont  Error          Rate
## Connecticut         0 0.2000 =    38 / 190
## Maine              18 0.6333 =   133 / 210
## Massachusetts       0 0.1572 =    36 / 229
## New Hampshire      75 0.5650 =   126 / 223
## Rhode Island        2 0.4742 =    92 / 194
## Vermont           130 0.3229 =    62 / 192
## Totals            225 0.3934 = 487 / 1,238
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.606624
## 2 2  0.882068
## 3 3  0.970113
## 4 4  0.997577
## 5 5  0.999192
## 6 6  1.000000
## 
## 
## 
## 
## H2OMultinomialMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## Cross-Validation Set Metrics: 
## =====================
## 
## Extract cross-validation frame with `h2o.getFrame("levelone_training_StackedEnsemble_AllModels_1_AutoML_9_20241203_162321")`
## MSE: (Extract with `h2o.mse`) 0.2925926
## RMSE: (Extract with `h2o.rmse`) 0.5409183
## Logloss: (Extract with `h2o.logloss`) 0.8423963
## Mean Per-Class Error: 0.3708954
## AUC: (Extract with `h2o.auc`) NaN
## AUCPR: (Extract with `h2o.aucpr`) NaN
## Null Deviance: (Extract with `h2o.nulldeviance`) 25767.24
## Residual Deviance: (Extract with `h2o.residual_deviance`) 12110.29
## AIC: (Extract with `h2o.aic`) NaN
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,xval = TRUE)`
## =======================================================================
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.628130
## 2 2  0.891903
## 3 3  0.965220
## 4 4  0.995409
## 5 5  0.998748
## 6 6  1.000000
## 
## 
## 
## 
## Cross-Validation Metrics Summary: 
##                                mean         sd  cv_1_valid  cv_2_valid
## accuracy                   0.627391   0.013936    0.605498    0.624741
## auc                              NA   0.000000          NA          NA
## err                        0.372609   0.013936    0.394502    0.375259
## err_count                536.000000  36.090164  574.000000  543.000000
## logloss                    0.841912   0.013921    0.859396    0.831485
## max_per_class_error        0.567701   0.039323    0.571970    0.604255
## mean_per_class_accuracy    0.628757   0.011565    0.614289    0.623152
## mean_per_class_error       0.371243   0.011565    0.385711    0.376848
## mse                        0.292296   0.005311    0.301223    0.290064
## null_deviance           5153.448700 214.011960 5216.419400 5188.217000
## pr_auc                           NA   0.000000          NA          NA
## r2                         0.900432   0.002286    0.899640    0.898534
## residual_deviance       2419.913000  85.605640 2500.841800 2406.318000
## rmse                       0.540626   0.004891    0.548838    0.538576
##                          cv_3_valid  cv_4_valid  cv_5_valid
## accuracy                   0.632997    0.630539    0.643178
## auc                              NA          NA          NA
## err                        0.367003    0.369461    0.356822
## err_count                545.000000  542.000000  476.000000
## logloss                    0.832230    0.831742    0.854704
## max_per_class_error        0.590909    0.569170    0.502203
## mean_per_class_accuracy    0.625481    0.638095    0.642768
## mean_per_class_error       0.374519    0.361905    0.357232
## mse                        0.287202    0.292172    0.290817
## null_deviance           5323.343000 5257.700700 4781.564500
## pr_auc                           NA          NA          NA
## r2                         0.903365    0.898304    0.902316
## residual_deviance       2471.723400 2440.331800 2280.350300
## rmse                       0.535912    0.540529    0.539275

Save and Load

?h2o.getModel
?h2o.saveModel
?h2o.loadModel

#h2o.getModel("StackedEnsemble_AllModels_1_AutoML_1_20241203_144731") %>%
    #h2o.saveModel("h2o_models/")

best_model <- h2o.loadModel("h2o_models/StackedEnsemble_AllModels_1_AutoML_1_20241203_144731")

Make predictions

predictions <- h2o.predict(best_model, newdata = test_h2o)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

predictions_tbl <- predictions %>%
    as.tibble()

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

predictions_tbl %>%
    bind_cols(test_tbl)

## # A tibble: 2,812 × 10
##    predict  Connecticut   Maine Massachusetts New.Hampshire Rhode.Island Vermont
##    <fct>          <dbl>   <dbl>         <dbl>         <dbl>        <dbl>   <dbl>
##  1 Massach…       0.474 8.37e-4      0.520        0.000200       0.00452 8.53e-5
##  2 Connect…       0.866 1.09e-2      0.0682       0.0000946      0.0547  1.47e-4
##  3 Connect…       0.851 1.65e-2      0.00961      0.00201        0.119   1.70e-3
##  4 Connect…       0.600 5.46e-2      0.00121      0.0159         0.323   5.34e-3
##  5 Connect…       0.448 1.66e-1      0.000147     0.0206         0.358   6.93e-3
##  6 Connect…       0.908 1.33e-3      0.0814       0.0000165      0.00884 7.40e-5
##  7 Connect…       0.675 6.43e-2      0.000387     0.00815        0.249   3.37e-3
##  8 Rhode I…       0.408 1.40e-1      0.000327     0.0136         0.433   4.15e-3
##  9 Connect…       0.422 1.63e-1      0.000226     0.0213         0.384   9.56e-3
## 10 Connect…       0.590 1.10e-1      0.00101      0.0166         0.276   6.50e-3
## # ℹ 2,802 more rows
## # ℹ 3 more variables: symbol <fct>, date <date>, claims <int>

Evaluate model

performance_h2o  <- h2o.performance(best_model, newdata = test_h2o)
typeof(performance_h2o)

## [1] "S4"

slotNames(performance_h2o)

## [1] "algorithm" "on_train"  "on_valid"  "on_xval"   "metrics"

performance_h2o@metrics

## $model
## $model$`__meta`
## $model$`__meta`$schema_version
## [1] 3
## 
## $model$`__meta`$schema_name
## [1] "ModelKeyV3"
## 
## $model$`__meta`$schema_type
## [1] "Key<Model>"
## 
## 
## $model$name
## [1] "StackedEnsemble_AllModels_1_AutoML_1_20241203_144731"
## 
## $model$type
## [1] "Key<Model>"
## 
## $model$URL
## [1] "/3/Models/StackedEnsemble_AllModels_1_AutoML_1_20241203_144731"
## 
## 
## $model_checksum
## [1] "-4233719964055368032"
## 
## $frame
## $frame$name
## [1] "test_tbl_sid_ae6d_3"
## 
## 
## $frame_checksum
## [1] "-741132111419434106"
## 
## $description
## NULL
## 
## $scoring_time
## [1] 1.733261e+12
## 
## $predictions
## NULL
## 
## $MSE
## [1] 0.2957494
## 
## $RMSE
## [1] 0.5438285
## 
## $nobs
## [1] 2812
## 
## $custom_metric_name
## NULL
## 
## $custom_metric_value
## [1] 0
## 
## $r2
## [1] 0.8990639
## 
## $hit_ratio_table
## Top-6 Hit Ratios: 
##   k hit_ratio
## 1 1  0.636558
## 2 2  0.883001
## 3 3  0.964794
## 4 4  0.997511
## 5 5  0.999289
## 6 6  1.000000
## 
## $cm
## $cm$`__meta`
## $cm$`__meta`$schema_version
## [1] 3
## 
## $cm$`__meta`$schema_name
## [1] "ConfusionMatrixV3"
## 
## $cm$`__meta`$schema_type
## [1] "ConfusionMatrix"
## 
## 
## $cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           361     6            68             1           17
## Maine                   9   172             1            81          132
## Massachusetts          49     0           429             1            6
## New Hampshire           0    66             2           191           42
## Rhode Island           29   131             5            40          276
## Vermont                 1    21             0            92           22
## Totals                449   396           505           406          495
##               Vermont  Error            Rate
## Connecticut         0 0.2031 =      92 / 453
## Maine              38 0.6028 =     261 / 433
## Massachusetts       0 0.1155 =      56 / 485
## New Hampshire     152 0.5784 =     262 / 453
## Rhode Island       10 0.4379 =     215 / 491
## Vermont           361 0.2736 =     136 / 497
## Totals            561 0.3634 = 1,022 / 2,812
## 
## 
## $logloss
## [1] 0.852542
## 
## $mean_per_class_error
## [1] 0.368536
## 
## $AUC
## [1] "NaN"
## 
## $pr_auc
## [1] "NaN"
## 
## $multinomial_auc_table
## NULL
## 
## $multinomial_aucpr_table
## NULL
## 
## $residual_deviance
## [1] 4794.696
## 
## $null_deviance
## [1] 10082.92
## 
## $AIC
## [1] "NaN"
## 
## $loglikelihood
## [1] 0
## 
## $null_degrees_of_freedom
## [1] 2811
## 
## $residual_degrees_of_freedom
## [1] 2557

h2o.auc(best_model)

## [1] "NaN"

h2o.confusionMatrix(performance_h2o)

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##               Connecticut Maine Massachusetts New Hampshire Rhode Island
## Connecticut           361     6            68             1           17
## Maine                   9   172             1            81          132
## Massachusetts          49     0           429             1            6
## New Hampshire           0    66             2           191           42
## Rhode Island           29   131             5            40          276
## Vermont                 1    21             0            92           22
## Totals                449   396           505           406          495
##               Vermont  Error            Rate
## Connecticut         0 0.2031 =      92 / 453
## Maine              38 0.6028 =     261 / 433
## Massachusetts       0 0.1155 =      56 / 485
## New Hampshire     152 0.5784 =     262 / 453
## Rhode Island       10 0.4379 =     215 / 491
## Vermont           361 0.2736 =     136 / 497
## Totals            561 0.3634 = 1,022 / 2,812

Apply 11

Johnny McKinnon

2024-12-03

Plotting time series

Box plots

Regression plots

Plotting Seasonality and Correlation

Correlation Plots

Seasonality

STL Diagnostics

Time Series Data Wrangling

Summarize by Time

Filter By Time

Padding Data

Sliding (Rolling) Calculations

Apply 11

Build a Classification Model using H2O

Set up

Import data

Split data

Recipes

Model

Save and Load

Make predictions

Evaluate model