Apply 11: Classification Model

# for Core packages
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.1

## Warning: package 'purrr' was built under R version 4.4.2

## Warning: package 'dplyr' was built under R version 4.4.1

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for financial analysis
library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# for times series
library(timetk)

## Warning: package 'timetk' was built under R version 4.4.2

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.4.2

library(h2o)

## Warning: package 'h2o' was built under R version 4.4.2

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

library(slider)

## Warning: package 'slider' was built under R version 4.4.2

library(dplyr)
library(lubridate)
library(purrr)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

# Feature engineering using STL decomposition

# Ensure `date` column is of Date class
claims_tbl <- claims_tbl %>%
  mutate(date = as.Date(date))

# Perform feature engineering with STL decomposition
claims_features <- claims_tbl %>%
  group_by(symbol) %>%
  filter(n() >= 24) %>%  # Ensure at least 24 months of data per group
  mutate(
    claims_lag_12 = slider::slide_dbl(claims, mean, .before = 12, .complete = TRUE),
    month = lubridate::month(date),
    year = lubridate::year(date)
  ) %>%
  # Add STL decomposition
  group_by(symbol) %>%
  group_modify(~ {
    stl_result <- stats::stl(ts(.x$claims, frequency = 12), s.window = "periodic")$time.series
    bind_cols(.x, as_tibble(stl_result)) # Add trend, seasonal, and remainder as columns
  }) %>%
  ungroup()

# Correlation analysis
corr_matrix <- claims_features %>%
  select(-date) %>%
  select_if(is.numeric) %>%
  cor(use = "pairwise.complete.obs")

ggcorrplot(corr_matrix, lab = TRUE)

# Prepare data for modeling
claims_model_data <- claims_features %>%
  drop_na() %>%
  mutate(label = if_else(claims > lag(claims, 12), "increase", "decrease")) %>%
  select(-c(date, symbol))

claims_model_data <- claims_model_data %>% sample_frac(0.5) %>%
  mutate(trend = as.numeric(trend))

# Initialize H2O
h2o.init(startH2O = TRUE, max_mem_size = "8G", nthreads = -2)

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\Surplus\AppData\Local\Temp\Rtmpq24G29\file64443b293b9a/h2o_Surplus_started_from_r.out
##     C:\Users\Surplus\AppData\Local\Temp\Rtmpq24G29\file6444210c7654/h2o_Surplus_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         6 seconds 219 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    11 months and 28 days 
##     H2O cluster name:           H2O_started_from_R_Surplus_zuj217 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.10 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.4.0 (2024-04-24 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (11 months and 28 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

## 
## Note:  As started, H2O is limited to the CRAN default of 2 CPUs.
##        Shut down and restart H2O as shown below to use all your CPUs.
##            > h2o.shutdown()
##            > h2o.init(nthreads = -1)

# Split data
h2o_data <- as.h2o(claims_model_data)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

# Ensure dataset is not excessively large
h2o.summary(h2o_data)

## Warning in h2o.summary(h2o_data): Approximated quantiles computed! If you are
## interested in exact quantiles, please pass the `exact_quantiles=TRUE`
## parameter.

##  claims             claims_lag_12     month            year          
##  Min.   :   154.0   Min.   :  225.5   Min.   : 1.000   Min.   :1989  
##  1st Qu.:   879.1   1st Qu.:  920.4   1st Qu.: 4.000   1st Qu.:1997  
##  Median :  1604.2   Median : 1692.5   Median : 7.000   Median :2007  
##  Mean   :  3165.0   Mean   : 3153.4   Mean   : 6.609   Mean   :2006  
##  3rd Qu.:  4141.9   3rd Qu.: 4317.7   3rd Qu.:10.000   3rd Qu.:2015  
##  Max.   :181423.0   Max.   :77435.4   Max.   :12.000   Max.   :2024  
##  seasonal           trend             remainder           label
##  Min.   :-412.285   Min.   :  224.2   Min.   :-37513.64        
##  1st Qu.: -60.467   1st Qu.:  863.3   1st Qu.:  -411.84        
##  Median : -15.870   Median : 1662.1   Median :  -121.98        
##  Mean   :   1.033   Mean   : 3149.6   Mean   :    14.41        
##  3rd Qu.:  51.521   3rd Qu.: 4378.1   3rd Qu.:    22.95        
##  Max.   : 578.753   Max.   :80108.1   Max.   :107415.26

splits <- h2o.splitFrame(h2o_data, ratios = 0.8, seed = 12)
train <- splits[[1]]
test <- splits[[2]]

# Define response and predictor variables
response <- "trend"  # Replace with your response column name
predictors <- setdiff(names(h2o_data), response)

# Build and evaluate model
y <- "label"
x <- setdiff(names(h2o_data), y)

model <- h2o.gbm(
  x = predictors,
  y = response,
  training_frame = train,
  validation_frame = test,
  ntrees = 50,
  max_depth = 5,
  learn_rate = 0.1,
  seed = 123
)

## Warning in .h2o.processResponseWarnings(res): Dropping bad and constant columns: [label].

##   |                                                                              |                                                                      |   0%  |                                                                              |==================================                                    |  48%  |                                                                              |======================================================================| 100%

# Evaluate model
perf <- h2o.performance(model, test)
print(h2o.auc(perf))

## NULL

# Shutdown H2O
h2o.shutdown(prompt = FALSE)

I found that h2o had a lot more errors and it took much longer to correct them. Correcting one error would create another error somewhere else. I felt the way it was done in Apply 10 was much easier.

Apply 11: Classification Model

Sara Donahue

2024-12-18