# for Core packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.1
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# for times series
library(timetk)
## Warning: package 'timetk' was built under R version 4.4.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.2

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

# Line plot of claims over time
claims_tbl %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_line() +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "Unemployment Claims Over Time", y = "Claims", x = "Date") +
    theme_minimal()

Box plots

# Box plot by month
claims_tbl %>%
    mutate(month = month(date, label = TRUE)) %>%
    ggplot(aes(x = month, y = claims, fill = symbol)) +
    geom_boxplot() +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "Boxplot of Unemployment Claims by Month", y = "Claims", x = "Month") +
    theme_minimal()

Regression plots

# Scatter plot with linear regression
claims_tbl %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method = "lm", se = FALSE) +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "Regression Plots of Unemployment Claims", y = "Claims", x = "Date") +
    theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Plotting Seasonality and Correlation

Correlation Plots

# Calculate and plot correlations
# Create a correlation matrix
corr_matrix <- claims_tbl %>%
    spread(key = symbol, value = claims) %>%
    select(-date) %>%
    cor(use = "pairwise.complete.obs")

# Plot the correlation heatmap
ggcorrplot(corr_matrix,
           method = "circle",     # Shape of the correlation points
           type = "lower",        # Show only lower triangle
           lab = TRUE,            # Add correlation coefficients
           lab_size = 3,
           title = "Correlation Heatmap of Unemployment Claims",
           ggtheme = theme_minimal())

Seasonality

# Seasonal diagnostics plot
claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(
        .date_var = date,
        .value = claims
    )

STL Diagnostics

# STL decomposition diagnostics
claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(
        .date_var = date,
        .value = claims
    )

Time Series Data Wrangling

Summarize by Time

# Summarize claims by year
claims_tbl %>%
    summarise_by_time(.date_var = date, .by = "year", total_claims = sum(claims)) %>%
    ggplot(aes(x = date, y = total_claims)) +
    geom_line() +
    labs(title = "Total Claims by Year", y = "Total Claims", x = "Year") +
    theme_minimal()

Filter By Time

# Filter claims to the most recent decade
claims_tbl %>%
    filter_by_time(.date_var = date, .start_date = "2010-01-01") %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_line() +
    labs(title = "Unemployment Claims (2010-Present)", y = "Claims", x = "Date") +
    theme_minimal()

Padding Data

# Pad the data to ensure no missing dates
claims_tbl %>%
    group_by(symbol) %>%
    pad_by_time(.date_var = date, .by = "week") %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_line() +
    labs(title = "Padded Unemployment Claims Data", y = "Claims", x = "Date") +
    theme_minimal()

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(log(claims), mean, 
                                       .period = 2, 
                                       .align = "right", 
                                       .partial = TRUE))
## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345          9.03
##  2 Connecticut 1989-01-14   6503          8.90
##  3 Connecticut 1989-01-21   3821          8.51
##  4 Connecticut 1989-01-28   4663          8.35
##  5 Connecticut 1989-02-04   4162          8.39
##  6 Connecticut 1989-02-11   4337          8.35
##  7 Connecticut 1989-02-18   4079          8.34
##  8 Connecticut 1989-02-25   3556          8.24
##  9 Connecticut 1989-03-04   3826          8.21
## 10 Connecticut 1989-03-11   3515          8.21
# Rolling average calculation
library(slider)
## Warning: package 'slider' was built under R version 4.4.2
claims_tbl %>%
    group_by(symbol) %>%
    mutate(rolling_avg = slide_dbl(claims, mean, .before = 11, .complete = TRUE)) %>%
    ggplot(aes(x = date, y = rolling_avg, color = symbol)) +
    geom_line() +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "12-Month Rolling Average of Claims", y = "Rolling Average", x = "Date") +
    theme_minimal()
## Warning: Removed 66 rows containing missing values or values outside the scale range
## (`geom_line()`).

lm_roll <- slidify(~ lm(..1 ~ ..2), .period = 90, 
                   .unlist = FALSE, .align = "right")

reg_results <- claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>%
  mutate(numeric_date = as.numeric(date)) %>%mutate(rolling_lm = lm_roll(claims, numeric_date)) %>%
  filter(!is.na(rolling_lm))

# Check rolling_lm
reg_results$rolling_lm %>% .[[1]] %>% broom::tidy()
## # A tibble: 2 × 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -11225.    6974.        -1.61  0.111 
## 2 ..2              2.19     0.961      2.28  0.0248
# Check all rows
reg_coeff <- reg_results %>% mutate(rolling_lm = map(rolling_lm, broom::tidy)) %>% unnest(rolling_lm)

# Plot coefficient
reg_coeff %>% filter(term== "..2") %>% ggplot(aes(date, estimate)) + geom_line() + facet_wrap(~symbol)