# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# for times series
library(timetk)

library(umap)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl
## # A tibble: 11,040 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,030 more rows
claims_tbl %>%
  plot_time_series(.date_var = date, .value = claims)
claims_tbl %>% count(claims)
## # A tibble: 5,328 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    184     2
##  4    189     1
##  5    200     1
##  6    201     1
##  7    203     1
##  8    205     1
##  9    206     1
## 10    211     2
## # ℹ 5,318 more rows
claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(
      .date_var     = date, 
      .value        = claims, 
      .facet_ncol   = 2, 
      .facet_scales = "free", 
      .interactive  = FALSE)

claims_tbl %>% count(claims)
## # A tibble: 5,328 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    184     2
##  4    189     1
##  5    200     1
##  6    201     1
##  7    203     1
##  8    205     1
##  9    206     1
## 10    211     2
## # ℹ 5,318 more rows
claims_tbl %>% 
  group_by(symbol) %>%
  plot_time_series(
    .date_var     = date, 
    .value        = log(claims), 
    .facet_ncol   = 2, 
    .facet_scales = "free", 
    .color_var    = week(date))
claims_tbl %>%
  plot_time_series(date, claims, 
                   .color_var = month(date, label = TRUE),
                   
                   # Returns static ggplot
                   .interactive = FALSE, 
                   .title = "State Unemployment", 
                   .x_lab = "Timeline", 
                   .y_lab = "Claims", 
                   .color_lab = "Month")

Box plots

claims_tbl %>% count(claims)
## # A tibble: 5,328 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    184     2
##  4    189     1
##  5    200     1
##  6    201     1
##  7    203     1
##  8    205     1
##  9    206     1
## 10    211     2
## # ℹ 5,318 more rows
claims_tbl %>%
    plot_time_series_boxplot(.date_var = date, 
                             .value = claims, 
                             .period = "1 year", 
                             .facet_ncol = 2)

Regression plots

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series_regression(
    .date_var = date, 
    .facet_ncol = 2, 
    .formula  = log(claims) ~ as.numeric(date) + month(date, label = TRUE), 
    .show_summary = TRUE)
## 
## Summary for Group: Connecticut---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8490 -0.2149 -0.0304  0.1805  3.1830 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.574e+00  2.944e-02 291.273  < 2e-16 ***
## as.numeric(date)             -1.565e-05  2.119e-06  -7.383 2.34e-13 ***
## month(date, label = TRUE).L  -2.918e-01  2.727e-02 -10.699  < 2e-16 ***
## month(date, label = TRUE).Q   4.386e-01  2.720e-02  16.128  < 2e-16 ***
## month(date, label = TRUE).C   5.819e-02  2.714e-02   2.144 0.032148 *  
## month(date, label = TRUE)^4   4.908e-01  2.723e-02  18.025  < 2e-16 ***
## month(date, label = TRUE)^5  -2.213e-02  2.746e-02  -0.806 0.420301    
## month(date, label = TRUE)^6  -2.775e-02  2.754e-02  -1.008 0.313777    
## month(date, label = TRUE)^7  -1.231e-01  2.732e-02  -4.504 7.09e-06 ***
## month(date, label = TRUE)^8   5.651e-02  2.723e-02   2.075 0.038121 *  
## month(date, label = TRUE)^9   1.934e-01  2.731e-02   7.082 2.02e-12 ***
## month(date, label = TRUE)^10 -9.414e-02  2.730e-02  -3.448 0.000577 ***
## month(date, label = TRUE)^11  8.424e-03  2.731e-02   0.309 0.757723    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.338 on 1827 degrees of freedom
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.3144 
## F-statistic: 71.28 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Massachusetts---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5882 -0.2224 -0.0517  0.1774  3.3823 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   9.195e+00  3.603e-02 255.166  < 2e-16 ***
## as.numeric(date)             -2.093e-05  2.594e-06  -8.068 1.28e-15 ***
## month(date, label = TRUE).L  -7.800e-02  3.338e-02  -2.336 0.019582 *  
## month(date, label = TRUE).Q   5.944e-01  3.329e-02  17.855  < 2e-16 ***
## month(date, label = TRUE).C   1.926e-01  3.322e-02   5.797 7.94e-09 ***
## month(date, label = TRUE)^4   2.867e-01  3.333e-02   8.601  < 2e-16 ***
## month(date, label = TRUE)^5  -5.649e-02  3.361e-02  -1.681 0.092981 .  
## month(date, label = TRUE)^6  -1.125e-01  3.371e-02  -3.336 0.000867 ***
## month(date, label = TRUE)^7  -6.434e-03  3.345e-02  -0.192 0.847492    
## month(date, label = TRUE)^8   3.316e-02  3.334e-02   0.995 0.319976    
## month(date, label = TRUE)^9   9.952e-02  3.343e-02   2.977 0.002948 ** 
## month(date, label = TRUE)^10 -8.996e-02  3.342e-02  -2.692 0.007178 ** 
## month(date, label = TRUE)^11  1.247e-02  3.343e-02   0.373 0.709246    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4137 on 1827 degrees of freedom
## Multiple R-squared:  0.2243, Adjusted R-squared:  0.2192 
## F-statistic: 44.02 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Maine---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8386 -0.2423 -0.0621  0.1923  3.4344 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.381e+00  3.389e-02 247.329  < 2e-16 ***
## as.numeric(date)             -8.527e-05  2.440e-06 -34.950  < 2e-16 ***
## month(date, label = TRUE).L  -2.159e-01  3.139e-02  -6.878 8.32e-12 ***
## month(date, label = TRUE).Q   8.101e-01  3.131e-02  25.874  < 2e-16 ***
## month(date, label = TRUE).C   2.198e-01  3.124e-02   7.034 2.82e-12 ***
## month(date, label = TRUE)^4   2.225e-01  3.135e-02   7.099 1.79e-12 ***
## month(date, label = TRUE)^5  -2.080e-01  3.161e-02  -6.580 6.14e-11 ***
## month(date, label = TRUE)^6  -1.795e-02  3.170e-02  -0.566   0.5714    
## month(date, label = TRUE)^7  -1.318e-01  3.145e-02  -4.190 2.92e-05 ***
## month(date, label = TRUE)^8   5.464e-02  3.135e-02   1.743   0.0815 .  
## month(date, label = TRUE)^9   1.287e-01  3.144e-02   4.095 4.41e-05 ***
## month(date, label = TRUE)^10 -7.169e-02  3.143e-02  -2.281   0.0227 *  
## month(date, label = TRUE)^11 -5.374e-02  3.143e-02  -1.710   0.0875 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.389 on 1827 degrees of freedom
## Multiple R-squared:  0.5371, Adjusted R-squared:  0.534 
## F-statistic: 176.6 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: New Hampshire---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3400 -0.3594 -0.0449  0.2876  3.7339 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.119e+00  4.776e-02 149.075  < 2e-16 ***
## as.numeric(date)             -2.398e-05  3.438e-06  -6.974 4.28e-12 ***
## month(date, label = TRUE).L  -2.112e-01  4.424e-02  -4.774 1.95e-06 ***
## month(date, label = TRUE).Q   4.402e-01  4.412e-02   9.978  < 2e-16 ***
## month(date, label = TRUE).C   2.268e-01  4.403e-02   5.151 2.88e-07 ***
## month(date, label = TRUE)^4   3.052e-01  4.418e-02   6.908 6.75e-12 ***
## month(date, label = TRUE)^5  -4.862e-02  4.454e-02  -1.092  0.27520    
## month(date, label = TRUE)^6  -1.718e-03  4.468e-02  -0.038  0.96934    
## month(date, label = TRUE)^7  -1.389e-01  4.433e-02  -3.134  0.00175 ** 
## month(date, label = TRUE)^8   7.738e-02  4.418e-02   1.751  0.08004 .  
## month(date, label = TRUE)^9   1.852e-01  4.430e-02   4.181 3.04e-05 ***
## month(date, label = TRUE)^10 -1.329e-01  4.429e-02  -3.002  0.00272 ** 
## month(date, label = TRUE)^11  5.651e-03  4.430e-02   0.128  0.89850    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5483 on 1827 degrees of freedom
## Multiple R-squared:  0.1355, Adjusted R-squared:  0.1298 
## F-statistic: 23.85 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Rhode Island---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0430 -0.2675 -0.0644  0.1815  3.3940 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.058e+00  3.824e-02 210.720  < 2e-16 ***
## as.numeric(date)             -4.804e-05  2.753e-06 -17.448  < 2e-16 ***
## month(date, label = TRUE).L  -3.402e-01  3.543e-02  -9.603  < 2e-16 ***
## month(date, label = TRUE).Q   5.653e-01  3.533e-02  16.001  < 2e-16 ***
## month(date, label = TRUE).C   1.819e-01  3.525e-02   5.161 2.73e-07 ***
## month(date, label = TRUE)^4   4.432e-01  3.537e-02  12.527  < 2e-16 ***
## month(date, label = TRUE)^5   5.777e-02  3.567e-02   1.620  0.10549    
## month(date, label = TRUE)^6  -1.192e-01  3.578e-02  -3.332  0.00088 ***
## month(date, label = TRUE)^7  -2.691e-02  3.550e-02  -0.758  0.44846    
## month(date, label = TRUE)^8   5.238e-02  3.538e-02   1.481  0.13886    
## month(date, label = TRUE)^9   1.900e-01  3.548e-02   5.355 9.66e-08 ***
## month(date, label = TRUE)^10 -1.883e-01  3.547e-02  -5.309 1.24e-07 ***
## month(date, label = TRUE)^11  6.618e-03  3.547e-02   0.187  0.85201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.439 on 1827 degrees of freedom
## Multiple R-squared:  0.3327, Adjusted R-squared:  0.3284 
## F-statistic: 75.92 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Vermont---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2085 -0.2319 -0.0374  0.1961  3.1346 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.032e+00  3.413e-02 206.022  < 2e-16 ***
## as.numeric(date)             -3.406e-05  2.457e-06 -13.859  < 2e-16 ***
## month(date, label = TRUE).L  -8.164e-02  3.162e-02  -2.582   0.0099 ** 
## month(date, label = TRUE).Q   7.176e-01  3.153e-02  22.757  < 2e-16 ***
## month(date, label = TRUE).C   5.306e-01  3.147e-02  16.863  < 2e-16 ***
## month(date, label = TRUE)^4   7.925e-02  3.157e-02   2.510   0.0122 *  
## month(date, label = TRUE)^5  -3.352e-01  3.184e-02 -10.529  < 2e-16 ***
## month(date, label = TRUE)^6   6.819e-03  3.193e-02   0.214   0.8309    
## month(date, label = TRUE)^7  -1.470e-01  3.168e-02  -4.638 3.76e-06 ***
## month(date, label = TRUE)^8  -1.085e-02  3.158e-02  -0.344   0.7311    
## month(date, label = TRUE)^9   1.978e-01  3.166e-02   6.245 5.24e-10 ***
## month(date, label = TRUE)^10 -1.728e-01  3.166e-02  -5.460 5.42e-08 ***
## month(date, label = TRUE)^11  7.008e-02  3.166e-02   2.213   0.0270 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3919 on 1827 degrees of freedom
## Multiple R-squared:  0.3974, Adjusted R-squared:  0.3935 
## F-statistic: 100.4 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
  group_by(symbol) %>%
  plot_acf_diagnostics(
    date, claims, 
    .lags = "1 year")

Seasonality

claims_tbl %>%
  group_by(symbol) %>%
  plot_seasonal_diagnostics(date, claims)

STL Diagnostics

claims_tbl %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(
        date, claims,
        .frequency = "auto", .trend = "auto",
        .feature_set = c("observed", "season", "trend", "remainder"),
        .interactive = TRUE)
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(date, claims, .facet_ncol = 2, .interactive = TRUE)
claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, adjusted = mean(claims), .by = "year") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = TRUE)

Filter By Time

claims_tbl %>%
  group_by(symbol) %>%
  filter_by_time(.date_var   = date, 
                 .start_date = "2000", 
                 .end_date   = "2022") %>%
  plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>% 
  group_by(symbol) %>%
  pad_by_time(date, .by = "day", .pad_value = 0)
## # A tibble: 77,244 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-08      0
##  3 Connecticut 1989-01-09      0
##  4 Connecticut 1989-01-10      0
##  5 Connecticut 1989-01-11      0
##  6 Connecticut 1989-01-12      0
##  7 Connecticut 1989-01-13      0
##  8 Connecticut 1989-01-14   6503
##  9 Connecticut 1989-01-15      0
## 10 Connecticut 1989-01-16      0
## # ℹ 77,234 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
  head(10) %>%
  mutate(rolling_avg_2 = slidify_vec(claims, mean, 
                                     .period  = 2, 
                                     .align   = "right", 
                                     .partial = TRUE))
## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345         8345 
##  2 Connecticut 1989-01-14   6503         7424 
##  3 Connecticut 1989-01-21   3821         5162 
##  4 Connecticut 1989-01-28   4663         4242 
##  5 Connecticut 1989-02-04   4162         4412.
##  6 Connecticut 1989-02-11   4337         4250.
##  7 Connecticut 1989-02-18   4079         4208 
##  8 Connecticut 1989-02-25   3556         3818.
##  9 Connecticut 1989-03-04   3826         3691 
## 10 Connecticut 1989-03-11   3515         3670.
# Rolling regressions are easy to implement using '.unlist = FALSE 
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), 
                   .period = 90, 
                   .unlist = FALSE, 
                   .align  = "right")



claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>% 
  mutate(numeric_date = as.numeric(symbol)) %>%
  #Apply rolling regression 
  mutate(rolling_lm = lm_roll(claims, date, numeric_date)) %>%
  filter(!is.na(rolling_lm))
## # A tibble: 10,506 × 5
## # Groups:   symbol [6]
##    symbol      date       claims numeric_date rolling_lm
##    <fct>       <date>      <int>        <dbl> <list>    
##  1 Connecticut 1990-09-22   3927            1 <lm>      
##  2 Connecticut 1990-09-29   4471            1 <lm>      
##  3 Connecticut 1990-10-06   4430            1 <lm>      
##  4 Connecticut 1990-10-13   4494            1 <lm>      
##  5 Connecticut 1990-10-20   4894            1 <lm>      
##  6 Connecticut 1990-10-27   4653            1 <lm>      
##  7 Connecticut 1990-11-03   4719            1 <lm>      
##  8 Connecticut 1990-11-10   5347            1 <lm>      
##  9 Connecticut 1990-11-17   4824            1 <lm>      
## 10 Connecticut 1990-11-24   5367            1 <lm>      
## # ℹ 10,496 more rows