Apply 10

# for Core packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for financial analysis
library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# for times series
library(timetk)

library(umap)

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl

## # A tibble: 11,040 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,030 more rows

claims_tbl %>%
  plot_time_series(.date_var = date, .value = claims)

claims_tbl %>% count(claims)

## # A tibble: 5,328 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    184     2
##  4    189     1
##  5    200     1
##  6    201     1
##  7    203     1
##  8    205     1
##  9    206     1
## 10    211     2
## # ℹ 5,318 more rows

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(
      .date_var     = date, 
      .value        = claims, 
      .facet_ncol   = 2, 
      .facet_scales = "free", 
      .interactive  = FALSE)

claims_tbl %>% count(claims)

## # A tibble: 5,328 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    184     2
##  4    189     1
##  5    200     1
##  6    201     1
##  7    203     1
##  8    205     1
##  9    206     1
## 10    211     2
## # ℹ 5,318 more rows

claims_tbl %>% 
  group_by(symbol) %>%
  plot_time_series(
    .date_var     = date, 
    .value        = log(claims), 
    .facet_ncol   = 2, 
    .facet_scales = "free", 
    .color_var    = week(date))

claims_tbl %>%
  plot_time_series(date, claims, 
                   .color_var = month(date, label = TRUE),
                   
                   # Returns static ggplot
                   .interactive = FALSE, 
                   .title = "State Unemployment", 
                   .x_lab = "Timeline", 
                   .y_lab = "Claims", 
                   .color_lab = "Month")

Box plots

claims_tbl %>% count(claims)

## # A tibble: 5,328 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    184     2
##  4    189     1
##  5    200     1
##  6    201     1
##  7    203     1
##  8    205     1
##  9    206     1
## 10    211     2
## # ℹ 5,318 more rows

claims_tbl %>%
    plot_time_series_boxplot(.date_var = date, 
                             .value = claims, 
                             .period = "1 year", 
                             .facet_ncol = 2)

Regression plots

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series_regression(
    .date_var = date, 
    .facet_ncol = 2, 
    .formula  = log(claims) ~ as.numeric(date) + month(date, label = TRUE), 
    .show_summary = TRUE)

## 
## Summary for Group: Connecticut---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8490 -0.2149 -0.0304  0.1805  3.1830 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.574e+00  2.944e-02 291.273  < 2e-16 ***
## as.numeric(date)             -1.565e-05  2.119e-06  -7.383 2.34e-13 ***
## month(date, label = TRUE).L  -2.918e-01  2.727e-02 -10.699  < 2e-16 ***
## month(date, label = TRUE).Q   4.386e-01  2.720e-02  16.128  < 2e-16 ***
## month(date, label = TRUE).C   5.819e-02  2.714e-02   2.144 0.032148 *  
## month(date, label = TRUE)^4   4.908e-01  2.723e-02  18.025  < 2e-16 ***
## month(date, label = TRUE)^5  -2.213e-02  2.746e-02  -0.806 0.420301    
## month(date, label = TRUE)^6  -2.775e-02  2.754e-02  -1.008 0.313777    
## month(date, label = TRUE)^7  -1.231e-01  2.732e-02  -4.504 7.09e-06 ***
## month(date, label = TRUE)^8   5.651e-02  2.723e-02   2.075 0.038121 *  
## month(date, label = TRUE)^9   1.934e-01  2.731e-02   7.082 2.02e-12 ***
## month(date, label = TRUE)^10 -9.414e-02  2.730e-02  -3.448 0.000577 ***
## month(date, label = TRUE)^11  8.424e-03  2.731e-02   0.309 0.757723    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.338 on 1827 degrees of freedom
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.3144 
## F-statistic: 71.28 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Massachusetts---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5882 -0.2224 -0.0517  0.1774  3.3823 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   9.195e+00  3.603e-02 255.166  < 2e-16 ***
## as.numeric(date)             -2.093e-05  2.594e-06  -8.068 1.28e-15 ***
## month(date, label = TRUE).L  -7.800e-02  3.338e-02  -2.336 0.019582 *  
## month(date, label = TRUE).Q   5.944e-01  3.329e-02  17.855  < 2e-16 ***
## month(date, label = TRUE).C   1.926e-01  3.322e-02   5.797 7.94e-09 ***
## month(date, label = TRUE)^4   2.867e-01  3.333e-02   8.601  < 2e-16 ***
## month(date, label = TRUE)^5  -5.649e-02  3.361e-02  -1.681 0.092981 .  
## month(date, label = TRUE)^6  -1.125e-01  3.371e-02  -3.336 0.000867 ***
## month(date, label = TRUE)^7  -6.434e-03  3.345e-02  -0.192 0.847492    
## month(date, label = TRUE)^8   3.316e-02  3.334e-02   0.995 0.319976    
## month(date, label = TRUE)^9   9.952e-02  3.343e-02   2.977 0.002948 ** 
## month(date, label = TRUE)^10 -8.996e-02  3.342e-02  -2.692 0.007178 ** 
## month(date, label = TRUE)^11  1.247e-02  3.343e-02   0.373 0.709246    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4137 on 1827 degrees of freedom
## Multiple R-squared:  0.2243, Adjusted R-squared:  0.2192 
## F-statistic: 44.02 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Maine---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8386 -0.2423 -0.0621  0.1923  3.4344 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.381e+00  3.389e-02 247.329  < 2e-16 ***
## as.numeric(date)             -8.527e-05  2.440e-06 -34.950  < 2e-16 ***
## month(date, label = TRUE).L  -2.159e-01  3.139e-02  -6.878 8.32e-12 ***
## month(date, label = TRUE).Q   8.101e-01  3.131e-02  25.874  < 2e-16 ***
## month(date, label = TRUE).C   2.198e-01  3.124e-02   7.034 2.82e-12 ***
## month(date, label = TRUE)^4   2.225e-01  3.135e-02   7.099 1.79e-12 ***
## month(date, label = TRUE)^5  -2.080e-01  3.161e-02  -6.580 6.14e-11 ***
## month(date, label = TRUE)^6  -1.795e-02  3.170e-02  -0.566   0.5714    
## month(date, label = TRUE)^7  -1.318e-01  3.145e-02  -4.190 2.92e-05 ***
## month(date, label = TRUE)^8   5.464e-02  3.135e-02   1.743   0.0815 .  
## month(date, label = TRUE)^9   1.287e-01  3.144e-02   4.095 4.41e-05 ***
## month(date, label = TRUE)^10 -7.169e-02  3.143e-02  -2.281   0.0227 *  
## month(date, label = TRUE)^11 -5.374e-02  3.143e-02  -1.710   0.0875 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.389 on 1827 degrees of freedom
## Multiple R-squared:  0.5371, Adjusted R-squared:  0.534 
## F-statistic: 176.6 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: New Hampshire---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3400 -0.3594 -0.0449  0.2876  3.7339 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.119e+00  4.776e-02 149.075  < 2e-16 ***
## as.numeric(date)             -2.398e-05  3.438e-06  -6.974 4.28e-12 ***
## month(date, label = TRUE).L  -2.112e-01  4.424e-02  -4.774 1.95e-06 ***
## month(date, label = TRUE).Q   4.402e-01  4.412e-02   9.978  < 2e-16 ***
## month(date, label = TRUE).C   2.268e-01  4.403e-02   5.151 2.88e-07 ***
## month(date, label = TRUE)^4   3.052e-01  4.418e-02   6.908 6.75e-12 ***
## month(date, label = TRUE)^5  -4.862e-02  4.454e-02  -1.092  0.27520    
## month(date, label = TRUE)^6  -1.718e-03  4.468e-02  -0.038  0.96934    
## month(date, label = TRUE)^7  -1.389e-01  4.433e-02  -3.134  0.00175 ** 
## month(date, label = TRUE)^8   7.738e-02  4.418e-02   1.751  0.08004 .  
## month(date, label = TRUE)^9   1.852e-01  4.430e-02   4.181 3.04e-05 ***
## month(date, label = TRUE)^10 -1.329e-01  4.429e-02  -3.002  0.00272 ** 
## month(date, label = TRUE)^11  5.651e-03  4.430e-02   0.128  0.89850    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5483 on 1827 degrees of freedom
## Multiple R-squared:  0.1355, Adjusted R-squared:  0.1298 
## F-statistic: 23.85 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Rhode Island---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0430 -0.2675 -0.0644  0.1815  3.3940 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.058e+00  3.824e-02 210.720  < 2e-16 ***
## as.numeric(date)             -4.804e-05  2.753e-06 -17.448  < 2e-16 ***
## month(date, label = TRUE).L  -3.402e-01  3.543e-02  -9.603  < 2e-16 ***
## month(date, label = TRUE).Q   5.653e-01  3.533e-02  16.001  < 2e-16 ***
## month(date, label = TRUE).C   1.819e-01  3.525e-02   5.161 2.73e-07 ***
## month(date, label = TRUE)^4   4.432e-01  3.537e-02  12.527  < 2e-16 ***
## month(date, label = TRUE)^5   5.777e-02  3.567e-02   1.620  0.10549    
## month(date, label = TRUE)^6  -1.192e-01  3.578e-02  -3.332  0.00088 ***
## month(date, label = TRUE)^7  -2.691e-02  3.550e-02  -0.758  0.44846    
## month(date, label = TRUE)^8   5.238e-02  3.538e-02   1.481  0.13886    
## month(date, label = TRUE)^9   1.900e-01  3.548e-02   5.355 9.66e-08 ***
## month(date, label = TRUE)^10 -1.883e-01  3.547e-02  -5.309 1.24e-07 ***
## month(date, label = TRUE)^11  6.618e-03  3.547e-02   0.187  0.85201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.439 on 1827 degrees of freedom
## Multiple R-squared:  0.3327, Adjusted R-squared:  0.3284 
## F-statistic: 75.92 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Vermont---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2085 -0.2319 -0.0374  0.1961  3.1346 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.032e+00  3.413e-02 206.022  < 2e-16 ***
## as.numeric(date)             -3.406e-05  2.457e-06 -13.859  < 2e-16 ***
## month(date, label = TRUE).L  -8.164e-02  3.162e-02  -2.582   0.0099 ** 
## month(date, label = TRUE).Q   7.176e-01  3.153e-02  22.757  < 2e-16 ***
## month(date, label = TRUE).C   5.306e-01  3.147e-02  16.863  < 2e-16 ***
## month(date, label = TRUE)^4   7.925e-02  3.157e-02   2.510   0.0122 *  
## month(date, label = TRUE)^5  -3.352e-01  3.184e-02 -10.529  < 2e-16 ***
## month(date, label = TRUE)^6   6.819e-03  3.193e-02   0.214   0.8309    
## month(date, label = TRUE)^7  -1.470e-01  3.168e-02  -4.638 3.76e-06 ***
## month(date, label = TRUE)^8  -1.085e-02  3.158e-02  -0.344   0.7311    
## month(date, label = TRUE)^9   1.978e-01  3.166e-02   6.245 5.24e-10 ***
## month(date, label = TRUE)^10 -1.728e-01  3.166e-02  -5.460 5.42e-08 ***
## month(date, label = TRUE)^11  7.008e-02  3.166e-02   2.213   0.0270 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3919 on 1827 degrees of freedom
## Multiple R-squared:  0.3974, Adjusted R-squared:  0.3935 
## F-statistic: 100.4 on 12 and 1827 DF,  p-value: < 2.2e-16
## 
## ----

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
  group_by(symbol) %>%
  plot_acf_diagnostics(
    date, claims, 
    .lags = "1 year")

Seasonality

claims_tbl %>%
  group_by(symbol) %>%
  plot_seasonal_diagnostics(date, claims)

STL Diagnostics

claims_tbl %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(
        date, claims,
        .frequency = "auto", .trend = "auto",
        .feature_set = c("observed", "season", "trend", "remainder"),
        .interactive = TRUE)

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

## frequency = 13 observations per 1 quarter

## trend = 53 observations per 1 year

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
  group_by(symbol) %>%
  plot_time_series(date, claims, .facet_ncol = 2, .interactive = TRUE)

claims_tbl %>%
  group_by(symbol) %>%
  summarise_by_time(.date_var = date, adjusted = mean(claims), .by = "year") %>%
  plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = TRUE)

Filter By Time

claims_tbl %>%
  group_by(symbol) %>%
  filter_by_time(.date_var   = date, 
                 .start_date = "2000", 
                 .end_date   = "2022") %>%
  plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>% 
  group_by(symbol) %>%
  pad_by_time(date, .by = "day", .pad_value = 0)

## # A tibble: 77,244 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-08      0
##  3 Connecticut 1989-01-09      0
##  4 Connecticut 1989-01-10      0
##  5 Connecticut 1989-01-11      0
##  6 Connecticut 1989-01-12      0
##  7 Connecticut 1989-01-13      0
##  8 Connecticut 1989-01-14   6503
##  9 Connecticut 1989-01-15      0
## 10 Connecticut 1989-01-16      0
## # ℹ 77,234 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
  head(10) %>%
  mutate(rolling_avg_2 = slidify_vec(claims, mean, 
                                     .period  = 2, 
                                     .align   = "right", 
                                     .partial = TRUE))

## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345         8345 
##  2 Connecticut 1989-01-14   6503         7424 
##  3 Connecticut 1989-01-21   3821         5162 
##  4 Connecticut 1989-01-28   4663         4242 
##  5 Connecticut 1989-02-04   4162         4412.
##  6 Connecticut 1989-02-11   4337         4250.
##  7 Connecticut 1989-02-18   4079         4208 
##  8 Connecticut 1989-02-25   3556         3818.
##  9 Connecticut 1989-03-04   3826         3691 
## 10 Connecticut 1989-03-11   3515         3670.

# Rolling regressions are easy to implement using '.unlist = FALSE 
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3), 
                   .period = 90, 
                   .unlist = FALSE, 
                   .align  = "right")



claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>% 
  mutate(numeric_date = as.numeric(symbol)) %>%
  #Apply rolling regression 
  mutate(rolling_lm = lm_roll(claims, date, numeric_date)) %>%
  filter(!is.na(rolling_lm))

## # A tibble: 10,506 × 5
## # Groups:   symbol [6]
##    symbol      date       claims numeric_date rolling_lm
##    <fct>       <date>      <int>        <dbl> <list>    
##  1 Connecticut 1990-09-22   3927            1 <lm>      
##  2 Connecticut 1990-09-29   4471            1 <lm>      
##  3 Connecticut 1990-10-06   4430            1 <lm>      
##  4 Connecticut 1990-10-13   4494            1 <lm>      
##  5 Connecticut 1990-10-20   4894            1 <lm>      
##  6 Connecticut 1990-10-27   4653            1 <lm>      
##  7 Connecticut 1990-11-03   4719            1 <lm>      
##  8 Connecticut 1990-11-10   5347            1 <lm>      
##  9 Connecticut 1990-11-17   4824            1 <lm>      
## 10 Connecticut 1990-11-24   5367            1 <lm>      
## # ℹ 10,496 more rows

Apply 10

Spencer Murrin

Plotting time series

Box plots

Regression plots

Plotting Seasonality and Correlation

Correlation Plots

Seasonality

STL Diagnostics

Time Series Data Wrangling

Summarize by Time

Filter By Time

Padding Data

Sliding (Rolling) Calculations