Apply 11

# for Core packages
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for financial analysis
library(tidyquant)

## Warning: package 'tidyquant' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## ── Attaching core tidyquant packages ──────────────────────── tidyquant 1.0.9 ──
## ✔ PerformanceAnalytics 2.0.4      ✔ TTR                  0.24.4
## ✔ quantmod             0.4.25     ✔ xts                  0.13.2── Conflicts ────────────────────────────────────────── tidyquant_conflicts() ──
## ✖ zoo::as.Date()                 masks base::as.Date()
## ✖ zoo::as.Date.numeric()         masks base::as.Date.numeric()
## ✖ dplyr::filter()                masks stats::filter()
## ✖ xts::first()                   masks dplyr::first()
## ✖ dplyr::lag()                   masks stats::lag()
## ✖ xts::last()                    masks dplyr::last()
## ✖ PerformanceAnalytics::legend() masks graphics::legend()
## ✖ quantmod::summary()            masks base::summary()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for times series
library(timetk)

## 
## Attaching package: 'timetk'
## 
## The following object is masked from 'package:tidyquant':
## 
##     FANG

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

claims_tbl

## # A tibble: 11,220 × 3
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-14   6503
##  3 Connecticut 1989-01-21   3821
##  4 Connecticut 1989-01-28   4663
##  5 Connecticut 1989-02-04   4162
##  6 Connecticut 1989-02-11   4337
##  7 Connecticut 1989-02-18   4079
##  8 Connecticut 1989-02-25   3556
##  9 Connecticut 1989-03-04   3826
## 10 Connecticut 1989-03-11   3515
## # ℹ 11,210 more rows

claims_tbl %>%
    plot_time_series(.date_var = date, .value = claims)

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1870
## 2 Massachusetts  1870
## 3 Maine          1870
## 4 New Hampshire  1870
## 5 Rhode Island   1870
## 6 Vermont        1870

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(.date_var = date, 
                     .value = claims, 
                     .facet_ncol = 2, 
                     .facet_scales = "free", 
                     .interactive = FALSE)

Visualizing Transformations & Sub-Groups

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1870
## 2 Massachusetts  1870
## 3 Maine          1870
## 4 New Hampshire  1870
## 5 Rhode Island   1870
## 6 Vermont        1870

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series(.date_var = date, 
                     .value = log(claims),
                     .facet_ncol = 2, 
                     .facet_scales = "free",
                     .color_var = year(date),
                     .smooth = FALSE)

Static ggplot2 Visualizations & Customizations

claims_tbl %>%
    plot_time_series(date, claims, 
                     .color_var = month(date, label = TRUE),
                     
                     # Returns static ggplot
                     .interactive = FALSE, 
                     
                     .smooth = FALSE,
                     
                     # Customize
                     .title = "Monthly Initial Unemployment Claims in New England States", 
                     .x_lab = "Date", 
                     .y_lab = "Unemployment Claims", 
                     .color_lab = "Month")

Box plots

claims_tbl %>% count(symbol)

## # A tibble: 6 × 2
##   symbol            n
##   <fct>         <int>
## 1 Connecticut    1870
## 2 Massachusetts  1870
## 3 Maine          1870
## 4 New Hampshire  1870
## 5 Rhode Island   1870
## 6 Vermont        1870

claims_tbl %>%
    filter_by_time(.date_var = date, .end_date = "1990") %>%
    group_by(symbol) %>%
    plot_time_series_boxplot(.date_var = date, 
                             .value = claims, 
                             .period = "1 year", 
                             .facet_ncol = 2)

## Warning: There were 30 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `.value_smooth = auto_smooth(...)`.
## ℹ In group 1: `symbol = Connecticut`.
## Caused by warning in `simpleLoess()`:
## ! span too small.   fewer data values than degrees of freedom.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 29 remaining warnings.

Regression plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_time_series_regression(.date_var = date, 
                                .formula = log(claims) ~ as.numeric(date) + 
                                    month(date, label = TRUE), 
                                .show_summary = TRUE)

## 
## Summary for Group: Connecticut---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8412 -0.2155 -0.0309  0.1774  3.1912 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.589e+00  2.888e-02 297.344  < 2e-16 ***
## as.numeric(date)             -1.694e-05  2.062e-06  -8.212 4.02e-16 ***
## month(date, label = TRUE).L  -2.949e-01  2.711e-02 -10.879  < 2e-16 ***
## month(date, label = TRUE).Q   4.443e-01  2.696e-02  16.481  < 2e-16 ***
## month(date, label = TRUE).C   6.112e-02  2.689e-02   2.273 0.023128 *  
## month(date, label = TRUE)^4   4.911e-01  2.699e-02  18.199  < 2e-16 ***
## month(date, label = TRUE)^5  -2.063e-02  2.720e-02  -0.758 0.448327    
## month(date, label = TRUE)^6  -2.853e-02  2.727e-02  -1.046 0.295512    
## month(date, label = TRUE)^7  -1.251e-01  2.703e-02  -4.628 3.95e-06 ***
## month(date, label = TRUE)^8   5.630e-02  2.690e-02   2.093 0.036503 *  
## month(date, label = TRUE)^9   1.911e-01  2.693e-02   7.095 1.83e-12 ***
## month(date, label = TRUE)^10 -9.609e-02  2.687e-02  -3.576 0.000358 ***
## month(date, label = TRUE)^11  1.266e-02  2.684e-02   0.472 0.637302    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3369 on 1857 degrees of freedom
## Multiple R-squared:  0.3257, Adjusted R-squared:  0.3213 
## F-statistic: 74.75 on 12 and 1857 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Massachusetts---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5930 -0.2217 -0.0511  0.1769  3.3826 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   9.195e+00  3.527e-02 260.722  < 2e-16 ***
## as.numeric(date)             -2.100e-05  2.518e-06  -8.341  < 2e-16 ***
## month(date, label = TRUE).L  -7.636e-02  3.310e-02  -2.307 0.021160 *  
## month(date, label = TRUE).Q   5.939e-01  3.292e-02  18.043  < 2e-16 ***
## month(date, label = TRUE).C   1.881e-01  3.283e-02   5.731 1.16e-08 ***
## month(date, label = TRUE)^4   2.877e-01  3.295e-02   8.730  < 2e-16 ***
## month(date, label = TRUE)^5  -5.053e-02  3.321e-02  -1.522 0.128257    
## month(date, label = TRUE)^6  -1.118e-01  3.330e-02  -3.359 0.000798 ***
## month(date, label = TRUE)^7  -9.170e-03  3.300e-02  -0.278 0.781156    
## month(date, label = TRUE)^8   3.272e-02  3.285e-02   0.996 0.319355    
## month(date, label = TRUE)^9   9.814e-02  3.289e-02   2.984 0.002880 ** 
## month(date, label = TRUE)^10 -9.029e-02  3.281e-02  -2.752 0.005988 ** 
## month(date, label = TRUE)^11  1.475e-02  3.277e-02   0.450 0.652701    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4114 on 1857 degrees of freedom
## Multiple R-squared:  0.2272, Adjusted R-squared:  0.2222 
## F-statistic: 45.49 on 12 and 1857 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Maine---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8236 -0.2487 -0.0647  0.1923  3.4536 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.409e+00  3.331e-02 252.451  < 2e-16 ***
## as.numeric(date)             -8.776e-05  2.378e-06 -36.901  < 2e-16 ***
## month(date, label = TRUE).L  -2.185e-01  3.126e-02  -6.992 3.78e-12 ***
## month(date, label = TRUE).Q   8.238e-01  3.109e-02  26.501  < 2e-16 ***
## month(date, label = TRUE).C   2.224e-01  3.101e-02   7.175 1.04e-12 ***
## month(date, label = TRUE)^4   2.187e-01  3.112e-02   7.027 2.96e-12 ***
## month(date, label = TRUE)^5  -2.066e-01  3.136e-02  -6.586 5.87e-11 ***
## month(date, label = TRUE)^6  -1.892e-02  3.145e-02  -0.602   0.5474    
## month(date, label = TRUE)^7  -1.336e-01  3.117e-02  -4.285 1.92e-05 ***
## month(date, label = TRUE)^8   5.614e-02  3.103e-02   1.809   0.0705 .  
## month(date, label = TRUE)^9   1.262e-01  3.106e-02   4.063 5.04e-05 ***
## month(date, label = TRUE)^10 -7.236e-02  3.099e-02  -2.335   0.0196 *  
## month(date, label = TRUE)^11 -5.242e-02  3.095e-02  -1.694   0.0905 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3885 on 1857 degrees of freedom
## Multiple R-squared:  0.5574, Adjusted R-squared:  0.5545 
## F-statistic: 194.9 on 12 and 1857 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: New Hampshire---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3077 -0.3652 -0.0452  0.2823  3.7632 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.164e+00  4.705e-02 152.255  < 2e-16 ***
## as.numeric(date)             -2.797e-05  3.359e-06  -8.326  < 2e-16 ***
## month(date, label = TRUE).L  -2.173e-01  4.416e-02  -4.922 9.31e-07 ***
## month(date, label = TRUE).Q   4.608e-01  4.391e-02  10.493  < 2e-16 ***
## month(date, label = TRUE).C   2.326e-01  4.380e-02   5.311 1.22e-07 ***
## month(date, label = TRUE)^4   3.008e-01  4.396e-02   6.843 1.05e-11 ***
## month(date, label = TRUE)^5  -4.534e-02  4.431e-02  -1.023  0.30624    
## month(date, label = TRUE)^6  -2.115e-03  4.442e-02  -0.048  0.96203    
## month(date, label = TRUE)^7  -1.429e-01  4.403e-02  -3.245  0.00119 ** 
## month(date, label = TRUE)^8   7.758e-02  4.383e-02   1.770  0.07685 .  
## month(date, label = TRUE)^9   1.832e-01  4.388e-02   4.176 3.11e-05 ***
## month(date, label = TRUE)^10 -1.315e-01  4.378e-02  -3.004  0.00270 ** 
## month(date, label = TRUE)^11  6.228e-03  4.372e-02   0.142  0.88673    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5488 on 1857 degrees of freedom
## Multiple R-squared:  0.1487, Adjusted R-squared:  0.1432 
## F-statistic: 27.03 on 12 and 1857 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Rhode Island---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0324 -0.2652 -0.0656  0.1801  3.4025 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.077e+00  3.756e-02 215.065  < 2e-16 ***
## as.numeric(date)             -4.976e-05  2.682e-06 -18.558  < 2e-16 ***
## month(date, label = TRUE).L  -3.434e-01  3.525e-02  -9.743  < 2e-16 ***
## month(date, label = TRUE).Q   5.739e-01  3.505e-02  16.372  < 2e-16 ***
## month(date, label = TRUE).C   1.852e-01  3.496e-02   5.297 1.31e-07 ***
## month(date, label = TRUE)^4   4.421e-01  3.509e-02  12.599  < 2e-16 ***
## month(date, label = TRUE)^5   5.909e-02  3.536e-02   1.671 0.094934 .  
## month(date, label = TRUE)^6  -1.204e-01  3.546e-02  -3.397 0.000696 ***
## month(date, label = TRUE)^7  -2.913e-02  3.515e-02  -0.829 0.407237    
## month(date, label = TRUE)^8   5.367e-02  3.498e-02   1.534 0.125128    
## month(date, label = TRUE)^9   1.875e-01  3.502e-02   5.354 9.65e-08 ***
## month(date, label = TRUE)^10 -1.930e-01  3.494e-02  -5.522 3.82e-08 ***
## month(date, label = TRUE)^11  9.841e-03  3.490e-02   0.282 0.777962    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4381 on 1857 degrees of freedom
## Multiple R-squared:  0.3466, Adjusted R-squared:  0.3424 
## F-statistic:  82.1 on 12 and 1857 DF,  p-value: < 2.2e-16
## 
## ----
## 
## Summary for Group: Vermont---
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17595 -0.23458 -0.04234  0.20191  3.15309 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.074e+00  3.389e-02 208.704  < 2e-16 ***
## as.numeric(date)             -3.778e-05  2.420e-06 -15.614  < 2e-16 ***
## month(date, label = TRUE).L  -8.930e-02  3.181e-02  -2.808  0.00504 ** 
## month(date, label = TRUE).Q   7.344e-01  3.163e-02  23.218  < 2e-16 ***
## month(date, label = TRUE).C   5.382e-01  3.155e-02  17.061  < 2e-16 ***
## month(date, label = TRUE)^4   8.014e-02  3.167e-02   2.531  0.01146 *  
## month(date, label = TRUE)^5  -3.326e-01  3.191e-02 -10.423  < 2e-16 ***
## month(date, label = TRUE)^6   2.187e-03  3.200e-02   0.068  0.94550    
## month(date, label = TRUE)^7  -1.502e-01  3.172e-02  -4.736 2.35e-06 ***
## month(date, label = TRUE)^8  -9.043e-03  3.157e-02  -0.286  0.77456    
## month(date, label = TRUE)^9   1.936e-01  3.160e-02   6.126 1.10e-09 ***
## month(date, label = TRUE)^10 -1.710e-01  3.153e-02  -5.422 6.67e-08 ***
## month(date, label = TRUE)^11  7.101e-02  3.149e-02   2.255  0.02425 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3953 on 1857 degrees of freedom
## Multiple R-squared:  0.4123, Adjusted R-squared:  0.4085 
## F-statistic: 108.6 on 12 and 1857 DF,  p-value: < 2.2e-16
## 
## ----

Plotting Seasonality and Correlation

Correlation Plots

claims_tbl %>%
    group_by(symbol) %>%
    plot_acf_diagnostics(date, claims, 
                         .lags = "1 year")

Seasonality

claims_tbl %>%
    plot_seasonal_diagnostics(date, claims)

claims_tbl %>% count(claims)

## # A tibble: 5,373 × 2
##    claims     n
##     <int> <int>
##  1    152     1
##  2    154     1
##  3    181     1
##  4    184     2
##  5    187     1
##  6    188     1
##  7    189     1
##  8    200     3
##  9    201     1
## 10    203     1
## # ℹ 5,363 more rows

# I have split the states into two graphs for a better visualization.

claims_tbl %>%
    filter(symbol == c("Connecticut", "Massachusetts", "Maine")) %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(date, claims, 
                              .feature_set = "quarter")

claims_tbl %>%
    filter(symbol == c("New Hampshire", "Rhode Island", "Vermont")) %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(date, claims, 
                              .feature_set = "quarter")

STL Diagnostics

# Split it again for better visualization.
claims_tbl %>%
    filter(symbol == c("Connecticut", "Massachusetts", "Maine")) %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(date, claims, 
                         .feature_set = c("observed", "season", "trend", "seasadj"))

## frequency = 17 observations per 1 year

## trend = 87 observations per 5 years

## frequency = 17 observations per 1 year

## trend = 87 observations per 5 years

## frequency = 17 observations per 1 year

## trend = 87 observations per 5 years

claims_tbl %>%
    filter(symbol == c("New Hampshire", "Rhode Island", "Vermont")) %>%
    group_by(symbol) %>%
    plot_stl_diagnostics(date, claims, 
                         .feature_set = c("observed", "season", "trend", "seasadj"))

## frequency = 17 observations per 1 year
## trend = 87 observations per 5 years

## frequency = 17 observations per 1 year

## trend = 87 observations per 5 years

## frequency = 17 observations per 1 year

## trend = 87 observations per 5 years

Time Series Data Wrangling

Summarize by Time

claims_tbl %>%
    group_by(symbol) %>%
    summarise_by_time(.date_var = date, volume = mean(claims), .by = "quarter") %>%
    plot_time_series(date, volume, 
                     .facet_ncol = 2, 
                     .interactive = FALSE)

Filter By Time

claims_tbl %>%
    group_by(symbol) %>%
    filter_by_time(.date_var = date, 
                   .start_date = "2000-01", 
                   .end_date = "2003-12") %>%
    plot_time_series(date, claims, .facet_ncol = 2)

Padding Data

claims_tbl %>%
    group_by(symbol) %>%
    pad_by_time(date, .by = "day", .fill_na_direction = "down")

## # A tibble: 78,504 × 3
## # Groups:   symbol [6]
##    symbol      date       claims
##    <fct>       <date>      <int>
##  1 Connecticut 1989-01-07   8345
##  2 Connecticut 1989-01-08   8345
##  3 Connecticut 1989-01-09   8345
##  4 Connecticut 1989-01-10   8345
##  5 Connecticut 1989-01-11   8345
##  6 Connecticut 1989-01-12   8345
##  7 Connecticut 1989-01-13   8345
##  8 Connecticut 1989-01-14   6503
##  9 Connecticut 1989-01-15   6503
## 10 Connecticut 1989-01-16   6503
## # ℹ 78,494 more rows

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(claims, mean, 
                                       .period = 5, 
                                       .align = "center",
                                       .partial = TRUE))

## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345         6223 
##  2 Connecticut 1989-01-14   6503         5833 
##  3 Connecticut 1989-01-21   3821         5499.
##  4 Connecticut 1989-01-28   4663         4697.
##  5 Connecticut 1989-02-04   4162         4212.
##  6 Connecticut 1989-02-11   4337         4159.
##  7 Connecticut 1989-02-18   4079         3992 
##  8 Connecticut 1989-02-25   3556         3863.
##  9 Connecticut 1989-03-04   3826         3744 
## 10 Connecticut 1989-03-11   3515         3632.

# Rolling regressions are easy to implement using `.unlist = FALSE`
lm_roll <- slidify(~ lm(..2 ~ ..1), .period = 3, .unlist = FALSE, .align = "right", .partial = TRUE)

# Apply the rolling regression
claims_tbl %>%
  group_by(symbol) %>%
  mutate(numeric_date = as.numeric(date)) %>%  # Convert date to numeric
  mutate(rolling_lm = lm_roll(numeric_date, claims)) %>%  # Apply rolling regression
  filter(!is.na(rolling_lm))

## # A tibble: 11,220 × 5
## # Groups:   symbol [6]
##    symbol      date       claims numeric_date rolling_lm
##    <fct>       <date>      <int>        <dbl> <list>    
##  1 Connecticut 1989-01-07   8345         6946 <lm>      
##  2 Connecticut 1989-01-14   6503         6953 <lm>      
##  3 Connecticut 1989-01-21   3821         6960 <lm>      
##  4 Connecticut 1989-01-28   4663         6967 <lm>      
##  5 Connecticut 1989-02-04   4162         6974 <lm>      
##  6 Connecticut 1989-02-11   4337         6981 <lm>      
##  7 Connecticut 1989-02-18   4079         6988 <lm>      
##  8 Connecticut 1989-02-25   3556         6995 <lm>      
##  9 Connecticut 1989-03-04   3826         7002 <lm>      
## 10 Connecticut 1989-03-11   3515         7009 <lm>      
## # ℹ 11,210 more rows