# for Core packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for financial analysis
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# for times series
library(timetk)
library(umap)
Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.
The following is the replication of Matt Dancho’s tutorial on this page
start_date <- "1989-01-01"
symbols_txt <- c("CTICLAIMS", # Connecticut
"MEICLAIMS", # Maine
"MAICLAIMS", # Massachusetts
"NHICLAIMS", # New Hampshire
"RIICLAIMS", # Rhode Island
"VTICLAIMS") # Vermont
claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
mutate(symbol = fct_recode(symbol,
"Connecticut" = "CTICLAIMS",
"Maine" = "MEICLAIMS",
"Massachusetts" = "MAICLAIMS",
"New Hampshire" = "NHICLAIMS",
"Rhode Island" = "RIICLAIMS",
"Vermont" = "VTICLAIMS")) %>%
rename(claims = price)
claims_tbl
## # A tibble: 11,040 × 3
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-14 6503
## 3 Connecticut 1989-01-21 3821
## 4 Connecticut 1989-01-28 4663
## 5 Connecticut 1989-02-04 4162
## 6 Connecticut 1989-02-11 4337
## 7 Connecticut 1989-02-18 4079
## 8 Connecticut 1989-02-25 3556
## 9 Connecticut 1989-03-04 3826
## 10 Connecticut 1989-03-11 3515
## # ℹ 11,030 more rows
claims_tbl %>%
plot_time_series(.date_var = date, .value = claims)
claims_tbl %>% count(claims)
## # A tibble: 5,328 × 2
## claims n
## <int> <int>
## 1 152 1
## 2 154 1
## 3 184 2
## 4 189 1
## 5 200 1
## 6 201 1
## 7 203 1
## 8 205 1
## 9 206 1
## 10 211 2
## # ℹ 5,318 more rows
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(
.date_var = date,
.value = claims,
.facet_ncol = 2,
.facet_scales = "free",
.interactive = FALSE)
claims_tbl %>% count(claims)
## # A tibble: 5,328 × 2
## claims n
## <int> <int>
## 1 152 1
## 2 154 1
## 3 184 2
## 4 189 1
## 5 200 1
## 6 201 1
## 7 203 1
## 8 205 1
## 9 206 1
## 10 211 2
## # ℹ 5,318 more rows
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(
.date_var = date,
.value = log(claims),
.facet_ncol = 2,
.facet_scales = "free",
.color_var = week(date))
claims_tbl %>%
plot_time_series(date, claims,
.color_var = month(date, label = TRUE),
# Returns static ggplot
.interactive = FALSE,
.title = "State Unemployment",
.x_lab = "Timeline",
.y_lab = "Claims",
.color_lab = "Month")
claims_tbl %>% count(claims)
## # A tibble: 5,328 × 2
## claims n
## <int> <int>
## 1 152 1
## 2 154 1
## 3 184 2
## 4 189 1
## 5 200 1
## 6 201 1
## 7 203 1
## 8 205 1
## 9 206 1
## 10 211 2
## # ℹ 5,318 more rows
claims_tbl %>%
plot_time_series_boxplot(.date_var = date,
.value = claims,
.period = "1 year",
.facet_ncol = 2)
claims_tbl %>%
group_by(symbol) %>%
plot_time_series_regression(
.date_var = date,
.facet_ncol = 2,
.formula = log(claims) ~ as.numeric(date) + month(date, label = TRUE),
.show_summary = TRUE)
##
## Summary for Group: Connecticut---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8490 -0.2149 -0.0304 0.1805 3.1830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.574e+00 2.944e-02 291.273 < 2e-16 ***
## as.numeric(date) -1.565e-05 2.119e-06 -7.383 2.34e-13 ***
## month(date, label = TRUE).L -2.918e-01 2.727e-02 -10.699 < 2e-16 ***
## month(date, label = TRUE).Q 4.386e-01 2.720e-02 16.128 < 2e-16 ***
## month(date, label = TRUE).C 5.819e-02 2.714e-02 2.144 0.032148 *
## month(date, label = TRUE)^4 4.908e-01 2.723e-02 18.025 < 2e-16 ***
## month(date, label = TRUE)^5 -2.213e-02 2.746e-02 -0.806 0.420301
## month(date, label = TRUE)^6 -2.775e-02 2.754e-02 -1.008 0.313777
## month(date, label = TRUE)^7 -1.231e-01 2.732e-02 -4.504 7.09e-06 ***
## month(date, label = TRUE)^8 5.651e-02 2.723e-02 2.075 0.038121 *
## month(date, label = TRUE)^9 1.934e-01 2.731e-02 7.082 2.02e-12 ***
## month(date, label = TRUE)^10 -9.414e-02 2.730e-02 -3.448 0.000577 ***
## month(date, label = TRUE)^11 8.424e-03 2.731e-02 0.309 0.757723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.338 on 1827 degrees of freedom
## Multiple R-squared: 0.3189, Adjusted R-squared: 0.3144
## F-statistic: 71.28 on 12 and 1827 DF, p-value: < 2.2e-16
##
## ----
##
## Summary for Group: Massachusetts---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5882 -0.2224 -0.0517 0.1774 3.3823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.195e+00 3.603e-02 255.166 < 2e-16 ***
## as.numeric(date) -2.093e-05 2.594e-06 -8.068 1.28e-15 ***
## month(date, label = TRUE).L -7.800e-02 3.338e-02 -2.336 0.019582 *
## month(date, label = TRUE).Q 5.944e-01 3.329e-02 17.855 < 2e-16 ***
## month(date, label = TRUE).C 1.926e-01 3.322e-02 5.797 7.94e-09 ***
## month(date, label = TRUE)^4 2.867e-01 3.333e-02 8.601 < 2e-16 ***
## month(date, label = TRUE)^5 -5.649e-02 3.361e-02 -1.681 0.092981 .
## month(date, label = TRUE)^6 -1.125e-01 3.371e-02 -3.336 0.000867 ***
## month(date, label = TRUE)^7 -6.434e-03 3.345e-02 -0.192 0.847492
## month(date, label = TRUE)^8 3.316e-02 3.334e-02 0.995 0.319976
## month(date, label = TRUE)^9 9.952e-02 3.343e-02 2.977 0.002948 **
## month(date, label = TRUE)^10 -8.996e-02 3.342e-02 -2.692 0.007178 **
## month(date, label = TRUE)^11 1.247e-02 3.343e-02 0.373 0.709246
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4137 on 1827 degrees of freedom
## Multiple R-squared: 0.2243, Adjusted R-squared: 0.2192
## F-statistic: 44.02 on 12 and 1827 DF, p-value: < 2.2e-16
##
## ----
##
## Summary for Group: Maine---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8386 -0.2423 -0.0621 0.1923 3.4344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.381e+00 3.389e-02 247.329 < 2e-16 ***
## as.numeric(date) -8.527e-05 2.440e-06 -34.950 < 2e-16 ***
## month(date, label = TRUE).L -2.159e-01 3.139e-02 -6.878 8.32e-12 ***
## month(date, label = TRUE).Q 8.101e-01 3.131e-02 25.874 < 2e-16 ***
## month(date, label = TRUE).C 2.198e-01 3.124e-02 7.034 2.82e-12 ***
## month(date, label = TRUE)^4 2.225e-01 3.135e-02 7.099 1.79e-12 ***
## month(date, label = TRUE)^5 -2.080e-01 3.161e-02 -6.580 6.14e-11 ***
## month(date, label = TRUE)^6 -1.795e-02 3.170e-02 -0.566 0.5714
## month(date, label = TRUE)^7 -1.318e-01 3.145e-02 -4.190 2.92e-05 ***
## month(date, label = TRUE)^8 5.464e-02 3.135e-02 1.743 0.0815 .
## month(date, label = TRUE)^9 1.287e-01 3.144e-02 4.095 4.41e-05 ***
## month(date, label = TRUE)^10 -7.169e-02 3.143e-02 -2.281 0.0227 *
## month(date, label = TRUE)^11 -5.374e-02 3.143e-02 -1.710 0.0875 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.389 on 1827 degrees of freedom
## Multiple R-squared: 0.5371, Adjusted R-squared: 0.534
## F-statistic: 176.6 on 12 and 1827 DF, p-value: < 2.2e-16
##
## ----
##
## Summary for Group: New Hampshire---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3400 -0.3594 -0.0449 0.2876 3.7339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.119e+00 4.776e-02 149.075 < 2e-16 ***
## as.numeric(date) -2.398e-05 3.438e-06 -6.974 4.28e-12 ***
## month(date, label = TRUE).L -2.112e-01 4.424e-02 -4.774 1.95e-06 ***
## month(date, label = TRUE).Q 4.402e-01 4.412e-02 9.978 < 2e-16 ***
## month(date, label = TRUE).C 2.268e-01 4.403e-02 5.151 2.88e-07 ***
## month(date, label = TRUE)^4 3.052e-01 4.418e-02 6.908 6.75e-12 ***
## month(date, label = TRUE)^5 -4.862e-02 4.454e-02 -1.092 0.27520
## month(date, label = TRUE)^6 -1.718e-03 4.468e-02 -0.038 0.96934
## month(date, label = TRUE)^7 -1.389e-01 4.433e-02 -3.134 0.00175 **
## month(date, label = TRUE)^8 7.738e-02 4.418e-02 1.751 0.08004 .
## month(date, label = TRUE)^9 1.852e-01 4.430e-02 4.181 3.04e-05 ***
## month(date, label = TRUE)^10 -1.329e-01 4.429e-02 -3.002 0.00272 **
## month(date, label = TRUE)^11 5.651e-03 4.430e-02 0.128 0.89850
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5483 on 1827 degrees of freedom
## Multiple R-squared: 0.1355, Adjusted R-squared: 0.1298
## F-statistic: 23.85 on 12 and 1827 DF, p-value: < 2.2e-16
##
## ----
##
## Summary for Group: Rhode Island---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0430 -0.2675 -0.0644 0.1815 3.3940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.058e+00 3.824e-02 210.720 < 2e-16 ***
## as.numeric(date) -4.804e-05 2.753e-06 -17.448 < 2e-16 ***
## month(date, label = TRUE).L -3.402e-01 3.543e-02 -9.603 < 2e-16 ***
## month(date, label = TRUE).Q 5.653e-01 3.533e-02 16.001 < 2e-16 ***
## month(date, label = TRUE).C 1.819e-01 3.525e-02 5.161 2.73e-07 ***
## month(date, label = TRUE)^4 4.432e-01 3.537e-02 12.527 < 2e-16 ***
## month(date, label = TRUE)^5 5.777e-02 3.567e-02 1.620 0.10549
## month(date, label = TRUE)^6 -1.192e-01 3.578e-02 -3.332 0.00088 ***
## month(date, label = TRUE)^7 -2.691e-02 3.550e-02 -0.758 0.44846
## month(date, label = TRUE)^8 5.238e-02 3.538e-02 1.481 0.13886
## month(date, label = TRUE)^9 1.900e-01 3.548e-02 5.355 9.66e-08 ***
## month(date, label = TRUE)^10 -1.883e-01 3.547e-02 -5.309 1.24e-07 ***
## month(date, label = TRUE)^11 6.618e-03 3.547e-02 0.187 0.85201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.439 on 1827 degrees of freedom
## Multiple R-squared: 0.3327, Adjusted R-squared: 0.3284
## F-statistic: 75.92 on 12 and 1827 DF, p-value: < 2.2e-16
##
## ----
##
## Summary for Group: Vermont---
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2085 -0.2319 -0.0374 0.1961 3.1346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032e+00 3.413e-02 206.022 < 2e-16 ***
## as.numeric(date) -3.406e-05 2.457e-06 -13.859 < 2e-16 ***
## month(date, label = TRUE).L -8.164e-02 3.162e-02 -2.582 0.0099 **
## month(date, label = TRUE).Q 7.176e-01 3.153e-02 22.757 < 2e-16 ***
## month(date, label = TRUE).C 5.306e-01 3.147e-02 16.863 < 2e-16 ***
## month(date, label = TRUE)^4 7.925e-02 3.157e-02 2.510 0.0122 *
## month(date, label = TRUE)^5 -3.352e-01 3.184e-02 -10.529 < 2e-16 ***
## month(date, label = TRUE)^6 6.819e-03 3.193e-02 0.214 0.8309
## month(date, label = TRUE)^7 -1.470e-01 3.168e-02 -4.638 3.76e-06 ***
## month(date, label = TRUE)^8 -1.085e-02 3.158e-02 -0.344 0.7311
## month(date, label = TRUE)^9 1.978e-01 3.166e-02 6.245 5.24e-10 ***
## month(date, label = TRUE)^10 -1.728e-01 3.166e-02 -5.460 5.42e-08 ***
## month(date, label = TRUE)^11 7.008e-02 3.166e-02 2.213 0.0270 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3919 on 1827 degrees of freedom
## Multiple R-squared: 0.3974, Adjusted R-squared: 0.3935
## F-statistic: 100.4 on 12 and 1827 DF, p-value: < 2.2e-16
##
## ----
claims_tbl %>%
group_by(symbol) %>%
plot_acf_diagnostics(
date, claims,
.lags = "1 year")
claims_tbl %>%
group_by(symbol) %>%
plot_seasonal_diagnostics(date, claims)
claims_tbl %>%
group_by(symbol) %>%
plot_stl_diagnostics(
date, claims,
.frequency = "auto", .trend = "auto",
.feature_set = c("observed", "season", "trend", "remainder"),
.interactive = TRUE)
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
## frequency = 13 observations per 1 quarter
## trend = 53 observations per 1 year
claims_tbl %>%
group_by(symbol) %>%
plot_time_series(date, claims, .facet_ncol = 2, .interactive = TRUE)
claims_tbl %>%
group_by(symbol) %>%
summarise_by_time(.date_var = date, adjusted = mean(claims), .by = "year") %>%
plot_time_series(date, adjusted, .facet_ncol = 2, .interactive = TRUE)
claims_tbl %>%
group_by(symbol) %>%
filter_by_time(.date_var = date,
.start_date = "2000",
.end_date = "2022") %>%
plot_time_series(date, claims, .facet_ncol = 2)
claims_tbl %>%
group_by(symbol) %>%
pad_by_time(date, .by = "day", .pad_value = 0)
## # A tibble: 77,244 × 3
## # Groups: symbol [6]
## symbol date claims
## <fct> <date> <int>
## 1 Connecticut 1989-01-07 8345
## 2 Connecticut 1989-01-08 0
## 3 Connecticut 1989-01-09 0
## 4 Connecticut 1989-01-10 0
## 5 Connecticut 1989-01-11 0
## 6 Connecticut 1989-01-12 0
## 7 Connecticut 1989-01-13 0
## 8 Connecticut 1989-01-14 6503
## 9 Connecticut 1989-01-15 0
## 10 Connecticut 1989-01-16 0
## # ℹ 77,234 more rows
claims_tbl %>%
head(10) %>%
mutate(rolling_avg_2 = slidify_vec(claims, mean,
.period = 2,
.align = "right",
.partial = TRUE))
## # A tibble: 10 × 4
## symbol date claims rolling_avg_2
## <fct> <date> <int> <dbl>
## 1 Connecticut 1989-01-07 8345 8345
## 2 Connecticut 1989-01-14 6503 7424
## 3 Connecticut 1989-01-21 3821 5162
## 4 Connecticut 1989-01-28 4663 4242
## 5 Connecticut 1989-02-04 4162 4412.
## 6 Connecticut 1989-02-11 4337 4250.
## 7 Connecticut 1989-02-18 4079 4208
## 8 Connecticut 1989-02-25 3556 3818.
## 9 Connecticut 1989-03-04 3826 3691
## 10 Connecticut 1989-03-11 3515 3670.
# Rolling regressions are easy to implement using '.unlist = FALSE
lm_roll <- slidify(~ lm(..1 ~ ..2 + ..3),
.period = 90,
.unlist = FALSE,
.align = "right")
claims_tbl %>%
select(symbol, date, claims) %>%
group_by(symbol) %>%
mutate(numeric_date = as.numeric(symbol)) %>%
#Apply rolling regression
mutate(rolling_lm = lm_roll(claims, date, numeric_date)) %>%
filter(!is.na(rolling_lm))
## # A tibble: 10,506 × 5
## # Groups: symbol [6]
## symbol date claims numeric_date rolling_lm
## <fct> <date> <int> <dbl> <list>
## 1 Connecticut 1990-09-22 3927 1 <lm>
## 2 Connecticut 1990-09-29 4471 1 <lm>
## 3 Connecticut 1990-10-06 4430 1 <lm>
## 4 Connecticut 1990-10-13 4494 1 <lm>
## 5 Connecticut 1990-10-20 4894 1 <lm>
## 6 Connecticut 1990-10-27 4653 1 <lm>
## 7 Connecticut 1990-11-03 4719 1 <lm>
## 8 Connecticut 1990-11-10 5347 1 <lm>
## 9 Connecticut 1990-11-17 4824 1 <lm>
## 10 Connecticut 1990-11-24 5367 1 <lm>
## # ℹ 10,496 more rows