Apply 10 - Apply 11

# for Core packages
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.1

## Warning: package 'purrr' was built under R version 4.4.2

## Warning: package 'dplyr' was built under R version 4.4.1

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for financial analysis
library(tidyquant)

## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# for times series
library(timetk)

## Warning: package 'timetk' was built under R version 4.4.2

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.4.2

Goal: Apply Matt Dancho’s tutorial to state unemployment initial claims of New England states.

The following is the replication of Matt Dancho’s tutorial on this page

start_date <- "1989-01-01"

symbols_txt <- c("CTICLAIMS", # Connecticut
                 "MEICLAIMS", # Maine
                 "MAICLAIMS", # Massachusetts
                 "NHICLAIMS", # New Hampshire
                 "RIICLAIMS", # Rhode Island
                 "VTICLAIMS") # Vermont

claims_tbl <- tq_get(symbols_txt, get = "economic.data", from = start_date) %>%
    mutate(symbol = fct_recode(symbol,
                               "Connecticut"   = "CTICLAIMS",
                               "Maine"         = "MEICLAIMS",
                               "Massachusetts" = "MAICLAIMS",
                               "New Hampshire" = "NHICLAIMS",
                               "Rhode Island"  = "RIICLAIMS",
                               "Vermont"       = "VTICLAIMS")) %>%
    rename(claims = price)

Plotting time series

# Line plot of claims over time
claims_tbl %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_line() +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "Unemployment Claims Over Time", y = "Claims", x = "Date") +
    theme_minimal()

Box plots

# Box plot by month
claims_tbl %>%
    mutate(month = month(date, label = TRUE)) %>%
    ggplot(aes(x = month, y = claims, fill = symbol)) +
    geom_boxplot() +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "Boxplot of Unemployment Claims by Month", y = "Claims", x = "Month") +
    theme_minimal()

Regression plots

# Scatter plot with linear regression
claims_tbl %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_point(alpha = 0.5) +
    geom_smooth(method = "lm", se = FALSE) +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "Regression Plots of Unemployment Claims", y = "Claims", x = "Date") +
    theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Plotting Seasonality and Correlation

Correlation Plots

# Calculate and plot correlations
# Create a correlation matrix
corr_matrix <- claims_tbl %>%
    spread(key = symbol, value = claims) %>%
    select(-date) %>%
    cor(use = "pairwise.complete.obs")

# Plot the correlation heatmap
ggcorrplot(corr_matrix,
           method = "circle",     # Shape of the correlation points
           type = "lower",        # Show only lower triangle
           lab = TRUE,            # Add correlation coefficients
           lab_size = 3,
           title = "Correlation Heatmap of Unemployment Claims",
           ggtheme = theme_minimal())

Seasonality

# Seasonal diagnostics plot
claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(
        .date_var = date,
        .value = claims
    )

STL Diagnostics

# STL decomposition diagnostics
claims_tbl %>%
    group_by(symbol) %>%
    plot_seasonal_diagnostics(
        .date_var = date,
        .value = claims
    )

Time Series Data Wrangling

Summarize by Time

# Summarize claims by year
claims_tbl %>%
    summarise_by_time(.date_var = date, .by = "year", total_claims = sum(claims)) %>%
    ggplot(aes(x = date, y = total_claims)) +
    geom_line() +
    labs(title = "Total Claims by Year", y = "Total Claims", x = "Year") +
    theme_minimal()

Filter By Time

# Filter claims to the most recent decade
claims_tbl %>%
    filter_by_time(.date_var = date, .start_date = "2010-01-01") %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_line() +
    labs(title = "Unemployment Claims (2010-Present)", y = "Claims", x = "Date") +
    theme_minimal()

Padding Data

# Pad the data to ensure no missing dates
claims_tbl %>%
    group_by(symbol) %>%
    pad_by_time(.date_var = date, .by = "week") %>%
    ggplot(aes(x = date, y = claims, color = symbol)) +
    geom_line() +
    labs(title = "Padded Unemployment Claims Data", y = "Claims", x = "Date") +
    theme_minimal()

Sliding (Rolling) Calculations

claims_tbl %>%
    head(10) %>%
    mutate(rolling_avg_2 = slidify_vec(log(claims), mean, 
                                       .period = 2, 
                                       .align = "right", 
                                       .partial = TRUE))

## # A tibble: 10 × 4
##    symbol      date       claims rolling_avg_2
##    <fct>       <date>      <int>         <dbl>
##  1 Connecticut 1989-01-07   8345          9.03
##  2 Connecticut 1989-01-14   6503          8.90
##  3 Connecticut 1989-01-21   3821          8.51
##  4 Connecticut 1989-01-28   4663          8.35
##  5 Connecticut 1989-02-04   4162          8.39
##  6 Connecticut 1989-02-11   4337          8.35
##  7 Connecticut 1989-02-18   4079          8.34
##  8 Connecticut 1989-02-25   3556          8.24
##  9 Connecticut 1989-03-04   3826          8.21
## 10 Connecticut 1989-03-11   3515          8.21

# Rolling average calculation
library(slider)

## Warning: package 'slider' was built under R version 4.4.2

claims_tbl %>%
    group_by(symbol) %>%
    mutate(rolling_avg = slide_dbl(claims, mean, .before = 11, .complete = TRUE)) %>%
    ggplot(aes(x = date, y = rolling_avg, color = symbol)) +
    geom_line() +
    facet_wrap(~ symbol, scales = "free_y", ncol = 2) +
    labs(title = "12-Month Rolling Average of Claims", y = "Rolling Average", x = "Date") +
    theme_minimal()

## Warning: Removed 66 rows containing missing values or values outside the scale range
## (`geom_line()`).

lm_roll <- slidify(~ lm(..1 ~ ..2), .period = 90, 
                   .unlist = FALSE, .align = "right")

reg_results <- claims_tbl %>%
  select(symbol, date, claims) %>%
  group_by(symbol) %>%
  mutate(numeric_date = as.numeric(date)) %>%mutate(rolling_lm = lm_roll(claims, numeric_date)) %>%
  filter(!is.na(rolling_lm))

# Check rolling_lm
reg_results$rolling_lm %>% .[[1]] %>% broom::tidy()

## # A tibble: 2 × 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) -11225.    6974.        -1.61  0.111 
## 2 ..2              2.19     0.961      2.28  0.0248

# Check all rows
reg_coeff <- reg_results %>% mutate(rolling_lm = map(rolling_lm, broom::tidy)) %>% unnest(rolling_lm)

# Plot coefficient
reg_coeff %>% filter(term== "..2") %>% ggplot(aes(date, estimate)) + geom_line() + facet_wrap(~symbol)

Apply 10 - Apply 11

Sara Donahue

2024-12-19

Plotting time series

Box plots

Regression plots

Plotting Seasonality and Correlation

Correlation Plots

Seasonality

STL Diagnostics

Time Series Data Wrangling

Summarize by Time

Filter By Time

Padding Data

Sliding (Rolling) Calculations