# Time Series Machine Learning
library(tidymodels)
library(modeltime)
library(vip)
# EDA
library(DataExplorer)
# Core
library(tidyverse)
library(timetk)
library(lubridate)
library(readxl)
moodys_tbl <- read_excel("../../../Vermont/docs from Tom/Moodys 0625.xlsx", sheet = "Vermont (Quarterly)", skip = 0, n_max = 98) %>%
filter(!is.na(`VERMONT - QUARTERLY`))
moodys_us_tbl <- read_excel("../../../Vermont/docs from Tom/Moodys 0625.xlsx", sheet = "US (Quarterly)", skip = 0) %>%
filter(!is.na(`UNITED STATES - QUARTERLY`))
home_inventory <- read_csv("../00_data/Housing_Inventory_VT_FRED.csv")
# Zillow
# ZORI (Smoothed): All Homes Plus Multifamily Time Series ($): County
# Downloaded on 10/16/2025
zillow <- read_csv("../00_data/County_zori_uc_sfrcondomfr_sm_month.csv")
zillow_clean_tbl <- zillow %>%
filter(State == "VT") %>%
select(-c(RegionID,SizeRank,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS)) %>%
pivot_longer(-RegionName, names_to = "date", values_to = "rent") %>%
mutate(date = lubridate::ymd(date)) %>%
group_by(RegionName) %>%
summarise_by_time(date, .by = "quarter", rent = mean(rent)) %>%
ungroup() %>%
pivot_wider(names_from = RegionName, values_from = rent, names_prefix = "rent") %>%
janitor::clean_names() %>%
# Only chittenden_county has enough data
select(date, rent_chittenden_county) %>%
drop_na()Forecasting Unemployment Rate in New Hampshire
Objective
This analysis aims to forecast the quarterly home prices in Vermont next five years.
Data
Dependent variable
- FHFA All-Transactions HPI from Federal Housing Finance Agency (via FRED - St. Louis Fed).
Independent variable
- Mortgage Interest Rate from FRED (30-Year Fixed Rate Mortgage Average)
- Inventory/Supply from VAR, Redfin/Zillow (Active Listing Count, Months of Supply)
- Population/Migration from U.S. Census Bureau, BEA (Net Migration Data)
- Employment/Income from FRED (Vermont Unemployment Rate, Real Median Household Income). * National Home Price Index from S&P CoreLogic Case-Shiller U.S. National Home Price Index
- Second home/Tourism from state rooms and meals tax revenue
- Policy impact from state-level housing legislation
1.0 Clean data
# Extract var description
var_desc_tbl <- moodys_tbl %>% select(1:2) %>% set_names(c("Var_abb","Var_desc"))
var_desc_us_tbl <- moodys_us_tbl %>%
select(1:2) %>%
set_names(c("Var_abb","Var_desc")) %>%
filter(Var_desc != "0")
# Reshape Moodys data
moodys_clean_tbl <- moodys_tbl %>%
mutate(across(where(is.numeric), as.character)) %>%
# Add national data
bind_rows(
moodys_us_tbl %>%
filter(`UNITED STATES - QUARTERLY` == "FHOFHOPIQ") %>%
rename(`VERMONT - QUARTERLY`=`UNITED STATES - QUARTERLY`) %>%
mutate(across(everything(), as.character)) %>%
mutate(`VERMONT - QUARTERLY` = `VERMONT - QUARTERLY` %>% paste0("_us")) %>%
distinct()
) %>%
select(-2) %>%
pivot_longer(-`VERMONT - QUARTERLY`, names_to = "date") %>%
pivot_wider(names_from = `VERMONT - QUARTERLY`, values_from = value) %>%
# Replace "ND" with ""
mutate(across(everything(), ~str_replace(.,"ND",""))) %>%
# Convert them to numeric
mutate(across(-date, as.numeric)) %>%
mutate(date = date %>% yq()) %>%
# Remove forecast
filter(date <= as.Date("2025-04-01")) %>%
# Add rent from Zillow
left_join(
zillow_clean_tbl
) %>%
# Add FRED data
left_join(
home_inventory %>%
setNames(c("date", "home_inv")) %>%
summarise_by_time(date, .by = "quarter", home_inv = sum(home_inv))
)
moodys_clean_tbl %>% skimr::skim()| Name | Piped data |
| Number of rows | 202 |
| Number of columns | 101 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| numeric | 100 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 1975-01-01 | 2025-04-01 | 2000-02-15 | 202 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| RETQ | 0 | 1.00 | 269.74 | 45.11 | 160.49 | 247.84 | 289.72 | 306.26 | 317.30 | ▁▂▂▃▇ |
| RERMQ | 0 | 1.00 | 0.93 | 0.12 | 0.73 | 0.83 | 0.90 | 1.00 | 1.31 | ▇▇▆▃▁ |
| RE23Q | 0 | 1.00 | 13.79 | 2.48 | 6.47 | 11.98 | 14.54 | 15.37 | 18.67 | ▁▂▅▇▂ |
| REMFQ | 0 | 1.00 | 37.68 | 6.23 | 26.47 | 30.94 | 38.65 | 43.87 | 46.93 | ▅▃▃▅▇ |
| RETUQ | 0 | 1.00 | 7.67 | 1.13 | 5.19 | 6.72 | 8.05 | 8.57 | 8.97 | ▃▂▁▃▇ |
| RE51Q | 0 | 1.00 | 5.27 | 0.81 | 3.80 | 4.61 | 4.98 | 5.90 | 6.97 | ▃▇▃▃▃ |
| RETRQ | 0 | 1.00 | 3.90 | 0.56 | 2.72 | 3.52 | 3.92 | 4.45 | 4.68 | ▃▂▆▆▇ |
| RETTQ | 0 | 1.00 | 42.75 | 7.16 | 25.60 | 40.74 | 44.90 | 47.36 | 50.93 | ▂▂▁▅▇ |
| RE42Q | 0 | 1.00 | 8.83 | 1.35 | 5.27 | 8.61 | 9.27 | 9.70 | 10.47 | ▂▂▁▇▇ |
| RERTQ | 0 | 1.00 | 33.92 | 5.86 | 20.24 | 31.68 | 35.77 | 37.99 | 40.57 | ▂▂▂▅▇ |
| REFIQ | 0 | 1.00 | 11.80 | 1.57 | 7.53 | 11.68 | 12.17 | 12.87 | 13.77 | ▂▂▁▇▇ |
| REPSQ | 0 | 1.00 | 19.27 | 7.85 | 5.92 | 13.63 | 20.32 | 26.18 | 33.75 | ▇▆▇▆▅ |
| REEHQ | 0 | 1.00 | 46.51 | 14.11 | 23.48 | 33.48 | 45.65 | 60.37 | 66.83 | ▆▃▃▃▇ |
| RELHQ | 0 | 1.00 | 29.27 | 6.17 | 14.86 | 25.76 | 31.97 | 33.23 | 37.70 | ▃▂▂▇▆ |
| RE81Q | 0 | 1.00 | 8.36 | 2.10 | 3.90 | 6.70 | 9.38 | 10.03 | 10.50 | ▂▁▁▂▇ |
| RE62Q | 0 | 1.00 | 34.83 | 12.96 | 13.38 | 22.70 | 33.75 | 48.19 | 52.73 | ▅▃▃▃▇ |
| RE5411Q | 0 | 1.00 | 1.79 | 0.39 | 0.82 | 1.69 | 1.93 | 2.04 | 2.19 | ▁▂▁▅▇ |
| RE5416Q | 0 | 1.00 | 1.40 | 0.74 | 0.39 | 0.80 | 1.22 | 2.11 | 2.82 | ▇▇▂▆▃ |
| REGVQ | 0 | 1.00 | 46.44 | 9.06 | 26.69 | 39.95 | 49.18 | 54.54 | 56.87 | ▂▂▂▃▇ |
| RE311Q | 0 | 1.00 | 3.76 | 1.13 | 2.00 | 2.83 | 3.77 | 5.00 | 5.51 | ▆▃▇▁▇ |
| RE312Q | 0 | 1.00 | 0.42 | 0.35 | 0.09 | 0.19 | 0.26 | 0.50 | 1.31 | ▇▂▁▁▂ |
| RE313Q | 0 | 1.00 | 0.16 | 0.07 | 0.05 | 0.10 | 0.17 | 0.22 | 0.25 | ▃▇▂▆▇ |
| RE314Q | 0 | 1.00 | 0.17 | 0.08 | 0.04 | 0.08 | 0.19 | 0.21 | 0.34 | ▇▂▇▅▂ |
| RE315Q | 0 | 1.00 | 0.71 | 0.33 | 0.25 | 0.38 | 0.72 | 1.06 | 1.15 | ▇▃▂▃▇ |
| RE316Q | 0 | 1.00 | 0.05 | 0.01 | 0.02 | 0.04 | 0.05 | 0.05 | 0.08 | ▂▇▇▃▁ |
| RE321Q | 0 | 1.00 | 2.40 | 0.58 | 1.30 | 1.80 | 2.54 | 2.93 | 3.29 | ▅▃▃▆▇ |
| RE322Q | 0 | 1.00 | 1.17 | 0.40 | 0.55 | 0.74 | 1.30 | 1.55 | 1.69 | ▆▃▁▃▇ |
| RE323Q | 0 | 1.00 | 2.13 | 0.92 | 0.78 | 1.08 | 2.39 | 3.01 | 3.26 | ▆▁▂▂▇ |
| RE324Q | 0 | 1.00 | 0.10 | 0.03 | 0.05 | 0.07 | 0.11 | 0.14 | 0.22 | ▆▅▇▁▁ |
| RE325Q | 0 | 1.00 | 1.19 | 0.23 | 0.73 | 0.97 | 1.22 | 1.40 | 1.59 | ▂▇▃▇▃ |
| RE326Q | 0 | 1.00 | 1.34 | 0.20 | 1.00 | 1.20 | 1.31 | 1.37 | 1.83 | ▂▇▃▁▂ |
| RE327Q | 0 | 1.00 | 1.82 | 0.22 | 1.17 | 1.59 | 1.85 | 2.01 | 2.22 | ▁▆▆▇▇ |
| RE331Q | 0 | 1.00 | 0.52 | 0.30 | 0.13 | 0.17 | 0.62 | 0.81 | 0.93 | ▇▂▁▆▆ |
| RE332Q | 0 | 1.00 | 2.10 | 0.41 | 1.34 | 1.77 | 1.91 | 2.48 | 2.94 | ▁▇▁▃▂ |
| RE333Q | 0 | 1.00 | 3.92 | 1.13 | 2.37 | 2.86 | 3.98 | 4.90 | 5.74 | ▇▂▃▃▅ |
| RE334Q | 0 | 1.00 | 8.25 | 2.62 | 3.85 | 6.25 | 8.79 | 10.82 | 11.79 | ▆▃▃▆▇ |
| RE335Q | 0 | 1.00 | 1.35 | 0.19 | 0.95 | 1.21 | 1.34 | 1.51 | 1.64 | ▃▅▇▆▇ |
| RE336Q | 0 | 1.00 | 2.45 | 0.49 | 1.58 | 1.99 | 2.50 | 2.96 | 3.23 | ▅▃▆▃▇ |
| RE337Q | 0 | 1.00 | 2.27 | 0.68 | 1.21 | 1.46 | 2.48 | 2.83 | 3.44 | ▆▁▃▇▂ |
| RE339Q | 0 | 1.00 | 1.43 | 0.49 | 0.63 | 0.84 | 1.52 | 1.81 | 2.37 | ▇▁▇▆▂ |
| REMFDQ | 0 | 1.00 | 26.49 | 5.76 | 16.17 | 21.46 | 27.68 | 32.20 | 34.58 | ▅▃▃▅▇ |
| REMFNQ | 0 | 1.00 | 11.20 | 1.03 | 8.95 | 10.65 | 11.31 | 11.65 | 13.34 | ▂▂▇▃▂ |
| REOFFQ | 0 | 1.00 | 43.84 | 10.66 | 21.97 | 37.30 | 47.68 | 52.52 | 58.42 | ▅▂▆▇▇ |
| FLBF | 4 | 0.98 | 319.43 | 40.17 | 216.41 | 295.82 | 335.48 | 352.66 | 362.01 | ▁▂▂▃▇ |
| FLBE | 4 | 0.98 | 305.83 | 40.94 | 197.94 | 286.99 | 325.09 | 338.49 | 348.11 | ▁▂▁▃▇ |
| FLBU | 4 | 0.98 | 13.60 | 4.07 | 6.15 | 10.94 | 13.21 | 15.48 | 36.05 | ▆▇▂▁▁ |
| FLBR | 4 | 0.98 | 4.37 | 1.51 | 1.76 | 3.27 | 4.18 | 5.33 | 10.48 | ▆▇▅▁▁ |
| FYPQ | 0 | 1.00 | 19040.73 | 12251.11 | 2480.56 | 8312.88 | 17426.49 | 28328.92 | 47824.65 | ▇▅▅▃▂ |
| FYP\(Q | 0| 1.00| 22906.27| 8425.01| 9447.61| 15611.76| 23728.81| 29984.86| 38880.39|▇▇▆▇▅ | |FYPCONQ | 0| 1.00| 1520.52| 974.61| 158.40| 666.89| 1418.56| 2082.69| 3699.97|▇▇▅▅▂ | |FYPADJQ | 0| 1.00| 215.69| 185.45| -11.40| 42.76| 219.43| 337.47| 643.53|▇▃▅▂▂ | |FYPDIRQ | 0| 1.00| 3907.21| 2471.00| 471.37| 1800.54| 3646.09| 5385.76| 9848.49|▇▇▃▃▂ | |FYPTRPQ | 0| 1.00| 3488.90| 2880.63| 364.86| 973.78| 2388.90| 5590.63| 12808.74|▇▂▃▁▁ | |FYPEWSQ | 0| 1.00| 9055.01| 5343.68| 1375.32| 4467.31| 8798.67| 12886.42| 21201.92|▇▆▇▃▂ | |FYPESSQ | 0| 1.00| 2187.63| 1380.17| 200.23| 908.12| 2010.88| 3422.96| 4872.93|▇▆▃▇▃ | |FYPEPPQ | 0| 1.00| 1706.81| 1088.69| 234.40| 770.59| 1713.90| 2486.15| 4446.49|▇▂▆▂▁ | |FYPEPPFRQ | 0| 1.00| 99.54| 55.58| 16.34| 61.46| 85.60| 125.48| 301.64|▇▇▂▁▁ | |FYPEPPNFQ | 0| 1.00| 1607.28| 1059.14| 202.62| 676.73| 1629.98| 2348.08| 4179.26|▇▂▆▂▂ | |FYPNWQ | 0| 1.00| 9985.72| 6934.26| 1105.25| 3833.95| 8600.34| 15465.58| 26622.74|▇▅▃▂▂ | |FYPDPIQ | 0| 1.00| 16999.34| 10933.09| 2204.35| 7314.80| 15205.04| 25685.07| 42443.52|▇▅▅▃▂ | |FAHEMF | 104| 0.49| 18.80| 3.60| 14.11| 16.35| 18.34| 21.07| 26.56|▇▅▅▁▂ | |FPOPQ | 0| 1.00| 588.98| 51.51| 478.10| 543.10| 609.09| 631.61| 650.77|▂▂▂▃▇ | |FPOP0004Q | 0| 1.00| 34.49| 4.13| 26.83| 31.09| 33.93| 38.09| 41.81|▃▇▇▅▇ | |FPOP0519Q | 0| 1.00| 122.05| 8.94| 106.50| 116.18| 120.94| 130.76| 137.72|▆▅▇▅▇ | |FPOP2024Q | 0| 1.00| 44.47| 3.82| 36.19| 43.10| 44.37| 46.56| 52.91|▂▂▇▃▂ | |FPOP2544Q | 0| 1.00| 160.43| 17.67| 118.81| 147.38| 157.41| 177.23| 188.74|▁▃▇▃▆ | |FPOP4564Q | 0| 1.00| 142.18| 39.77| 89.43| 95.72| 150.96| 180.06| 194.33|▇▂▂▃▇ | |FPOP65GQ | 0| 1.00| 85.36| 26.94| 52.86| 64.04| 77.81| 100.73| 151.62|▇▆▂▂▂ | |FHHOLDQ | 0| 1.00| 231.26| 36.18| 154.88| 199.46| 240.40| 263.74| 276.33|▂▃▂▃▇ | |FNMQ | 0| 1.00| 0.49| 0.41| -0.23| 0.16| 0.47| 0.79| 1.49|▆▇▇▆▂ | |FBRQ | 0| 1.00| 11.62| 2.63| 7.77| 9.60| 10.72| 14.59| 15.93|▅▇▂▂▆ | |FDRQ | 0| 1.00| 8.79| 0.69| 8.01| 8.33| 8.64| 8.92| 11.28|▇▅▁▁▁ | |FRTFSQ | 0| 1.00| 7.70| 4.33| 1.29| 4.29| 7.52| 10.86| 17.63|▇▇▇▅▂ | |FCCALLQ | 12| 0.94| 4012.25| 2586.46| 615.47| 1714.02| 3862.89| 5815.80| 9167.27|▇▃▅▂▃ | |FCCREVQ | 12| 0.94| 1109.50| 734.13| 2.44| 353.24| 1382.10| 1719.32| 2443.47|▆▂▂▇▂ | |FBKPY | 23| 0.89| 819.83| 599.16| 99.00| 244.00| 671.00| 1397.50| 2544.00|▇▅▂▃▁ | |FHPNR | 20| 0.90| 2479.17| 949.88| 914.30| 1911.96| 2238.31| 2904.93| 6100.63|▅▇▂▂▁ | |FHPN1 | 20| 0.90| 1807.34| 815.28| 509.22| 1151.45| 1711.76| 2260.61| 4252.64|▇▇▅▂▁ | |FHPNM | 20| 0.90| 671.83| 386.30| 27.75| 351.52| 590.26| 893.02| 2137.42|▇▇▅▂▁ | |FHVACRQ | 44| 0.78| 4.56| 1.55| 1.97| 3.51| 4.36| 5.22| 11.25|▆▇▂▁▁ | |FHX1 | 24| 0.88| 9.61| 2.58| 3.19| 7.89| 9.70| 11.39| 15.59|▂▅▇▇▂ | |FHX1MED | 0| 1.00| 137.19| 99.80| 20.63| 61.35| 93.26| 201.97| 433.89|▇▃▃▁▁ | |FMBADC | 17| 0.92| 3.42| 2.33| 0.00| 2.29| 2.99| 4.15| 23.47|▇▂▁▁▁ | |FMBAFSC | 27| 0.87| 0.23| 0.19| 0.00| 0.11| 0.17| 0.31| 0.95|▇▃▁▁▁ | |FHXAFF | 0| 1.00| 164.68| 29.31| 101.29| 144.69| 168.77| 187.43| 219.47|▃▅▇▇▅ | |FDEBTC | 12| 0.94| 21.44| 2.26| 16.92| 19.95| 21.37| 23.38| 24.96|▅▃▇▆▇ | |FCCDAF | 122| 0.40| 3.30| 0.47| 2.14| 3.00| 3.32| 3.54| 4.34|▁▅▇▅▂ | |FCCDAB | 122| 0.40| 1.53| 0.30| 0.88| 1.35| 1.49| 1.67| 2.29|▂▇▇▂▂ | |FCCDBC | 122| 0.40| 2.85| 0.98| 1.40| 2.07| 2.50| 3.60| 5.21|▆▇▂▃▂ | |FCCDTO | 122| 0.40| 2.47| 0.99| 0.85| 1.82| 2.32| 3.26| 4.27|▇▆▇▃▆ | |FMOSDQ | 60| 0.70| 2.76| 1.46| 0.59| 1.65| 2.54| 3.64| 7.54|▇▇▅▂▁ | |FMOPDQ | 60| 0.70| 1.31| 0.54| 0.49| 0.87| 1.30| 1.64| 2.93|▇▇▆▂▁ | |FMORDQ | 60| 0.70| 1.45| 1.18| 0.03| 0.48| 1.15| 2.07| 5.82|▇▅▂▁▁ | |FYHHMEDQ | 0| 1.00| 40942.35| 19334.26| 10558.78| 25502.96| 41611.15| 52835.58| 89594.79|▆▆▇▂▂ | |FHOFHOPIQ | 0| 1.00| 320.18| 191.25| 56.84| 175.72| 252.84| 446.18| 876.63|▇▃▆▁▁ | |FGDPQ | 8| 0.96| 20.45| 11.75| 3.15| 11.02| 19.33| 29.46| 47.90|▇▆▆▅▂ | |FGDP\)Q | 8 | 0.96 | 24.09 | 8.17 | 9.42 | 18.03 | 25.55 | 31.60 | 36.54 | ▅▅▃▅▇ |
| FHOFHOPIQ_us | 0 | 1.00 | 268.23 | 155.96 | 59.87 | 146.38 | 231.84 | 357.56 | 704.49 | ▇▅▅▁▂ |
| rent_chittenden_county | 167 | 0.17 | 2078.29 | 311.02 | 1693.07 | 1814.40 | 1930.64 | 2375.34 | 2588.68 | ▇▂▂▂▅ |
| home_inv | 166 | 0.18 | 9553.69 | 6435.66 | 2342.00 | 3986.75 | 6318.50 | 15311.75 | 23434.00 | ▇▁▂▃▁ |
Calculate year-over-year percent change of home price to remove stationarity and seasonality.
moodys_clean_tbl %>%
select(date, FHOFHOPIQ) %>%
tk_augment_lags(
.value = FHOFHOPIQ,
.lags = 4
) %>%
mutate(
yoy_pct_change = (FHOFHOPIQ - FHOFHOPIQ_lag4) / FHOFHOPIQ_lag4 * 100
) %>%
filter(date >= as.Date("1984-01-01")) %>%
plot_time_series(date, yoy_pct_change, .smooth = FALSE)Choose independent variables mortgage delinquency, foreclosure in VT Consider mortgage rate in US
moodys_select_tbl <- moodys_clean_tbl %>%
select(date, FHOFHOPIQ, FHOFHOPIQ_us, FYHHMEDQ, FYPEWSQ, FMBAFSC, FMBADC, home_inv, rent_chittenden_county, FLBR, FNMQ) %>%
setNames(c("date", "home_price", "home_price_us", "income_median", "income_wage", "mort_forecl", "mort_delinq","home_inv", "rent", "unemp_rate", "net_migr"))
moodys_trans_tbl <- moodys_select_tbl %>%
# Transform home_price to YOY to remove stationarity
tk_augment_lags(c(matches("(home|income|rent|migr)")), .lags = 4) %>%
mutate(
home_price_yoy = (home_price - home_price_lag4) / home_price_lag4 * 100,
home_price_us_yoy = (home_price_us - home_price_us_lag4) / home_price_us_lag4 * 100,
income_median_yoy = (income_median - income_median_lag4) / income_median_lag4 * 100,
income_wage_yoy = (income_wage - income_wage_lag4) / income_wage_lag4 * 100,
home_inv_yoy = (home_inv - home_inv_lag4) / home_inv_lag4 * 100,
rent_yoy = (rent - rent_lag4) / rent_lag4 * 100,
net_migr_yoy = (net_migr - net_migr_lag4) / net_migr_lag4 * 100
) %>%
# Difference mortgage variables to remove stationarity
# mutate(mort_f_d = mort_forecl + mort_delinq) %>%
tk_augment_differences(mort_forecl, .lags = 4) %>%
tk_augment_differences(mort_delinq, .lags = 4) %>%
tk_augment_differences(unemp_rate, .lags = 4) %>%
select(date, contains("yoy"), contains("diff")) %>%
# Remove volatile early periods
filter(date >= as.Date("1984-01-01"))
moodys_trans_tbl# A tibble: 166 × 11
date home_price_yoy home_price_us_yoy income_median_yoy income_wage_yoy
<date> <dbl> <dbl> <dbl> <dbl>
1 1984-01-01 4.36 4.32 12.5 10.0
2 1984-04-01 9.17 4.40 16.1 8.63
3 1984-07-01 6.65 4.51 18.4 9.05
4 1984-10-01 5.56 4.64 19.4 9.76
5 1985-01-01 9.12 4.39 20.0 8.87
6 1985-04-01 10.0 4.54 19.3 9.21
7 1985-07-01 5.23 5.55 14.5 10.8
8 1985-10-01 7.93 5.90 6.09 9.59
9 1986-01-01 11.3 6.28 -1.31 10.6
10 1986-04-01 9.23 6.89 -7.40 9.44
# ℹ 156 more rows
# ℹ 6 more variables: home_inv_yoy <dbl>, rent_yoy <dbl>, net_migr_yoy <dbl>,
# mort_forecl_lag4_diff1 <dbl>, mort_delinq_lag4_diff1 <dbl>,
# unemp_rate_lag4_diff1 <dbl>
# moodys_trans_tbl %>%
# select(date, contains("home")) %>%
# pivot_longer(-date) %>%
# plot_time_series(date, value, name, .smooth = TRUE, .smooth_span = 0.2, .smooth_alpha = 1, .smooth_size = 0.5)2.0 Explore data
Explore data to recommend data transformation.
2.1 Regularity of times series
moodys_trans_tbl %>%
tk_summary_diagnostics(.date_var = date) %>%
glimpse()Rows: 1
Columns: 12
$ n.obs <int> 166
$ start <date> 1984-01-01
$ end <date> 2025-04-01
$ units <chr> "days"
$ scale <chr> "quarter"
$ tzone <chr> "UTC"
$ diff.minimum <dbl> 7776000
$ diff.q1 <dbl> 7862400
$ diff.median <dbl> 7862400
$ diff.mean <dbl> 7889105
$ diff.q3 <dbl> 7948800
$ diff.maximum <dbl> 7948800
2.2 Variance reduction
Consider an alternative to log transformation as it produces NaNs due to negative values.
moodys_trans_tbl %>%
select(date, home_price_yoy) %>%
mutate(home_price_yoy_log = log1p(home_price_yoy)) %>%
pivot_longer(-date) %>%
plot_time_series(date, value, .color_var = name, .smooth = FALSE)# unemployment_tbl %>% plot_time_series(date, log(rate), .smooth_span = 0.2)2.3 Seasonality
In theory, there shouldn’t be seasonality in yoy %.
moodys_trans_tbl %>%
select(date, home_price_yoy) %>% plot_seasonal_diagnostics(date, home_price_yoy)2.4 Anomalies
When analyzing and forecasting unemployment, you should remove the extreme data points from the COVID-19 pandemic. These were one-time anomalies that are not likely to repeat and can distort your model. After removing them, use a method like linear interpolation to impute or replace the missing values, creating a smoother and more reliable dataset. This will allow your forecasting model to be based on normal economic trends, leading to a more accurate and robust long-term forecast.
moodys_trans_tbl %>%
select(date, home_price_yoy) %>%
plot_anomaly_diagnostics(
.date_var = date,
.value = home_price_yoy,
.alpha = 0.02,
.max_anomalies = 0.01
)moodys_trans_tbl %>%
select(date, home_price_yoy) %>%
tk_anomaly_diagnostics(
.date_var = date,
.value = home_price_yoy,
.alpha = 0.02,
.max_anomalies = 0.01
)# A tibble: 166 × 11
date observed season trend remainder seasadj remainder_l1 remainder_l2
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1984-01-01 4.36 0.333 9.77 -5.74 4.03 -22.8 23.3
2 1984-04-01 9.17 -0.0893 9.59 -0.328 9.26 -22.8 23.3
3 1984-07-01 6.65 -0.354 9.40 -2.40 7.00 -22.8 23.3
4 1984-10-01 5.56 0.110 9.22 -3.77 5.45 -22.8 23.3
5 1985-01-01 9.12 0.333 9.04 -0.249 8.79 -22.8 23.3
6 1985-04-01 10.0 -0.0893 8.83 1.29 10.1 -22.8 23.3
7 1985-07-01 5.23 -0.354 8.62 -3.04 5.58 -22.8 23.3
8 1985-10-01 7.93 0.110 8.41 -0.588 7.82 -22.8 23.3
9 1986-01-01 11.3 0.333 8.20 2.74 10.9 -22.8 23.3
10 1986-04-01 9.23 -0.0893 7.98 1.35 9.32 -22.8 23.3
# ℹ 156 more rows
# ℹ 3 more variables: anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>
2.5 ACF Diagnostics
moodys_trans_tbl %>%
plot_acf_diagnostics(date, home_price_yoy,
.ccf_vars = c(income_median_yoy, contains("diff")),
# .show_ccf_vars_only = TRUE,
.facet_ncol = 1,
.lags = 36,
.title = "HP corr with its own lags at 4,8,12, m_income at 11,12,13 and foreclosure at 7,8,9 after differencing")2.6 Recommendation
- seasonality seems minimal
- corr with its own lags of 4,8,12; sign of remaining seasonality?
- corr with foreclosure at 7,8,9 after differencing
- corr with median income at 11 through 17
3.0 Identify features
Identify possible features for modeling.
data preparation
# Prep data
moodys_trans_tbl <- unemployment_tbl %>%
# log transform for variance reduction
mutate(rate_trans = log(rate)) %>%
# standardize for easy comparison and some ML algorithms
mutate(rate_trans = standardize_vec(rate_trans)) %>%
# Cleaning for the Pandemic
mutate(home_price_yoy = ts_clean_vec(rate_trans, period = 12)) %>%
select(-rate, -rate_trans)
moodys_trans_tbl %>%
pivot_longer(contains("trans")) %>%
plot_time_series(date, value, name, .smooth_span = 0.2)
# Save Key Params
std_mean_rate <- 1.34702775921281
std_sd_rate <- 0.3527697665249413.1 Time-based features
* Time Series Signature
moodys_trans_signature_tbl <- moodys_trans_tbl %>%
tk_augment_timeseries_signature() %>%
select(-diff, -ends_with("iso"), -ends_with(".xts"),
-contains("month"), -contains("week"), -contains("day"),
-contains("hour"), -contains("minute"), -contains("second"), -contains("am.pm"))
moodys_trans_signature_tbl %>% glimpse()Rows: 166
Columns: 15
$ date <date> 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01…
$ home_price_yoy <dbl> 4.363144, 9.169452, 6.646419, 5.563444, 9.11999…
$ home_price_us_yoy <dbl> 4.319844, 4.399065, 4.510958, 4.643825, 4.39238…
$ income_median_yoy <dbl> 12.502231, 16.102115, 18.401050, 19.449061, 19.…
$ income_wage_yoy <dbl> 10.026114, 8.632716, 9.053278, 9.763479, 8.8677…
$ home_inv_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ rent_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ net_migr_yoy <dbl> -1.142027e+02, -1.286330e+02, -1.501123e+02, -3…
$ mort_forecl_lag4_diff1 <dbl> 0.64437, 0.00000, 0.00000, 0.00000, -0.64437, 0…
$ mort_delinq_lag4_diff1 <dbl> 4.31668, 3.89201, -2.11274, -2.46383, -6.79729,…
$ unemp_rate_lag4_diff1 <dbl> -1.88298, -1.80066, -1.53282, -1.18839, -0.7479…
$ index.num <dbl> 441763200, 449625600, 457488000, 465436800, 473…
$ year <int> 1984, 1984, 1984, 1984, 1985, 1985, 1985, 1985,…
$ half <int> 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2,…
$ quarter <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,…
* Trend-Based Features
nonlinear natural spline model seems the best.
# ** Linear Trend
moodys_trans_signature_tbl %>%
plot_time_series_regression(
date,
.formula = home_price_yoy ~ index.num,
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-8.083 -4.432 -1.661 3.609 16.251
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.372e+00 1.306e+00 4.115 6.12e-05 ***
index.num -2.830e-10 1.129e-09 -0.251 0.802
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.5 on 164 degrees of freedom
Multiple R-squared: 0.0003828, Adjusted R-squared: -0.005712
F-statistic: 0.0628 on 1 and 164 DF, p-value: 0.8024
# ** Nonlinear Trend - Basis Splines
moodys_trans_signature_tbl %>%
plot_time_series_regression(
date,
.formula = home_price_yoy ~ splines::bs(index.num, df = 4),
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-10.613 -3.009 -1.081 2.344 13.004
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 13.657 1.653 8.260 5.04e-14 ***
splines::bs(index.num, df = 4)1 -17.600 3.408 -5.165 7.03e-07 ***
splines::bs(index.num, df = 4)2 2.694 2.703 0.997 0.320
splines::bs(index.num, df = 4)3 -21.121 3.035 -6.959 8.22e-11 ***
splines::bs(index.num, df = 4)4 2.809 2.191 1.282 0.202
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.587 on 161 degrees of freedom
Multiple R-squared: 0.3176, Adjusted R-squared: 0.3007
F-statistic: 18.74 on 4 and 161 DF, p-value: 1.155e-12
moodys_trans_signature_tbl %>%
plot_time_series_regression(
date,
.formula = home_price_yoy ~ splines::ns(index.num,
knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9))),
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-10.224 -2.486 -0.454 1.557 10.774
Coefficients:
Estimate
(Intercept) 14.587
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 3.092
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 -25.667
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 1.602
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 -21.892
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 8.985
Std. Error
(Intercept) 1.142
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 1.499
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 2.046
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 1.727
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 3.018
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 1.588
t value
(Intercept) 12.771
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 2.063
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 -12.544
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 0.928
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 -7.254
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 5.658
Pr(>|t|)
(Intercept) < 2e-16
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 0.0407
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 < 2e-16
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 0.3550
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 1.64e-11
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 6.90e-08
(Intercept) ***
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 *
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 ***
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 ***
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.671 on 160 degrees of freedom
Multiple R-squared: 0.5655, Adjusted R-squared: 0.5519
F-statistic: 41.65 on 5 and 160 DF, p-value: < 2.2e-16
* Seasonal Features
No seasonality detected
# ** Monthly Seasonality
moodys_trans_signature_tbl %>%
plot_time_series_regression(
date,
.formula = home_price_yoy ~ quarter,
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-8.061 -4.518 -1.794 3.409 16.073
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.17011 1.04172 4.963 1.72e-06 ***
quarter -0.04305 0.38192 -0.113 0.91
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.501 on 164 degrees of freedom
Multiple R-squared: 7.747e-05, Adjusted R-squared: -0.00602
F-statistic: 0.01271 on 1 and 164 DF, p-value: 0.9104
3.2 INTERACTIONS
How do I know possible interactions?
model_formula_interactions <- as.formula(
home_price_yoy ~ splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))
+ .
+ (as.factor(quarter) * month.lbl)
)
unemployment_trans_signature_tbl %>% glimpse()
unemployment_trans_signature_tbl %>%
plot_time_series_regression(
date,
.formula = model_formula_interactions,
.show_summary = TRUE
)3.3 FOURIER SERIES
How do I pick periods?
- For daily data with over a year of observations, include the following fourier series: 7 (weekly), 30 (monthly), 90 (quarterly), and 365 (yearly).
- Look for the duration of up and down wavy cycles from the ACF chart.
Fourier series may not be a good fit for the unemployment rate, which doesn’t display a wavy pattern, a frequent and regular patterns of ups and downs.
# - tk_augment_fourier
# Data Prep
unemployment_trans_signature_tbl %>%
plot_acf_diagnostics(date, home_price_yoy)
unemployment_trans_fourier_tbl <- unemployment_trans_signature_tbl %>%
tk_augment_fourier(date, .periods = c(6, 13, 25), .K = 2)
unemployment_trans_fourier_tbl %>% glimpse()
# Model
model_formula_fourier <- as.formula(
home_price_yoy ~ splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))
+ .
+ (as.factor(quarter) * month.lbl)
)
# Visualize
unemployment_trans_fourier_tbl %>%
# filter_by_time(.start_date = "2018-09-13") %>%
plot_time_series_regression(
date,
.formula = model_formula_fourier,
.show_summary = TRUE
)3.4 LAGS
How do I get
- .lags = (5*12):600: 5 year forecast for monthly data (12 months)
- .lags = c(87,135): significant spikes in ACF and/or PACF
Look for spikes in ACF/PACF beyond the forecasting time horizon to avoid forecasting external regressors.
# - tk_augment_lags()
# Data Prep
2*4 # 2 year forecast for quarterly data[1] 8
moodys_trans_signature_tbl %>%
plot_acf_diagnostics(
date, .value = home_price_yoy, .lags = (2*4):100
)moodys_trans_lags_tbl <- moodys_trans_signature_tbl %>%
tk_augment_lags(home_price_yoy, .lags = c(4,8,9))
moodys_trans_lags_tbl %>% glimpse()Rows: 166
Columns: 18
$ date <date> 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01…
$ home_price_yoy <dbl> 4.363144, 9.169452, 6.646419, 5.563444, 9.11999…
$ home_price_us_yoy <dbl> 4.319844, 4.399065, 4.510958, 4.643825, 4.39238…
$ income_median_yoy <dbl> 12.502231, 16.102115, 18.401050, 19.449061, 19.…
$ income_wage_yoy <dbl> 10.026114, 8.632716, 9.053278, 9.763479, 8.8677…
$ home_inv_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ rent_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ net_migr_yoy <dbl> -1.142027e+02, -1.286330e+02, -1.501123e+02, -3…
$ mort_forecl_lag4_diff1 <dbl> 0.64437, 0.00000, 0.00000, 0.00000, -0.64437, 0…
$ mort_delinq_lag4_diff1 <dbl> 4.31668, 3.89201, -2.11274, -2.46383, -6.79729,…
$ unemp_rate_lag4_diff1 <dbl> -1.88298, -1.80066, -1.53282, -1.18839, -0.7479…
$ index.num <dbl> 441763200, 449625600, 457488000, 465436800, 473…
$ year <int> 1984, 1984, 1984, 1984, 1985, 1985, 1985, 1985,…
$ half <int> 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2,…
$ quarter <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,…
$ home_price_yoy_lag4 <dbl> NA, NA, NA, NA, 4.363144, 9.169452, 6.646419, 5…
$ home_price_yoy_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4.363144, 9.169…
$ home_price_yoy_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.363144, 9…
# Model
model_formula <- as.formula(
home_price_yoy ~ splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))
+ .
- home_price_us_yoy
- income_median_yoy
- income_wage_yoy
- mort_forecl_lag4_diff1
- home_inv_yoy
- rent_yoy
)
# Visualize
moodys_trans_lags_tbl %>%
plot_time_series_regression(
date,
.formula = model_formula,
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-2.0896 -0.6676 0.0221 0.7188 1.8849
Coefficients: (4 not defined because of singularities)
Estimate
(Intercept) -3.853e+05
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 6.261e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 2.094e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 9.818e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
net_migr_yoy -2.461e-02
mort_delinq_lag4_diff1 8.756e-01
unemp_rate_lag4_diff1 -3.786e-01
index.num NA
year 1.903e+02
half 9.797e-01
quarter 4.726e+01
home_price_yoy_lag4 -2.428e-01
home_price_yoy_lag8 -8.642e-01
home_price_yoy_lag9 2.474e-01
Std. Error
(Intercept) 3.844e+05
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 4.062e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 2.233e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 9.696e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
net_migr_yoy 5.177e-03
mort_delinq_lag4_diff1 7.436e-01
unemp_rate_lag4_diff1 2.230e-01
index.num NA
year 1.898e+02
half 1.099e+00
quarter 4.733e+01
home_price_yoy_lag4 1.016e-01
home_price_yoy_lag8 1.383e-01
home_price_yoy_lag9 1.564e-01
t value
(Intercept) -1.002
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 1.541
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 0.938
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 1.013
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
net_migr_yoy -4.753
mort_delinq_lag4_diff1 1.178
unemp_rate_lag4_diff1 -1.698
index.num NA
year 1.002
half 0.891
quarter 0.999
home_price_yoy_lag4 -2.390
home_price_yoy_lag8 -6.249
home_price_yoy_lag9 1.582
Pr(>|t|)
(Intercept) 0.329441
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 0.140636
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 0.360840
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 0.324715
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
net_migr_yoy 0.000159
mort_delinq_lag4_diff1 0.254286
unemp_rate_lag4_diff1 0.106801
index.num NA
year 0.329415
half 0.384668
quarter 0.331254
home_price_yoy_lag4 0.028019
home_price_yoy_lag8 6.79e-06
home_price_yoy_lag9 0.130983
(Intercept)
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5
date
net_migr_yoy ***
mort_delinq_lag4_diff1
unemp_rate_lag4_diff1
index.num
year
half
quarter
home_price_yoy_lag4 *
home_price_yoy_lag8 ***
home_price_yoy_lag9
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.297 on 18 degrees of freedom
(135 observations deleted due to missingness)
Multiple R-squared: 0.967, Adjusted R-squared: 0.945
F-statistic: 43.95 on 12 and 18 DF, p-value: 7.993e-11
3.5 SPECIAL EVENTS
Created the dummy for the Pandemic starting from the month when the rate shot up to the month that gave us the largest R-squared.
moodys_trans_lags_pandemic_tbl <- moodys_trans_lags_tbl %>%
mutate(event = ifelse(between(date, as.Date("2020-04-01"),as.Date("2023-09-01")), 1, 0))
moodys_trans_lags_pandemic_tbl %>% glimpse()Rows: 166
Columns: 19
$ date <date> 1984-01-01, 1984-04-01, 1984-07-01, 1984-10-01…
$ home_price_yoy <dbl> 4.363144, 9.169452, 6.646419, 5.563444, 9.11999…
$ home_price_us_yoy <dbl> 4.319844, 4.399065, 4.510958, 4.643825, 4.39238…
$ income_median_yoy <dbl> 12.502231, 16.102115, 18.401050, 19.449061, 19.…
$ income_wage_yoy <dbl> 10.026114, 8.632716, 9.053278, 9.763479, 8.8677…
$ home_inv_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ rent_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ net_migr_yoy <dbl> -1.142027e+02, -1.286330e+02, -1.501123e+02, -3…
$ mort_forecl_lag4_diff1 <dbl> 0.64437, 0.00000, 0.00000, 0.00000, -0.64437, 0…
$ mort_delinq_lag4_diff1 <dbl> 4.31668, 3.89201, -2.11274, -2.46383, -6.79729,…
$ unemp_rate_lag4_diff1 <dbl> -1.88298, -1.80066, -1.53282, -1.18839, -0.7479…
$ index.num <dbl> 441763200, 449625600, 457488000, 465436800, 473…
$ year <int> 1984, 1984, 1984, 1984, 1985, 1985, 1985, 1985,…
$ half <int> 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2,…
$ quarter <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,…
$ home_price_yoy_lag4 <dbl> NA, NA, NA, NA, 4.363144, 9.169452, 6.646419, 5…
$ home_price_yoy_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4.363144, 9.169…
$ home_price_yoy_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.363144, 9…
$ event <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
g <- moodys_trans_lags_pandemic_tbl %>%
plot_time_series(date, home_price_yoy, .interactive = FALSE) +
geom_point(color = "red", data = . %>% filter(event == 1))
plotly::ggplotly(g)# Model
model_formula <- as.formula(
home_price_yoy ~ splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))
+ .
- home_price_us_yoy
- income_median_yoy
- mort_forecl_lag4_diff1
)
# Visualize
moodys_trans_lags_pandemic_tbl %>%
plot_time_series_regression(
date,
.formula = model_formula,
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-1.2219 -0.5268 0.1283 0.4601 1.5931
Coefficients: (4 not defined because of singularities)
Estimate
(Intercept) -1.463e+05
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 2.594e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 7.689e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 3.671e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
income_wage_yoy 7.353e-01
home_inv_yoy -5.036e-02
rent_yoy 3.925e-02
net_migr_yoy -2.217e-02
mort_delinq_lag4_diff1 9.268e-01
unemp_rate_lag4_diff1 3.366e-01
index.num NA
year 7.227e+01
half 1.308e+00
quarter 1.779e+01
home_price_yoy_lag4 -1.448e-01
home_price_yoy_lag8 -7.829e-01
home_price_yoy_lag9 3.954e-01
event 5.655e-01
Std. Error
(Intercept) 3.204e+05
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 3.472e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 1.857e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 8.094e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
income_wage_yoy 3.010e-01
home_inv_yoy 1.980e-02
rent_yoy 2.358e-01
net_migr_yoy 5.456e-03
mort_delinq_lag4_diff1 6.367e-01
unemp_rate_lag4_diff1 3.714e-01
index.num NA
year 1.582e+02
half 8.673e-01
quarter 3.946e+01
home_price_yoy_lag4 1.171e-01
home_price_yoy_lag8 1.239e-01
home_price_yoy_lag9 1.321e-01
event 1.465e+00
t value
(Intercept) -0.457
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 0.747
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 0.414
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 0.454
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
income_wage_yoy 2.443
home_inv_yoy -2.543
rent_yoy 0.166
net_migr_yoy -4.062
mort_delinq_lag4_diff1 1.456
unemp_rate_lag4_diff1 0.906
index.num NA
year 0.457
half 1.508
quarter 0.451
home_price_yoy_lag4 -1.237
home_price_yoy_lag8 -6.318
home_price_yoy_lag9 2.993
event 0.386
Pr(>|t|)
(Intercept) 0.65488
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 0.46737
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 0.68511
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 0.65710
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
income_wage_yoy 0.02843
home_inv_yoy 0.02342
rent_yoy 0.87017
net_migr_yoy 0.00117
mort_delinq_lag4_diff1 0.16753
unemp_rate_lag4_diff1 0.38020
index.num NA
year 0.65485
half 0.15372
quarter 0.65893
home_price_yoy_lag4 0.23645
home_price_yoy_lag8 1.9e-05
home_price_yoy_lag9 0.00968
event 0.70529
(Intercept)
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5
date
income_wage_yoy *
home_inv_yoy *
rent_yoy
net_migr_yoy **
mort_delinq_lag4_diff1
unemp_rate_lag4_diff1
index.num
year
half
quarter
home_price_yoy_lag4
home_price_yoy_lag8 ***
home_price_yoy_lag9 **
event
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.012 on 14 degrees of freedom
(135 observations deleted due to missingness)
Multiple R-squared: 0.9844, Adjusted R-squared: 0.9666
F-statistic: 55.2 on 16 and 14 DF, p-value: 7.022e-10
3.6 EXTERNAL LAGGED REGRESSORS
- Examine cross correlation (CC) between home_price_yoy and diff mort variables.
- Look for spikes in CC beyond the forecasting time horizon to avoid forecasting external regressors.
- Beyond 2 (years) * 4 (quarters) forecasting horizon, * delinquency: no spikes beyond 8 quarters * foreclosure: spikes at 7,8 * us home price: spikes at 8-18 * median income: 28 * wage: 1-9
# Identify corr predictors with 4-5 quarter lag
# income_median_yoy(5), home_inv_yoy(5), mort_forecl_lag4_diff1
moodys_trans_lags_tbl %>%
plot_acf_diagnostics(
date, home_price_yoy,
.ccf_vars = -c(date, home_price_yoy, home_price_yoy_lag4, home_price_yoy_lag8,
home_price_yoy_lag9,
index.num, year, half, quarter),
.show_ccf_vars_only = TRUE,
.facet_ncol = 3,
.lags = 10
)moodys_trans_ext_lags_tbl <- moodys_trans_lags_tbl %>%
tk_augment_lags(home_price_us_yoy, .lags = c(4)) %>%
tk_augment_lags(income_wage_yoy, .lags = c(4)) %>%
tk_augment_lags(rent_yoy, .lags = c(4)) %>%
tk_augment_lags(income_median_yoy, .lags = c(5)) %>%
tk_augment_lags(home_inv_yoy, .lags = c(5)) %>%
tk_augment_lags(mort_forecl_lag4_diff1, .lags = c(7)) %>%
select(-home_price_us_yoy, -income_median_yoy,
-income_wage_yoy, -mort_forecl_lag4_diff1,
-home_inv_yoy, rent_yoy)
moodys_trans_ext_lags_tbl %>% glimpse()Rows: 166
Columns: 19
$ date <date> 1984-01-01, 1984-04-01, 1984-07-01, 1984-…
$ home_price_yoy <dbl> 4.363144, 9.169452, 6.646419, 5.563444, 9.…
$ rent_yoy <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ net_migr_yoy <dbl> -1.142027e+02, -1.286330e+02, -1.501123e+0…
$ mort_delinq_lag4_diff1 <dbl> 4.31668, 3.89201, -2.11274, -2.46383, -6.7…
$ unemp_rate_lag4_diff1 <dbl> -1.88298, -1.80066, -1.53282, -1.18839, -0…
$ index.num <dbl> 441763200, 449625600, 457488000, 465436800…
$ year <int> 1984, 1984, 1984, 1984, 1985, 1985, 1985, …
$ half <int> 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, …
$ quarter <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, …
$ home_price_yoy_lag4 <dbl> NA, NA, NA, NA, 4.363144, 9.169452, 6.6464…
$ home_price_yoy_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4.363144, …
$ home_price_yoy_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.3631…
$ home_price_us_yoy_lag4 <dbl> NA, NA, NA, NA, 4.319844, 4.399065, 4.5109…
$ income_wage_yoy_lag4 <dbl> NA, NA, NA, NA, 10.026114, 8.632716, 9.053…
$ rent_yoy_lag4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ income_median_yoy_lag5 <dbl> NA, NA, NA, NA, NA, 12.502231, 16.102115, …
$ home_inv_yoy_lag5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ mort_forecl_lag4_diff1_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, 0.64437, 0.000…
# Model
model_formula <- as.formula(
home_price_yoy ~ splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.50, 0.7, 0.9)))
+ .
)
# Visualize
moodys_trans_ext_lags_tbl %>%
plot_time_series_regression(
.date_var = date,
.formula = model_formula,
.show_summary = TRUE
)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-0.57919 -0.15150 -0.01948 0.21653 0.66630
Coefficients: (4 not defined because of singularities)
Estimate
(Intercept) -2.938e+05
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 9.665e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 1.477e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 7.810e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
rent_yoy -1.020e+00
net_migr_yoy -5.920e-03
mort_delinq_lag4_diff1 -1.764e+00
unemp_rate_lag4_diff1 2.527e-01
index.num NA
year 1.451e+02
half 4.331e-01
quarter 3.606e+01
home_price_yoy_lag4 2.689e-01
home_price_yoy_lag8 -6.048e-01
home_price_yoy_lag9 9.665e-02
home_price_us_yoy_lag4 -5.038e-01
income_wage_yoy_lag4 3.326e-01
rent_yoy_lag4 6.260e-01
income_median_yoy_lag5 4.449e-01
home_inv_yoy_lag5 1.658e-02
mort_forecl_lag4_diff1_lag7 -1.274e+01
Std. Error
(Intercept) 2.268e+05
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 3.148e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 1.305e+03
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 5.715e+02
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
rent_yoy 2.774e-01
net_migr_yoy 6.135e-03
mort_delinq_lag4_diff1 1.016e+00
unemp_rate_lag4_diff1 2.155e-01
index.num NA
year 1.120e+02
half 6.629e-01
quarter 2.791e+01
home_price_yoy_lag4 4.106e-01
home_price_yoy_lag8 1.364e-01
home_price_yoy_lag9 1.549e-01
home_price_us_yoy_lag4 3.508e-01
income_wage_yoy_lag4 1.065e-01
rent_yoy_lag4 2.719e-01
income_median_yoy_lag5 1.383e-01
home_inv_yoy_lag5 3.825e-02
mort_forecl_lag4_diff1_lag7 2.786e+00
t value
(Intercept) -1.295
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 3.070
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 1.131
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 1.367
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
rent_yoy -3.676
net_migr_yoy -0.965
mort_delinq_lag4_diff1 -1.736
unemp_rate_lag4_diff1 1.172
index.num NA
year 1.295
half 0.653
quarter 1.292
home_price_yoy_lag4 0.655
home_price_yoy_lag8 -4.433
home_price_yoy_lag9 0.624
home_price_us_yoy_lag4 -1.436
income_wage_yoy_lag4 3.123
rent_yoy_lag4 2.302
income_median_yoy_lag5 3.217
home_inv_yoy_lag5 0.433
mort_forecl_lag4_diff1_lag7 -4.572
Pr(>|t|)
(Intercept) 0.23627
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 0.01807
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2 0.29512
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3 0.21401
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4 NA
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5 NA
date NA
rent_yoy 0.00791
net_migr_yoy 0.36668
mort_delinq_lag4_diff1 0.12609
unemp_rate_lag4_diff1 0.27945
index.num NA
year 0.23625
half 0.53438
quarter 0.23733
home_price_yoy_lag4 0.53341
home_price_yoy_lag8 0.00303
home_price_yoy_lag9 0.55243
home_price_us_yoy_lag4 0.19412
income_wage_yoy_lag4 0.01678
rent_yoy_lag4 0.05479
income_median_yoy_lag5 0.01472
home_inv_yoy_lag5 0.67776
mort_forecl_lag4_diff1_lag7 0.00257
(Intercept)
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))1 *
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))2
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))3
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))4
splines::ns(index.num, knots = quantile(index.num, probs = c(0.25, 0.5, 0.7, 0.9)))5
date
rent_yoy **
net_migr_yoy
mort_delinq_lag4_diff1
unemp_rate_lag4_diff1
index.num
year
half
quarter
home_price_yoy_lag4
home_price_yoy_lag8 **
home_price_yoy_lag9
home_price_us_yoy_lag4
income_wage_yoy_lag4 *
rent_yoy_lag4 .
income_median_yoy_lag5 *
home_inv_yoy_lag5
mort_forecl_lag4_diff1_lag7 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.625 on 7 degrees of freedom
(139 observations deleted due to missingness)
Multiple R-squared: 0.9965, Adjusted R-squared: 0.9871
F-statistic: 105.3 on 19 and 7 DF, p-value: 8.434e-07
# home price vs inventory (~50% explained)
model_formula <- as.formula(
home_price_yoy ~ mort_forecl_lag4_diff1_lag7 +
income_median_yoy_lag5 + home_inv_yoy_lag5
)
moodys_trans_ext_lags_tbl %>%
plot_time_series_regression(date,
model_formula,
.show_summary = T)
Call:
stats::lm(formula = .formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-4.6000 -2.8468 -0.0186 2.1743 5.5048
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.2347 1.5875 1.408 0.172612
mort_forecl_lag4_diff1_lag7 -2.2416 7.8551 -0.285 0.777914
income_median_yoy_lag5 0.6245 0.2033 3.072 0.005391 **
home_inv_yoy_lag5 -0.1781 0.0384 -4.637 0.000115 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.304 on 23 degrees of freedom
(139 observations deleted due to missingness)
Multiple R-squared: 0.68, Adjusted R-squared: 0.6382
F-statistic: 16.29 on 3 and 23 DF, p-value: 6.771e-06
moodys_trans_ext_lags_tbl %>%
select(date, home_price_yoy, mort_forecl_lag4_diff1_lag7, income_median_yoy_lag5, home_inv_yoy_lag5, rent_yoy) %>%
# standardize for easy comparison and some ML algorithms
mutate(across(where(is.numeric), standardize_vec)) %>%
drop_na() %>%
pivot_longer(-date) %>%
plot_time_series(date, value, name, .smooth = FALSE)3.7 RECOMMENDATION
- Time_based feastures
- trend-based feature (0.55): yes (splines::ns)
- seasonal feature (): no
- Lags (0.69): yes
- Special Event (0.69): no
- External Lags (0.73): yes
4.0 Prep Data
4.1 Create full dataset
- Extend to Future Window
- Add any lags to full dataset
- Add any external regressors to full dataset
How do I determine? rolling_periods <- c(30, 60, 90)
horizon <- 4
lag_period <- c(4) # first spike in ACF/PACF beyond the forecasting time horizon
rolling_periods <- c(4, 8, 12)
data_prepared_full_tbl <- moodys_trans_tbl %>%
# Add future window
bind_rows(
future_frame(.data = ., .date_var = date, .length_out = horizon)
) %>%
# Add external lags
tk_augment_lags(mort_forecl_lag4_diff1, .lags = c(7)) %>%
# tk_augment_lags(home_price_us_yoy, .lags = c(4)) %>%
# tk_augment_lags(income_wage_yoy, .lags = c(4)) %>%
# tk_augment_lags(rent_yoy, .lags = c(4)) %>%
tk_augment_lags(income_median_yoy, .lags = c(5)) %>%
tk_augment_lags(home_inv_yoy, .lags = c(5)) %>%
select(-home_price_us_yoy, -income_wage_yoy, -rent_yoy, -income_median_yoy,
-mort_forecl_lag4_diff1, -home_inv_yoy, -unemp_rate_lag4_diff1,
-mort_delinq_lag4_diff1, -net_migr_yoy) %>%
# Add Autocorrelated Lags
# tk_augment_lags(home_price_yoy, .lags = lag_period) %>%
# Add rolling features
# tk_augment_slidify(
# .value = home_price_yoy_lag4, # time horizon
# .f = mean,
# .period = rolling_periods,
# .align = "center",
# .partial = TRUE
# ) %>%
# Format Columns
rename_with(.cols = contains("lag"), .fn = ~ str_c("lag_", .))
data_prepared_full_tbl %>%
pivot_longer(-date) %>%
plot_time_series(date, value, name, .smooth = FALSE)data_prepared_full_tbl %>% tail(5*12 + 1)# A tibble: 61 × 5
date home_price_yoy lag_mort_forecl_lag4_diff1…¹ lag_income_median_yo…²
<date> <dbl> <dbl> <dbl>
1 2011-04-01 -1.35 0.121 -5.39
2 2011-07-01 -1.36 -0.0105 -6.25
3 2011-10-01 -0.432 0.0899 -4.72
4 2012-01-01 -0.178 -0.0103 -0.668
5 2012-04-01 0.657 0.205 3.72
6 2012-07-01 0.716 0.0944 7.81
7 2012-10-01 -0.505 -0.0984 9.12
8 2013-01-01 0.319 0.0302 6.68
9 2013-04-01 -0.225 -0.205 3.71
10 2013-07-01 -0.0640 -0.0644 0.811
# ℹ 51 more rows
# ℹ abbreviated names: ¹lag_mort_forecl_lag4_diff1_lag7,
# ²lag_income_median_yoy_lag5
# ℹ 1 more variable: lag_home_inv_yoy_lag5 <dbl>
4.2 Separate into model and forecast data
data_prepared_full_tbl %>% tail(61)# A tibble: 61 × 5
date home_price_yoy lag_mort_forecl_lag4_diff1…¹ lag_income_median_yo…²
<date> <dbl> <dbl> <dbl>
1 2011-04-01 -1.35 0.121 -5.39
2 2011-07-01 -1.36 -0.0105 -6.25
3 2011-10-01 -0.432 0.0899 -4.72
4 2012-01-01 -0.178 -0.0103 -0.668
5 2012-04-01 0.657 0.205 3.72
6 2012-07-01 0.716 0.0944 7.81
7 2012-10-01 -0.505 -0.0984 9.12
8 2013-01-01 0.319 0.0302 6.68
9 2013-04-01 -0.225 -0.205 3.71
10 2013-07-01 -0.0640 -0.0644 0.811
# ℹ 51 more rows
# ℹ abbreviated names: ¹lag_mort_forecl_lag4_diff1_lag7,
# ²lag_income_median_yoy_lag5
# ℹ 1 more variable: lag_home_inv_yoy_lag5 <dbl>
data_prepared_tbl <- data_prepared_full_tbl %>%
filter(!is.na(home_price_yoy))
data_prepared_tbl# A tibble: 166 × 5
date home_price_yoy lag_mort_forecl_lag4_diff1…¹ lag_income_median_yo…²
<date> <dbl> <dbl> <dbl>
1 1984-01-01 4.36 NA NA
2 1984-04-01 9.17 NA NA
3 1984-07-01 6.65 NA NA
4 1984-10-01 5.56 NA NA
5 1985-01-01 9.12 NA NA
6 1985-04-01 10.0 NA 12.5
7 1985-07-01 5.23 NA 16.1
8 1985-10-01 7.93 0.644 18.4
9 1986-01-01 11.3 0 19.4
10 1986-04-01 9.23 0 20.0
# ℹ 156 more rows
# ℹ abbreviated names: ¹lag_mort_forecl_lag4_diff1_lag7,
# ²lag_income_median_yoy_lag5
# ℹ 1 more variable: lag_home_inv_yoy_lag5 <dbl>
forecast_tbl <- data_prepared_full_tbl %>%
filter(is.na(home_price_yoy))
forecast_tbl# A tibble: 4 × 5
date home_price_yoy lag_mort_forecl_lag4_diff1_…¹ lag_income_median_yo…²
<date> <dbl> <dbl> <dbl>
1 2025-07-01 NA 0.0151 5.83
2 2025-10-01 NA 0.00864 4.69
3 2026-01-01 NA -0.00741 3.88
4 2026-04-01 NA 0.0253 4.03
# ℹ abbreviated names: ¹lag_mort_forecl_lag4_diff1_lag7,
# ²lag_income_median_yoy_lag5
# ℹ 1 more variable: lag_home_inv_yoy_lag5 <dbl>
4.3 Split model data into training/test
data_prepared_tbl# A tibble: 166 × 5
date home_price_yoy lag_mort_forecl_lag4_diff1…¹ lag_income_median_yo…²
<date> <dbl> <dbl> <dbl>
1 1984-01-01 4.36 NA NA
2 1984-04-01 9.17 NA NA
3 1984-07-01 6.65 NA NA
4 1984-10-01 5.56 NA NA
5 1985-01-01 9.12 NA NA
6 1985-04-01 10.0 NA 12.5
7 1985-07-01 5.23 NA 16.1
8 1985-10-01 7.93 0.644 18.4
9 1986-01-01 11.3 0 19.4
10 1986-04-01 9.23 0 20.0
# ℹ 156 more rows
# ℹ abbreviated names: ¹lag_mort_forecl_lag4_diff1_lag7,
# ²lag_income_median_yoy_lag5
# ℹ 1 more variable: lag_home_inv_yoy_lag5 <dbl>
splits <- time_series_split(data_prepared_tbl, assess = horizon, cumulative = TRUE)
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(date, home_price_yoy)5.0 Specify recipes
- Time Series Signature - Adds bulk time-based features
- Spline Transformation to index.num
- External lags
# recipe_spec_base <- recipe(home_price_yoy ~ ., data = training(splits)) %>%
#
# # Time Series Signature
# step_timeseries_signature(date) %>%
# step_rm(matches("(iso)|(xts)|(month)|(week)|(day)|(hour)|(minute)|(second)|(am.pm)")) %>%
#
# # Standardization
# step_normalize(matches("(index.num)|(year)")) %>%
#
# # Dummy Encoding (One Hot Encoding)
# step_dummy(all_nominal(), one_hot = TRUE)
recipe_spec_base <- recipe(home_price_yoy ~ ., data = training(splits)) %>%
step_normalize(-date)
recipe_spec_base %>% prep() %>% juice() %>% glimpse()Rows: 162
Columns: 5
$ date <date> 1984-01-01, 1984-04-01, 1984-07-01, 1…
$ lag_mort_forecl_lag4_diff1_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, 4.97304322…
$ lag_income_median_yoy_lag5 <dbl> NA, NA, NA, NA, NA, 1.3252670, 1.87522…
$ lag_home_inv_yoy_lag5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ home_price_yoy <dbl> -0.11763205, 0.74996427, 0.29452655, 0…
6.0 Specify models
6.1 GLMNet
# - Strengths: Very good for trend
# - Weaknesses: Not as good for complex patterns (i.e. seasonality)
model_spec_glmnet <- linear_reg(
penalty = 0.3,
mixture = 1
) %>%
set_engine("glmnet")
recipe_spec_2_lag <- recipe_spec_base %>%
step_rm(date) %>% # Remove linear features
step_naomit(contains("lag"))
recipe_spec_2_lag %>% prep() %>% juice() %>% glimpse()Rows: 23
Columns: 4
$ lag_mort_forecl_lag4_diff1_lag7 <dbl> -0.02515893, -0.21705115, 0.33362982, …
$ lag_income_median_yoy_lag5 <dbl> -0.65323697, -0.26081530, 0.06713234, …
$ lag_home_inv_yoy_lag5 <dbl> 0.5522146, 0.5640290, 0.3671761, 0.442…
$ home_price_yoy <dbl> -0.54514262, -0.51076880, -0.29986892,…
workflow_fit_glmnet <- workflow() %>%
add_recipe(recipe_spec_2_lag) %>%
add_model(model_spec_glmnet) %>%
fit(training(splits))
workflow_fit_glmnet %>%
# Pull the fitted model object from the workflow
extract_fit_parsnip() %>%
# Get the coefficient estimates from the glmnet fit (using s=0.1, the penalty value)
broom::tidy() %>%
# Filter to show only predictors where the estimate is not zero
# The 'term' column contains the predictor names.
filter(estimate != 0) %>%
# Arrange by the absolute value of the estimate to see the most important predictors
arrange(desc(abs(estimate)))# A tibble: 3 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) 0.801 0.3
2 lag_home_inv_yoy_lag5 -0.429 0.3
3 lag_income_median_yoy_lag5 0.371 0.3
6.2 XGBOOST
# - Strengths: Best for seasonality & complex patterns
# - Weaknesses:
# - Cannot predict beyond the maximum/minimum target (e.g. increasing trend)
# - Solution: Model trend separately (if needed).
# - Can combine with ARIMA, Linear Regression, Mars, or Prophet
# - prophet_boost & arima_boost: Do this
# Implementation
model_spec_boost <- boost_tree(
mode = "regression",
mtry = 3,
trees = 500,
min_n = 2,
tree_depth = 12,
learn_rate = 0.3,
loss_reduction = 0
) %>%
set_engine("xgboost")
# spline
# recipe_spec_1 <- recipe_spec_base %>%
# step_rm(date) %>%
# step_ns(ends_with("index.num"), deg_free = 4) %>%
# step_rm(starts_with("lag_"))
#
# recipe_spec_1 %>% prep() %>% juice() %>% glimpse()
#
# set.seed(123)
# wflw_fit_xgboost_spline <- workflow() %>%
# add_recipe(recipe_spec_1) %>%
# add_model(model_spec_boost) %>%
# fit(training(splits))
#
# # Extract the fitted model and display Variable Importance
# wflw_fit_xgboost_spline %>%
# # Pull the fitted model object
# pull_workflow_fit() %>%
# # Use the vip package to calculate and plot variable importance
# vip(num_features = 10, # Show the top 10 most important features
# geom = "point") # Use a point plot for a clean display
# Lag
recipe_spec_2 <- recipe_spec_base %>%
step_rm(date) %>%
step_naomit(starts_with("lag_"))
set.seed(123)
wflw_fit_xgboost_lag <- workflow() %>%
add_recipe(recipe_spec_2_lag) %>%
add_model(model_spec_boost) %>%
fit(training(splits))
# Extract the fitted model and display Variable Importance
wflw_fit_xgboost_lag %>%
# Pull the fitted model object
pull_workflow_fit() %>%
# Use the vip package to calculate and plot variable importance
vip(num_features = 10, # Show the top 10 most important features
geom = "point") # Use a point plot for a clean display
6.3 Spline
model_spec_lm <- linear_reg() %>%
set_engine("lm")
recipe_spec_1 <- recipe_spec_base %>%
step_rm(date) %>%
step_ns(ends_with("index.num"), deg_free = 4) %>%
step_rm(contains("roll"))
recipe_spec_1 %>% prep() %>% juice() %>% glimpse()Rows: 162
Columns: 4
$ lag_mort_forecl_lag4_diff1_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, 4.97304322…
$ lag_income_median_yoy_lag5 <dbl> NA, NA, NA, NA, NA, 1.3252670, 1.87522…
$ lag_home_inv_yoy_lag5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ home_price_yoy <dbl> -0.11763205, 0.74996427, 0.29452655, 0…
workflow_fit_lm_1_spline <- workflow() %>%
add_model(model_spec_lm) %>%
add_recipe(recipe_spec_1) %>%
fit(training(splits))
workflow_fit_lm_1_spline %>%
pull_workflow_fit() %>%
pluck("fit") %>%
summary()
Call:
stats::lm(formula = ..y ~ ., data = data)
Residuals:
Min 1Q Median 3Q Max
-0.88175 -0.43174 0.01341 0.35097 1.03442
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.7310 0.1333 5.482 2.74e-05 ***
lag_mort_forecl_lag4_diff1_lag7 0.1787 0.2196 0.814 0.425815
lag_income_median_yoy_lag5 0.9475 0.2749 3.446 0.002705 **
lag_home_inv_yoy_lag5 -0.7847 0.1828 -4.293 0.000393 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.594 on 19 degrees of freedom
(139 observations deleted due to missingness)
Multiple R-squared: 0.7259, Adjusted R-squared: 0.6826
F-statistic: 16.77 on 3 and 19 DF, p-value: 1.433e-05
7.0 Compare model performance
7.1 Modeltime table
Organize
model_tbl <- modeltime_table(
workflow_fit_glmnet,
# wflw_fit_xgboost_spline,
wflw_fit_xgboost_lag,
workflow_fit_lm_1_spline
) %>%
# update_model_description(2, "XGBOOST - Spline Recipe") %>%
update_model_description(2, "XGBOOST - Lag Recipe")
model_tbl# Modeltime Table
# A tibble: 3 × 3
.model_id .model .model_desc
<int> <list> <chr>
1 1 <workflow> GLMNET
2 2 <workflow> XGBOOST - Lag Recipe
3 3 <workflow> LM
7.2 Calibration
# - Calculates residual model errors on test set
# - Gives us a true prediction error estimate when we model with confidence intervals
calibration_tbl <- model_tbl %>%
modeltime_calibrate(new_data = testing(splits))
calibration_tbl %>%
slice(1) %>%
unnest(.calibration_data)# A tibble: 4 × 8
.model_id .model .model_desc .type date .actual .prediction
<int> <list> <chr> <chr> <date> <dbl> <dbl>
1 1 <workflow> GLMNET Test 2024-07-01 0.235 0.890
2 1 <workflow> GLMNET Test 2024-10-01 0.761 0.817
3 1 <workflow> GLMNET Test 2025-01-01 0.297 0.359
4 1 <workflow> GLMNET Test 2025-04-01 0.151 -0.0954
# ℹ 1 more variable: .residuals <dbl>
7.3 Test accuracy
# - Calculates common accuracy measures
# - MAE, MAPE, MASE, SMAPE, RMSE, R-SQUARED
calibration_tbl %>%
modeltime_accuracy()# A tibble: 3 × 9
.model_id .model_desc .type mae mape mase smape rmse rsq
<int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 GLMNET Test 0.255 118. 0.674 85.6 0.353 0.319
2 2 XGBOOST - Lag Recipe Test 0.326 127. 0.862 78.9 0.360 0.851
3 3 LM Test 0.623 305. 1.65 124. 0.746 0.247
# Table Modeltime Accuracy
calibration_tbl %>%
modeltime_accuracy(
metric_set = default_forecast_accuracy_metric_set()
) %>%
table_modeltime_accuracy(
.interactive = TRUE,
bordered = TRUE,
resizable = TRUE
)# Metric Sets
#?default_forecast_accuracy_metric_set
metric_set(mae, rmse, iic)A metric set, consisting of:
- `mae()`, a numeric metric | direction: minimize
- `rmse()`, a numeric metric | direction: minimize
- `iic()`, a numeric metric | direction: maximize
calibration_tbl %>%
modeltime_accuracy(
metric_set = metric_set(mae, rmse, iic)
)# A tibble: 3 × 6
.model_id .model_desc .type mae rmse iic
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 1 GLMNET Test 0.255 0.353 0.540
2 2 XGBOOST - Lag Recipe Test 0.326 0.360 -0.519
3 3 LM Test 0.623 0.746 0.491
7.4 Test forecast
# - Visualize the out-of-sample forecast
calibration_tbl %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = data_prepared_tbl,
conf_interval = 0.80
) %>%
plot_modeltime_forecast(
.legend_max_width = 60,
.legend_show = TRUE,
.conf_interval_show = TRUE,
.conf_interval_alpha = 0.20,
.conf_interval_fill = "lightblue",
.title = "NH Unemployment Rate Forecast"
)8.0 Refit
refit_tbl <- calibration_tbl %>%
modeltime_refit(data = data_prepared_tbl)9.0 Forecast
# * Final Forecast ----
# - 'new_data' vs 'h'
# - 'actual_data'
# - Preprocessing
refit_tbl %>%
modeltime_forecast(
# h = "8 weeks",
new_data = forecast_tbl,
actual_data = data_prepared_tbl,
conf_interval = 0.80
) %>%
plot_modeltime_forecast(
.legend_max_width = 25,
.conf_interval_fill = "lightblue",
.interactive = TRUE,
.legend_show = TRUE
)