Chase Farha
Predictive Analytics
Outcome Variable:
Predictor Variables:
I import data from January 2020 to August 2025 for each variable.
# Load required packages
library(fredr)
library(tidyverse)
library(tsibble)
library(lubridate)
library(fpp3)
library(zoo)
library(knitr)
# ---Import monthly Nonfarm Payrolls (PAYNSA) ---
nfp <- fredr(series_id = "PAYNSA") %>%
select(date, nfp = value) %>%
mutate(date = yearmonth(date)) %>%
filter(date >= yearmonth("2020 Jan")) %>%
as_tsibble(index = date)
#---Import predictor variable data from outside sources, already compiled into 1 csv
predictors <- read_csv("/Users/cfarha/Library/CloudStorage/OneDrive-Personal/Documents/BC/Predictive Analytics/private_employment.csv") %>%
mutate(
date = make_date(year, month, 1),
date = yearmonth(date)
) %>%
as_tsibble(index = date)
#---Import Lightcast Job Postings
lightcast <- read_csv("/Users/cfarha/Library/CloudStorage/OneDrive-Personal/Documents/BC/Predictive Analytics/lightcast.csv") %>%
mutate(date = make_date(year, month, day)) %>%
as_tsibble(index = date) %>%
slice_head(n = nrow(.) - 2) %>% # Drop October data
fill_gaps() %>% # Fill missing daily gaps
mutate(lightcast = zoo::na.approx(lightcast, na.rm = FALSE)) %>% # Interpolate
index_by(month = yearmonth(date)) %>% # Aggregate by month
summarise(lightcast = mean(lightcast, na.rm = TRUE)) %>%
rename(date = month) %>% # Rename back to 'date'
as_tsibble(index = date)
# ---Combine all series into one tsibble ---
data <- nfp %>%
right_join(predictors, by = "date") %>%
right_join(lightcast, by = "date") %>%
select(-month, -year)
I split the train dataset from January 2020 to December 2023 (about 70% of the data) to split at a full-year boundary, preserve seasonal cycles, and reflect realistic forecasting conditions.
train <- data %>% filter(date <= yearmonth("2023 Dec"))
test <- data %>% filter(date >= yearmonth("2024 Jan"))
library(forecast)
# Compute Box-Cox lambda on training set
vars_to_bc <- c("nfp","adp","small_bus","warn","indeed","lightcast")
#Compute Guerrero lambda for each variable
lambda_table <- sapply(vars_to_bc, function(v) {
train %>%
select(all_of(v)) %>%
features(features = guerrero) %>%
pull(lambda_guerrero)
})
lambda_table <- data.frame(
variable = names(lambda_table),
lambda_guerrero = as.numeric(lambda_table)
)
# Display with knitr::kable
knitr::kable(lambda_table, digits = 3,
caption = "Guerrero Box-Cox Lambda for Selected Variables")
# Convert to named vector for easy lookup
lambda_list <- setNames(lambda_table$lambda_guerrero, lambda_table$variable)
# List of variables to transform
vars_to_bc <- lambda_table$variable
# Full training pipeline
train <- train %>%
# Step 1: Box–Cox transform using Guerrero lambdas dynamically
mutate(across(all_of(vars_to_bc),
~{
shift <- ifelse(min(.x, na.rm = TRUE) <= 0, abs(min(.x, na.rm = TRUE)) + 1, 0)
BoxCox(.x + shift, lambda_list[cur_column()])
},
.names = "{.col}_bc"))
# Store Transformations and shifts
bc_info <- data.frame(
variable = vars_to_bc,
lambda = lambda_list,
shift = sapply(vars_to_bc, function(v) {
shift <- ifelse(min(train[[v]], na.rm = TRUE) <= 0, abs(min(train[[v]], na.rm = TRUE)) + 1, 0)
shift
})
)
# Apply transformations to full set
data <- data %>%
mutate(across(all_of(vars_to_bc),
~ BoxCox(.x + bc_info$shift[bc_info$variable == cur_column()],
bc_info$lambda[bc_info$variable == cur_column()]),
.names = "{.col}_bc"))
#Transform/difference data as appropriate
data <- data %>%
mutate(
dnfp = nfp_bc - lag(nfp_bc),
dadp = adp_bc - lag(adp_bc),
dsmall = small_bus_bc - lag(small_bus_bc),
mfg_net = (ism_mfg - 50)*2, #Makes variable net percentage reporting increase
serv_net = (ism_serv - 50)*2,
dwarn = warn_bc - lag(warn_bc),
dindeed = indeed_bc - lag(indeed_bc),
dlightcast = lightcast_bc - lag(lightcast_bc)
)
#Replace train and test splits with differenced data
train <- data %>% filter(date <= yearmonth("2023 Dec"))
#Start test data at Dec 2023 to have the differenced data for Jan 2024
test <- data %>% filter(date >= yearmonth("2023 Jan"))
library(feasts)
# Stationarity Tests on Training Data
# List of differenced variable names
dvars <- c("dnfp", "dadp", "dsmall", "mfg_net",
"serv_net", "dwarn", "dindeed", "dlightcast")
# Loop over variables and run unit root tests
stationarity_tests <- map_dfr(dvars, function(var) {
train %>%
features(!!sym(var), list(
kpss = unitroot_kpss,
ndiffs = unitroot_ndiffs,
pp = unitroot_pp
)) %>%
mutate(variable = var)
})
# Reorder columns
stationarity_tests <- stationarity_tests %>%
select(variable, everything())
# Display nicely
kable(stationarity_tests, caption = "Stationarity Tests for Differenced Variables")
#---Plots
#--- Time Series Plot of Transformed Change in Payrolls ---
#Change in payrolls display
train |>
gg_tsdisplay(dnfp, plot_type='partial')
#get rid of scientific notation in tables
options(scipen = 999)
#Create Covid dummy for outliers
train <- train %>%
mutate(
covid = ifelse(date >= as.Date("2020-03-01") & date <= as.Date("2020-04-30"), 1, 0), #Covid decline
recovery = ifelse(date >= as.Date("2020-05-01") & date <= as.Date("2021-12-31"), 1, 0), #Strong recovery
month = month(date, label = TRUE, abbr = TRUE)
) %>%
as_tsibble(index = date)
test <- test %>%
mutate(
covid = ifelse(date >= as.Date("2020-03-01") & date <= as.Date("2020-04-30"), 1, 0), #Covid decline
recovery = ifelse(date >= as.Date("2020-05-01") & date <= as.Date("2021-12-31"), 1, 0), #Strong recovery
month = month(date, label = TRUE, abbr = TRUE)
) %>%
as_tsibble(index = date)
#Fit ETS and ARIMA models
fit <- train %>% model(
ets_auto = ETS(nfp_bc),
ets_aada = ETS(nfp_bc ~ error("A") + trend("Ad") + season("A")),
arima_covid = ARIMA(nfp_bc ~ month + covid + recovery, stepwise = FALSE, approximation = FALSE),
arima_auto = ARIMA(nfp_bc ~ month, stepwise = FALSE, approximation = FALSE)
)
#Create train df without January 2020 for differenced data
train_diff <- train %>%
filter(date != yearmonth("2020 Jan"))
#Fit time series models
ts_fit <- train_diff %>% model(
season_lag_covid = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery),
adp = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp),
small_bus = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall),
ism = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net),
warn = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net+dwarn),
indeed = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net+dwarn+dindeed),
lightcast = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net+dwarn+dindeed+dlightcast)
)
#Report parameters for auto ARIMA and ETS models
report(fit[,"ets_auto"])
report(fit[,"arima_auto"])
report(fit[,"arima_covid"])
#Examine AICc for time series models
aic_table <- ts_fit %>%
glance() %>%
select(.model, AICc) %>%
arrange(AICc)
knitr::kable(aic_table, digits = 2, caption = "AICc Comparison of TSLM Models")
#Test model accuracy on test data
# Filter test to start AFTER training ends (Jan 2024)
test_for_accuracy <- test %>%
filter(date >= yearmonth("2024 Jan"))
# Forecast on the test period
fc <- fit %>% forecast(new_data = test_for_accuracy)
print(fabletools::accuracy(fc, data))
# Same for ts_fit
ts_fc <- ts_fit %>% forecast(new_data = test_for_accuracy)
print(fabletools::accuracy(ts_fc, data))
#Report Results for full model and ADP model
# Report ADP model
report(ts_fit[, "adp"])
# Report Lightcast model
report(ts_fit[, "lightcast"])
# Step 1: Get original untransformed historical data
nfp_hist <- fredr(series_id = "PAYNSA") %>%
select(date, nfp = value) %>%
mutate(date = yearmonth(date)) %>%
filter(date >= yearmonth("2020 Jan")) %>%
as_tsibble(index = date)
# Step 2: Split for plotting
nfp_test <- nfp_hist %>% filter(date >= yearmonth("2024 Feb"))
nfp_jan2024 <- nfp_hist %>% filter(date == yearmonth("2024 Jan"))
# Step 3: Filter test data to start AFTER training ends
test_forecast <- test %>%
filter(date >= yearmonth("2024 Jan"))
# Step 4: Extract forecasts
# ETS forecast
fc_ets <- fit[,"ets_aada"] %>%
forecast(new_data = test_forecast) %>%
as_tibble() %>%
filter(date >= yearmonth("2024 Feb")) %>% # start Feb 2024
select(date, fc_bc = .mean) %>%
mutate(.model = "ets_aada")
# ARIMA forecast
fc_arima <- fit[,"arima_auto"] %>%
forecast(new_data = test_forecast) %>%
as_tibble() %>%
filter(date >= yearmonth("2024 Feb")) %>% # start Feb 2024
select(date, fc_bc = .mean) %>%
mutate(.model = "arima_auto")
# ADP forecast (in differenced BC space)
fc_adp <- ts_fc %>%
filter(.model == "adp", date >= yearmonth("2024 Feb")) %>%
as_tibble() %>%
select(date, fc_dnfp = .mean) %>%
mutate(.model = "adp")
# === Back-transform forecasts ===
# Extract lambda and shift for nfp from bc_info
lambda_nfp <- bc_info$lambda[bc_info$variable == "nfp"]
shift_nfp <- bc_info$shift[bc_info$variable == "nfp"]
# Get the last actual NFP value (January 2024)
last_nfp <- nfp_hist %>%
filter(date == yearmonth("2024 Jan")) %>%
pull(nfp)
# Define backtransform function for differenced forecasts
backtransform_forecast <- function(fc_diff, last_actual, lambda, shift) {
# fc_diff: differenced forecasts in Box-Cox space
# last_actual: last actual value in original scale
# lambda: Box-Cox lambda
# shift: shift applied before Box-Cox
# Convert last actual to Box-Cox space
last_bc <- BoxCox(last_actual + shift, lambda)
# Cumulatively add differences to get levels in Box-Cox space
fc_bc_levels <- cumsum(c(last_bc, fc_diff))[-1]
# Back-transform to original scale
fc_original <- InvBoxCox(fc_bc_levels, lambda) - shift
return(fc_original)
}
# ETS & ARIMA are in Box–Cox (level)
fc_ets$nfp_forecast <- InvBoxCox(fc_ets$fc_bc, lambda_nfp) - shift_nfp
fc_arima$nfp_forecast <- InvBoxCox(fc_arima$fc_bc, lambda_nfp) - shift_nfp
# ADP is differenced Box–Cox, needs cumulative transformation
fc_adp$nfp_forecast <- backtransform_forecast(
fc_adp$fc_dnfp,
last_nfp,
lambda_nfp,
shift_nfp
)
# === Combine base forecasts ===
fc_combined <- bind_rows(fc_ets, fc_arima, fc_adp) %>%
select(date, .model, nfp_forecast)
# === Create ensemble forecasts ===
fc_wide <- fc_combined %>%
pivot_wider(names_from = .model, values_from = nfp_forecast)
fc_ens <- fc_wide %>%
mutate(
ensemble_all = (arima_auto + ets_aada + adp) / 3,
ensemble_ets_adp = (ets_aada + adp) / 2
) %>%
pivot_longer(
cols = c(arima_auto, ets_aada, adp, ensemble_all, ensemble_ets_adp),
names_to = ".model",
values_to = "nfp_forecast"
)
# === Final dataset for plotting ===
fc_all <- fc_ens
# --- Plot 1: ETS + ARIMA + ADP + Ensemble of all three ---
plot_all <- ggplot() +
# Historical data
geom_line(data = nfp_hist %>% filter(date <= yearmonth("2023 Dec")),
aes(x = date, y = nfp),
color = "black", linewidth = 1) +
geom_line(data = nfp_hist %>% filter(date >= yearmonth("2023 Dec") & date <= yearmonth("2024 Jan")),
aes(x = date, y = nfp),
color = "black", linewidth = 1, linetype = "dotted") +
geom_point(data = nfp_jan2024,
aes(x = date, y = nfp),
color = "black", size = 3) +
geom_line(data = nfp_test,
aes(x = date, y = nfp),
color = "black", linewidth = 1, linetype = "dashed") +
# Forecasts
geom_line(
data = fc_all %>%
filter(.model %in% c("ets_aada", "arima_auto", "adp", "ensemble_all")),
aes(x = date, y = nfp_forecast, color = .model),
linewidth = 1.2
) +
labs(
title = "Forecast Comparison: ETS, ARIMA, ADP, and Ensemble (All Three)",
subtitle = "Solid black = training | Dotted = Jan 2024 (actual) | Dashed = test | Colored = forecasts",
y = "Nonfarm Payrolls (thousands)",
x = "Date",
color = "Model"
) +
scale_color_manual(
values = c(
"ets_aada" = "#E67E22",
"arima_auto" = "#9B59B6",
"adp" = "#3498DB",
"ensemble_all" = "#E74C3C" # red-orange for ensemble
),
labels = c(
"ets_aada" = "ETS(A,Ad,A)",
"arima_auto" = "ARIMA (auto)",
"adp" = "ADP TSLM",
"ensemble_all" = "Ensemble (ARIMA + ETS + ADP)"
)
) +
theme_minimal() +
theme(legend.position = "bottom")
# --- Plot 2: ETS + ADP + Ensemble of those two ---
plot_ets_adp <- ggplot() +
# Historical data
geom_line(data = nfp_hist %>% filter(date <= yearmonth("2023 Dec")),
aes(x = date, y = nfp),
color = "black", linewidth = 1) +
geom_line(data = nfp_hist %>% filter(date >= yearmonth("2023 Dec") & date <= yearmonth("2024 Jan")),
aes(x = date, y = nfp),
color = "black", linewidth = 1, linetype = "dotted") +
geom_point(data = nfp_jan2024,
aes(x = date, y = nfp),
color = "black", size = 3) +
geom_line(data = nfp_test,
aes(x = date, y = nfp),
color = "black", linewidth = 1, linetype = "dashed") +
# Forecasts
geom_line(
data = fc_all %>%
filter(.model %in% c("ets_aada", "adp", "ensemble_ets_adp")),
aes(x = date, y = nfp_forecast, color = .model),
linewidth = 1.2
) +
labs(
title = "Forecast Comparison: ETS, ADP, and Ensemble (ETS + ADP)",
subtitle = "Solid black = training | Dotted = Jan 2024 (actual) | Dashed = test | Colored = forecasts",
y = "Nonfarm Payrolls (thousands)",
x = "Date",
color = "Model"
) +
scale_color_manual(
values = c(
"ets_aada" = "#E67E22",
"adp" = "#3498DB",
"ensemble_ets_adp" = "#27AE60" # dark green for ensemble
),
labels = c(
"ets_aada" = "ETS(A,Ad,A)",
"adp" = "ADP TSLM",
"ensemble_ets_adp" = "Ensemble (ETS + ADP)"
)
) +
theme_minimal() +
theme(legend.position = "bottom")
# Display both plots
plot_all
plot_ets_adp
library(dplyr)
library(knitr)
#Test model accuracy on test data
# Filter test to start AFTER training ends (Jan 2024)
test_for_accuracy <- test %>%
filter(date >= yearmonth("2024 Jan"))
# Forecast on the test period
fc <- fit %>% forecast(new_data = test_for_accuracy)
ts_fc <- ts_fit %>% forecast(new_data = test_for_accuracy)
# === Compute Ensemble Accuracy in ORIGINAL SCALE ===
# Extract lambda and shift for nfp
lambda_nfp <- bc_info$lambda[bc_info$variable == "nfp"]
shift_nfp <- bc_info$shift[bc_info$variable == "nfp"]
# Extract forecasts as tibbles (only Feb 2024+) and back-transform
fc_tbl <- fc %>% as_tibble() %>%
filter(date >= yearmonth("2024 Feb")) %>%
select(date, .model, .mean) %>%
mutate(.mean = InvBoxCox(.mean, lambda_nfp) - shift_nfp) # Back-transform
ts_tbl <- ts_fc %>% as_tibble() %>%
filter(date >= yearmonth("2024 Feb")) %>%
select(date, .model, .mean) %>%
mutate(.mean = InvBoxCox(.mean, lambda_nfp) - shift_nfp) # Back-transform
# Get actual values in ORIGINAL SCALE from nfp_hist
actuals_original <- nfp_hist %>%
filter(date >= yearmonth("2024 Feb")) %>%
select(date, nfp)
# Ensemble 1: ARIMA + ETS + ADP
ensemble_all <- bind_rows(
fc_tbl %>% filter(.model %in% c("arima_auto", "ets_aada")),
ts_tbl %>% filter(.model == "adp")
) %>%
group_by(date) %>%
summarise(.mean = mean(.mean, na.rm = TRUE), .groups = "drop") %>%
left_join(actuals_original, by = "date") %>%
filter(!is.na(nfp)) %>%
summarise(
.model = "ensemble_all",
RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
MAE = mean(abs(nfp - .mean), na.rm = TRUE),
MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
)
# Ensemble 2: ETS + ADP
ensemble_ets_adp <- bind_rows(
fc_tbl %>% filter(.model == "ets_aada"),
ts_tbl %>% filter(.model == "adp")
) %>%
group_by(date) %>%
summarise(.mean = mean(.mean, na.rm = TRUE), .groups = "drop") %>%
left_join(actuals_original, by = "date") %>%
filter(!is.na(nfp)) %>%
summarise(
.model = "ensemble_ets_adp",
RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
MAE = mean(abs(nfp - .mean), na.rm = TRUE),
MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
)
# Get accuracy for individual models in original scale
fc_accuracy_original <- fc_tbl %>%
filter(.model %in% c("arima_auto", "ets_aada")) %>%
left_join(actuals_original, by = "date") %>%
group_by(.model) %>%
summarise(
RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
MAE = mean(abs(nfp - .mean), na.rm = TRUE),
MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
)
ts_accuracy_original <- ts_tbl %>%
filter(.model == "adp") %>%
left_join(actuals_original, by = "date") %>%
group_by(.model) %>%
summarise(
RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
MAE = mean(abs(nfp - .mean), na.rm = TRUE),
MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
)
# === Table 1: ARIMA, ETS, ADP, Ensemble of All Three ===
metrics_plot1 <- bind_rows(
fc_accuracy_original %>% filter(.model %in% c("arima_auto", "ets_aada")),
ts_accuracy_original %>% filter(.model == "adp"),
ensemble_all
) %>%
arrange(RMSE)
kable(metrics_plot1,
caption = "Model Accuracy (ARIMA, ETS, ADP, Ensemble of All Three)",
digits = 2,
format.args = list(big.mark = ","))
# === Table 2: ETS, ADP, Ensemble of ETS & ADP ===
metrics_plot2 <- bind_rows(
fc_accuracy_original %>% filter(.model == "ets_aada"),
ts_accuracy_original %>% filter(.model == "adp"),
ensemble_ets_adp
) %>%
arrange(RMSE)
kable(metrics_plot2,
caption = "Model Accuracy (ETS, ADP, Ensemble of ETS & ADP)",
digits = 2,
format.args = list(big.mark = ","))
# --- Compute the difference (Sep 2025 - Aug 2025) from fc_combined ---
sep_2025_diff <- fc_all %>%
filter(date %in% yearmonth(c("2025 Aug", "2025 Sep"))) %>%
select(date, .model, nfp_forecast) %>%
tidyr::pivot_wider(names_from = date, values_from = nfp_forecast) %>%
mutate(diff = `2025 Sep` - `2025 Aug`)
# --- Create the table ---
sep_forecast_table <- data.frame(
Source = c("ETS(A,Ad,A)", "ADP TSLM", "Ensemble (ETS + ADP)",
"ADP estimate", "Revelio", "ADP and Revelio Avg.", "Carlyle"),
`Sep_2025_Forecast` = c(
sep_2025_diff %>% filter(.model == "ets_aada") %>% pull(diff),
sep_2025_diff %>% filter(.model == "adp") %>% pull(diff),
sep_2025_diff %>% filter(.model == "ensemble_ets_adp") %>% pull(diff),
-32, # ADP estimate
60.1, # Revelio value
14.05, #ADP and Revelio Avg.
17 # Carlyle value
)
)
# --- Create kable for Sep 2025 differences ---
sep_forecast_kable <- sep_forecast_table %>%
kable(
caption = "Sep 2025 Employment Growth Nowcasts",
col.names = c("Source", "Thousands"),
digits = 0
)
sep_forecast_kable