Predictive Analytics Midterm

Chase Farha

Predictive Analytics

Step 1: Import Monthly Data from FRED and imported CSV files.

Outcome Variable:

Predictor Variables:

I import data from January 2020 to August 2025 for each variable.

# Load required packages
library(fredr)
library(tidyverse)
library(tsibble)
library(lubridate)
library(fpp3)
library(zoo)
library(knitr)

# ---Import monthly Nonfarm Payrolls (PAYNSA) ---
nfp <- fredr(series_id = "PAYNSA") %>%
  select(date, nfp = value) %>%
  mutate(date = yearmonth(date)) %>%
  filter(date >= yearmonth("2020 Jan")) %>%
  as_tsibble(index = date)

#---Import predictor variable data from outside sources, already compiled into 1 csv
predictors <- read_csv("/Users/cfarha/Library/CloudStorage/OneDrive-Personal/Documents/BC/Predictive Analytics/private_employment.csv") %>%
  mutate(
    date = make_date(year, month, 1),
    date = yearmonth(date)
  ) %>%
  as_tsibble(index = date)

#---Import Lightcast Job Postings
lightcast <- read_csv("/Users/cfarha/Library/CloudStorage/OneDrive-Personal/Documents/BC/Predictive Analytics/lightcast.csv") %>%
  mutate(date = make_date(year, month, day)) %>%
  as_tsibble(index = date) %>%
  slice_head(n = nrow(.) - 2) %>%                       # Drop October data
  fill_gaps() %>%                                   # Fill missing daily gaps
  mutate(lightcast = zoo::na.approx(lightcast, na.rm = FALSE)) %>%  # Interpolate
  index_by(month = yearmonth(date)) %>%              # Aggregate by month
  summarise(lightcast = mean(lightcast, na.rm = TRUE)) %>%
  rename(date = month) %>%                           # Rename back to 'date'
  as_tsibble(index = date)
           
# ---Combine all series into one tsibble ---
data <- nfp %>%
  right_join(predictors, by = "date") %>%
  right_join(lightcast, by = "date") %>%
  select(-month, -year)

Step 2: Split Train (70%) and Test (30%)

I split the train dataset from January 2020 to December 2023 (about 70% of the data) to split at a full-year boundary, preserve seasonal cycles, and reflect realistic forecasting conditions.

train <- data %>% filter(date <= yearmonth("2023 Dec"))
test  <- data %>% filter(date >= yearmonth("2024 Jan"))

Step 3: PreProcess Data on Training Set

library(forecast)

# Compute Box-Cox lambda on training set
vars_to_bc <- c("nfp","adp","small_bus","warn","indeed","lightcast")

#Compute Guerrero lambda for each variable
lambda_table <- sapply(vars_to_bc, function(v) {
  train %>%
    select(all_of(v)) %>%
    features(features = guerrero) %>%
    pull(lambda_guerrero)
})

lambda_table <- data.frame(
  variable = names(lambda_table),
  lambda_guerrero = as.numeric(lambda_table)
)

# Display with knitr::kable
knitr::kable(lambda_table, digits = 3,
             caption = "Guerrero Box-Cox Lambda for Selected Variables")

# Convert to named vector for easy lookup
lambda_list <- setNames(lambda_table$lambda_guerrero, lambda_table$variable)

# List of variables to transform
vars_to_bc <- lambda_table$variable

# Full training pipeline
train <- train %>%
  # Step 1: Box–Cox transform using Guerrero lambdas dynamically
  mutate(across(all_of(vars_to_bc), 
                ~{
                  shift <- ifelse(min(.x, na.rm = TRUE) <= 0, abs(min(.x, na.rm = TRUE)) + 1, 0)
                  BoxCox(.x + shift, lambda_list[cur_column()])
                },
                .names = "{.col}_bc"))

# Store Transformations and shifts
bc_info <- data.frame(
  variable = vars_to_bc,
  lambda   = lambda_list,
  shift    = sapply(vars_to_bc, function(v) {
    shift <- ifelse(min(train[[v]], na.rm = TRUE) <= 0, abs(min(train[[v]], na.rm = TRUE)) + 1, 0)
    shift
  })
)

# Apply transformations to full set
data <- data %>%
  mutate(across(all_of(vars_to_bc),
                ~ BoxCox(.x + bc_info$shift[bc_info$variable == cur_column()],
                         bc_info$lambda[bc_info$variable == cur_column()]),
                .names = "{.col}_bc"))

Step 4: Difference the transformed data

#Transform/difference data as appropriate
data <- data %>%
  mutate(
    dnfp = nfp_bc - lag(nfp_bc),
    dadp = adp_bc - lag(adp_bc),
    dsmall = small_bus_bc - lag(small_bus_bc),
    mfg_net = (ism_mfg - 50)*2, #Makes variable net percentage reporting increase
    serv_net = (ism_serv - 50)*2,
    dwarn = warn_bc - lag(warn_bc),
    dindeed = indeed_bc - lag(indeed_bc),
    dlightcast = lightcast_bc - lag(lightcast_bc)
  )

#Replace train and test splits with differenced data
train <- data %>% filter(date <= yearmonth("2023 Dec"))
#Start test data at Dec 2023 to have the differenced data for Jan 2024
test  <- data %>% filter(date >= yearmonth("2023 Jan"))

Step 5: Run Stationarity Tests and Plot ACF

library(feasts)

# Stationarity Tests on Training Data
# List of differenced variable names
dvars <- c("dnfp", "dadp", "dsmall", "mfg_net", 
          "serv_net", "dwarn", "dindeed", "dlightcast")

# Loop over variables and run unit root tests
stationarity_tests <- map_dfr(dvars, function(var) {
  train %>%
    features(!!sym(var), list(
      kpss = unitroot_kpss,
      ndiffs = unitroot_ndiffs,
      pp = unitroot_pp
    )) %>%
    mutate(variable = var)
})

# Reorder columns
stationarity_tests <- stationarity_tests %>%
  select(variable, everything())

# Display nicely
kable(stationarity_tests, caption = "Stationarity Tests for Differenced Variables")

#---Plots
#--- Time Series Plot of Transformed Change in Payrolls ---
#Change in payrolls display
train |>
gg_tsdisplay(dnfp, plot_type='partial')

Step 6: Run Models

#get rid of scientific notation in tables
options(scipen = 999)

#Create Covid dummy for outliers
train <- train %>%
  mutate(
    covid = ifelse(date >= as.Date("2020-03-01") & date <= as.Date("2020-04-30"), 1, 0), #Covid decline
    recovery = ifelse(date >= as.Date("2020-05-01") & date <= as.Date("2021-12-31"), 1, 0), #Strong recovery
    month = month(date, label = TRUE, abbr = TRUE)
  ) %>%
  as_tsibble(index = date)

test <- test %>%
  mutate(
    covid = ifelse(date >= as.Date("2020-03-01") & date <= as.Date("2020-04-30"), 1, 0), #Covid decline
    recovery = ifelse(date >= as.Date("2020-05-01") & date <= as.Date("2021-12-31"), 1, 0), #Strong recovery
    month = month(date, label = TRUE, abbr = TRUE)
  ) %>%
  as_tsibble(index = date)

#Fit ETS and ARIMA models
fit <- train %>% model(
  ets_auto  = ETS(nfp_bc),
  ets_aada = ETS(nfp_bc ~ error("A") + trend("Ad") + season("A")),
  arima_covid = ARIMA(nfp_bc ~ month + covid + recovery, stepwise = FALSE, approximation = FALSE),
  arima_auto = ARIMA(nfp_bc ~ month, stepwise = FALSE, approximation = FALSE)
)

#Create train df without January 2020 for differenced data
train_diff <- train %>%
  filter(date != yearmonth("2020 Jan"))

#Fit time series models 
ts_fit <- train_diff %>% model(
  season_lag_covid = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery),
  adp = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp),
  small_bus = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall),
  ism = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net),
  warn = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net+dwarn),
  indeed = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net+dwarn+dindeed),
  lightcast = TSLM(dnfp ~ season()+lag(dnfp)+covid+recovery+dadp+dsmall+mfg_net+serv_net+dwarn+dindeed+dlightcast)
)

#Report parameters for auto ARIMA and ETS models
report(fit[,"ets_auto"])
report(fit[,"arima_auto"])
report(fit[,"arima_covid"])

#Examine AICc for time series models
aic_table <- ts_fit %>%
  glance() %>%
  select(.model, AICc) %>%
  arrange(AICc)

knitr::kable(aic_table, digits = 2, caption = "AICc Comparison of TSLM Models")


#Test model accuracy on test data
# Filter test to start AFTER training ends (Jan 2024)
test_for_accuracy <- test %>%
  filter(date >= yearmonth("2024 Jan"))

# Forecast on the test period
fc <- fit %>% forecast(new_data = test_for_accuracy)
print(fabletools::accuracy(fc, data))

# Same for ts_fit
ts_fc <- ts_fit %>% forecast(new_data = test_for_accuracy)
print(fabletools::accuracy(ts_fc, data))

#Report Results for full model and ADP model
# Report ADP model
report(ts_fit[, "adp"])
# Report Lightcast model
report(ts_fit[, "lightcast"])

Step 7: Back-Transform Data, Create Ensemble Models, and Plot Results

# Step 1: Get original untransformed historical data
nfp_hist <- fredr(series_id = "PAYNSA") %>%
  select(date, nfp = value) %>%
  mutate(date = yearmonth(date)) %>%
  filter(date >= yearmonth("2020 Jan")) %>%
  as_tsibble(index = date)

# Step 2: Split for plotting
nfp_test <- nfp_hist %>% filter(date >= yearmonth("2024 Feb"))
nfp_jan2024 <- nfp_hist %>% filter(date == yearmonth("2024 Jan"))

# Step 3: Filter test data to start AFTER training ends
test_forecast <- test %>%
  filter(date >= yearmonth("2024 Jan"))

# Step 4: Extract forecasts
# ETS forecast
fc_ets <- fit[,"ets_aada"] %>%
  forecast(new_data = test_forecast) %>%
  as_tibble() %>%
  filter(date >= yearmonth("2024 Feb")) %>%  # start Feb 2024
  select(date, fc_bc = .mean) %>%
  mutate(.model = "ets_aada")

# ARIMA forecast
fc_arima <- fit[,"arima_auto"] %>%
  forecast(new_data = test_forecast) %>%
  as_tibble() %>%
  filter(date >= yearmonth("2024 Feb")) %>%  # start Feb 2024
  select(date, fc_bc = .mean) %>%
  mutate(.model = "arima_auto")

# ADP forecast (in differenced BC space)
fc_adp <- ts_fc %>%
  filter(.model == "adp", date >= yearmonth("2024 Feb")) %>%
  as_tibble() %>%
  select(date, fc_dnfp = .mean) %>%
  mutate(.model = "adp")

# === Back-transform forecasts ===
# Extract lambda and shift for nfp from bc_info
lambda_nfp <- bc_info$lambda[bc_info$variable == "nfp"]
shift_nfp <- bc_info$shift[bc_info$variable == "nfp"]

# Get the last actual NFP value (January 2024)
last_nfp <- nfp_hist %>% 
  filter(date == yearmonth("2024 Jan")) %>% 
  pull(nfp)

# Define backtransform function for differenced forecasts
backtransform_forecast <- function(fc_diff, last_actual, lambda, shift) {
  # fc_diff: differenced forecasts in Box-Cox space
  # last_actual: last actual value in original scale
  # lambda: Box-Cox lambda
  # shift: shift applied before Box-Cox
  
  # Convert last actual to Box-Cox space
  last_bc <- BoxCox(last_actual + shift, lambda)
  
  # Cumulatively add differences to get levels in Box-Cox space
  fc_bc_levels <- cumsum(c(last_bc, fc_diff))[-1]
  
  # Back-transform to original scale
  fc_original <- InvBoxCox(fc_bc_levels, lambda) - shift
  
  return(fc_original)
}

# ETS & ARIMA are in Box–Cox (level)
fc_ets$nfp_forecast <- InvBoxCox(fc_ets$fc_bc, lambda_nfp) - shift_nfp
fc_arima$nfp_forecast <- InvBoxCox(fc_arima$fc_bc, lambda_nfp) - shift_nfp

# ADP is differenced Box–Cox, needs cumulative transformation
fc_adp$nfp_forecast <- backtransform_forecast(
  fc_adp$fc_dnfp,
  last_nfp,
  lambda_nfp,
  shift_nfp
)

# === Combine base forecasts ===
fc_combined <- bind_rows(fc_ets, fc_arima, fc_adp) %>%
  select(date, .model, nfp_forecast)

# === Create ensemble forecasts ===
fc_wide <- fc_combined %>%
  pivot_wider(names_from = .model, values_from = nfp_forecast)

fc_ens <- fc_wide %>%
  mutate(
    ensemble_all = (arima_auto + ets_aada + adp) / 3,
    ensemble_ets_adp = (ets_aada + adp) / 2
  ) %>%
  pivot_longer(
    cols = c(arima_auto, ets_aada, adp, ensemble_all, ensemble_ets_adp),
    names_to = ".model",
    values_to = "nfp_forecast"
  )

# === Final dataset for plotting ===
fc_all <- fc_ens

# --- Plot 1: ETS + ARIMA + ADP + Ensemble of all three ---
plot_all <- ggplot() +
  # Historical data
  geom_line(data = nfp_hist %>% filter(date <= yearmonth("2023 Dec")),
            aes(x = date, y = nfp),
            color = "black", linewidth = 1) +
  geom_line(data = nfp_hist %>% filter(date >= yearmonth("2023 Dec") & date <= yearmonth("2024 Jan")),
            aes(x = date, y = nfp),
            color = "black", linewidth = 1, linetype = "dotted") +
  geom_point(data = nfp_jan2024,
             aes(x = date, y = nfp),
             color = "black", size = 3) +
  geom_line(data = nfp_test,
            aes(x = date, y = nfp),
            color = "black", linewidth = 1, linetype = "dashed") +
  # Forecasts
  geom_line(
    data = fc_all %>%
      filter(.model %in% c("ets_aada", "arima_auto", "adp", "ensemble_all")),
    aes(x = date, y = nfp_forecast, color = .model),
    linewidth = 1.2
  ) +
  labs(
    title = "Forecast Comparison: ETS, ARIMA, ADP, and Ensemble (All Three)",
    subtitle = "Solid black = training | Dotted = Jan 2024 (actual) | Dashed = test | Colored = forecasts",
    y = "Nonfarm Payrolls (thousands)",
    x = "Date",
    color = "Model"
  ) +
  scale_color_manual(
    values = c(
      "ets_aada" = "#E67E22",
      "arima_auto" = "#9B59B6",
      "adp" = "#3498DB",
      "ensemble_all" = "#E74C3C"  # red-orange for ensemble
    ),
    labels = c(
      "ets_aada" = "ETS(A,Ad,A)",
      "arima_auto" = "ARIMA (auto)",
      "adp" = "ADP TSLM",
      "ensemble_all" = "Ensemble (ARIMA + ETS + ADP)"
    )
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

# --- Plot 2: ETS + ADP + Ensemble of those two ---
plot_ets_adp <- ggplot() +
  # Historical data
  geom_line(data = nfp_hist %>% filter(date <= yearmonth("2023 Dec")),
            aes(x = date, y = nfp),
            color = "black", linewidth = 1) +
  geom_line(data = nfp_hist %>% filter(date >= yearmonth("2023 Dec") & date <= yearmonth("2024 Jan")),
            aes(x = date, y = nfp),
            color = "black", linewidth = 1, linetype = "dotted") +
  geom_point(data = nfp_jan2024,
             aes(x = date, y = nfp),
             color = "black", size = 3) +
  geom_line(data = nfp_test,
            aes(x = date, y = nfp),
            color = "black", linewidth = 1, linetype = "dashed") +
  # Forecasts
  geom_line(
    data = fc_all %>%
      filter(.model %in% c("ets_aada", "adp", "ensemble_ets_adp")),
    aes(x = date, y = nfp_forecast, color = .model),
    linewidth = 1.2
  ) +
  labs(
    title = "Forecast Comparison: ETS, ADP, and Ensemble (ETS + ADP)",
    subtitle = "Solid black = training | Dotted = Jan 2024 (actual) | Dashed = test | Colored = forecasts",
    y = "Nonfarm Payrolls (thousands)",
    x = "Date",
    color = "Model"
  ) +
  scale_color_manual(
    values = c(
      "ets_aada" = "#E67E22",
      "adp" = "#3498DB",
      "ensemble_ets_adp" = "#27AE60"  # dark green for ensemble
    ),
    labels = c(
      "ets_aada" = "ETS(A,Ad,A)",
      "adp" = "ADP TSLM",
      "ensemble_ets_adp" = "Ensemble (ETS + ADP)"
    )
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

# Display both plots
plot_all

plot_ets_adp

Step 8: Produce Error Tables

library(dplyr)
library(knitr)

#Test model accuracy on test data
# Filter test to start AFTER training ends (Jan 2024)
test_for_accuracy <- test %>%
  filter(date >= yearmonth("2024 Jan"))

# Forecast on the test period
fc <- fit %>% forecast(new_data = test_for_accuracy)
ts_fc <- ts_fit %>% forecast(new_data = test_for_accuracy)

# === Compute Ensemble Accuracy in ORIGINAL SCALE ===

# Extract lambda and shift for nfp
lambda_nfp <- bc_info$lambda[bc_info$variable == "nfp"]
shift_nfp <- bc_info$shift[bc_info$variable == "nfp"]

# Extract forecasts as tibbles (only Feb 2024+) and back-transform
fc_tbl <- fc %>% as_tibble() %>% 
  filter(date >= yearmonth("2024 Feb")) %>%
  select(date, .model, .mean) %>%
  mutate(.mean = InvBoxCox(.mean, lambda_nfp) - shift_nfp)  # Back-transform

ts_tbl <- ts_fc %>% as_tibble() %>% 
  filter(date >= yearmonth("2024 Feb")) %>%
  select(date, .model, .mean) %>%
  mutate(.mean = InvBoxCox(.mean, lambda_nfp) - shift_nfp)  # Back-transform

# Get actual values in ORIGINAL SCALE from nfp_hist
actuals_original <- nfp_hist %>% 
  filter(date >= yearmonth("2024 Feb")) %>%
  select(date, nfp)

# Ensemble 1: ARIMA + ETS + ADP
ensemble_all <- bind_rows(
  fc_tbl %>% filter(.model %in% c("arima_auto", "ets_aada")),
  ts_tbl %>% filter(.model == "adp")
) %>%
  group_by(date) %>%
  summarise(.mean = mean(.mean, na.rm = TRUE), .groups = "drop") %>%
  left_join(actuals_original, by = "date") %>%
  filter(!is.na(nfp)) %>%
  summarise(
    .model = "ensemble_all",
    RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
    MAE = mean(abs(nfp - .mean), na.rm = TRUE),
    MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
  )

# Ensemble 2: ETS + ADP
ensemble_ets_adp <- bind_rows(
  fc_tbl %>% filter(.model == "ets_aada"),
  ts_tbl %>% filter(.model == "adp")
) %>%
  group_by(date) %>%
  summarise(.mean = mean(.mean, na.rm = TRUE), .groups = "drop") %>%
  left_join(actuals_original, by = "date") %>%
  filter(!is.na(nfp)) %>%
  summarise(
    .model = "ensemble_ets_adp",
    RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
    MAE = mean(abs(nfp - .mean), na.rm = TRUE),
    MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
  )

# Get accuracy for individual models in original scale
fc_accuracy_original <- fc_tbl %>%
  filter(.model %in% c("arima_auto", "ets_aada")) %>%
  left_join(actuals_original, by = "date") %>%
  group_by(.model) %>%
  summarise(
    RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
    MAE = mean(abs(nfp - .mean), na.rm = TRUE),
    MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
  )

ts_accuracy_original <- ts_tbl %>%
  filter(.model == "adp") %>%
  left_join(actuals_original, by = "date") %>%
  group_by(.model) %>%
  summarise(
    RMSE = sqrt(mean((nfp - .mean)^2, na.rm = TRUE)),
    MAE = mean(abs(nfp - .mean), na.rm = TRUE),
    MAPE = mean(abs((nfp - .mean) / nfp), na.rm = TRUE) * 100
  )

# === Table 1: ARIMA, ETS, ADP, Ensemble of All Three ===
metrics_plot1 <- bind_rows(
  fc_accuracy_original %>% filter(.model %in% c("arima_auto", "ets_aada")),
  ts_accuracy_original %>% filter(.model == "adp"),
  ensemble_all
) %>%
  arrange(RMSE)

kable(metrics_plot1,
      caption = "Model Accuracy (ARIMA, ETS, ADP, Ensemble of All Three)",
      digits = 2,
      format.args = list(big.mark = ","))

# === Table 2: ETS, ADP, Ensemble of ETS & ADP ===
metrics_plot2 <- bind_rows(
  fc_accuracy_original %>% filter(.model == "ets_aada"),
  ts_accuracy_original %>% filter(.model == "adp"),
  ensemble_ets_adp
) %>%
  arrange(RMSE)

kable(metrics_plot2,
      caption = "Model Accuracy (ETS, ADP, Ensemble of ETS & ADP)",
      digits = 2,
      format.args = list(big.mark = ","))

Step 9: Table of September 2025 Employment Growth Nowcast

# --- Compute the difference (Sep 2025 - Aug 2025) from fc_combined ---
sep_2025_diff <- fc_all %>%
  filter(date %in% yearmonth(c("2025 Aug", "2025 Sep"))) %>%
  select(date, .model, nfp_forecast) %>%
  tidyr::pivot_wider(names_from = date, values_from = nfp_forecast) %>%
  mutate(diff = `2025 Sep` - `2025 Aug`)

# --- Create the table ---
sep_forecast_table <- data.frame(
  Source = c("ETS(A,Ad,A)", "ADP TSLM", "Ensemble (ETS + ADP)", 
             "ADP estimate", "Revelio", "ADP and Revelio Avg.", "Carlyle"),
  `Sep_2025_Forecast` = c(
    sep_2025_diff %>% filter(.model == "ets_aada") %>% pull(diff),
    sep_2025_diff %>% filter(.model == "adp") %>% pull(diff),
    sep_2025_diff %>% filter(.model == "ensemble_ets_adp") %>% pull(diff),
    -32,  # ADP estimate
    60.1,  # Revelio value
    14.05,  #ADP and Revelio Avg.
    17   # Carlyle value
  )
)

# --- Create kable for Sep 2025 differences ---
sep_forecast_kable <- sep_forecast_table %>%
  kable(
    caption = "Sep 2025 Employment Growth Nowcasts",
    col.names = c("Source", "Thousands"),
    digits = 0
  )

sep_forecast_kable