Load Data

sticker_data <- read.csv("/Users/emilybroderick/Desktop/R/PAF/Final/train.csv")
sticker_test <- read.csv("/Users/emilybroderick/Desktop/R/PAF/Final/test.csv")

Clean and Convert to Tsibble

# Clean raw data
sticker_data <- sticker_data %>%
  mutate(date = as.Date(date))

# Compute year cutoffs without unevenly separating groups
global_cutoff <- sticker_data %>%
  summarise(cut = sort(unique(date))[floor(0.8 * n_distinct(date))]) %>%
  pull(cut)

train_raw <- sticker_data %>% filter(date <= global_cutoff)
test_raw  <- sticker_data %>% filter(date >  global_cutoff)

# Convert to tsibble
train <- train_raw %>%
  dplyr::select(country, store, product, date, num_sold) %>%
  as_tsibble(key = c(country, store, product), index = date) 

test <- test_raw %>%
  dplyr::select(country, store, product, date, num_sold) %>%
  as_tsibble(key = c(country, store, product), index = date)

Aggregate Data

# Create hierarchical training set
train_agg <- train %>%
  aggregate_key((country / store) * product, num_sold = sum(num_sold), .auto = FALSE)

# Filter aggregated training set
train_top_level <- train_agg %>%
  filter(is_aggregated(country),
         is_aggregated(store),
         is_aggregated(product)) %>%
  dplyr::select(date, num_sold) %>% # Remove keys entirely
  as_tsibble(index = date)

# Create hierarchical test set
test_agg <- test %>%
  aggregate_key((country / store) * product, num_sold = sum(num_sold), .auto = FALSE)

# Filter aggregated test set
test_top_level <- test_agg %>%
  filter(is_aggregated(country),
         is_aggregated(store),
         is_aggregated(product)) %>%
  dplyr::select(date, num_sold) %>% # Remove keys entirely
  as_tsibble(index = date)

# Create hierarchy for full dataset (kaggle forecast)
sticker_agg <- sticker_data %>%
  rename(num_sold = num_sold) %>%
  as_tsibble(key = c(country, store, product), index = date) %>%
  aggregate_key((country / store) * product, num_sold = sum(num_sold), .auto = FALSE)

# Filter aggregated full dataset (plotting)
sticker_top_level <- sticker_agg %>%
  filter(is_aggregated(country),
         is_aggregated(store),
         is_aggregated(product)) %>%
  dplyr::select(date, num_sold) %>% # Remove keys entirely
  as_tsibble(index = date)

Create Features

# Dynamic Regression Feature Creation

all_dates <- train_top_level %>% 
  distinct(date) %>% 
  arrange(date) %>% 
  pull(date)

# Create features for aggregated data
test_top_level <- test_top_level %>%
  mutate(month = month(date),
         wday = wday(date, label = TRUE, week_start = 1),
         is_weekend = wday %in% c("Sat", "Sun"),
         business_day = !(is_weekend)
  ) %>%
  arrange(date) %>%
  mutate(lag1 = lag(num_sold, 1),
         lag7 = lag(num_sold, 7),
         lag28 = lag(num_sold, 28),
         roll7_mean = slider::slide_dbl(num_sold, mean, .before = 6, .complete = TRUE)
  )

train_top_level <- train_top_level %>%
  mutate(month = month(date),
         wday = wday(date, label = TRUE, week_start = 1),
         is_weekend = wday %in% c("Sat", "Sun"),
         business_day = !(is_weekend)
  ) %>%
  arrange(date) %>%
  mutate(lag1 = lag(num_sold, 1),
         lag7 = lag(num_sold, 7),
         lag28 = lag(num_sold, 28),
         roll7_mean = slider::slide_dbl(num_sold, mean, .before = 6, .complete = TRUE)
  )

Initial Visualizations

Time Plot

# Create a time plot to get an idea of what the data look like
time_plot <- autoplot(train_top_level, num_sold) +
  labs(title = "Sticker Sales",
       y = "Number Sold")

time_plot

Seasonal Plot

# Create a seasonal plot of the years by month
seasonal_plot_yr <- train_top_level %>% 
  gg_season(num_sold, period = "year") +
  labs(y="Number Sold", 
       title="Seasonal Plot: Sticker Sales")

seasonal_plot_yr

ACF Graph

# Check ACF features for stationarity
acf_feat <- train_top_level %>% 
  features(num_sold, feat_acf)

acf_feat %>%
  gt() %>%
  tab_header(title = "Time Series ACF Features")

acf <- acf(train_top_level$num_sold, main="ACF of Sticker Sales")

PACF Graph

# Check PACF features
pacf_feat <- train_top_level %>% 
  features(num_sold, feat_pacf)

pacf_feat %>%
  gt() %>%
  tab_header(title = "Time Series PACF Features")

pacf <- pacf(train_top_level$num_sold, main = "PACF of Sticker Sales")

Decompose the Training Set

Additive STL Decomposition

add_dcmp <- train_top_level %>%
  model(stl = STL(num_sold)) 
components(add_dcmp) %>%
  autoplot() + # Plot the decomp
  labs(title = "Additive STL Decomposition")

Multiplicative STL Decomposition

# Mult decomp
mult_dcmp <- train_top_level %>%
  model(stl = STL(log(num_sold))) # Log transform
components(mult_dcmp) %>%
  autoplot() + # Plot the decomp
  labs(title = "Multiplicative STL Decomposition")

Which is Best?

# Compare accuracy
decomp <- train_top_level %>% model(add = STL(num_sold),
                                       mult = STL(log(num_sold)))
acc_decomp <- accuracy(decomp)

# Filter out important accuracy measures with important models
acc_summary_tbl <- acc_decomp %>%
  dplyr::select(Model = .model, RMSE, MAE, MAPE, ACF1)

# Make the table presentable
acc_summary_tbl %>%
  gt() %>%
  tab_header(title = "Additive vs. Multiplicative STL Decomposition")

Aggregated Modeling

ETS (M,Ad,M)

The ETS model was chosen based on lowest AICc.

# Fit the model
ets_fit <- train_top_level %>%
  model(ETS(log(num_sold)))

# View selected model and info
report(ets_fit)

# Forecast model
ets_fc <- forecast(ets_fit, new_data = test_top_level)

# Plot forecast
ets_plot <- ets_fc %>%
  autoplot(sticker_top_level %>% 
             filter(!is.na(num_sold)), level = c(80, 95)) +
  geom_line(aes(y = .mean), data = ets_fc, linetype = 1) +
  labs(title = "Sticker Sales with ETS () Forecast",
       y = "Number Sold")

ets_plot

ARIMA (3,0,2)(1,1,0)

# Fit model
arst_fit <- train_top_level %>%
  model(ARIMA(log(num_sold)))

# View selected model and info
report(arst_fit)

# Forecast model
arst_fc <- forecast(arst_fit, new_data = test_top_level)

# Plot forecast
arst_plot <- arst_fc %>%
  autoplot(sticker_top_level %>% 
             filter(!is.na(num_sold)), level = c(80, 95)) +
  geom_line(aes(y = .mean), data = arst_fc, linetype = 1) +
  labs(title = "Sticker Sales with ARIMA () Forecast",
       y = "Number Sold")

arst_plot

Dynamic Regression

# Build models
dhr_fit <- model(train_top_level,
                 `K = 1` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 + 
                                   roll7_mean + fourier(K=1)),
                 `K = 2` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 + 
                                   roll7_mean + fourier(K=2)),
                 `K = 3` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 + 
                                   roll7_mean + fourier(K=3))
)

# Compare model AICc and BIC
glance(dhr_fit)

# Forecast model
dhr_fc <- forecast(dhr_fit, new_data = test_top_level)

# Plot model
dhr_plot <- dhr_fc %>%
  autoplot(sticker_top_level %>% 
             filter(!is.na(num_sold)), level = c(80, 95)) +
  geom_line(aes(y = .mean), data = dhr_fc, linetype = 1) +
  labs(title = "Sticker Sales with DHR Forecast",
       y = "Number Sold")

dhr_plot

Neural Network Model - Failed

nn_fit <- train_top_level %>%
  model(NNETAR = NNETAR(log(num_sold)))

nn_fc <- forecast(nn_fit, new_data = test_top_level)

nn_plot <- nn_fc %>%
  autoplot(sticker_top_level %>% 
             filter(!is.na(num_sold)), level = c(80, 95)) +
  geom_line(aes(y = .mean), data = nn_fc, linetype = 1) +
  labs(title = "Sticker Sales with Neural Network Forecast",
       y = "Number Sold")

nn_plot

Model Accuracy Comparison

All Models

# Build models all together
fits <- train_top_level %>%
  model(
    ETS = ETS(log(num_sold)),
    ARIMA = ARIMA(log(num_sold)),
    `DHR (K = 3)` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 + 
                                   roll7_mean + fourier(K=3)),
    Ensemble = ((ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 + 
                                   roll7_mean + fourier(K=3))) + 
                  ARIMA(log(num_sold)) +
                  ETS(log(num_sold))) / 3
  )

# Forecast models
fc <- forecast(fits, new_data = test_top_level)

Compare Accuracy

# Get accuracy stats
compare <- accuracy(fc, test_top_level) 

# Filter out important accuracy measures with important models
summary_tbl <- compare %>%
  dplyr::select(Model = .model, RMSE, MAE, MAPE, ACF1)

summary_tbl %>%
  gt() %>%
  tab_header(title = "Model Comparison")

Hierarchical Modeling

Bottom-Up ETS

# Load sample submission
sample_sub <- read.csv("/Users/emilybroderick/Desktop/R/PAF/Final/sample_submission.csv",
                       stringsAsFactors = FALSE) 

# Prepare test data
sticker_test <- sticker_test %>%
  mutate(date = as.Date(date))

# test bottom-level tsibble
test_bottom <- sticker_test %>%
  as_tsibble(key = c(country, store, product), index = date)

# Create num_sold variable
test_bottom$num_sold <- NA

# Aggregate to hierarchical structure
test_hts <- test_bottom %>%
  aggregate_key((country / store) * product, num_sold = NA)

Build Model and Forecast

# Build model on full train.csv
ets_fit <- sticker_agg %>%
  model(ets = ETS(log(num_sold))) %>%
  reconcile(ETS = bottom_up(ets))

# Forecast for test horizon
bu_fc <- ets_fit %>%
  forecast(new_data = test_hts)

Submit to Kaggle

# Convert forecasts to a clean tibble
fc_tbl <- bu_fc %>%
  as_tibble() %>%
  filter(
    .model == "ETS",
    !is_aggregated(country),
    !is_aggregated(store),
    !is_aggregated(product)
  ) %>%
  mutate(
    num_sold = exp(.mean),
    num_sold = pmax(0, num_sold),
    num_sold = round(num_sold)
  ) %>%
  dplyr::select(country, store, product, date, num_sold)

# Add ID back 
fc_tbl <- sticker_test %>%
  mutate(date = as.Date(date)) %>%
  left_join(
    fc_tbl,
    by = c("country", "store", "product", "date")
  ) %>%
  dplyr::select(id, num_sold) %>%
  arrange(id) %>% # ensure correct order

# Write CSV
write.csv(fc_tbl, "submission.csv", row.names = FALSE)

Kaggle Score

Score: 1.45 x 10^172

Username: broderem