sticker_data <- read.csv("/Users/emilybroderick/Desktop/R/PAF/Final/train.csv")
sticker_test <- read.csv("/Users/emilybroderick/Desktop/R/PAF/Final/test.csv")
# Clean raw data
sticker_data <- sticker_data %>%
mutate(date = as.Date(date))
# Compute year cutoffs without unevenly separating groups
global_cutoff <- sticker_data %>%
summarise(cut = sort(unique(date))[floor(0.8 * n_distinct(date))]) %>%
pull(cut)
train_raw <- sticker_data %>% filter(date <= global_cutoff)
test_raw <- sticker_data %>% filter(date > global_cutoff)
# Convert to tsibble
train <- train_raw %>%
dplyr::select(country, store, product, date, num_sold) %>%
as_tsibble(key = c(country, store, product), index = date)
test <- test_raw %>%
dplyr::select(country, store, product, date, num_sold) %>%
as_tsibble(key = c(country, store, product), index = date)
# Create hierarchical training set
train_agg <- train %>%
aggregate_key((country / store) * product, num_sold = sum(num_sold), .auto = FALSE)
# Filter aggregated training set
train_top_level <- train_agg %>%
filter(is_aggregated(country),
is_aggregated(store),
is_aggregated(product)) %>%
dplyr::select(date, num_sold) %>% # Remove keys entirely
as_tsibble(index = date)
# Create hierarchical test set
test_agg <- test %>%
aggregate_key((country / store) * product, num_sold = sum(num_sold), .auto = FALSE)
# Filter aggregated test set
test_top_level <- test_agg %>%
filter(is_aggregated(country),
is_aggregated(store),
is_aggregated(product)) %>%
dplyr::select(date, num_sold) %>% # Remove keys entirely
as_tsibble(index = date)
# Create hierarchy for full dataset (kaggle forecast)
sticker_agg <- sticker_data %>%
rename(num_sold = num_sold) %>%
as_tsibble(key = c(country, store, product), index = date) %>%
aggregate_key((country / store) * product, num_sold = sum(num_sold), .auto = FALSE)
# Filter aggregated full dataset (plotting)
sticker_top_level <- sticker_agg %>%
filter(is_aggregated(country),
is_aggregated(store),
is_aggregated(product)) %>%
dplyr::select(date, num_sold) %>% # Remove keys entirely
as_tsibble(index = date)
# Dynamic Regression Feature Creation
all_dates <- train_top_level %>%
distinct(date) %>%
arrange(date) %>%
pull(date)
# Create features for aggregated data
test_top_level <- test_top_level %>%
mutate(month = month(date),
wday = wday(date, label = TRUE, week_start = 1),
is_weekend = wday %in% c("Sat", "Sun"),
business_day = !(is_weekend)
) %>%
arrange(date) %>%
mutate(lag1 = lag(num_sold, 1),
lag7 = lag(num_sold, 7),
lag28 = lag(num_sold, 28),
roll7_mean = slider::slide_dbl(num_sold, mean, .before = 6, .complete = TRUE)
)
train_top_level <- train_top_level %>%
mutate(month = month(date),
wday = wday(date, label = TRUE, week_start = 1),
is_weekend = wday %in% c("Sat", "Sun"),
business_day = !(is_weekend)
) %>%
arrange(date) %>%
mutate(lag1 = lag(num_sold, 1),
lag7 = lag(num_sold, 7),
lag28 = lag(num_sold, 28),
roll7_mean = slider::slide_dbl(num_sold, mean, .before = 6, .complete = TRUE)
)
# Create a time plot to get an idea of what the data look like
time_plot <- autoplot(train_top_level, num_sold) +
labs(title = "Sticker Sales",
y = "Number Sold")
time_plot
# Create a seasonal plot of the years by month
seasonal_plot_yr <- train_top_level %>%
gg_season(num_sold, period = "year") +
labs(y="Number Sold",
title="Seasonal Plot: Sticker Sales")
seasonal_plot_yr
# Check ACF features for stationarity
acf_feat <- train_top_level %>%
features(num_sold, feat_acf)
acf_feat %>%
gt() %>%
tab_header(title = "Time Series ACF Features")
acf <- acf(train_top_level$num_sold, main="ACF of Sticker Sales")
# Check PACF features
pacf_feat <- train_top_level %>%
features(num_sold, feat_pacf)
pacf_feat %>%
gt() %>%
tab_header(title = "Time Series PACF Features")
pacf <- pacf(train_top_level$num_sold, main = "PACF of Sticker Sales")
add_dcmp <- train_top_level %>%
model(stl = STL(num_sold))
components(add_dcmp) %>%
autoplot() + # Plot the decomp
labs(title = "Additive STL Decomposition")
# Mult decomp
mult_dcmp <- train_top_level %>%
model(stl = STL(log(num_sold))) # Log transform
components(mult_dcmp) %>%
autoplot() + # Plot the decomp
labs(title = "Multiplicative STL Decomposition")
# Compare accuracy
decomp <- train_top_level %>% model(add = STL(num_sold),
mult = STL(log(num_sold)))
acc_decomp <- accuracy(decomp)
# Filter out important accuracy measures with important models
acc_summary_tbl <- acc_decomp %>%
dplyr::select(Model = .model, RMSE, MAE, MAPE, ACF1)
# Make the table presentable
acc_summary_tbl %>%
gt() %>%
tab_header(title = "Additive vs. Multiplicative STL Decomposition")
The ETS model was chosen based on lowest AICc.
# Fit the model
ets_fit <- train_top_level %>%
model(ETS(log(num_sold)))
# View selected model and info
report(ets_fit)
# Forecast model
ets_fc <- forecast(ets_fit, new_data = test_top_level)
# Plot forecast
ets_plot <- ets_fc %>%
autoplot(sticker_top_level %>%
filter(!is.na(num_sold)), level = c(80, 95)) +
geom_line(aes(y = .mean), data = ets_fc, linetype = 1) +
labs(title = "Sticker Sales with ETS () Forecast",
y = "Number Sold")
ets_plot
# Fit model
arst_fit <- train_top_level %>%
model(ARIMA(log(num_sold)))
# View selected model and info
report(arst_fit)
# Forecast model
arst_fc <- forecast(arst_fit, new_data = test_top_level)
# Plot forecast
arst_plot <- arst_fc %>%
autoplot(sticker_top_level %>%
filter(!is.na(num_sold)), level = c(80, 95)) +
geom_line(aes(y = .mean), data = arst_fc, linetype = 1) +
labs(title = "Sticker Sales with ARIMA () Forecast",
y = "Number Sold")
arst_plot
# Build models
dhr_fit <- model(train_top_level,
`K = 1` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 +
roll7_mean + fourier(K=1)),
`K = 2` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 +
roll7_mean + fourier(K=2)),
`K = 3` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 +
roll7_mean + fourier(K=3))
)
# Compare model AICc and BIC
glance(dhr_fit)
# Forecast model
dhr_fc <- forecast(dhr_fit, new_data = test_top_level)
# Plot model
dhr_plot <- dhr_fc %>%
autoplot(sticker_top_level %>%
filter(!is.na(num_sold)), level = c(80, 95)) +
geom_line(aes(y = .mean), data = dhr_fc, linetype = 1) +
labs(title = "Sticker Sales with DHR Forecast",
y = "Number Sold")
dhr_plot
nn_fit <- train_top_level %>%
model(NNETAR = NNETAR(log(num_sold)))
nn_fc <- forecast(nn_fit, new_data = test_top_level)
nn_plot <- nn_fc %>%
autoplot(sticker_top_level %>%
filter(!is.na(num_sold)), level = c(80, 95)) +
geom_line(aes(y = .mean), data = nn_fc, linetype = 1) +
labs(title = "Sticker Sales with Neural Network Forecast",
y = "Number Sold")
nn_plot
# Build models all together
fits <- train_top_level %>%
model(
ETS = ETS(log(num_sold)),
ARIMA = ARIMA(log(num_sold)),
`DHR (K = 3)` = ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 +
roll7_mean + fourier(K=3)),
Ensemble = ((ARIMA(log(num_sold) ~ month + is_weekend + lag1 + lag7 +
roll7_mean + fourier(K=3))) +
ARIMA(log(num_sold)) +
ETS(log(num_sold))) / 3
)
# Forecast models
fc <- forecast(fits, new_data = test_top_level)
# Get accuracy stats
compare <- accuracy(fc, test_top_level)
# Filter out important accuracy measures with important models
summary_tbl <- compare %>%
dplyr::select(Model = .model, RMSE, MAE, MAPE, ACF1)
summary_tbl %>%
gt() %>%
tab_header(title = "Model Comparison")
# Load sample submission
sample_sub <- read.csv("/Users/emilybroderick/Desktop/R/PAF/Final/sample_submission.csv",
stringsAsFactors = FALSE)
# Prepare test data
sticker_test <- sticker_test %>%
mutate(date = as.Date(date))
# test bottom-level tsibble
test_bottom <- sticker_test %>%
as_tsibble(key = c(country, store, product), index = date)
# Create num_sold variable
test_bottom$num_sold <- NA
# Aggregate to hierarchical structure
test_hts <- test_bottom %>%
aggregate_key((country / store) * product, num_sold = NA)
# Build model on full train.csv
ets_fit <- sticker_agg %>%
model(ets = ETS(log(num_sold))) %>%
reconcile(ETS = bottom_up(ets))
# Forecast for test horizon
bu_fc <- ets_fit %>%
forecast(new_data = test_hts)
# Convert forecasts to a clean tibble
fc_tbl <- bu_fc %>%
as_tibble() %>%
filter(
.model == "ETS",
!is_aggregated(country),
!is_aggregated(store),
!is_aggregated(product)
) %>%
mutate(
num_sold = exp(.mean),
num_sold = pmax(0, num_sold),
num_sold = round(num_sold)
) %>%
dplyr::select(country, store, product, date, num_sold)
# Add ID back
fc_tbl <- sticker_test %>%
mutate(date = as.Date(date)) %>%
left_join(
fc_tbl,
by = c("country", "store", "product", "date")
) %>%
dplyr::select(id, num_sold) %>%
arrange(id) %>% # ensure correct order
# Write CSV
write.csv(fc_tbl, "submission.csv", row.names = FALSE)
Score: 1.45 x 10^172
Username: broderem