Forecasting Sales Data (Baseline)

Ho Huu Binh

26/2/2022

Load Package

# Time series decomposition
library("feasts") # Using STL
library('tidyverse')
library('tsibble')
library('fable')
library('stR') # Using Seasonal Trend regression model 
library('bsts')


# Date + forecast
library('lubridate') # date and time
library('prophet') # time series analysis
library('timetk') # time series analysis
library('timeDate') # time series analysis

# general data manipulation
library('dplyr') # data manipulation
library('readr') # input/output
library('vroom') # input/output
library('tibble') # data wrangling
library('tidyr') # data wrangling

Load data

rm(list = ls())

# Preventing clash from Dplyr and MASS package for select function

select <- dplyr::select

train_df <- vroom("D:\\Dataset\\sales_train_validation.csv", delim = ",", col_types = cols())
price_df <- vroom("D:\\Dataset\\sell_prices.csv", delim = ",",
                  col_types = cols())
calendar_df <- vroom("D:\\Dataset\\calendar.csv", col_types = cols())

test_df <- vroom("D:\\Dataset\\sales_train_evaluation.csv", delim = ",",
                 col_types = cols())

# Retrieving special holidays 
holidays <- c(USThanksgivingDay(2011:2016), USChristmasDay(2011:2016))@Data
hoidays <- as.Date(holidays)

Function to transform dataframe from wide to long format

wide_to_long <- function(data){
  start_date <- date("2011-01-29")
  
  data %>%
    select(id, starts_with("d_")) %>%
    pivot_longer(starts_with("d_"), names_to = "dates",
                 values_to = "sales") %>% # group days columns with values equal to number of sales
    mutate(dates = as.integer(str_remove(dates, "d_"))) %>% # remove the prefix "d_"
    mutate(dates = start_date + dates - 1) %>% # operation to transform into date format
    mutate(dates = ymd(dates)) %>% # transform to date class
    mutate(id = str_remove(id, "_validation")) # remove suffix "_validation" for each type of product
}

Forecasting with Category and State Level

start_date = date("2011-01-29")

temp <- train_df %>%
  group_by(cat_id, state_id) %>% 
  summarise_at(vars(starts_with("d_")), sum) %>% 
  ungroup() %>% 
  select(ends_with("id"), starts_with("d_")) %>%  
  pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
  mutate(dates = as.integer(str_remove(dates, "d_"))) %>% 
  mutate(dates = start_date + dates - 1) %>%
  mutate(dates = ymd(dates)) %>%
  mutate(dummy = if_else(dates %in% holidays, 1, 0)) %>%
  mutate(dow = wday(dates, label = TRUE))
  # filter(!str_detect(as.character(dates), '..-12-25'))
  # replace_na(list(sales =  0))

temp_ts <- temp %>%
  as_tsibble(key = c(cat_id, state_id), index = dates)

FOOD Category in California

1 week forecast

food.CA <- temp_ts %>%
  filter(cat_id == "FOODS" & state_id == "CA") %>%
  select(dates, sales, dummy, dow) 

# Forecasting task 
# Forecasting with Decomposition
fit_food_CA_week <- food.CA %>% filter(dates < "2016-04-18") %>% model(stlf = decomposition_model(STL(
  sales  ~ season(window = 'periodic') + trend(window = 15),
                                robust = TRUE),
                                ARIMA(season_adjust ~ PDQ(0,0,0)),
                                SNAIVE(season_year)),
  additive_ets = ETS(sales ~ error("A") + trend("A") +
                                                season("A")),
  multiplicative_ets = ETS(sales ~ error("M") + trend("A") +
                                                season("M")),
  arima = ARIMA(sales)) %>%
  mutate(combination_ad = (additive_ets + stlf + arima) / 3,
         combination_mul = (multiplicative_ets + stlf + arima) / 3)

# report(fit_food_CA)

# Forecasting 1 week
food_CA_fc_week <- fit_food_CA_week %>%
  forecast(h = "1 week")

# Plotting the forecast
food_CA_fc_week %>%
  autoplot(food.CA %>% filter(dates <= "2016-04-24" & dates >= "2016-03-24"),
           level = NULL) + 
  labs(y = "Sale Volumes", 
       title = "Food Sale in California")

# Checking the accuracy 
fc_accuracy_week <- accuracy(food_CA_fc_week, food.CA,  measures = list(
    point_accuracy_measures,
    interval_accuracy_measures,
    distribution_accuracy_measures))

fc_accuracy_week %>% 
  group_by(.model) %>%
  summarise(RMSE = mean(RMSE),
    MAE = mean(MAE),
    MASE = mean(MASE),
    MAPE = mean(MAPE),
    Winkler = mean(winkler),
    CRPS = mean(CRPS)
  ) %>%
  arrange(MAPE)
## # A tibble: 6 x 7
##   .model              RMSE   MAE  MASE  MAPE Winkler  CRPS
##   <chr>              <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 multiplicative_ets  246.  162. 0.162  1.40   5300.  332.
## 2 combination_mul     316.  253. 0.254  2.12   4659.  308.
## 3 additive_ets        332.  239. 0.240  2.14   4594.  309.
## 4 combination_ad      334.  258. 0.259  2.16   4445.  302.
## 5 arima               354.  310. 0.311  2.59   4388.  304.
## 6 stlf                417.  346. 0.347  2.95   4774.  338.

1 month forecast

# Forecasting task 
# Forecasting with Decomposition
fit_food_CA_month <- food.CA %>% filter(dates < "2016-03-28") %>% model(stlf = decomposition_model(STL(
  sales  ~ season(window = 'periodic') + trend(window = 15),
                                robust = TRUE),
                                ARIMA(season_adjust ~ PDQ(0,0,0)),
                                SNAIVE(season_year)),
  additive_ets = ETS(sales ~ error("A") + trend("A") +
                                                season("A")),
  multiplicative_ets = ETS(sales ~ error("M") + trend("A") +
                                                season("M")),
  arima = ARIMA(sales)) %>%
  mutate(combination_ad = (additive_ets + stlf + arima) / 3,
         combination_mul = (multiplicative_ets + stlf + arima) / 3)

# report(fit_food_CA)

# Forecasting 1 week
food_CA_fc_month <- fit_food_CA_month %>%
  forecast(h = "28 days")

# Plotting the forecast
food_CA_fc_month %>%
  autoplot(food.CA %>% filter(dates <= "2016-04-24" & dates >= "2016-03-27"),
           level = NULL) + 
  labs(y = "Sale Volumes", 
       title = "Food Sale in California")

# Checking the accuracy 
fc_accuracy_month <- accuracy(food_CA_fc_month, food.CA,  measures = list(
    point_accuracy_measures,
    interval_accuracy_measures,
    distribution_accuracy_measures))

fc_accuracy_month %>% 
  group_by(.model) %>%
  summarise(RMSE = mean(RMSE),
    MAE = mean(MAE),
    MASE = mean(MASE),
    MAPE = mean(MAPE),
    Winkler = mean(winkler),
    CRPS = mean(CRPS)
  ) %>%
  arrange(MAPE)
## # A tibble: 6 x 7
##   .model              RMSE   MAE  MASE  MAPE Winkler  CRPS
##   <chr>              <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 combination_mul     744.  590. 0.592  5.21   5813.  499.
## 2 combination_ad      793.  633. 0.635  5.63   5599.  506.
## 3 stlf                843.  683. 0.685  6.04   5712.  532.
## 4 arima               899.  726. 0.728  6.05   4859.  530.
## 5 multiplicative_ets  874.  735. 0.737  6.74   7551.  616.
## 6 additive_ets       1027.  877. 0.879  8.21   6587.  635.

1 year forecast

# Forecasting task 
# Forecasting with Decomposition
fit_food_CA_year <- food.CA %>% filter(dates < "2015-04-26") %>% model(stlf = decomposition_model(STL(
  sales  ~ season(window = 'periodic') + trend(window = 13),
                                robust = TRUE),
                                ARIMA(season_adjust ~ dummy + dow + PDQ(0,0,0)),
                                SNAIVE(season_year)),
  additive_ets = ETS(sales ~ error("A") + trend("A") +
                                                season("A")),
  multiplicative_ets = ETS(sales ~ error("M") + trend("A") +
                                                season("M")),
  arima = ARIMA(sales)) %>%
  mutate(combination_ad = (additive_ets + stlf + arima) / 3,
         combination_mul = (multiplicative_ets + stlf + arima) / 3)
## Warning: Provided exogenous regressors are rank deficient, removing regressors:
## `dummy`
# report(fit_food_CA)

# Forecasting 1 year

fc_xreg <- food.CA %>%
  filter(dates <= "2016-04-24" & dates >= "2015-04-25") %>%
  select(dummy, dow)


food_CA_fc_year <- fit_food_CA_year %>%
  forecast(h = "1 year", new_data = fc_xreg)
## Warning: Input forecast horizon `h` will be ignored as `new_data` has been
## provided.
# Plotting the forecast
food_CA_fc_year %>%
  autoplot(food.CA %>% filter(dates <= "2016-04-24" & dates >= "2015-04-25"),
           level = NULL) + 
  labs(y = "Sale Volumes", 
       title = "Food Sale in California")

# Checking the accuracy 
fc_accuracy_year <- accuracy(food_CA_fc_year, food.CA,  measures = list(
    point_accuracy_measures,
    interval_accuracy_measures,
    distribution_accuracy_measures))

fc_accuracy_year %>% 
  group_by(.model) %>%
  summarise(RMSE = mean(RMSE),
    MAE = mean(MAE),
    MASE = mean(MASE),
    MAPE = mean(MAPE),
    Winkler = mean(winkler),
    CRPS = mean(CRPS)
  ) %>%
  arrange(MAPE)
## # A tibble: 6 x 7
##   .model              RMSE   MAE  MASE  MAPE Winkler  CRPS
##   <chr>              <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1 stlf               2663. 2053.  2.06  516.  13990. 1490.
## 2 additive_ets       2543. 1977.  1.98  526.  26925. 1991.
## 3 combination_ad     2485. 1900.  1.90  530.  15261. 1463.
## 4 combination_mul    2348. 1758.  1.76  544.  14036. 1362.
## 5 arima              2284. 1699.  1.70  548.  16946. 1269.
## 6 multiplicative_ets 2171. 1604.  1.61  569.  21902. 1662.
fit_food_CA_year %>% select(combination_ad) %>% gg_tsresiduals()
## Warning: Removed 7 row(s) containing missing values (geom_path).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing non-finite values (stat_bin).

# food.CA.temp <- food.CA %>%
#   select(dates, sales)
# 
# dcmp <- food.CA.temp %>%
#   model(stl = STL(sales ~ trend(window = 365) +
#                    season(window = "periodic"), robust = TRUE))
# 
# 
# components(dcmp)