LBB TS - Walmart Weekly Sales in 45 stores

Introduction

The dataset is the weekly sales of Walmart across its 45 stores in US with 143 observations per store totaling 143x45 =6,435 data points. As the sales are recorded at regularly spaced intervals, this is a time-series data. We will attempt to forecast Walmart’s weekly sales half a year into the future for each of its 45 stores.

library(tidyverse)
library(dplyr)     
library(lubridate) 
library(padr)     
library(zoo)       
library(forecast)  
library(TTR)       
library(MLmetrics)
library(tidymodels)       
library(TSstudio) 
library(ggplot2)
library(magrittr)
library(tidyverse)
library(tidyquant)
library(DescTools)
library(recipes)

walmart <- read_csv("walmart-sales-dataset-of-45stores.csv")
str(walmart)

## spc_tbl_ [6,435 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Store       : num [1:6435] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr [1:6435] "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
##  $ Weekly_Sales: num [1:6435] 1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: num [1:6435] 0 1 0 0 0 0 0 0 0 0 ...
##  $ Temperature : num [1:6435] 42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num [1:6435] 2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num [1:6435] 211 211 211 211 211 ...
##  $ Unemployment: num [1:6435] 8.11 8.11 8.11 8.11 8.11 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Store = col_double(),
##   ..   Date = col_character(),
##   ..   Weekly_Sales = col_double(),
##   ..   Holiday_Flag = col_double(),
##   ..   Temperature = col_double(),
##   ..   Fuel_Price = col_double(),
##   ..   CPI = col_double(),
##   ..   Unemployment = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data Preprocessing

Change datatype to appropriate datatype

We will only apply this to columns of interest.

walmart <- walmart %>% 
  mutate(Date = dmy(Date), Store = as.factor(Store)) %>% 
  arrange(Date)

head(walmart)

## # A tibble: 6 × 8
##   Store Date       Weekly_Sales Holiday_Flag Temperature Fuel_Price   CPI
##   <fct> <date>            <dbl>        <dbl>       <dbl>      <dbl> <dbl>
## 1 1     2010-02-05     1643691.            0        42.3       2.57  211.
## 2 2     2010-02-05     2136989.            0        40.2       2.57  211.
## 3 3     2010-02-05      461622.            0        45.7       2.57  214.
## 4 4     2010-02-05     2135144.            0        43.8       2.60  126.
## 5 5     2010-02-05      317173.            0        39.7       2.57  212.
## 6 6     2010-02-05     1652635.            0        40.4       2.57  213.
## # ℹ 1 more variable: Unemployment <dbl>

Check if there are missing dates or values

#Check the min and max date in our data
range(walmart$Date)

## [1] "2010-02-05" "2012-10-26"

#Create sequence of weekly dates spanning the range of the dataset
min_date <- min(walmart$Date)
max_date <- max(walmart$Date)
complete_time <- seq(from = min_date, to = max_date, by = "1 week")
df <- data_frame(time = complete_time)
dim(df)[1]

## [1] 143

#Check if there are timeskips
walmart %>% 
  group_by(Store) %>% 
  summarise(freq = n()) %>% 
  mutate(is_same = freq == dim(df)[1])

## # A tibble: 45 × 3
##    Store  freq is_same
##    <fct> <int> <lgl>  
##  1 1       143 TRUE   
##  2 2       143 TRUE   
##  3 3       143 TRUE   
##  4 4       143 TRUE   
##  5 5       143 TRUE   
##  6 6       143 TRUE   
##  7 7       143 TRUE   
##  8 8       143 TRUE   
##  9 9       143 TRUE   
## 10 10      143 TRUE   
## # ℹ 35 more rows

#Check if any missing values
any(is.na(walmart))

## [1] FALSE

colSums(is.na(walmart))

##        Store         Date Weekly_Sales Holiday_Flag  Temperature   Fuel_Price 
##            0            0            0            0            0            0 
##          CPI Unemployment 
##            0            0

Data Processing

Create nested tables

As we would like to forecast for each of the Walmart store, we use group_by on the stores and nest the column of interest i.e. Date and Weekly_Sales column

walmart_nested <- walmart %>%
 select(c(Store, Date, Weekly_Sales)) %>% 
  group_by(Store) %>%
  nest()

walmart_nested

## # A tibble: 45 × 2
## # Groups:   Store [45]
##    Store data              
##    <fct> <list>            
##  1 1     <tibble [143 × 2]>
##  2 2     <tibble [143 × 2]>
##  3 3     <tibble [143 × 2]>
##  4 4     <tibble [143 × 2]>
##  5 5     <tibble [143 × 2]>
##  6 6     <tibble [143 × 2]>
##  7 7     <tibble [143 × 2]>
##  8 8     <tibble [143 × 2]>
##  9 9     <tibble [143 × 2]>
## 10 10    <tibble [143 × 2]>
## # ℹ 35 more rows

Decompose the time series for each store

We decompose the serieson two levels: - ts object for single seasonality - msts object for multiple seasonality. As the data is weekly data, we want to see if there are any strong seasonalities for monthly (4 weeks), quarterly (12 weeks), half-yearly (26 weeks) as well as annually (52 weeks)

walmart_ts_decompose <- function(df){
  ts(df$Weekly_Sales, frequency=52) %>% decompose() %>% autoplot()
}

walmart_msts_decompose <- function(df){
  msts(df$Weekly_Sales, seasonal.periods = c(4, 12, 26, 52)) %>% mstl() %>% autoplot()
}

walmart_nested <- walmart_nested %>% 
  mutate(ts_decompose = map(data, walmart_ts_decompose))

print(walmart_nested$ts_decompose[[1]])

walmart_nested <- walmart_nested %>% 
  mutate(msts_decompose = map(data, walmart_msts_decompose))

print(walmart_nested$msts_decompose[[1]])

Note that each of the Walmart stores are different and they all have their own trend and seasonality pattern.

#Lets see a few random examples
print(walmart_nested$msts_decompose[[1]])

print(walmart_nested$msts_decompose[[10]])

print(walmart_nested$msts_decompose[[37]])

print(walmart_nested$msts_decompose[[45]])

Insights - The stores have different trends–some is going up, some is going down, some goes up and down. - Same applies for seasonality. The most marked variability is quarterly seasonality among the stores which can look very different from stores to stores - Monthly seasonality is multiplicative which makes sense from business perspective i.e. payday means more consumption - The stronget seasonality is the annual seasonality, which means our sales forecast could probably be done with single seasonality assumption

Cross Validation

We have 143 datapoints for each of the 45 stores. As we will forecast 6 months into the future, we will take 26 weeks as the size of our test data, with the remainder being the training dataset. For simplicity we want the train and test split to be in pivot wider format.

#define the train function
train_split <- function(df, train_size = 117) {
  train_data <- head(df, train_size)
  return(train_data)
}

#apply the train function
walmart_nested <- walmart_nested %>%
  mutate(train = map(data, train_split))

#define the test function
test_split <- function(df, test_size = 26) {
  test_data <- tail(df, test_size)
  return(test_data)
}

#apply the test function
walmart_nested<-walmart_nested %>%
  mutate(test = map(data, test_split))

#view the nested tables
walmart_nested

## # A tibble: 45 × 6
## # Groups:   Store [45]
##    Store data               ts_decompose msts_decompose train    test    
##    <fct> <list>             <list>       <list>         <list>   <list>  
##  1 1     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  2 2     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  3 3     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  4 4     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  5 5     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  6 6     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  7 7     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  8 8     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
##  9 9     <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
## 10 10    <tibble [143 × 2]> <gg>         <gg>           <tibble> <tibble>
## # ℹ 35 more rows

Proportion of train and test data is therefore 88% and 12% respectively as per below.

#Choose a random store (store#7) and glimpse into their cross-validation table
glimpse(walmart_nested$train[[7]])

## Rows: 117
## Columns: 2
## $ Date         <date> 2010-02-05, 2010-02-12, 2010-02-19, 2010-02-26, 2010-03-…
## $ Weekly_Sales <dbl> 496725.4, 524104.9, 506760.5, 496083.2, 491419.5, 480452.…

glimpse(walmart_nested$test[[7]])

## Rows: 26
## Columns: 2
## $ Date         <date> 2012-05-04, 2012-05-11, 2012-05-18, 2012-05-25, 2012-06-…
## $ Weekly_Sales <dbl> 465198.9, 460397.4, 468428.3, 532739.8, 598495.0, 642963.…

#training proportion
print(paste("The train proportion is:", round(117 / (117 + 26) * 100, 1), "%"))

## [1] "The train proportion is: 81.8 %"

#test proportion
print(paste("The test proportion is:", round(26 / (117 + 26) * 100, 1), "%"))

## [1] "The test proportion is: 18.2 %"

Create functions to automatically convert the datasets into time series objects

We will create both the ts and msts objects for each of the Walmart stores. As we have two functions, we will wrap them into a list and map them to each store. we will also need to replicate each unique store number twice.

Afterwards, we will use left_join to combine the ts_function list to the original walmart_nested table. To make sure this works, we need to make sure that we can join by the Store number, and as such need to ensure that the column name is consistent between ts_function as well as walmart_nested

The resulting nested table will have 90 rows as we have “pivot longer” for the time series objects.

#Create list of time series object functions
ts_function <- list(
  ts_obj = function(x) ts(x$Weekly_Sales, frequency=52),
  msts_obj = function(x) msts(x$Weekly_Sales, seasonal.periods = c(4,12,26,52))
)


#As there are two objects, replicate each unique Store number twice i.e one for each time series object
ts_function<- ts_function %>% 
  rep(length(unique(walmart$Store))) %>% 
  enframe("ts_func_name", "ts_func") %>% 
  mutate(Store = sort(rep(unique(walmart$Store), length(unique(.$ts_func_name)))))

ts_function

## # A tibble: 90 × 3
##    ts_func_name ts_func Store
##    <chr>        <list>  <fct>
##  1 ts_obj       <fn>    1    
##  2 msts_obj     <fn>    1    
##  3 ts_obj       <fn>    2    
##  4 msts_obj     <fn>    2    
##  5 ts_obj       <fn>    3    
##  6 msts_obj     <fn>    3    
##  7 ts_obj       <fn>    4    
##  8 msts_obj     <fn>    4    
##  9 ts_obj       <fn>    5    
## 10 msts_obj     <fn>    5    
## # ℹ 80 more rows

#left join (as our list have more rows than the *walmart_nested* df)
walmart_nested<- left_join(walmart_nested, ts_function) 

head(walmart_nested)

## # A tibble: 6 × 8
## # Groups:   Store [3]
##   Store data     ts_decompose msts_decompose train    test     ts_func_name
##   <fct> <list>   <list>       <list>         <list>   <list>   <chr>       
## 1 1     <tibble> <gg>         <gg>           <tibble> <tibble> ts_obj      
## 2 1     <tibble> <gg>         <gg>           <tibble> <tibble> msts_obj    
## 3 2     <tibble> <gg>         <gg>           <tibble> <tibble> ts_obj      
## 4 2     <tibble> <gg>         <gg>           <tibble> <tibble> msts_obj    
## 5 3     <tibble> <gg>         <gg>           <tibble> <tibble> ts_obj      
## 6 3     <tibble> <gg>         <gg>           <tibble> <tibble> msts_obj    
## # ℹ 1 more variable: ts_func <list>

dim(walmart_nested)

## [1] 90  8

As expected, we have 90 rows and 8 columns now in walmart_nested

Model selection and fitting

Create model function for the nested table

As our data has multiple seasonalities we will be choosing the following models:

Triple exponential smoothing (Holt Winters): This method is able to handle data with both trend and seasonal components
TBATS (Trigonometric seasonality, Box-Cox, ARMA errors, Trend and Seasonal Components): extension of exponential smoothing methods. This method is able to handle multiple seasonalities at various frequencies using pairs of Sin and Cos functions (also called the Fourier terms). The Fourier terms are handled by adjusting the k-parameter. The higher the k-parameter, the better our model will be at estimating complex seasonalities. However we must still strike a balance to ensure it does not overfit and is able to generalize to unseen datapoints. One advantage of TBATS is that it has the ability to handle trends and irregular data frequencies without us having to explicitly specify the seasonality range. In data with multiple seasonalities, TBATS is often the best performer
STL (Seasonal and Trend decomposition using Loess): STL is a decomposition method that decomposes time series to seasonal, trend and residual components. Given our data has some outliers as well, STL will be suitable as we are able to specify robust composition so that the outlier will only impact the residual components. However STL is mainly suitable to handle data with additive seasonalities.

We will fit each of the models considering both single and multiple seasonalities discussed above for each of the Walmart stores train data, under the assumption of both single and multiple seasonalities.

The approach for processing this will be exactly the same as creating time series objects above.

#The input in the model function should be the converted ts  objects
model_function <- list(
   stlm = function(x) stlm(x),
   tbats = function(x) tbats(x),
   HoltWinters = function(x) HoltWinters(x)
   )

model_function<- model_function %>% 
  rep(length(unique(walmart$Store))) %>% 
  enframe("model_name", "model_func") %>% 
  mutate(Store = sort(rep(unique(walmart$Store), length(unique(.$model_name)))))

model_function

## # A tibble: 135 × 3
##    model_name  model_func Store
##    <chr>       <list>     <fct>
##  1 stlm        <fn>       1    
##  2 tbats       <fn>       1    
##  3 HoltWinters <fn>       1    
##  4 stlm        <fn>       2    
##  5 tbats       <fn>       2    
##  6 HoltWinters <fn>       2    
##  7 stlm        <fn>       3    
##  8 tbats       <fn>       3    
##  9 HoltWinters <fn>       3    
## 10 stlm        <fn>       4    
## # ℹ 125 more rows

Similarly as we have 3 models, we will need to use left_join to merge this table with walmart_nested table. We will expect the resulting table to have 90 x 3 = 270 rows

walmart_nested<- left_join(walmart_nested, model_function) 

head(walmart_nested)

## # A tibble: 6 × 10
## # Groups:   Store [1]
##   Store data     ts_decompose msts_decompose train    test     ts_func_name
##   <fct> <list>   <list>       <list>         <list>   <list>   <chr>       
## 1 1     <tibble> <gg>         <gg>           <tibble> <tibble> ts_obj      
## 2 1     <tibble> <gg>         <gg>           <tibble> <tibble> ts_obj      
## 3 1     <tibble> <gg>         <gg>           <tibble> <tibble> ts_obj      
## 4 1     <tibble> <gg>         <gg>           <tibble> <tibble> msts_obj    
## 5 1     <tibble> <gg>         <gg>           <tibble> <tibble> msts_obj    
## 6 1     <tibble> <gg>         <gg>           <tibble> <tibble> msts_obj    
## # ℹ 3 more variables: ts_func <list>, model_name <chr>, model_func <list>

dim(walmart_nested)

## [1] 270  10

Execute nested fittings

To start fitting the models we will to invoke nested fitting:

Below, we apply map to the train column in the nested table in whcih we convert each element in train into a list. We assign this to variable ‘params’
The next argument, invoke_map, allows us to apply a list of functions in ts_func, which consist of converting datasets to ts and msts objects, to the params or the list of all elements of the train data for all three sub-areas
The now time-series objects are then taken as input to the list of model functions (model_function)

# Create the params column which consist of train data in list form
walmart_nested <- walmart_nested %>%
  mutate(params = map(train, ~list(x = .x)))

# Apply ts_func to data and overwrite the data column. The data contained inside this variable are all time series objects that our model can work with. We then reassign these time series objects to params variable
walmart_nested <- walmart_nested %>% 
  mutate(
    data = invoke_map(ts_func, params),
    params = map(data, ~list(x = .x))
  )

# Apply model functions to params variable and create the fitted column
walmart_nested <- walmart_nested %>% 
  mutate(
    fitted = invoke_map(model_func, params)
  )

# We can access the individual model fittings as follows
head(walmart_nested$fitted[[15]])

## $fitted
## Time Series:
## Start = c(2, 1) 
## End = c(3, 13) 
## Frequency = 52 
##              xhat    level    trend     season
## 2.000000 431847.1 386854.2 288.2944  44704.535
## 2.019231 425575.2 389035.4 288.2944  36251.475
## 2.038462 428863.3 390728.9 288.2944  37846.133
## 2.057692 394383.0 392129.3 288.2944   1965.394
## 2.076923 435215.1 393220.3 288.2944  41706.520
## 2.096154 403370.1 394039.2 288.2944   9042.606
## 2.115385 391045.1 394720.0 288.2944  -3963.217
## 2.134615 379996.3 395310.4 288.2944 -15602.393
## 2.153846 374166.4 395793.8 288.2944 -21915.713
## 2.173077 383912.1 396192.7 288.2944 -12568.876
## 2.192308 366124.7 396527.3 288.2944 -30690.931
## 2.211538 391396.7 396851.4 288.2944  -5742.913
## 2.230769 366525.1 397271.1 288.2944 -31034.373
## 2.250000 411957.5 397809.3 288.2944  13859.881
## 2.269231 385272.3 398405.4 288.2944 -13421.452
## 2.288462 363690.3 398989.0 288.2944 -35586.992
## 2.307692 368815.4 399536.4 288.2944 -31009.243
## 2.326923 394533.5 399976.5 288.2944  -5731.310
## 2.346154 391972.2 400257.5 288.2944  -8573.681
## 2.365385 403818.5 400451.2 288.2944   3078.995
## 2.384615 386497.6 400627.4 288.2944 -14418.049
## 2.403846 370209.2 400638.4 288.2944 -30717.451
## 2.423077 395867.5 400572.9 288.2944  -4993.672
## 2.442308 374198.5 400656.5 288.2944 -26746.297
## 2.461538 361425.9 400733.6 288.2944 -39595.944
## 2.480769 346252.6 400792.4 288.2944 -54828.118
## 2.500000 415500.9 400833.4 288.2944  14379.184
## 2.519231 351312.2 399555.2 288.2944 -48531.230
## 2.538462 371778.9 408092.3 288.2944 -36601.719
## 2.557692 384714.9 410633.5 288.2944 -26206.860
## 2.576923 384021.6 405714.6 288.2944 -21981.296
## 2.596154 367409.1 403723.1 288.2944 -36602.325
## 2.615385 380962.7 406832.0 288.2944 -26157.632
## 2.634615 371938.9 405606.7 288.2944 -33956.126
## 2.653846 373755.0 403996.3 288.2944 -30529.627
## 2.673077 409643.2 402786.9 288.2944   6567.981
## 2.692308 358817.8 401287.0 288.2944 -42757.502
## 2.711538 364496.7 404261.5 288.2944 -40053.036
## 2.730769 372573.2 413200.2 288.2944 -40915.356
## 2.750000 452069.7 418304.1 288.2944  33477.355
## 2.769231 417759.5 420685.0 288.2944  -3213.877
## 2.788462 400902.1 418136.6 288.2944 -17522.767
## 2.807692 593241.8 417839.3 288.2944 175114.164
## 2.826923 493914.4 407820.6 288.2944  85805.551
## 2.846154 479021.1 402034.5 288.2944  76698.343
## 2.865385 506187.7 399414.2 288.2944 106485.240
## 2.884615 614951.9 400996.6 288.2944 213667.022
## 2.903846 373686.4 383197.5 288.2944  -9799.332
## 2.923077 379677.2 393949.1 288.2944 -14560.204
## 2.942308 387395.1 399488.2 288.2944 -12381.383
## 2.961538 351561.3 394112.6 288.2944 -42839.614
## 2.980769 369835.7 398447.3 288.2944 -28899.873
## 3.000000 437962.1 392969.2 288.2944  44704.535
## 3.019231 426107.4 389567.6 288.2944  36251.475
## 3.038462 441381.9 403247.5 288.2944  37846.133
## 3.057692 415498.4 413244.7 288.2944   1965.394
## 3.076923 456500.5 414505.7 288.2944  41706.520
## 3.096154 427885.9 418555.0 288.2944   9042.606
## 3.115385 420071.5 423746.5 288.2944  -3963.217
## 3.134615 406366.2 421680.3 288.2944 -15602.393
## 3.153846 402481.4 424108.9 288.2944 -21915.713
## 3.173077 413537.7 425818.3 288.2944 -12568.876
## 3.192308 421160.2 451562.8 288.2944 -30690.931
## 3.211538 446291.4 451746.0 288.2944  -5742.913
## 3.230769 418033.1 448779.2 288.2944 -31034.373
## 
## $x
## Time Series:
## Start = c(1, 1) 
## End = c(3, 13) 
## Frequency = 52 
##   [1] 461622.2 420729.0 421642.2 407204.9 415202.0 384200.7 375328.6 359949.3
##   [9] 423294.4 415870.3 354993.3 339976.7 361248.4 399323.9 384357.9 343763.2
##  [17] 350089.2 396968.8 355017.1 364076.8 357346.5 381151.7 349214.2 352728.8
##  [25] 352864.5 347955.0 402635.8 339597.4 351728.2 362134.1 366474.0 352261.0
##  [33] 363064.6 355626.9 358784.1 395107.3 345584.4 348896.0 348591.7 423175.6
##  [41] 386635.0 372545.3 565567.8 476420.8 467642.0 498159.4 605990.4 382677.8
##  [49] 378241.3 381061.1 350876.7 364866.2 438516.5 430526.2 432782.1 397211.2
##  [57] 437084.5 404753.3 392109.5 380683.7 374556.1 384075.3 366250.7 391860.0
##  [65] 367405.4 413042.1 386312.7 364603.1 369350.6 394507.8 391638.8 403423.3
##  [73] 385520.7 368962.7 395146.2 373454.3 360617.4 345381.3 409981.2 380376.8
##  [81] 379716.9 366367.5 375988.7 377347.5 375629.5 365248.9 368477.9 403342.4
##  [89] 368282.6 394976.4 389540.6 459443.2 407764.2 398839.0 556925.2 472511.3
##  [97] 468772.8 510747.6 551221.2 410553.9 398178.2 367438.6 365818.6 349518.1
## [105] 424960.7 473292.5 475591.1 418925.5 469752.6 445162.0 411775.8 413907.2
## [113] 407488.8 503232.1 420789.7 434822.1 394616.1
## 
## $alpha
##     alpha 
## 0.2838105 
## 
## $beta
## beta 
##    0 
## 
## $gamma
## gamma 
##     0 
## 
## $coefficients
##           a           b          s1          s2          s3          s4 
## 442421.4874    288.2944  13859.8806 -13421.4525 -35586.9916 -31009.2428 
##          s5          s6          s7          s8          s9         s10 
##  -5731.3095  -8573.6814   3078.9951 -14418.0486 -30717.4513  -4993.6716 
##         s11         s12         s13         s14         s15         s16 
## -26746.2969 -39595.9437 -54828.1177  14379.1843 -48531.2299 -36601.7187 
##         s17         s18         s19         s20         s21         s22 
## -26206.8603 -21981.2957 -36602.3253 -26157.6315 -33956.1258 -30529.6273 
##         s23         s24         s25         s26         s27         s28 
##   6567.9813 -42757.5024 -40053.0357 -40915.3564  33477.3552  -3213.8767 
##         s29         s30         s31         s32         s33         s34 
## -17522.7665 175114.1638  85805.5514  76698.3431 106485.2401 213667.0224 
##         s35         s36         s37         s38         s39         s40 
##  -9799.3317 -14560.2042 -12381.3828 -42839.6138 -28899.8731  44704.5349 
##         s41         s42         s43         s44         s45         s46 
##  36251.4749  37846.1325   1965.3941  41706.5200   9042.6058  -3963.2168 
##         s47         s48         s49         s50         s51         s52 
## -15602.3928 -21915.7126 -12568.8757 -30690.9306  -5742.9129 -31034.3727

  params = map(data, ~list(x = .x)),
  fitted = invoke_map(model_func, params))

Forecast and model evaluation

Make the forecast

To make the forecast, we use the map function again to apply the forecast function to the fitted models, in fitted. The forecast horizon is 26 weeks (6 months forward)

Calculate MAE

To calculate MAE, we will use the test datasets for each store, extract the Weekly_Sales column and compare it with the mean column from the forecast object generated above. The result will be stored in the MAE_test column, and the it will be expressed in the same unit as the variable we are trying to forecast. As such in this case, the MAE will be the average weekly sales of each Walmart store, in which the unit is in US Dollars

Lastly we will also be removing the columns that are no longer necessary for model evaluation and model improvement stage.

walmart_nested<- walmart_nested %>%
   mutate(MAE_test = map(fitted, ~forecast(.x, h = 26)) %>%
          map2_dbl(test, ~mae_vec(truth = .y$Weekly_Sales, estimate = .x$mean))) %>%
   select(-c("ts_func", "model_func", "params", "data")) %>%
   arrange(Store, MAE_test)

head(walmart_nested)

## # A tibble: 6 × 9
## # Groups:   Store [1]
##   Store ts_decompose msts_decompose train    test     ts_func_name model_name 
##   <fct> <list>       <list>         <list>   <list>   <chr>        <chr>      
## 1 1     <gg>         <gg>           <tibble> <tibble> ts_obj       stlm       
## 2 1     <gg>         <gg>           <tibble> <tibble> ts_obj       HoltWinters
## 3 1     <gg>         <gg>           <tibble> <tibble> msts_obj     HoltWinters
## 4 1     <gg>         <gg>           <tibble> <tibble> ts_obj       tbats      
## 5 1     <gg>         <gg>           <tibble> <tibble> msts_obj     stlm       
## 6 1     <gg>         <gg>           <tibble> <tibble> msts_obj     tbats      
## # ℹ 2 more variables: fitted <list>, MAE_test <dbl>

Interpret the MAE

As mentioned before, MAE is expressed in the same unit as the forecasted object. Below we inspect the average weekly sales over the entire dataset for all of the Walmart stores

walmart_avgsales <- walmart %>% 
  group_by(Store) %>% 
  summarise(avg_sales = mean(Weekly_Sales))

walmart_avgsales

## # A tibble: 45 × 2
##    Store avg_sales
##    <fct>     <dbl>
##  1 1      1555264.
##  2 2      1925751.
##  3 3       402704.
##  4 4      2094713.
##  5 5       318012.
##  6 6      1564728.
##  7 7       570617.
##  8 8       908750.
##  9 9       543981.
## 10 10     1899425.
## # ℹ 35 more rows

For example, for Store 1, the average weekly sales is USD 1,555,264.4. We compare this against the MAE of the best performing model, which is stlm model assuming single seasonality, which is USD 38,483.731. As such, the MAE expressed in percentage for Store 1’s best performing model is 2.5%

print(paste("The MAE expressed in percentage for Store 1 is:", round(38483.7/1555264.4,3)*100, "%"))

## [1] "The MAE expressed in percentage for Store 1 is: 2.5 %"

Summary and Conclusions

Assumptions checking: no autocorrelation in response variables as well as residuals

We would like to apply Ljung-Box test on our fitted values to check if any autocorrelation is present. There are two settings on the Ljung-Box test i.e. ‘response’ and ‘innovation’ whereby the former is used to check autocorrelation in response variable which is Weekly_Sales in our case, and the latter is used to check for autocorrelation in the residuals of our model.

The hypothesis testing statement is as follows:

H0: no-autocorrelation in response / innovation, p-value > 0.05 H1: autocorrelation in response / innovation , p-value < 0.05

#Create the function for box test
box_test_response <- function(df) {
  test <- Box.test(residuals(df, type = "response"))
  p_value <- test$p.value
  return(p_value)
}

box_test_residuals <- function(df) {
  test <- Box.test(residuals(df, type = "innovation"))
  p_value <- test$p.value
  return(p_value)
}

# Map the function to the fitted column in the nested table
walmart_nested <- walmart_nested %>% 
  mutate(
    box_test_response = map(fitted, ~box_test_response(.)),
    box_test_innovation = map(fitted, ~box_test_residuals(.))
  )

#Access one of the results
walmart_nested$box_test_innovation[[1]]

## [1] 0.8574428

To make it easy, we will visualize the p-values of the Ljung-Box test

# unnest the nested table
walmart_nested_unnested <- walmart_nested %>%
  group_by(Store) %>% 
  filter(MAE_test == min(MAE_test)) %>%
  distinct(MAE_test, .keep_all = TRUE) %>% 
  unnest(box_test_innovation, box_test_response) %>% 
  select(Store, MAE_test, box_test_innovation, box_test_response)
  
#create threshold for null and alternative hypothesis
walmart_nested_unnested <- walmart_nested_unnested %>%
  mutate(BelowThreshold = box_test_response < 0.05)

# Response variable plot
ggplot(walmart_nested_unnested, aes(x = Store, y = box_test_response, color = BelowThreshold)) +
  geom_point() +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
  labs(x = "Store", y = "P-Value", color = "Below 0.05", 
       title = "Ljung-Box Test P-Values by Store for Response Variable") +
  theme_minimal()

# Residuals plot
ggplot(walmart_nested_unnested, aes(x = Store, y = box_test_innovation, color = BelowThreshold)) +
  geom_point() +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
  labs(x = "Store", y = "P-Value", color = "Below 0.05", 
       title = "Ljung-Box Test P-Values by Store for Residuals") +
  theme_minimal()

43 out of 45 stores forecast have met our assumptions for no autocorrelation in residuals and response variables. The 2 stores who violate the assumption is Store no 4 and no 37. This means there are systematic patterns or dependencies in the residuals that the model has not captured by the models we apply. We would ideally need to go back and apply different model, perform transformations (e.g. square root), eliminate outliers or choose a different model for this two.

walmart_nested_unnested %>% 
  filter(BelowThreshold == TRUE)

## # A tibble: 2 × 5
## # Groups:   Store [2]
##   Store MAE_test box_test_innovation box_test_response BelowThreshold
##   <fct>    <dbl>               <dbl>             <dbl> <lgl>         
## 1 4       39333.              0.0335            0.0335 TRUE          
## 2 37      10924.              0.0442            0.0442 TRUE

Normality of residuals

We will assess the normality of residuals visually by plotting the histograms of the residuals. We understand from above that no autocorrelation assumptions for residuals are violated by store no 4 and store no 37’s forecast. If we look at the plots below, we can see that their residuals are not normally distributed i.e. the central mean of residuals which is 0 is not the highest in the histogram. Some of the other stores’ histogram are approximately normally distributed but many have leptokurtic kurtosis and are skewed.

normality_residuals <- function(df) {
  hist(residuals(df), breaks = 30, 
              col = "blue", border = "red", xlab = "Normal distribution")
}

# Map the function to the fitted column in the nested table
walmart_nested %>% 
  mutate(
    histograms = map(fitted, ~normality_residuals(.))
  )

## # A tibble: 270 × 12
## # Groups:   Store [45]
##    Store ts_decompose msts_decompose train    test     ts_func_name model_name 
##    <fct> <list>       <list>         <list>   <list>   <chr>        <chr>      
##  1 1     <gg>         <gg>           <tibble> <tibble> ts_obj       stlm       
##  2 1     <gg>         <gg>           <tibble> <tibble> ts_obj       HoltWinters
##  3 1     <gg>         <gg>           <tibble> <tibble> msts_obj     HoltWinters
##  4 1     <gg>         <gg>           <tibble> <tibble> ts_obj       tbats      
##  5 1     <gg>         <gg>           <tibble> <tibble> msts_obj     stlm       
##  6 1     <gg>         <gg>           <tibble> <tibble> msts_obj     tbats      
##  7 2     <gg>         <gg>           <tibble> <tibble> ts_obj       stlm       
##  8 2     <gg>         <gg>           <tibble> <tibble> ts_obj       tbats      
##  9 2     <gg>         <gg>           <tibble> <tibble> ts_obj       HoltWinters
## 10 2     <gg>         <gg>           <tibble> <tibble> msts_obj     HoltWinters
## # ℹ 260 more rows
## # ℹ 5 more variables: fitted <list>, MAE_test <dbl>, box_test_response <list>,
## #   box_test_innovation <list>, histograms <list>

Best model selection for each store

The best model for each store forecast is given as follows. Note that if the MAE is the same, there can be two identical models suitable for each store forecast.

best_model <- walmart_nested %>%
  group_by(Store) %>%
  filter(MAE_test == min(MAE_test)) %>%
  ungroup() %>%
  left_join(walmart_avgsales, by = "Store") %>%
  mutate(MAE_percentage = round((MAE_test / avg_sales) * 100, 2)) %>% 
  select(-ts_decompose, -msts_decompose, -train, -test)

best_model

## # A tibble: 60 × 9
##    Store ts_func_name model_name  fitted     MAE_test box_test_response
##    <fct> <chr>        <chr>       <list>        <dbl> <list>           
##  1 1     ts_obj       stlm        <stlm>       38484. <dbl [1]>        
##  2 2     ts_obj       stlm        <stlm>       53361. <dbl [1]>        
##  3 3     ts_obj       HoltWinters <HltWntrs>   11551. <dbl [1]>        
##  4 3     msts_obj     HoltWinters <HltWntrs>   11551. <dbl [1]>        
##  5 4     ts_obj       stlm        <stlm>       39333. <dbl [1]>        
##  6 5     ts_obj       stlm        <stlm>       12464. <dbl [1]>        
##  7 6     ts_obj       stlm        <stlm>       44996. <dbl [1]>        
##  8 7     ts_obj       tbats       <tbats>      24600. <dbl [1]>        
##  9 8     ts_obj       HoltWinters <HltWntrs>   27576. <dbl [1]>        
## 10 8     msts_obj     HoltWinters <HltWntrs>   27576. <dbl [1]>        
## # ℹ 50 more rows
## # ℹ 3 more variables: box_test_innovation <list>, avg_sales <dbl>,
## #   MAE_percentage <dbl>

#Proportion of multiple seasonalities vs single seasonality in best model
prop.table(table(best_model$ts_func_name))

## 
##  msts_obj    ts_obj 
## 0.2666667 0.7333333

#Proportion of best model
prop.table(table(best_model$model_name))

## 
## HoltWinters        stlm       tbats 
##   0.5000000   0.3666667   0.1333333

#Average equally weighted MAE in our dataset(in %)
round(mean(best_model$MAE_test) / mean(best_model$avg_sales)*100,2)

## [1] 3.51

#Visualize
ggplot(data = best_model, aes(x = Store, y = MAE_percentage)) +
  geom_point() +  
  labs(x = "Store", y = "MAE Percentage", title = "MAE(% of avg sales) of each Walmart Store") +
  theme_minimal()

#MAE distribution
ggplot(data = best_model, aes(x = MAE_percentage)) +
  geom_histogram(binwidth = 0.25, fill = "red", color = "black") +
  labs(x = "MAE Percentage", y = "Frequency", title = "Distribution of MAE Percentage") +
  theme_minimal()

Generate validation forecast

library(timetk)

validation_forecast <- walmart_nested %>%
   select(Store, train, everything(), -test) %>%
   mutate(forecast = map(fitted, ~forecast(.x, h = 26)) %>%
                     map2(train, ~tibble(Date = timetk::tk_make_future_timeseries(.y$Date, 26),
                                         Weekly_Sales = as.vector(.x$mean))))


validation_forecast<- validation_forecast %>% 
  group_by(Store) %>% 
  select(Store, actual = train, forecast) %>% 
  gather(key, value, -Store) %>% 
  unnest(value) 


# Display the validation forecast plot
store_ids <- unique(validation_forecast$Store)

for (store_id in store_ids) {
  store_data <- validation_forecast[validation_forecast$Store == store_id, ]
  
  p <- ggplot(store_data, aes(x = Date, y = Weekly_Sales, colour = key)) +
    geom_line() +
    labs(title = paste("Store", store_id)) +
    theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p) 
}

Summary

The best model that applies to most of our store forecasts is HoltWinters Triple Exponential Smoothing model with both beta and gamma parameter set to F. As we have seen from decomposition plots, the annual seasonality look like strongest. Evidently most of the best models are those assuming single seasonality

Improvements for the future

The following needs to be applied in future projects:

Model improvement could be introduced i.e removing outliers, transforming response variables with log or square root (note: log is more common for financial data however Walmart’s weekly sales do not seem to be growing exponentially from the decomposition we did above so square root can also be appropriate)
Introduce params tuning for models such as controlling k for TBATS and specifying “multiplicative” seasonality type for Holt Winters
Independent validation set needs to be available to compare the validation data against
Modeling workflow is flawed as the same test set is used to test for several models, which is not best practice as it may expose us to selection bias

Workflow

Other measurement of error other than MAE could be introduced