The dataset is the weekly sales of Walmart across its 45 stores in US with 143 observations per store totaling 143x45 =6,435 data points. As the sales are recorded at regularly spaced intervals, this is a time-series data. We will attempt to forecast Walmart’s weekly sales half a year into the future for each of its 45 stores.
library(tidyverse)
library(dplyr)
library(lubridate)
library(padr)
library(zoo)
library(forecast)
library(TTR)
library(MLmetrics)
library(tidymodels)
library(TSstudio)
library(ggplot2)
library(magrittr)
library(tidyverse)
library(tidyquant)
library(DescTools)
library(recipes)
walmart <- read_csv("walmart-sales-dataset-of-45stores.csv")
str(walmart)
## spc_tbl_ [6,435 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Store : num [1:6435] 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr [1:6435] "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
## $ Weekly_Sales: num [1:6435] 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: num [1:6435] 0 1 0 0 0 0 0 0 0 0 ...
## $ Temperature : num [1:6435] 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num [1:6435] 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num [1:6435] 211 211 211 211 211 ...
## $ Unemployment: num [1:6435] 8.11 8.11 8.11 8.11 8.11 ...
## - attr(*, "spec")=
## .. cols(
## .. Store = col_double(),
## .. Date = col_character(),
## .. Weekly_Sales = col_double(),
## .. Holiday_Flag = col_double(),
## .. Temperature = col_double(),
## .. Fuel_Price = col_double(),
## .. CPI = col_double(),
## .. Unemployment = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
We will only apply this to columns of interest.
walmart <- walmart %>%
mutate(Date = dmy(Date), Store = as.factor(Store)) %>%
arrange(Date)
head(walmart)
## # A tibble: 6 × 8
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI
## <fct> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2010-02-05 1643691. 0 42.3 2.57 211.
## 2 2 2010-02-05 2136989. 0 40.2 2.57 211.
## 3 3 2010-02-05 461622. 0 45.7 2.57 214.
## 4 4 2010-02-05 2135144. 0 43.8 2.60 126.
## 5 5 2010-02-05 317173. 0 39.7 2.57 212.
## 6 6 2010-02-05 1652635. 0 40.4 2.57 213.
## # ℹ 1 more variable: Unemployment <dbl>
#Check the min and max date in our data
range(walmart$Date)
## [1] "2010-02-05" "2012-10-26"
#Create sequence of weekly dates spanning the range of the dataset
min_date <- min(walmart$Date)
max_date <- max(walmart$Date)
complete_time <- seq(from = min_date, to = max_date, by = "1 week")
df <- data_frame(time = complete_time)
dim(df)[1]
## [1] 143
#Check if there are timeskips
walmart %>%
group_by(Store) %>%
summarise(freq = n()) %>%
mutate(is_same = freq == dim(df)[1])
## # A tibble: 45 × 3
## Store freq is_same
## <fct> <int> <lgl>
## 1 1 143 TRUE
## 2 2 143 TRUE
## 3 3 143 TRUE
## 4 4 143 TRUE
## 5 5 143 TRUE
## 6 6 143 TRUE
## 7 7 143 TRUE
## 8 8 143 TRUE
## 9 9 143 TRUE
## 10 10 143 TRUE
## # ℹ 35 more rows
#Check if any missing values
any(is.na(walmart))
## [1] FALSE
colSums(is.na(walmart))
## Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price
## 0 0 0 0 0 0
## CPI Unemployment
## 0 0
As we would like to forecast for each of the Walmart store, we use group_by on the stores and nest the column of interest i.e. Date and Weekly_Sales column
walmart_nested <- walmart %>%
select(c(Store, Date, Weekly_Sales)) %>%
group_by(Store) %>%
nest()
walmart_nested
## # A tibble: 45 × 2
## # Groups: Store [45]
## Store data
## <fct> <list>
## 1 1 <tibble [143 × 2]>
## 2 2 <tibble [143 × 2]>
## 3 3 <tibble [143 × 2]>
## 4 4 <tibble [143 × 2]>
## 5 5 <tibble [143 × 2]>
## 6 6 <tibble [143 × 2]>
## 7 7 <tibble [143 × 2]>
## 8 8 <tibble [143 × 2]>
## 9 9 <tibble [143 × 2]>
## 10 10 <tibble [143 × 2]>
## # ℹ 35 more rows
We decompose the serieson two levels: - ts object for single seasonality - msts object for multiple seasonality. As the data is weekly data, we want to see if there are any strong seasonalities for monthly (4 weeks), quarterly (12 weeks), half-yearly (26 weeks) as well as annually (52 weeks)
walmart_ts_decompose <- function(df){
ts(df$Weekly_Sales, frequency=52) %>% decompose() %>% autoplot()
}
walmart_msts_decompose <- function(df){
msts(df$Weekly_Sales, seasonal.periods = c(4, 12, 26, 52)) %>% mstl() %>% autoplot()
}
walmart_nested <- walmart_nested %>%
mutate(ts_decompose = map(data, walmart_ts_decompose))
print(walmart_nested$ts_decompose[[1]])
walmart_nested <- walmart_nested %>%
mutate(msts_decompose = map(data, walmart_msts_decompose))
print(walmart_nested$msts_decompose[[1]])
Note that each of the Walmart stores are different and they all have
their own trend and seasonality pattern.
#Lets see a few random examples
print(walmart_nested$msts_decompose[[1]])
print(walmart_nested$msts_decompose[[10]])
print(walmart_nested$msts_decompose[[37]])
print(walmart_nested$msts_decompose[[45]])
Insights - The stores have different trends–some is going up, some is going down, some goes up and down. - Same applies for seasonality. The most marked variability is quarterly seasonality among the stores which can look very different from stores to stores - Monthly seasonality is multiplicative which makes sense from business perspective i.e. payday means more consumption - The stronget seasonality is the annual seasonality, which means our sales forecast could probably be done with single seasonality assumption
We have 143 datapoints for each of the 45 stores. As we will forecast 6 months into the future, we will take 26 weeks as the size of our test data, with the remainder being the training dataset. For simplicity we want the train and test split to be in pivot wider format.
#define the train function
train_split <- function(df, train_size = 117) {
train_data <- head(df, train_size)
return(train_data)
}
#apply the train function
walmart_nested <- walmart_nested %>%
mutate(train = map(data, train_split))
#define the test function
test_split <- function(df, test_size = 26) {
test_data <- tail(df, test_size)
return(test_data)
}
#apply the test function
walmart_nested<-walmart_nested %>%
mutate(test = map(data, test_split))
#view the nested tables
walmart_nested
## # A tibble: 45 × 6
## # Groups: Store [45]
## Store data ts_decompose msts_decompose train test
## <fct> <list> <list> <list> <list> <list>
## 1 1 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 2 2 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 3 3 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 4 4 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 5 5 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 6 6 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 7 7 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 8 8 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 9 9 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## 10 10 <tibble [143 × 2]> <gg> <gg> <tibble> <tibble>
## # ℹ 35 more rows
Proportion of train and test data is therefore 88% and 12% respectively as per below.
#Choose a random store (store#7) and glimpse into their cross-validation table
glimpse(walmart_nested$train[[7]])
## Rows: 117
## Columns: 2
## $ Date <date> 2010-02-05, 2010-02-12, 2010-02-19, 2010-02-26, 2010-03-…
## $ Weekly_Sales <dbl> 496725.4, 524104.9, 506760.5, 496083.2, 491419.5, 480452.…
glimpse(walmart_nested$test[[7]])
## Rows: 26
## Columns: 2
## $ Date <date> 2012-05-04, 2012-05-11, 2012-05-18, 2012-05-25, 2012-06-…
## $ Weekly_Sales <dbl> 465198.9, 460397.4, 468428.3, 532739.8, 598495.0, 642963.…
#training proportion
print(paste("The train proportion is:", round(117 / (117 + 26) * 100, 1), "%"))
## [1] "The train proportion is: 81.8 %"
#test proportion
print(paste("The test proportion is:", round(26 / (117 + 26) * 100, 1), "%"))
## [1] "The test proportion is: 18.2 %"
We will create both the ts and msts objects for each of the Walmart stores. As we have two functions, we will wrap them into a list and map them to each store. we will also need to replicate each unique store number twice.
Afterwards, we will use left_join to combine the ts_function list to the original walmart_nested table. To make sure this works, we need to make sure that we can join by the Store number, and as such need to ensure that the column name is consistent between ts_function as well as walmart_nested
The resulting nested table will have 90 rows as we have “pivot longer” for the time series objects.
#Create list of time series object functions
ts_function <- list(
ts_obj = function(x) ts(x$Weekly_Sales, frequency=52),
msts_obj = function(x) msts(x$Weekly_Sales, seasonal.periods = c(4,12,26,52))
)
#As there are two objects, replicate each unique Store number twice i.e one for each time series object
ts_function<- ts_function %>%
rep(length(unique(walmart$Store))) %>%
enframe("ts_func_name", "ts_func") %>%
mutate(Store = sort(rep(unique(walmart$Store), length(unique(.$ts_func_name)))))
ts_function
## # A tibble: 90 × 3
## ts_func_name ts_func Store
## <chr> <list> <fct>
## 1 ts_obj <fn> 1
## 2 msts_obj <fn> 1
## 3 ts_obj <fn> 2
## 4 msts_obj <fn> 2
## 5 ts_obj <fn> 3
## 6 msts_obj <fn> 3
## 7 ts_obj <fn> 4
## 8 msts_obj <fn> 4
## 9 ts_obj <fn> 5
## 10 msts_obj <fn> 5
## # ℹ 80 more rows
#left join (as our list have more rows than the *walmart_nested* df)
walmart_nested<- left_join(walmart_nested, ts_function)
head(walmart_nested)
## # A tibble: 6 × 8
## # Groups: Store [3]
## Store data ts_decompose msts_decompose train test ts_func_name
## <fct> <list> <list> <list> <list> <list> <chr>
## 1 1 <tibble> <gg> <gg> <tibble> <tibble> ts_obj
## 2 1 <tibble> <gg> <gg> <tibble> <tibble> msts_obj
## 3 2 <tibble> <gg> <gg> <tibble> <tibble> ts_obj
## 4 2 <tibble> <gg> <gg> <tibble> <tibble> msts_obj
## 5 3 <tibble> <gg> <gg> <tibble> <tibble> ts_obj
## 6 3 <tibble> <gg> <gg> <tibble> <tibble> msts_obj
## # ℹ 1 more variable: ts_func <list>
dim(walmart_nested)
## [1] 90 8
As expected, we have 90 rows and 8 columns now in walmart_nested
As our data has multiple seasonalities we will be choosing the following models:
Triple exponential smoothing (Holt Winters): This method is able to handle data with both trend and seasonal components
TBATS (Trigonometric seasonality, Box-Cox, ARMA errors, Trend and Seasonal Components): extension of exponential smoothing methods. This method is able to handle multiple seasonalities at various frequencies using pairs of Sin and Cos functions (also called the Fourier terms). The Fourier terms are handled by adjusting the k-parameter. The higher the k-parameter, the better our model will be at estimating complex seasonalities. However we must still strike a balance to ensure it does not overfit and is able to generalize to unseen datapoints. One advantage of TBATS is that it has the ability to handle trends and irregular data frequencies without us having to explicitly specify the seasonality range. In data with multiple seasonalities, TBATS is often the best performer
STL (Seasonal and Trend decomposition using Loess): STL is a decomposition method that decomposes time series to seasonal, trend and residual components. Given our data has some outliers as well, STL will be suitable as we are able to specify robust composition so that the outlier will only impact the residual components. However STL is mainly suitable to handle data with additive seasonalities.
We will fit each of the models considering both single and multiple seasonalities discussed above for each of the Walmart stores train data, under the assumption of both single and multiple seasonalities.
The approach for processing this will be exactly the same as creating time series objects above.
#The input in the model function should be the converted ts objects
model_function <- list(
stlm = function(x) stlm(x),
tbats = function(x) tbats(x),
HoltWinters = function(x) HoltWinters(x)
)
model_function<- model_function %>%
rep(length(unique(walmart$Store))) %>%
enframe("model_name", "model_func") %>%
mutate(Store = sort(rep(unique(walmart$Store), length(unique(.$model_name)))))
model_function
## # A tibble: 135 × 3
## model_name model_func Store
## <chr> <list> <fct>
## 1 stlm <fn> 1
## 2 tbats <fn> 1
## 3 HoltWinters <fn> 1
## 4 stlm <fn> 2
## 5 tbats <fn> 2
## 6 HoltWinters <fn> 2
## 7 stlm <fn> 3
## 8 tbats <fn> 3
## 9 HoltWinters <fn> 3
## 10 stlm <fn> 4
## # ℹ 125 more rows
Similarly as we have 3 models, we will need to use left_join to merge this table with walmart_nested table. We will expect the resulting table to have 90 x 3 = 270 rows
walmart_nested<- left_join(walmart_nested, model_function)
head(walmart_nested)
## # A tibble: 6 × 10
## # Groups: Store [1]
## Store data ts_decompose msts_decompose train test ts_func_name
## <fct> <list> <list> <list> <list> <list> <chr>
## 1 1 <tibble> <gg> <gg> <tibble> <tibble> ts_obj
## 2 1 <tibble> <gg> <gg> <tibble> <tibble> ts_obj
## 3 1 <tibble> <gg> <gg> <tibble> <tibble> ts_obj
## 4 1 <tibble> <gg> <gg> <tibble> <tibble> msts_obj
## 5 1 <tibble> <gg> <gg> <tibble> <tibble> msts_obj
## 6 1 <tibble> <gg> <gg> <tibble> <tibble> msts_obj
## # ℹ 3 more variables: ts_func <list>, model_name <chr>, model_func <list>
dim(walmart_nested)
## [1] 270 10
To start fitting the models we will to invoke nested fitting:
# Create the params column which consist of train data in list form
walmart_nested <- walmart_nested %>%
mutate(params = map(train, ~list(x = .x)))
# Apply ts_func to data and overwrite the data column. The data contained inside this variable are all time series objects that our model can work with. We then reassign these time series objects to params variable
walmart_nested <- walmart_nested %>%
mutate(
data = invoke_map(ts_func, params),
params = map(data, ~list(x = .x))
)
# Apply model functions to params variable and create the fitted column
walmart_nested <- walmart_nested %>%
mutate(
fitted = invoke_map(model_func, params)
)
# We can access the individual model fittings as follows
head(walmart_nested$fitted[[15]])
## $fitted
## Time Series:
## Start = c(2, 1)
## End = c(3, 13)
## Frequency = 52
## xhat level trend season
## 2.000000 431847.1 386854.2 288.2944 44704.535
## 2.019231 425575.2 389035.4 288.2944 36251.475
## 2.038462 428863.3 390728.9 288.2944 37846.133
## 2.057692 394383.0 392129.3 288.2944 1965.394
## 2.076923 435215.1 393220.3 288.2944 41706.520
## 2.096154 403370.1 394039.2 288.2944 9042.606
## 2.115385 391045.1 394720.0 288.2944 -3963.217
## 2.134615 379996.3 395310.4 288.2944 -15602.393
## 2.153846 374166.4 395793.8 288.2944 -21915.713
## 2.173077 383912.1 396192.7 288.2944 -12568.876
## 2.192308 366124.7 396527.3 288.2944 -30690.931
## 2.211538 391396.7 396851.4 288.2944 -5742.913
## 2.230769 366525.1 397271.1 288.2944 -31034.373
## 2.250000 411957.5 397809.3 288.2944 13859.881
## 2.269231 385272.3 398405.4 288.2944 -13421.452
## 2.288462 363690.3 398989.0 288.2944 -35586.992
## 2.307692 368815.4 399536.4 288.2944 -31009.243
## 2.326923 394533.5 399976.5 288.2944 -5731.310
## 2.346154 391972.2 400257.5 288.2944 -8573.681
## 2.365385 403818.5 400451.2 288.2944 3078.995
## 2.384615 386497.6 400627.4 288.2944 -14418.049
## 2.403846 370209.2 400638.4 288.2944 -30717.451
## 2.423077 395867.5 400572.9 288.2944 -4993.672
## 2.442308 374198.5 400656.5 288.2944 -26746.297
## 2.461538 361425.9 400733.6 288.2944 -39595.944
## 2.480769 346252.6 400792.4 288.2944 -54828.118
## 2.500000 415500.9 400833.4 288.2944 14379.184
## 2.519231 351312.2 399555.2 288.2944 -48531.230
## 2.538462 371778.9 408092.3 288.2944 -36601.719
## 2.557692 384714.9 410633.5 288.2944 -26206.860
## 2.576923 384021.6 405714.6 288.2944 -21981.296
## 2.596154 367409.1 403723.1 288.2944 -36602.325
## 2.615385 380962.7 406832.0 288.2944 -26157.632
## 2.634615 371938.9 405606.7 288.2944 -33956.126
## 2.653846 373755.0 403996.3 288.2944 -30529.627
## 2.673077 409643.2 402786.9 288.2944 6567.981
## 2.692308 358817.8 401287.0 288.2944 -42757.502
## 2.711538 364496.7 404261.5 288.2944 -40053.036
## 2.730769 372573.2 413200.2 288.2944 -40915.356
## 2.750000 452069.7 418304.1 288.2944 33477.355
## 2.769231 417759.5 420685.0 288.2944 -3213.877
## 2.788462 400902.1 418136.6 288.2944 -17522.767
## 2.807692 593241.8 417839.3 288.2944 175114.164
## 2.826923 493914.4 407820.6 288.2944 85805.551
## 2.846154 479021.1 402034.5 288.2944 76698.343
## 2.865385 506187.7 399414.2 288.2944 106485.240
## 2.884615 614951.9 400996.6 288.2944 213667.022
## 2.903846 373686.4 383197.5 288.2944 -9799.332
## 2.923077 379677.2 393949.1 288.2944 -14560.204
## 2.942308 387395.1 399488.2 288.2944 -12381.383
## 2.961538 351561.3 394112.6 288.2944 -42839.614
## 2.980769 369835.7 398447.3 288.2944 -28899.873
## 3.000000 437962.1 392969.2 288.2944 44704.535
## 3.019231 426107.4 389567.6 288.2944 36251.475
## 3.038462 441381.9 403247.5 288.2944 37846.133
## 3.057692 415498.4 413244.7 288.2944 1965.394
## 3.076923 456500.5 414505.7 288.2944 41706.520
## 3.096154 427885.9 418555.0 288.2944 9042.606
## 3.115385 420071.5 423746.5 288.2944 -3963.217
## 3.134615 406366.2 421680.3 288.2944 -15602.393
## 3.153846 402481.4 424108.9 288.2944 -21915.713
## 3.173077 413537.7 425818.3 288.2944 -12568.876
## 3.192308 421160.2 451562.8 288.2944 -30690.931
## 3.211538 446291.4 451746.0 288.2944 -5742.913
## 3.230769 418033.1 448779.2 288.2944 -31034.373
##
## $x
## Time Series:
## Start = c(1, 1)
## End = c(3, 13)
## Frequency = 52
## [1] 461622.2 420729.0 421642.2 407204.9 415202.0 384200.7 375328.6 359949.3
## [9] 423294.4 415870.3 354993.3 339976.7 361248.4 399323.9 384357.9 343763.2
## [17] 350089.2 396968.8 355017.1 364076.8 357346.5 381151.7 349214.2 352728.8
## [25] 352864.5 347955.0 402635.8 339597.4 351728.2 362134.1 366474.0 352261.0
## [33] 363064.6 355626.9 358784.1 395107.3 345584.4 348896.0 348591.7 423175.6
## [41] 386635.0 372545.3 565567.8 476420.8 467642.0 498159.4 605990.4 382677.8
## [49] 378241.3 381061.1 350876.7 364866.2 438516.5 430526.2 432782.1 397211.2
## [57] 437084.5 404753.3 392109.5 380683.7 374556.1 384075.3 366250.7 391860.0
## [65] 367405.4 413042.1 386312.7 364603.1 369350.6 394507.8 391638.8 403423.3
## [73] 385520.7 368962.7 395146.2 373454.3 360617.4 345381.3 409981.2 380376.8
## [81] 379716.9 366367.5 375988.7 377347.5 375629.5 365248.9 368477.9 403342.4
## [89] 368282.6 394976.4 389540.6 459443.2 407764.2 398839.0 556925.2 472511.3
## [97] 468772.8 510747.6 551221.2 410553.9 398178.2 367438.6 365818.6 349518.1
## [105] 424960.7 473292.5 475591.1 418925.5 469752.6 445162.0 411775.8 413907.2
## [113] 407488.8 503232.1 420789.7 434822.1 394616.1
##
## $alpha
## alpha
## 0.2838105
##
## $beta
## beta
## 0
##
## $gamma
## gamma
## 0
##
## $coefficients
## a b s1 s2 s3 s4
## 442421.4874 288.2944 13859.8806 -13421.4525 -35586.9916 -31009.2428
## s5 s6 s7 s8 s9 s10
## -5731.3095 -8573.6814 3078.9951 -14418.0486 -30717.4513 -4993.6716
## s11 s12 s13 s14 s15 s16
## -26746.2969 -39595.9437 -54828.1177 14379.1843 -48531.2299 -36601.7187
## s17 s18 s19 s20 s21 s22
## -26206.8603 -21981.2957 -36602.3253 -26157.6315 -33956.1258 -30529.6273
## s23 s24 s25 s26 s27 s28
## 6567.9813 -42757.5024 -40053.0357 -40915.3564 33477.3552 -3213.8767
## s29 s30 s31 s32 s33 s34
## -17522.7665 175114.1638 85805.5514 76698.3431 106485.2401 213667.0224
## s35 s36 s37 s38 s39 s40
## -9799.3317 -14560.2042 -12381.3828 -42839.6138 -28899.8731 44704.5349
## s41 s42 s43 s44 s45 s46
## 36251.4749 37846.1325 1965.3941 41706.5200 9042.6058 -3963.2168
## s47 s48 s49 s50 s51 s52
## -15602.3928 -21915.7126 -12568.8757 -30690.9306 -5742.9129 -31034.3727
params = map(data, ~list(x = .x)),
fitted = invoke_map(model_func, params))
To make the forecast, we use the map function again to apply the forecast function to the fitted models, in fitted. The forecast horizon is 26 weeks (6 months forward)
To calculate MAE, we will use the test datasets for each store, extract the Weekly_Sales column and compare it with the mean column from the forecast object generated above. The result will be stored in the MAE_test column, and the it will be expressed in the same unit as the variable we are trying to forecast. As such in this case, the MAE will be the average weekly sales of each Walmart store, in which the unit is in US Dollars
Lastly we will also be removing the columns that are no longer necessary for model evaluation and model improvement stage.
walmart_nested<- walmart_nested %>%
mutate(MAE_test = map(fitted, ~forecast(.x, h = 26)) %>%
map2_dbl(test, ~mae_vec(truth = .y$Weekly_Sales, estimate = .x$mean))) %>%
select(-c("ts_func", "model_func", "params", "data")) %>%
arrange(Store, MAE_test)
head(walmart_nested)
## # A tibble: 6 × 9
## # Groups: Store [1]
## Store ts_decompose msts_decompose train test ts_func_name model_name
## <fct> <list> <list> <list> <list> <chr> <chr>
## 1 1 <gg> <gg> <tibble> <tibble> ts_obj stlm
## 2 1 <gg> <gg> <tibble> <tibble> ts_obj HoltWinters
## 3 1 <gg> <gg> <tibble> <tibble> msts_obj HoltWinters
## 4 1 <gg> <gg> <tibble> <tibble> ts_obj tbats
## 5 1 <gg> <gg> <tibble> <tibble> msts_obj stlm
## 6 1 <gg> <gg> <tibble> <tibble> msts_obj tbats
## # ℹ 2 more variables: fitted <list>, MAE_test <dbl>
As mentioned before, MAE is expressed in the same unit as the forecasted object. Below we inspect the average weekly sales over the entire dataset for all of the Walmart stores
walmart_avgsales <- walmart %>%
group_by(Store) %>%
summarise(avg_sales = mean(Weekly_Sales))
walmart_avgsales
## # A tibble: 45 × 2
## Store avg_sales
## <fct> <dbl>
## 1 1 1555264.
## 2 2 1925751.
## 3 3 402704.
## 4 4 2094713.
## 5 5 318012.
## 6 6 1564728.
## 7 7 570617.
## 8 8 908750.
## 9 9 543981.
## 10 10 1899425.
## # ℹ 35 more rows
For example, for Store 1, the average weekly sales is USD 1,555,264.4. We compare this against the MAE of the best performing model, which is stlm model assuming single seasonality, which is USD 38,483.731. As such, the MAE expressed in percentage for Store 1’s best performing model is 2.5%
print(paste("The MAE expressed in percentage for Store 1 is:", round(38483.7/1555264.4,3)*100, "%"))
## [1] "The MAE expressed in percentage for Store 1 is: 2.5 %"
We would like to apply Ljung-Box test on our fitted values to check if any autocorrelation is present. There are two settings on the Ljung-Box test i.e. ‘response’ and ‘innovation’ whereby the former is used to check autocorrelation in response variable which is Weekly_Sales in our case, and the latter is used to check for autocorrelation in the residuals of our model.
The hypothesis testing statement is as follows:
H0: no-autocorrelation in response / innovation, p-value > 0.05 H1: autocorrelation in response / innovation , p-value < 0.05
#Create the function for box test
box_test_response <- function(df) {
test <- Box.test(residuals(df, type = "response"))
p_value <- test$p.value
return(p_value)
}
box_test_residuals <- function(df) {
test <- Box.test(residuals(df, type = "innovation"))
p_value <- test$p.value
return(p_value)
}
# Map the function to the fitted column in the nested table
walmart_nested <- walmart_nested %>%
mutate(
box_test_response = map(fitted, ~box_test_response(.)),
box_test_innovation = map(fitted, ~box_test_residuals(.))
)
#Access one of the results
walmart_nested$box_test_innovation[[1]]
## [1] 0.8574428
To make it easy, we will visualize the p-values of the Ljung-Box test
# unnest the nested table
walmart_nested_unnested <- walmart_nested %>%
group_by(Store) %>%
filter(MAE_test == min(MAE_test)) %>%
distinct(MAE_test, .keep_all = TRUE) %>%
unnest(box_test_innovation, box_test_response) %>%
select(Store, MAE_test, box_test_innovation, box_test_response)
#create threshold for null and alternative hypothesis
walmart_nested_unnested <- walmart_nested_unnested %>%
mutate(BelowThreshold = box_test_response < 0.05)
# Response variable plot
ggplot(walmart_nested_unnested, aes(x = Store, y = box_test_response, color = BelowThreshold)) +
geom_point() +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
labs(x = "Store", y = "P-Value", color = "Below 0.05",
title = "Ljung-Box Test P-Values by Store for Response Variable") +
theme_minimal()
# Residuals plot
ggplot(walmart_nested_unnested, aes(x = Store, y = box_test_innovation, color = BelowThreshold)) +
geom_point() +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
labs(x = "Store", y = "P-Value", color = "Below 0.05",
title = "Ljung-Box Test P-Values by Store for Residuals") +
theme_minimal()
43 out of 45 stores forecast have met our assumptions for no
autocorrelation in residuals and response variables. The 2 stores who
violate the assumption is Store no 4 and no 37. This means there are
systematic patterns or dependencies in the residuals that the model has
not captured by the models we apply. We would ideally need to go back
and apply different model, perform transformations (e.g. square root),
eliminate outliers or choose a different model for this two.
walmart_nested_unnested %>%
filter(BelowThreshold == TRUE)
## # A tibble: 2 × 5
## # Groups: Store [2]
## Store MAE_test box_test_innovation box_test_response BelowThreshold
## <fct> <dbl> <dbl> <dbl> <lgl>
## 1 4 39333. 0.0335 0.0335 TRUE
## 2 37 10924. 0.0442 0.0442 TRUE
We will assess the normality of residuals visually by plotting the histograms of the residuals. We understand from above that no autocorrelation assumptions for residuals are violated by store no 4 and store no 37’s forecast. If we look at the plots below, we can see that their residuals are not normally distributed i.e. the central mean of residuals which is 0 is not the highest in the histogram. Some of the other stores’ histogram are approximately normally distributed but many have leptokurtic kurtosis and are skewed.
normality_residuals <- function(df) {
hist(residuals(df), breaks = 30,
col = "blue", border = "red", xlab = "Normal distribution")
}
# Map the function to the fitted column in the nested table
walmart_nested %>%
mutate(
histograms = map(fitted, ~normality_residuals(.))
)
## # A tibble: 270 × 12
## # Groups: Store [45]
## Store ts_decompose msts_decompose train test ts_func_name model_name
## <fct> <list> <list> <list> <list> <chr> <chr>
## 1 1 <gg> <gg> <tibble> <tibble> ts_obj stlm
## 2 1 <gg> <gg> <tibble> <tibble> ts_obj HoltWinters
## 3 1 <gg> <gg> <tibble> <tibble> msts_obj HoltWinters
## 4 1 <gg> <gg> <tibble> <tibble> ts_obj tbats
## 5 1 <gg> <gg> <tibble> <tibble> msts_obj stlm
## 6 1 <gg> <gg> <tibble> <tibble> msts_obj tbats
## 7 2 <gg> <gg> <tibble> <tibble> ts_obj stlm
## 8 2 <gg> <gg> <tibble> <tibble> ts_obj tbats
## 9 2 <gg> <gg> <tibble> <tibble> ts_obj HoltWinters
## 10 2 <gg> <gg> <tibble> <tibble> msts_obj HoltWinters
## # ℹ 260 more rows
## # ℹ 5 more variables: fitted <list>, MAE_test <dbl>, box_test_response <list>,
## # box_test_innovation <list>, histograms <list>
The best model for each store forecast is given as follows. Note that if the MAE is the same, there can be two identical models suitable for each store forecast.
best_model <- walmart_nested %>%
group_by(Store) %>%
filter(MAE_test == min(MAE_test)) %>%
ungroup() %>%
left_join(walmart_avgsales, by = "Store") %>%
mutate(MAE_percentage = round((MAE_test / avg_sales) * 100, 2)) %>%
select(-ts_decompose, -msts_decompose, -train, -test)
best_model
## # A tibble: 60 × 9
## Store ts_func_name model_name fitted MAE_test box_test_response
## <fct> <chr> <chr> <list> <dbl> <list>
## 1 1 ts_obj stlm <stlm> 38484. <dbl [1]>
## 2 2 ts_obj stlm <stlm> 53361. <dbl [1]>
## 3 3 ts_obj HoltWinters <HltWntrs> 11551. <dbl [1]>
## 4 3 msts_obj HoltWinters <HltWntrs> 11551. <dbl [1]>
## 5 4 ts_obj stlm <stlm> 39333. <dbl [1]>
## 6 5 ts_obj stlm <stlm> 12464. <dbl [1]>
## 7 6 ts_obj stlm <stlm> 44996. <dbl [1]>
## 8 7 ts_obj tbats <tbats> 24600. <dbl [1]>
## 9 8 ts_obj HoltWinters <HltWntrs> 27576. <dbl [1]>
## 10 8 msts_obj HoltWinters <HltWntrs> 27576. <dbl [1]>
## # ℹ 50 more rows
## # ℹ 3 more variables: box_test_innovation <list>, avg_sales <dbl>,
## # MAE_percentage <dbl>
#Proportion of multiple seasonalities vs single seasonality in best model
prop.table(table(best_model$ts_func_name))
##
## msts_obj ts_obj
## 0.2666667 0.7333333
#Proportion of best model
prop.table(table(best_model$model_name))
##
## HoltWinters stlm tbats
## 0.5000000 0.3666667 0.1333333
#Average equally weighted MAE in our dataset(in %)
round(mean(best_model$MAE_test) / mean(best_model$avg_sales)*100,2)
## [1] 3.51
#Visualize
ggplot(data = best_model, aes(x = Store, y = MAE_percentage)) +
geom_point() +
labs(x = "Store", y = "MAE Percentage", title = "MAE(% of avg sales) of each Walmart Store") +
theme_minimal()
#MAE distribution
ggplot(data = best_model, aes(x = MAE_percentage)) +
geom_histogram(binwidth = 0.25, fill = "red", color = "black") +
labs(x = "MAE Percentage", y = "Frequency", title = "Distribution of MAE Percentage") +
theme_minimal()
library(timetk)
validation_forecast <- walmart_nested %>%
select(Store, train, everything(), -test) %>%
mutate(forecast = map(fitted, ~forecast(.x, h = 26)) %>%
map2(train, ~tibble(Date = timetk::tk_make_future_timeseries(.y$Date, 26),
Weekly_Sales = as.vector(.x$mean))))
validation_forecast<- validation_forecast %>%
group_by(Store) %>%
select(Store, actual = train, forecast) %>%
gather(key, value, -Store) %>%
unnest(value)
# Display the validation forecast plot
store_ids <- unique(validation_forecast$Store)
for (store_id in store_ids) {
store_data <- validation_forecast[validation_forecast$Store == store_id, ]
p <- ggplot(store_data, aes(x = Date, y = Weekly_Sales, colour = key)) +
geom_line() +
labs(title = paste("Store", store_id)) +
theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
The best model that applies to most of our store forecasts is HoltWinters Triple Exponential Smoothing model with both beta and gamma parameter set to F. As we have seen from decomposition plots, the annual seasonality look like strongest. Evidently most of the best models are those assuming single seasonality
The following needs to be applied in future projects: