[1] Setup

knitr::opts_chunk$set(echo = TRUE, warning = F, comment = "", fig.width = 10)

library(tidyverse)
library(lubridate)
library(TSstudio)
library(zoo)

[2] Data Cleaning

# load data
load("./product.RData")

head(product)
        week sales price discount leaflet stores
1 2011-01-03  2573  0.72     0.17       1     26
2 2011-01-10  3099  0.56     0.29       1     26
3 2011-01-17  1186  0.73     0.15       0     26
4 2011-01-24   750  0.89     0.01       0     26
5 2011-01-31   922  0.89     0.01       0     26
6 2011-02-07   885  0.89     0.01       0     26
# check missing weeks
week_check <- seq.Date(min(product$week), max(product$week), by = "week")
setdiff(week_check, product$week)
numeric(0)
# check numeber of weeks
product$nweek <- week(product$week) 
table(product$nweek)

 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
 7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  6 
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
 6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6 
53 
 1 
# remove week 53 for seasonality
product <- product %>% filter(nweek < 53)
table(product$nweek)

 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
 7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  6 
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
 6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6 
# remove first 3 weeks (because promotional)
product <- product[-(1:3),]

# adjust anomaly in discount values 
product$discount[1:3] <- 0
# summary
knitr::kable(summary(product))
week sales price discount leaflet stores nweek
Min. :2011-01-24 Min. : 438.0 Min. :0.3600 Min. :0.00000 Min. :0.00000 Min. :19.00 Min. : 1.00
1st Qu.:2012-08-28 1st Qu.: 795.2 1st Qu.:0.9000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:28.00 1st Qu.:13.00
Median :2014-04-10 Median : 966.5 Median :0.9400 Median :0.00000 Median :0.00000 Median :30.00 Median :25.00
Mean :2014-04-08 Mean :1301.6 Mean :0.9175 Mean :0.02904 Mean :0.06587 Mean :30.38 Mean :25.71
3rd Qu.:2015-11-14 3rd Qu.:1387.0 3rd Qu.:0.9800 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:33.00 3rd Qu.:38.75
Max. :2017-06-19 Max. :7123.0 Max. :1.0000 Max. :0.26000 Max. :1.00000 Max. :34.00 Max. :52.00

[3] Time Series Analysis

# original series
series <- ts(product$sales, start = c(2011,4), frequency = 52)
series
Time Series:
Start = c(2011, 4) 
End = c(2017, 25) 
Frequency = 52 
  [1]  750  922  885  711  783  808  833  859  895 1049 1169 1246 1073 1214 1168
 [16] 1439 1285 1515 1686 1458 1378 6057 5845 2069 2316 1700 1269 1327 1420 1677
 [31] 2174 1706 1370 1487 1185 3702 3195  876  773  868  901  894  795  733  876
 [46]  833  738  815 1717 1407 1013  910  824  822  915 1664 1860 1029  896  950
 [61]  816 1125 1198  883  866 1038 1098 1142 1152 1056 1281 1282 1239 2251 2049
 [76] 1917 2172 1787 1802 1790 1731 1406 2022 3447 3408 1384  944  947  910  859
 [91]  836  802  842  888  686  777  818 2256 2600 1022  844  867  859  730  850
[106]  981  930  939  809  438 2238 4136 1004  991  908 1285 1018 1090 3282 3318
[121] 1966 1116 1231 1609 2323 1485 6754 7123 2082 2242 2377 2335 1336 1237 1106
[136] 1388 1164  934 1008  968 3115 2615 1088  993  811  879  814  782 1603 1841
[151]  993  968  951  970  852  860 1935 2020 1737  927  877  730  863  958  698
[166]  891  972  870  982  904 3427 3969 1443 1232 1149 1500 1373 1333 1222 1304
[181] 1416 1425 4895 4892 1465 1133  862  965 1067 1103  828  830  794  872  716
[196]  795 1674 1656  721  675  626  643  727  689  788  677  752  747  647  824
[211]  640  785  639  751  776  843 2506 3102  896 1178  901  812  886  902  822
[226]  797 1375 1463 1105 1088 1430 1363 1636 1611 1085 1285 5457 4421 2129 1247
[241]  905  983  789  849  693  710 1401 1527  895  758  648  643  695  685  682
[256]  740  767  817  618  559  648  689  659  572  647  772  670  694  792 1124
[271]  845  890  861  765  796  659  645  856  736  805  901 1082 1157 1124 1019
[286] 1061 4297 4583 2027 1222 1040  945 1048 1060  855  740  720  701 1097 1220
[301]  845  596  512  586  616  567  624  612  585  720  649  541  601  651  622
[316]  588  912 1228  784  712  787  780  701  914  597  741 2796 2924 1560  997
[331] 1291 1248 1248 1360
ts_plot(series, line.mode = "lines+markers", Ygrid = T, slider = T, Xtitle = "Week",
        Ytitle = "Sales", title = "Product Time Series")
# create sales baseline
product$baseline <- product$sales
product$baseline[product$discount > 0] <- NA
product$baseline <- na.approx(product$baseline)

# create price baseline
product$baseprice <- product$price
product$baseprice[product$discount > 0] <- NA
product$baseprice <- na.approx(product$baseprice)

baseline <- ts(product$baseline, start = c(2011,4), frequency = 52)

ts_plot(baseline, line.mode = "lines+markers", Ygrid = T, slider = T, Xtitle = "Week",
        Ytitle = "Sales", title = "Baseline Time Series")
# baseline decomposition
ts_decompose(baseline)
# baseline seasonality focus
ts_seasonal(baseline, type = "normal")
ts_seasonal(baseline, type = "cycle")
# baseline autocorrelation
ts_cor(baseline, lag.max = 104, seasonal_lags = 52)
# baseline difference
baselineDiff <- diff(baseline)

ts_plot(baselineDiff, line.mode = "lines+markers", Ygrid = T, slider = T, Xtitle = "Week",
        Ytitle = "Sales", title = "Baseline Differencing")
ts_cor(baselineDiff, lag.max = 104, seasonal_lags = 52)

[4] Baseline Forecasting

# external regressors
regressor <- list(
  train = data.frame(
    baseprice = product$baseprice,
    stores = product$stores
  ),
  forecast = data.frame(
    baseprice = rep(0.98, 52),
    stores = rep(34, 52)
  )
)

# models
methods <- list(
  arima1 = list(
    method = "auto.arima",
    notes = "Arima",
    method_arg = list()
  ),
  arima2 = list(
    method = "auto.arima",
    notes = "Arima with lambda",
    method_arg = list(lambda = "auto")
  ),
  tslm1 = list(
    method = "tslm",
    notes = "TSLM",
    method_arg = list(formula = input ~ trend + season)
  ),
  tslm2 = list(
    method = "tslm",
    notes = "TSLM with regressors",
    method_arg = list(formula = input ~ trend + season + baseprice + stores)
  ),
  nnetar = list(
    method = "nnetar",
    notes = "Default nnetar",
    method_arg = list()
  )
)

# training and testing
models <- train_model(
  input = baseline,
  methods = methods,
  train_method = list(
    partitions = 4,
    sample.out = 52,
    space = 52
  ),
  horizon = 52,
  error = "MAPE",
  xreg = regressor
)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Registered S3 methods overwritten by 'forecast':
  method             from    
  fitted.fracdiff    fracdiff
  residuals.fracdiff fracdiff
# A tibble: 5 x 7
  model_id model   notes     avg_mape avg_rmse `avg_coverage_8… `avg_coverage_9…
  <chr>    <chr>   <chr>        <dbl>    <dbl>            <dbl>            <dbl>
1 tslm2    tslm    TSLM wit…    0.159     236.            0.812            0.909
2 tslm1    tslm    TSLM         0.159     224.            0.875            0.933
3 arima1   auto.a… Arima        0.168     229.            0.856            0.942
4 arima2   auto.a… Arima wi…    0.172     239.           NA               NA    
5 nnetar   nnetar  Default …    0.201     258.           NA               NA    
plot_model(models)
# forecasting (regular)
plot_forecast(models$forecast$tslm2$forecast)

[5] Promotional Forecasting

# external regressors
regressor_promo <- list(
  train = data.frame(
    price = product$price,
    stores = product$stores,
    discount = product$discount,
    leaflet = as.factor(product$leaflet)
  ),
  forecast = data.frame(
    price = c(rep(0.72,3),rep(0.98,4),rep(0.68,3),rep(0.98,10),rep(0.88,3),rep(0.98,50)),
    stores = rep(34, 73),
    discount = c(rep(0.26,3),rep(0,4),rep(0.30,3),rep(0,10),rep(0.1,3),rep(0,50)),
    leaflet = c(rep("1",3),rep("0",4),rep("1",3),rep("0",10),rep("0",3),rep("0",50))
  )
)

# models
methods <- list(
  tslm1 = list(
    method = "tslm",
    notes = "TSLM with price",
    method_arg = list(formula = input ~ trend + season + price)
  ),
  tslm2 = list(
    method = "tslm",
    notes = "TSLM with stores",
    method_arg = list(formula = input ~ trend + season + stores)
  ),
  tslm3 = list(
    method = "tslm",
    notes = "TSLM with discount",
    method_arg = list(formula = input ~ trend + season + discount)
  ),
  tslm4 = list(
    method = "tslm",
    notes = "TSLM with leaflet",
    method_arg = list(formula = input ~ trend + season + leaflet)
  ),
  tslm5 = list(
    method = "tslm",
    notes = "TSLM with price & stores",
    method_arg = list(formula = input ~ trend + season + price + stores)
  ),
  tslm6 = list(
    method = "tslm",
    notes = "TSLM with price & discount",
    method_arg = list(formula = input ~ trend + season + price + discount)
  ),
  tslm7 = list(
    method = "tslm",
    notes = "TSLM with price & leaflet",
    method_arg = list(formula = input ~ trend + season + price + leaflet)
  ),
  tslm8 = list(
    method = "tslm",
    notes = "TSLM with stores & discount",
    method_arg = list(formula = input ~ trend + season + stores + discount)
  ),
  tslm9 = list(
    method = "tslm",
    notes = "TSLM with stores & leaflet",
    method_arg = list(formula = input ~ trend + season + stores + leaflet)
  ),
  tslm10 = list(
    method = "tslm",
    notes = "TSLM with discount & leaflet",
    method_arg = list(formula = input ~ trend + season + discount + leaflet)
  ),
  tslm11 = list(
    method = "tslm",
    notes = "TSLM with price & stores & discount",
    method_arg = list(formula = input ~ trend + season + price + stores + discount)
  ),
  tslm12 = list(
    method = "tslm",
    notes = "TSLM with price & stores & leaflet",
    method_arg = list(formula = input ~ trend + season + price + stores + leaflet)
  ),
  tslm13 = list(
    method = "tslm",
    notes = "TSLM with price & leaflet & discount",
    method_arg = list(formula = input ~ trend + season + price + discount + leaflet)
  ),
  tslm14 = list(
    method = "tslm",
    notes = "TSLM with leaflet & stores & discount",
    method_arg = list(formula = input ~ trend + season + stores + discount + leaflet)
  ),
  tslm15 = list(
    method = "tslm",
    notes = "TSLM with all regressors",
    method_arg = list(formula = input ~ trend + season + price + stores + discount + leaflet)
  )
)

# training and testing
models <- train_model(
  input = series,
  methods = methods,
  train_method = list(
    partitions = 4,
    sample.out = 52,
    space = 52
  ),
  horizon = 73,
  error = "MAPE",
  xreg = regressor_promo
)
# A tibble: 15 x 7
   model_id model notes      avg_mape avg_rmse `avg_coverage_8… `avg_coverage_9…
   <chr>    <chr> <chr>         <dbl>    <dbl>            <dbl>            <dbl>
 1 tslm13   tslm  TSLM with…    0.256     500.            0.827            0.933
 2 tslm7    tslm  TSLM with…    0.262     511.            0.865            0.942
 3 tslm12   tslm  TSLM with…    0.262     507.            0.851            0.942
 4 tslm10   tslm  TSLM with…    0.275     507.            0.827            0.928
 5 tslm15   tslm  TSLM with…    0.277     501.            0.837            0.938
 6 tslm14   tslm  TSLM with…    0.301     513.            0.822            0.933
 7 tslm6    tslm  TSLM with…    0.307     591.            0.827            0.928
 8 tslm11   tslm  TSLM with…    0.320     592.            0.832            0.923
 9 tslm3    tslm  TSLM with…    0.324     598.            0.837            0.918
10 tslm4    tslm  TSLM with…    0.335     585.            0.846            0.947
11 tslm8    tslm  TSLM with…    0.339     601.            0.837            0.918
12 tslm9    tslm  TSLM with…    0.349     599.            0.822            0.942
13 tslm5    tslm  TSLM with…    0.350     704.            0.861            0.938
14 tslm1    tslm  TSLM with…    0.364     710.            0.856            0.938
15 tslm2    tslm  TSLM with…    0.527     998.            0.856            0.928
plot_model(models)
# best model
models$forecast$tslm13$model$coefficients
 (Intercept)        trend      season2      season3      season4      season5 
 2433.888649    -1.093482  -119.209296  -226.165624  -179.529472   -72.761886 
     season6      season7      season8      season9     season10     season11 
 -175.363786   -57.942078   -86.062556  -224.845254  -289.738224   -14.972566 
    season12     season13     season14     season15     season16     season17 
 -161.431343  -197.412617   -58.493033   140.947579    13.959957    51.012517 
    season18     season19     season20     season21     season22     season23 
  294.744278   109.655646   -28.821772   211.203405   362.500017   453.634421 
    season24     season25     season26     season27     season28     season29 
  495.830578   772.749435  1137.972682   951.760214   758.621490   729.193232 
    season30     season31     season32     season33     season34     season35 
 1078.145352   812.350492   733.521192   470.328690   547.182381   611.490272 
    season36     season37     season38     season39     season40     season41 
  382.224418   151.081751    31.439084   -57.030028  -355.235759  -441.827210 
    season42     season43     season44     season45     season46     season47 
 -278.872619  -356.765376  -325.794141  -156.434245  -263.370849  -170.408823 
    season48     season49     season50     season51     season52        price 
 -104.008010  -307.890846  -407.550307   -57.701562   -17.844332 -1428.645392 
    discount     leaflet1 
 5032.914756  1542.832332 
# forecasting (regular + promotions)
plot_forecast(models$forecast$tslm13$forecast)