Final Project

About Store Sales Data set

The goal is to forecast the total amount of product sold by shop for the data set. The data is provided by the Russian software firm - 1C. The daily historical data of stores and products are provided from Jan-2013 to Oct-2015. We need to forecast the sales for shop and products for Nov-2015.

Why I have selected this dataset? The store sales is something helped to study behavior of human For instance it help us to understand the nature of demography , what they purchase and how price defined the point of purchase.

What drives variation in the variable? I think pricing will be bit seasonal as well as dependent on the supply chain of the product. The festive discounts and other factor also can lead to change in buying behavior. The forecast of the pricing will be bit moderate as there are chances of variation on date level.

To begin the project we will install some libraries for data wrangling and we will import the data set.

options(warn=-1)
rm(list = ls())
pkgs <- c("dplyr", "tidyverse", "lubridate", "data.table", "ggplot2","knitr","tseries","forecast","DT","prophet")
lib <- installed.packages()[, "Package"]
install.packages(setdiff(pkgs, lib))
setwd("C:\\Users\\nawal\\OneDrive\\Documents\\Course Material\\Time Series")

sales_train<- read.csv("sales_train.csv")
head(sales_train)

##         date date_block_num shop_id item_id item_price item_cnt_day
## 1 02.01.2013              0      59   22154     999.00            1
## 2 03.01.2013              0      25    2552     899.00            1
## 3 05.01.2013              0      25    2552     899.00           -1
## 4 06.01.2013              0      25    2554    1709.05            1
## 5 15.01.2013              0      25    2555    1099.00            1
## 6 10.01.2013              0      25    2564     349.00            1

dim(sales_train)

## [1] 2935849       6

str(sales_train)

## 'data.frame':    2935849 obs. of  6 variables:
##  $ date          : chr  "02.01.2013" "03.01.2013" "05.01.2013" "06.01.2013" ...
##  $ date_block_num: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ shop_id       : int  59 25 25 25 25 25 25 25 25 25 ...
##  $ item_id       : int  22154 2552 2552 2554 2555 2564 2565 2572 2572 2573 ...
##  $ item_price    : num  999 899 899 1709 1099 ...
##  $ item_cnt_day  : num  1 1 -1 1 1 1 1 1 1 3 ...

summary(sales_train)

##      date           date_block_num     shop_id      item_id     
##  Length:2935849     Min.   : 0.00   Min.   : 0   Min.   :    0  
##  Class :character   1st Qu.: 7.00   1st Qu.:22   1st Qu.: 4476  
##  Mode  :character   Median :14.00   Median :31   Median : 9343  
##                     Mean   :14.57   Mean   :33   Mean   :10197  
##                     3rd Qu.:23.00   3rd Qu.:47   3rd Qu.:15684  
##                     Max.   :33.00   Max.   :59   Max.   :22169  
##    item_price        item_cnt_day     
##  Min.   :    -1.0   Min.   : -22.000  
##  1st Qu.:   249.0   1st Qu.:   1.000  
##  Median :   399.0   Median :   1.000  
##  Mean   :   890.9   Mean   :   1.243  
##  3rd Qu.:   999.0   3rd Qu.:   1.000  
##  Max.   :307980.0   Max.   :2169.000

Description of sales training data set

Shop_id : unique identifier for the shop

item_id: unique identifier of a product

item_price: current price of an item

date : date in format dd/mm/yyyy

item_cnt_day : number of product sold

date_block_num : a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,…, October 2015 is 33

Along with sales training data set we have information about item name , categories name , shop name which we won’t be using in our data set since the naming are in Russian which won’t be easy for an Indian to interpret :) . We will try to forecast our model on the shop_id of highest selling by volume. We will be doing summary statisting and data wrangling in next step.

options(warn=-1)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(knitr)
sales_train$date1<- as.Date(sales_train$date,"%d.%m.%Y")
sales_train$year<- lubridate::year(sales_train$date1)
sales_train$month<- lubridate::month(sales_train$date1)
sales_train$day<- lubridate::day(sales_train$date1)

sales_train_store <- sales_train%>%group_by(shop_id,date_block_num)%>%
  summarise(package_moved =sum(item_cnt_day))%>%arrange(shop_id,date_block_num)

## `summarise()` has grouped output by 'shop_id'. You can override using the
## `.groups` argument.

sales_train_store <- filter(sales_train_store , date_block_num %in% 22:33)

summary(sales_train_store)

##     shop_id   date_block_num package_moved  
##  Min.   : 2   Min.   :22.0   Min.   :   -1  
##  1st Qu.:18   1st Qu.:24.0   1st Qu.:  966  
##  Median :34   Median :27.0   Median : 1340  
##  Mean   :32   Mean   :27.3   Mean   : 1915  
##  3rd Qu.:47   3rd Qu.:30.0   3rd Qu.: 2084  
##  Max.   :59   Max.   :33.0   Max.   :14610

sales_store_req <- sales_train %>% group_by(shop_id)%>%
  summarise(package_moved = sum(item_cnt_day))%>%arrange(desc(package_moved ))

### shop id =31 have highest number of package moved in category so we cut_off our data only for this id

sales_train_top_store <- filter(sales_train, shop_id ==31)

sales_train_top_store$shop_id<- as.vector(sales_train_top_store$shop_id)

sales_train_top_store_month <- sales_train_top_store%>% group_by(shop_id,date1)%>% 
  summarise(package_moved = sum(item_cnt_day))%>% arrange(date1)

## `summarise()` has grouped output by 'shop_id'. You can override using the
## `.groups` argument.

ggplot2::ggplot(sales_train_top_store_month, ggplot2::aes(x=date1,y=package_moved,color=shop_id))+xlab("Year")+ylab("Package Moved")+ggtitle("Goods bought from Shop ID -31 in YoY")+
  geom_point()+geom_line()+theme_classic()+theme(panel.grid=element_blank())

options(warn=-1)
boxplot(sales_train_top_store_month$package_moved,main = "Average Box moved on daily basis by store")

summary(sales_train_top_store_month$package_moved)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   102.0   221.5   275.0   301.4   358.5  1080.0

As box plot and summary stats suggest there are some outlier that we need to remove or either impute some max value

options(warn=-1)

sales_train_top_store_month$package_moved <- 
  ifelse(sales_train_top_store_month$package_moved > 700 ,650,sales_train_top_store_month$package_moved)

ggplot2::ggplot(sales_train_top_store_month, ggplot2::aes(x=date1,y=package_moved,color=shop_id))+xlab("Year")+ylab("Package Moved")+ggtitle("Goods bought from Shop ID -31 in YoY")+
  geom_point()+geom_line()+theme_classic()+theme(panel.grid=element_blank())

Visualization look better as compared to the last as we have imputed the values for the outliers with mean +3*sd

sales_train_sale_log = sales_train_top_store_month%>% mutate(log_sales= log1p(package_moved))

ggplot(sales_train_sale_log,aes(x=date1,y=log_sales))+
  geom_line()+theme_classic()+
  ylab("Total Store Sale")+
  xlab("Date")+
  ggtitle("Time Series with log transformation")

We will make the series mean stationary by subtracting the every value from the mean

sales_train_sale_log_mean <- sales_train_sale_log%>% arrange(date1)%>%  mutate(lag_diff = log_sales-lag(log_sales))


sales_train_sale_log_mean<- na.omit(sales_train_sale_log_mean)

ggplot(sales_train_sale_log_mean,aes(x=date1,y=lag_diff))+
  geom_line()+theme_classic()+
  ylab("Total Store Sale")+
  xlab("Date")+
  ggtitle("Time Series with log transformation & Mean Transformation")

options(warn=-1)
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

adf.test(sales_train_sale_log_mean$lag_diff)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales_train_sale_log_mean$lag_diff
## Dickey-Fuller = -11.686, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary

options(warn=-1)
kpss.test(sales_train_sale_log_mean$lag_diff)

## 
##  KPSS Test for Level Stationarity
## 
## data:  sales_train_sale_log_mean$lag_diff
## KPSS Level = 0.012204, Truncation lag parameter = 7, p-value = 0.1

KPSS and ADF p-value suggest that our series is stationary. Now, we will do ACF and PACF to check the order of time series

par(mfrow = c(1,2))
acf(sales_train_sale_log_mean$lag_diff,lag.max=20)
pacf(sales_train_sale_log_mean$lag_diff,lag.max=20)

PACF and ACF suggest mix of AR and MA model we will try multiple model to check the best fit for our data

options(warn=-1)
BIC(
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,1)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,2)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,3)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,8)),
  arima(sales_train_sale_log_mean$log_sales,order=c(1,1,1)),
  arima(sales_train_sale_log_mean$log_sales,order=c(2,1,0)),
  arima(sales_train_sale_log_mean$log_sales,order=c(3,1,0)),
  arima(sales_train_sale_log_mean$log_sales,order=c(5,0,0)),
  arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0))
)

##                                                                df        BIC
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 1))  2 186.527485
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 2))  3 -10.219906
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 3))  4  -5.810333
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 8))  9 -82.243017
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 1))  3  34.485524
## arima(sales_train_sale_log_mean$log_sales, order = c(2, 1, 0))  3 195.005310
## arima(sales_train_sale_log_mean$log_sales, order = c(3, 1, 0))  4 166.535651
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 0, 0))  7  74.316992
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 1, 0))  6 -68.867420

AIC(
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,1)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,2)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,3)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,4)),
  arima(sales_train_sale_log_mean$log_sales,order=c(0,1,8)),
  arima(sales_train_sale_log_mean$log_sales,order=c(1,1,5)),
  arima(sales_train_sale_log_mean$log_sales,order=c(1,1,6)),
  arima(sales_train_sale_log_mean$log_sales,order=c(1,1,7)),
  arima(sales_train_sale_log_mean$log_sales,order=c(5,0,0)),
  arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0)),
  arima(sales_train_sale_log_mean$log_sales,order=c(4,1,0)),
  arima(sales_train_sale_log_mean$log_sales,order=c(3,1,0))

)

##                                                                df        AIC
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 1))  2  176.65480
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 2))  3  -25.02893
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 3))  4  -25.55570
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 4))  5  -31.07848
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 8))  9 -126.67010
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 5))  7  -38.43750
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 6))  8  -47.03917
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 7))  9  -80.24721
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 0, 0))  7   39.75579
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 1, 0))  6  -98.48548
## arima(sales_train_sale_log_mean$log_sales, order = c(4, 1, 0))  5   80.65481
## arima(sales_train_sale_log_mean$log_sales, order = c(3, 1, 0))  4  146.79028

The AIC and BIC suggest the time series of nth order of Moving Average and AR but we will try Auto ARIMA to find the best data generating process for our dataset

options(warn=-1)
library(forecast)
auto.arima(sales_train_sale_log_mean$log_sales,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)

## Series: sales_train_sale_log_mean$log_sales 
## ARIMA(5,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5
##       -0.3694  -0.5229  -0.4127  -0.3622  -0.4023
## s.e.   0.0285   0.0287   0.0304   0.0287   0.0286
## 
## sigma^2 = 0.05277:  log likelihood = 55.24
## AIC=-98.49   AICc=-98.4   BIC=-68.87

Auto ARIMA suggest that this is AR 5 process to verify that we will do residual plot and Ljung-Box test

best_mod = arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0))
forecast::checkresiduals(best_mod)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,0)
## Q* = 112.65, df = 5, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 10

Residual show that our residual have auto correlation, we will check the lag variable with Ljung-Box test to come with better lag

par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)

Box.test(resid,type='Ljung-Box',lag=1)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 18.131, df = 1, p-value = 2.062e-05

Box.test(resid,type='Ljung-Box',lag=2)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 18.604, df = 2, p-value = 9.126e-05

Box.test(resid,type='Ljung-Box',lag=3)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 18.88, df = 3, p-value = 0.0002895

Box.test(resid,type='Ljung-Box',lag=4)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 21.382, df = 4, p-value = 0.0002659

Box.test(resid,type='Ljung-Box',lag=5)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 23.284, df = 5, p-value = 0.0002979

Box.test(resid,type='Ljung-Box',lag=6)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 56.615, df = 6, p-value = 2.186e-10

Box.test(resid,type='Ljung-Box',lag=7)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 91.755, df = 7, p-value < 2.2e-16

Box.test(resid,type='Ljung-Box',lag=8)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 91.85, df = 8, p-value = 2.22e-16

Box.test(resid,type='Ljung-Box',lag=9)

## 
##  Box-Ljung test
## 
## data:  resid
## X-squared = 112.64, df = 9, p-value < 2.2e-16

The ACF and PACF residual plots Ljung Box Plot suggests that the there is other unmodeled variation in our data- which resonates with our common ground knowledge as well. Sales is usually dependent on other factors as well data regression taking into account

We will take the value from auto arima to get the forecast

best_mod = arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0))
resid = best_mod$residuals
pred = resid+sales_train_sale_log_mean$log_sales
ggplot()+
  geom_line(aes(sales_train_sale_log_mean$date1,sales_train_sale_log_mean$log_sales))+
  geom_line(aes(sales_train_sale_log_mean$date1,pred),color='blue',alpha=0.4)+
  theme_bw()+
  xlab("Date")+
  ylab("Store Value")+
  ggtitle("Time Series forecast with our best model")

RMSE = sqrt(mean((expm1(pred) - expm1(sales_train_sale_log_mean$log_sales))^2,na.rm=T))
RMSE

## [1] 98.12853

best_mod %>%
  forecast(h=30) %>% 
  autoplot()

moving_average =arima(sales_train_sale_log_mean$log_sales,order=c(0,1,8))

resid = moving_average$residuals
pred = resid+sales_train_sale_log_mean$log_sales

RMSE = sqrt(mean((expm1(pred) - expm1(sales_train_sale_log_mean$log_sales))^2,na.rm=T))
RMSE

## [1] 97.93321

moving_average %>% 
    forecast(h=30) %>% 
  autoplot()

We will do Facebook prophet and with test-train split we will determine the best model for our data from traditional & Facebook prophet

options(warn=-1)

library(prophet)

## Loading required package: Rcpp

## Loading required package: rlang

prophet_data <- sales_train_top_store_month %>%
  mutate(ds= date1 ,y = package_moved) %>%  select(ds,y)

## Adding missing grouping variables: `shop_id`

prophet_data<- prophet_data[,c(2:3)]

model <- prophet(prophet_data)

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

future = make_future_dataframe(model,periods = 365)
forecast = predict(model,future)

plot(model,forecast)+
ylab("Store Sales Forecast")+xlab("Date")+theme_bw()

options(warn=-1)

prophet_plot_components(model,forecast)

The sales increase on weekend as compared to weekdays that we can see on the weekly plots

options(warn=-1)

plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Store Sales")

The change point suggest a change in 2014 & 2015, which make sense since it show a sequence of observational change

prophet_data$floor = 0
prophet_data$cap = 600
future$floor = 0
future$cap = 600

sat_model <- prophet(prophet_data,growth='logistic')

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

sat_one_yr_forecast = predict(sat_model,future)

plot(sat_model,sat_one_yr_forecast)+ylim(0,600)+
  theme_bw()+xlab("Date")+ylab("Store Sales")

Since we don’t see any multiplicative effect in seasonality our model , so we will stick to additive model i.e default model

We will also want to add the effect of holiday in our data since the sales will be impacted by holidays

model = prophet(prophet_data,growth='logistic',fit=FALSE)
model = add_country_holidays(model,country_name = 'RU')
model = fit.prophet(model,prophet_data)

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

forecast = predict(model,future)
prophet_plot_components(model,forecast)

Since our data is for Russia sadly we will look at the impact of holidays on the sales volume

library(tidyr)

forecast %>%
  filter(holidays != 0) %>%
  dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
  mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
  summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
  pivot_longer(
    cols = `Christmas Day`:`Victory Day`,
    names_to = 'holiday', 
    values_to = 'effect'
  ) %>%
ggplot() +
  geom_col(aes(effect,holiday))+
  theme_bw()

options(warn =-1)
forecast_metric_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.Date(ds))


RMSE = sqrt(mean((prophet_data$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(prophet_data$y - forecast_metric_data$yhat))
MAPE = mean(abs((prophet_data$y - forecast_metric_data$yhat)/prophet_data$y))
print(paste("RMSE:",round(RMSE,2)))

## [1] "RMSE: 122.45"

print(paste("MAE:",round(MAE,2)))

## [1] "MAE: 79.73"

print(paste("MAPE:",round(MAPE,2)))

## [1] "MAPE: 0.24"

The RMSE obtained through Facebook prophet is higher as compared to ARIMA model , the RMSE is letting us to take ARIMA moving average model , however we will do outsample for selecting final model for 2 models

options(warn=-1)

train <- sales_train_sale_log[1:721,]

test <- sales_train_sale_log[722:1031,]

model1 <- arima(train$log_sales,order=c(5,1,0))

model2 <- arima(train$log_sales,order=c(0,1,8))

prophet_data <- sales_train_sale_log %>%
  mutate(ds= date1 ,y = package_moved) %>%  select(ds,y)

## Adding missing grouping variables: `shop_id`

prophet_train <- prophet_data[1:721,]


prophet_train$cap = 600
prophet_train$floor = 0



model3  = prophet(prophet_train,growth='logistic',fit=FALSE)
model3 = add_country_holidays(model3,country_name = 'RU')
model3 =fit.prophet(model3,prophet_train)

## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

yhat1 <- predict(model1,310)

yhat2<- predict(model2,310)

future <- make_future_dataframe(model3,periods = 310)
future$cap = 600
future$floor = 0


yhat3<- predict(model3,future)

yhat3 <- yhat3[722:1031,]


out_data <- data.frame(
  actual =test,
  error1 = yhat1$pred - test$log_sales,
  error2 = yhat2$pred - test$log_sales,
  error3 = log(yhat3$yhat) - log(test$package_moved)
)


error_metrics = out_data %>% pivot_longer(cols = error1:error3,values_to = 'error')%>%group_by(name,actual.date1) %>%
  summarise(
    RMSE = sqrt(mean(error^2)),
    MAE = mean(abs(error)),
    MAPE = mean(abs(error / actual.log_sales))
    ) %>% ungroup()

## `summarise()` has grouped output by 'name'. You can override using the `.groups`
## argument.

  error_metrics %>% ggplot()+
  geom_line(aes(actual.date1 ,RMSE,color=name))+
  theme_bw()+
  labs(title = 'RMSE at each Time Point Across all Test Value')+xlab("Date")

  error_metrics %>% ggplot()+
  geom_line(aes(actual.date1 ,MAE,color=name))+
  theme_bw()+
  labs(title = 'MAE at each Time Point Across all Test Value')+xlab("Date")

  error_metrics %>% ggplot()+
  geom_line(aes(actual.date1 ,MAPE,color=name))+
  theme_bw()+
  labs(title = 'MAPE at each Time Point Across all Test Value')+xlab("Date")

options(warn=-1)
library(DT)


error_metrics1 = out_data %>% pivot_longer(cols = error1:error3,values_to = 'error')%>%group_by(name) %>%
  summarise(
    RMSE = sqrt(mean(error^2)),
    MAE = mean(abs(error)),
    MAPE = mean(abs(error / actual.log_sales))
    )

tibble(
  Model = c("ARIMA(5,1,0)","ARIMA(0,1,8)","Facebook Prophet"),
  `Out of Sample RMSE` = c(error_metrics1[1,2],error_metrics1[2,2],error_metrics1[3,2]),
  `Out of Sample MAE` = c(error_metrics1[1,3],error_metrics1[2,3],error_metrics1[3,3]),
  `Out of Sample MAPE` = c(error_metrics1[1,4],error_metrics1[2,4],error_metrics1[3,4]),
) %>%
datatable()

So on the basis of all the error we obtained through Train -Test split for ARIMA models we can say that our time series follow ARIMA (0,1,8) behaviour is better but in selecting the overall model Facebook prophet provide us better model.