The goal is to forecast the total amount of product sold by shop for the data set. The data is provided by the Russian software firm - 1C. The daily historical data of stores and products are provided from Jan-2013 to Oct-2015. We need to forecast the sales for shop and products for Nov-2015.
Why I have selected this dataset? The store sales is something helped to study behavior of human For instance it help us to understand the nature of demography , what they purchase and how price defined the point of purchase.
What drives variation in the variable? I think pricing will be bit seasonal as well as dependent on the supply chain of the product. The festive discounts and other factor also can lead to change in buying behavior. The forecast of the pricing will be bit moderate as there are chances of variation on date level.
To begin the project we will install some libraries for data wrangling and we will import the data set.
options(warn=-1)
rm(list = ls())
pkgs <- c("dplyr", "tidyverse", "lubridate", "data.table", "ggplot2","knitr","tseries","forecast","DT","prophet")
lib <- installed.packages()[, "Package"]
install.packages(setdiff(pkgs, lib))
setwd("C:\\Users\\nawal\\OneDrive\\Documents\\Course Material\\Time Series")
sales_train<- read.csv("sales_train.csv")
head(sales_train)
## date date_block_num shop_id item_id item_price item_cnt_day
## 1 02.01.2013 0 59 22154 999.00 1
## 2 03.01.2013 0 25 2552 899.00 1
## 3 05.01.2013 0 25 2552 899.00 -1
## 4 06.01.2013 0 25 2554 1709.05 1
## 5 15.01.2013 0 25 2555 1099.00 1
## 6 10.01.2013 0 25 2564 349.00 1
dim(sales_train)
## [1] 2935849 6
str(sales_train)
## 'data.frame': 2935849 obs. of 6 variables:
## $ date : chr "02.01.2013" "03.01.2013" "05.01.2013" "06.01.2013" ...
## $ date_block_num: int 0 0 0 0 0 0 0 0 0 0 ...
## $ shop_id : int 59 25 25 25 25 25 25 25 25 25 ...
## $ item_id : int 22154 2552 2552 2554 2555 2564 2565 2572 2572 2573 ...
## $ item_price : num 999 899 899 1709 1099 ...
## $ item_cnt_day : num 1 1 -1 1 1 1 1 1 1 3 ...
summary(sales_train)
## date date_block_num shop_id item_id
## Length:2935849 Min. : 0.00 Min. : 0 Min. : 0
## Class :character 1st Qu.: 7.00 1st Qu.:22 1st Qu.: 4476
## Mode :character Median :14.00 Median :31 Median : 9343
## Mean :14.57 Mean :33 Mean :10197
## 3rd Qu.:23.00 3rd Qu.:47 3rd Qu.:15684
## Max. :33.00 Max. :59 Max. :22169
## item_price item_cnt_day
## Min. : -1.0 Min. : -22.000
## 1st Qu.: 249.0 1st Qu.: 1.000
## Median : 399.0 Median : 1.000
## Mean : 890.9 Mean : 1.243
## 3rd Qu.: 999.0 3rd Qu.: 1.000
## Max. :307980.0 Max. :2169.000
Shop_id : unique identifier for the shop
item_id: unique identifier of a product
item_price: current price of an item
date : date in format dd/mm/yyyy
item_cnt_day : number of product sold
date_block_num : a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,…, October 2015 is 33
Along with sales training data set we have information about item name , categories name , shop name which we won’t be using in our data set since the naming are in Russian which won’t be easy for an Indian to interpret :) . We will try to forecast our model on the shop_id of highest selling by volume. We will be doing summary statisting and data wrangling in next step.
options(warn=-1)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
sales_train$date1<- as.Date(sales_train$date,"%d.%m.%Y")
sales_train$year<- lubridate::year(sales_train$date1)
sales_train$month<- lubridate::month(sales_train$date1)
sales_train$day<- lubridate::day(sales_train$date1)
sales_train_store <- sales_train%>%group_by(shop_id,date_block_num)%>%
summarise(package_moved =sum(item_cnt_day))%>%arrange(shop_id,date_block_num)
## `summarise()` has grouped output by 'shop_id'. You can override using the
## `.groups` argument.
sales_train_store <- filter(sales_train_store , date_block_num %in% 22:33)
summary(sales_train_store)
## shop_id date_block_num package_moved
## Min. : 2 Min. :22.0 Min. : -1
## 1st Qu.:18 1st Qu.:24.0 1st Qu.: 966
## Median :34 Median :27.0 Median : 1340
## Mean :32 Mean :27.3 Mean : 1915
## 3rd Qu.:47 3rd Qu.:30.0 3rd Qu.: 2084
## Max. :59 Max. :33.0 Max. :14610
sales_store_req <- sales_train %>% group_by(shop_id)%>%
summarise(package_moved = sum(item_cnt_day))%>%arrange(desc(package_moved ))
### shop id =31 have highest number of package moved in category so we cut_off our data only for this id
sales_train_top_store <- filter(sales_train, shop_id ==31)
sales_train_top_store$shop_id<- as.vector(sales_train_top_store$shop_id)
sales_train_top_store_month <- sales_train_top_store%>% group_by(shop_id,date1)%>%
summarise(package_moved = sum(item_cnt_day))%>% arrange(date1)
## `summarise()` has grouped output by 'shop_id'. You can override using the
## `.groups` argument.
ggplot2::ggplot(sales_train_top_store_month, ggplot2::aes(x=date1,y=package_moved,color=shop_id))+xlab("Year")+ylab("Package Moved")+ggtitle("Goods bought from Shop ID -31 in YoY")+
geom_point()+geom_line()+theme_classic()+theme(panel.grid=element_blank())
options(warn=-1)
boxplot(sales_train_top_store_month$package_moved,main = "Average Box moved on daily basis by store")
summary(sales_train_top_store_month$package_moved)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 102.0 221.5 275.0 301.4 358.5 1080.0
options(warn=-1)
sales_train_top_store_month$package_moved <-
ifelse(sales_train_top_store_month$package_moved > 700 ,650,sales_train_top_store_month$package_moved)
ggplot2::ggplot(sales_train_top_store_month, ggplot2::aes(x=date1,y=package_moved,color=shop_id))+xlab("Year")+ylab("Package Moved")+ggtitle("Goods bought from Shop ID -31 in YoY")+
geom_point()+geom_line()+theme_classic()+theme(panel.grid=element_blank())
sales_train_sale_log = sales_train_top_store_month%>% mutate(log_sales= log1p(package_moved))
ggplot(sales_train_sale_log,aes(x=date1,y=log_sales))+
geom_line()+theme_classic()+
ylab("Total Store Sale")+
xlab("Date")+
ggtitle("Time Series with log transformation")
sales_train_sale_log_mean <- sales_train_sale_log%>% arrange(date1)%>% mutate(lag_diff = log_sales-lag(log_sales))
sales_train_sale_log_mean<- na.omit(sales_train_sale_log_mean)
ggplot(sales_train_sale_log_mean,aes(x=date1,y=lag_diff))+
geom_line()+theme_classic()+
ylab("Total Store Sale")+
xlab("Date")+
ggtitle("Time Series with log transformation & Mean Transformation")
options(warn=-1)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
adf.test(sales_train_sale_log_mean$lag_diff)
##
## Augmented Dickey-Fuller Test
##
## data: sales_train_sale_log_mean$lag_diff
## Dickey-Fuller = -11.686, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
options(warn=-1)
kpss.test(sales_train_sale_log_mean$lag_diff)
##
## KPSS Test for Level Stationarity
##
## data: sales_train_sale_log_mean$lag_diff
## KPSS Level = 0.012204, Truncation lag parameter = 7, p-value = 0.1
par(mfrow = c(1,2))
acf(sales_train_sale_log_mean$lag_diff,lag.max=20)
pacf(sales_train_sale_log_mean$lag_diff,lag.max=20)
options(warn=-1)
BIC(
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,1)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,2)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,3)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,8)),
arima(sales_train_sale_log_mean$log_sales,order=c(1,1,1)),
arima(sales_train_sale_log_mean$log_sales,order=c(2,1,0)),
arima(sales_train_sale_log_mean$log_sales,order=c(3,1,0)),
arima(sales_train_sale_log_mean$log_sales,order=c(5,0,0)),
arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0))
)
## df BIC
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 1)) 2 186.527485
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 2)) 3 -10.219906
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 3)) 4 -5.810333
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 8)) 9 -82.243017
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 1)) 3 34.485524
## arima(sales_train_sale_log_mean$log_sales, order = c(2, 1, 0)) 3 195.005310
## arima(sales_train_sale_log_mean$log_sales, order = c(3, 1, 0)) 4 166.535651
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 0, 0)) 7 74.316992
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 1, 0)) 6 -68.867420
AIC(
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,1)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,2)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,3)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,4)),
arima(sales_train_sale_log_mean$log_sales,order=c(0,1,8)),
arima(sales_train_sale_log_mean$log_sales,order=c(1,1,5)),
arima(sales_train_sale_log_mean$log_sales,order=c(1,1,6)),
arima(sales_train_sale_log_mean$log_sales,order=c(1,1,7)),
arima(sales_train_sale_log_mean$log_sales,order=c(5,0,0)),
arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0)),
arima(sales_train_sale_log_mean$log_sales,order=c(4,1,0)),
arima(sales_train_sale_log_mean$log_sales,order=c(3,1,0))
)
## df AIC
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 1)) 2 176.65480
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 2)) 3 -25.02893
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 3)) 4 -25.55570
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 4)) 5 -31.07848
## arima(sales_train_sale_log_mean$log_sales, order = c(0, 1, 8)) 9 -126.67010
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 5)) 7 -38.43750
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 6)) 8 -47.03917
## arima(sales_train_sale_log_mean$log_sales, order = c(1, 1, 7)) 9 -80.24721
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 0, 0)) 7 39.75579
## arima(sales_train_sale_log_mean$log_sales, order = c(5, 1, 0)) 6 -98.48548
## arima(sales_train_sale_log_mean$log_sales, order = c(4, 1, 0)) 5 80.65481
## arima(sales_train_sale_log_mean$log_sales, order = c(3, 1, 0)) 4 146.79028
options(warn=-1)
library(forecast)
auto.arima(sales_train_sale_log_mean$log_sales,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: sales_train_sale_log_mean$log_sales
## ARIMA(5,1,0)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5
## -0.3694 -0.5229 -0.4127 -0.3622 -0.4023
## s.e. 0.0285 0.0287 0.0304 0.0287 0.0286
##
## sigma^2 = 0.05277: log likelihood = 55.24
## AIC=-98.49 AICc=-98.4 BIC=-68.87
best_mod = arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0))
forecast::checkresiduals(best_mod)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,1,0)
## Q* = 112.65, df = 5, p-value < 2.2e-16
##
## Model df: 5. Total lags used: 10
par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)
Box.test(resid,type='Ljung-Box',lag=1)
##
## Box-Ljung test
##
## data: resid
## X-squared = 18.131, df = 1, p-value = 2.062e-05
Box.test(resid,type='Ljung-Box',lag=2)
##
## Box-Ljung test
##
## data: resid
## X-squared = 18.604, df = 2, p-value = 9.126e-05
Box.test(resid,type='Ljung-Box',lag=3)
##
## Box-Ljung test
##
## data: resid
## X-squared = 18.88, df = 3, p-value = 0.0002895
Box.test(resid,type='Ljung-Box',lag=4)
##
## Box-Ljung test
##
## data: resid
## X-squared = 21.382, df = 4, p-value = 0.0002659
Box.test(resid,type='Ljung-Box',lag=5)
##
## Box-Ljung test
##
## data: resid
## X-squared = 23.284, df = 5, p-value = 0.0002979
Box.test(resid,type='Ljung-Box',lag=6)
##
## Box-Ljung test
##
## data: resid
## X-squared = 56.615, df = 6, p-value = 2.186e-10
Box.test(resid,type='Ljung-Box',lag=7)
##
## Box-Ljung test
##
## data: resid
## X-squared = 91.755, df = 7, p-value < 2.2e-16
Box.test(resid,type='Ljung-Box',lag=8)
##
## Box-Ljung test
##
## data: resid
## X-squared = 91.85, df = 8, p-value = 2.22e-16
Box.test(resid,type='Ljung-Box',lag=9)
##
## Box-Ljung test
##
## data: resid
## X-squared = 112.64, df = 9, p-value < 2.2e-16
The ACF and PACF residual plots Ljung Box Plot suggests that the there is other unmodeled variation in our data- which resonates with our common ground knowledge as well. Sales is usually dependent on other factors as well data regression taking into account
best_mod = arima(sales_train_sale_log_mean$log_sales,order=c(5,1,0))
resid = best_mod$residuals
pred = resid+sales_train_sale_log_mean$log_sales
ggplot()+
geom_line(aes(sales_train_sale_log_mean$date1,sales_train_sale_log_mean$log_sales))+
geom_line(aes(sales_train_sale_log_mean$date1,pred),color='blue',alpha=0.4)+
theme_bw()+
xlab("Date")+
ylab("Store Value")+
ggtitle("Time Series forecast with our best model")
RMSE = sqrt(mean((expm1(pred) - expm1(sales_train_sale_log_mean$log_sales))^2,na.rm=T))
RMSE
## [1] 98.12853
best_mod %>%
forecast(h=30) %>%
autoplot()
moving_average =arima(sales_train_sale_log_mean$log_sales,order=c(0,1,8))
resid = moving_average$residuals
pred = resid+sales_train_sale_log_mean$log_sales
RMSE = sqrt(mean((expm1(pred) - expm1(sales_train_sale_log_mean$log_sales))^2,na.rm=T))
RMSE
## [1] 97.93321
moving_average %>%
forecast(h=30) %>%
autoplot()
options(warn=-1)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
prophet_data <- sales_train_top_store_month %>%
mutate(ds= date1 ,y = package_moved) %>% select(ds,y)
## Adding missing grouping variables: `shop_id`
prophet_data<- prophet_data[,c(2:3)]
model <- prophet(prophet_data)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future = make_future_dataframe(model,periods = 365)
forecast = predict(model,future)
plot(model,forecast)+
ylab("Store Sales Forecast")+xlab("Date")+theme_bw()
options(warn=-1)
prophet_plot_components(model,forecast)
options(warn=-1)
plot(model,forecast)+add_changepoints_to_plot(model)+theme_bw()+xlab("Date")+ylab("Store Sales")
prophet_data$floor = 0
prophet_data$cap = 600
future$floor = 0
future$cap = 600
sat_model <- prophet(prophet_data,growth='logistic')
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
sat_one_yr_forecast = predict(sat_model,future)
plot(sat_model,sat_one_yr_forecast)+ylim(0,600)+
theme_bw()+xlab("Date")+ylab("Store Sales")
model = prophet(prophet_data,growth='logistic',fit=FALSE)
model = add_country_holidays(model,country_name = 'RU')
model = fit.prophet(model,prophet_data)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
forecast = predict(model,future)
prophet_plot_components(model,forecast)
library(tidyr)
forecast %>%
filter(holidays != 0) %>%
dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
pivot_longer(
cols = `Christmas Day`:`Victory Day`,
names_to = 'holiday',
values_to = 'effect'
) %>%
ggplot() +
geom_col(aes(effect,holiday))+
theme_bw()
options(warn =-1)
forecast_metric_data = forecast %>%
as_tibble() %>%
mutate(ds = as.Date(ds))
RMSE = sqrt(mean((prophet_data$y - forecast_metric_data$yhat)^2))
MAE = mean(abs(prophet_data$y - forecast_metric_data$yhat))
MAPE = mean(abs((prophet_data$y - forecast_metric_data$yhat)/prophet_data$y))
print(paste("RMSE:",round(RMSE,2)))
## [1] "RMSE: 122.45"
print(paste("MAE:",round(MAE,2)))
## [1] "MAE: 79.73"
print(paste("MAPE:",round(MAPE,2)))
## [1] "MAPE: 0.24"
options(warn=-1)
train <- sales_train_sale_log[1:721,]
test <- sales_train_sale_log[722:1031,]
model1 <- arima(train$log_sales,order=c(5,1,0))
model2 <- arima(train$log_sales,order=c(0,1,8))
prophet_data <- sales_train_sale_log %>%
mutate(ds= date1 ,y = package_moved) %>% select(ds,y)
## Adding missing grouping variables: `shop_id`
prophet_train <- prophet_data[1:721,]
prophet_train$cap = 600
prophet_train$floor = 0
model3 = prophet(prophet_train,growth='logistic',fit=FALSE)
model3 = add_country_holidays(model3,country_name = 'RU')
model3 =fit.prophet(model3,prophet_train)
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
yhat1 <- predict(model1,310)
yhat2<- predict(model2,310)
future <- make_future_dataframe(model3,periods = 310)
future$cap = 600
future$floor = 0
yhat3<- predict(model3,future)
yhat3 <- yhat3[722:1031,]
out_data <- data.frame(
actual =test,
error1 = yhat1$pred - test$log_sales,
error2 = yhat2$pred - test$log_sales,
error3 = log(yhat3$yhat) - log(test$package_moved)
)
error_metrics = out_data %>% pivot_longer(cols = error1:error3,values_to = 'error')%>%group_by(name,actual.date1) %>%
summarise(
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error)),
MAPE = mean(abs(error / actual.log_sales))
) %>% ungroup()
## `summarise()` has grouped output by 'name'. You can override using the `.groups`
## argument.
error_metrics %>% ggplot()+
geom_line(aes(actual.date1 ,RMSE,color=name))+
theme_bw()+
labs(title = 'RMSE at each Time Point Across all Test Value')+xlab("Date")
error_metrics %>% ggplot()+
geom_line(aes(actual.date1 ,MAE,color=name))+
theme_bw()+
labs(title = 'MAE at each Time Point Across all Test Value')+xlab("Date")
error_metrics %>% ggplot()+
geom_line(aes(actual.date1 ,MAPE,color=name))+
theme_bw()+
labs(title = 'MAPE at each Time Point Across all Test Value')+xlab("Date")
options(warn=-1)
library(DT)
error_metrics1 = out_data %>% pivot_longer(cols = error1:error3,values_to = 'error')%>%group_by(name) %>%
summarise(
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error)),
MAPE = mean(abs(error / actual.log_sales))
)
tibble(
Model = c("ARIMA(5,1,0)","ARIMA(0,1,8)","Facebook Prophet"),
`Out of Sample RMSE` = c(error_metrics1[1,2],error_metrics1[2,2],error_metrics1[3,2]),
`Out of Sample MAE` = c(error_metrics1[1,3],error_metrics1[2,3],error_metrics1[3,3]),
`Out of Sample MAPE` = c(error_metrics1[1,4],error_metrics1[2,4],error_metrics1[3,4]),
) %>%
datatable()