Section 1

I have picked the data from India’s largest e-commerce company Flipkart

library(astsa, quietly=TRUE, warn.conflicts=FALSE)
## Warning: package 'astsa' was built under R version 4.1.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Import data
setwd("C:\\Users\\shree\\Downloads")
retail_df <- read.csv("flipkart_com-ecommerce_sample.csv")
colnames(retail_df)
##  [1] "uniq_id"                 "crawl_timestamp"        
##  [3] "product_url"             "product_name"           
##  [5] "product_category_tree"   "pid"                    
##  [7] "retail_price"            "discounted_price"       
##  [9] "image"                   "is_FK_Advantage_product"
## [11] "description"             "product_rating"         
## [13] "overall_rating"          "brand"                  
## [15] "product_specifications"
str(retail_df)
## 'data.frame':    20000 obs. of  15 variables:
##  $ uniq_id                : chr  "c2d766ca982eca8304150849735ffef9" "7f7036a6d550aaa89d34c77bd39a5e48" "f449ec65dcbc041b6ae5e6a32717d01b" "0973b37acd0c664e3de26e97e5571454" ...
##  $ crawl_timestamp        : chr  "2016-03-25 22:59:23 +0000" "2016-03-25 22:59:23 +0000" "2016-03-25 22:59:23 +0000" "2016-03-25 22:59:23 +0000" ...
##  $ product_url            : chr  "http://www.flipkart.com/alisha-solid-women-s-cycling-shorts/p/itmeh2ffvzetthbb?pid=SRTEH2FF9KEDEFGF" "http://www.flipkart.com/fabhomedecor-fabric-double-sofa-bed/p/itmeh3qgfamccfpy?pid=SBEEH3QGU7MFYJFY" "http://www.flipkart.com/aw-bellies/p/itmeh4grgfbkexnt?pid=SHOEH4GRSUBJGZXE" "http://www.flipkart.com/alisha-solid-women-s-cycling-shorts/p/itmeh2f6sdgah2pq?pid=SRTEH2F6HUZMQ6SJ" ...
##  $ product_name           : chr  "Alisha Solid Women's Cycling Shorts" "FabHomeDecor Fabric Double Sofa Bed" "AW Bellies" "Alisha Solid Women's Cycling Shorts" ...
##  $ product_category_tree  : chr  "[\"Clothing >> Women's Clothing >> Lingerie, Sleep & Swimwear >> Shorts >> Alisha Shorts >> Alisha Solid Women'"| __truncated__ "[\"Furniture >> Living Room Furniture >> Sofa Beds & Futons >> FabHomeDecor Fabric Double Sofa Bed (Finish Colo...\"]" "[\"Footwear >> Women's Footwear >> Ballerinas >> AW Bellies\"]" "[\"Clothing >> Women's Clothing >> Lingerie, Sleep & Swimwear >> Shorts >> Alisha Shorts >> Alisha Solid Women'"| __truncated__ ...
##  $ pid                    : chr  "SRTEH2FF9KEDEFGF" "SBEEH3QGU7MFYJFY" "SHOEH4GRSUBJGZXE" "SRTEH2F6HUZMQ6SJ" ...
##  $ retail_price           : int  999 32157 999 699 220 430 1199 32157 699 1199 ...
##  $ discounted_price       : int  379 22646 499 267 210 430 479 22646 349 479 ...
##  $ image                  : chr  "[\"http://img5a.flixcart.com/image/short/u/4/a/altht-3p-21-alisha-38-original-imaeh2d5vm5zbtgg.jpeg\", \"http:/"| __truncated__ "[\"http://img6a.flixcart.com/image/sofa-bed/j/f/y/fhd112-double-foam-fabhomedecor-leatherette-black-leatherette"| __truncated__ "[\"http://img5a.flixcart.com/image/shoe/7/z/z/red-as-454-aw-11-original-imaeebfwsdf6jdf6.jpeg\", \"http://img6a"| __truncated__ "[\"http://img5a.flixcart.com/image/short/6/2/h/altght-11-alisha-38-original-imaeh2d5uq9thnyg.jpeg\", \"http://i"| __truncated__ ...
##  $ is_FK_Advantage_product: chr  "false" "false" "false" "false" ...
##  $ description            : chr  "Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid"| __truncated__ "FabHomeDecor Fabric Double Sofa Bed (Finish Color - Leatherette Black Mechanism Type - Pull Out) Price: Rs. 22,"| __truncated__ "Key Features of AW Bellies Sandals Wedges Heel Casuals,AW Bellies Price: Rs. 499 Material: Synthetic Lifestyle:"| __truncated__ "Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Black, Red,Specifications of Alisha Solid Wome"| __truncated__ ...
##  $ product_rating         : chr  "No rating available" "No rating available" "No rating available" "No rating available" ...
##  $ overall_rating         : chr  "No rating available" "No rating available" "No rating available" "No rating available" ...
##  $ brand                  : chr  "Alisha" "FabHomeDecor" "AW" "Alisha" ...
##  $ product_specifications : chr  "{\"product_specification\"=>[{\"key\"=>\"Number of Contents in Sales Package\", \"value\"=>\"Pack of 3\"}, {\"k"| __truncated__ "{\"product_specification\"=>[{\"key\"=>\"Installation & Demo Details\", \"value\"=>\"Installation and demo for "| __truncated__ "{\"product_specification\"=>[{\"key\"=>\"Ideal For\", \"value\"=>\"Women\"}, {\"key\"=>\"Occasion\", \"value\"="| __truncated__ "{\"product_specification\"=>[{\"key\"=>\"Number of Contents in Sales Package\", \"value\"=>\"Pack of 2\"}, {\"k"| __truncated__ ...
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data<-retail_df%>%select(c('crawl_timestamp','retail_price'))%>%
 mutate(dt<-as.Date(as.POSIXct(crawl_timestamp, tz = "GMT"),format= "%Y/%m/%d %H:%M:%S"))

names(data)=c('crawl_timestamp','retail_price','date')
data<-data%>%group_by(date)%>%summarize(sum_retail=sum(retail_price,na.rm=T))%>%mutate(year=format(date,format="%Y"))%>%
  mutate(mnth<-month(as.POSIXlt(date, format="%d/%m/%Y")))

names(data)=c('date','sum_retail','year','month')
data<-data%>%filter(month %in% c(2,3,4,5,6))
data<-unique(data)

Convert the data into a time-series data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   2.0.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.1
## Warning: package 'tidyr' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
data %>%
  ggplot() +
  geom_line(aes(date, sum_retail)) +
  theme_bw() +
  ggtitle("Sum of retail prices Over time") +
  theme(legend.position="none")+
  ylab("Retail Prices") +
  xlab("Date")

### Glancing at the time series Plot: The data directs towards mean reverting nature that is mean stationary, and towards non-variance stationary.

Handling Variance Non-stationarity

Making the time series variance stationary

library(forecast)
## Warning: package 'forecast' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:astsa':
## 
##     gas
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.1.1
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
p1<-data%>%
  ggplot() +
  geom_line(aes(date, sum_retail)) +
  theme_bw() +
  ggtitle("Retail Sum over 6 months") +
  theme(legend.position="none")+
  ylab("Retail Sum") +
  xlab("Date")


p2<-data%>%
  ggplot() +
  geom_line( aes(date, log(sum_retail)  )) +
  theme_bw() +
  ggtitle("Log of Retail Sum over 6 months") +
  ylab("Retail Sum") +
  theme(legend.position="none")+
  xlab("Date")


p3<-data%>%
  ggplot() +
  geom_line(aes(date, BoxCox(sum_retail,lambda='auto')  )) +
  theme_bw() +
  ggtitle("Box Cox::Retail Sum over 6 months") +
  ylab("Retail Sum") +
  theme(legend.position="none")+
  xlab("Date")

gridExtra::grid.arrange(p1, p2,p3, nrow = 1)

data = data %>%
  mutate(LogRetail=log(sum_retail))%>%
  mutate(LogRetailLag1=LogRetail - lag(LogRetail))
data %>%
  ggplot()+
      geom_line(aes(date,LogRetailLag1))+
      theme_bw()+
      ggtitle("Log Retail Sum(First Difference)")+
      ylab("Log Retail Lag1")+
      xlab("Date")
## Warning: Removed 1 row(s) containing missing values (geom_path).

Augmented Dickey Fuller Test

library(tseries) # Use tseries package for adf.test function
## Warning: package 'tseries' was built under R version 4.1.2
test=na.omit(data$LogRetailLag1)
adf.test(test) # Raw Close Value
## Warning in adf.test(test): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  test
## Dickey-Fuller = -7.4104, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

KPSS Test

kpss.test(test) # Differenced log close value
## Warning in kpss.test(test): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  test
## KPSS Level = 0.027199, Truncation lag parameter = 4, p-value = 0.1

KPSS Test (preferred)

p-value>0.1 indicates stationary

Augmented Dickey-Fuller (ADF) Test

p-value<0.05 indicates stationary

Seasonality analysis- Visually

The data clearly has seasonality in it and it is highly a monthly seasonality. The first few days of the month usually has a peak, followed by a dip. Coming from a retail experience, the most likely reason for this could be that customers usually get paid at the beginning of the month and has a higher spending potential in return than towards the end.

This seasonality is directing towards a monthly seasonality. I also feel there is a mild decreasing trend in the time-series, but this can even be directional towards yearly seasonality. This is because usually the sales go up during December or year end and stays high pretty much in the beginning of the year, but dwindles somewhere after the first month, and especially till the middle of the year. However, we will need at-least 1 more year of data to validate this yearly seasonality.

ACF and PACF plots

acf(test,lag.max=20)

pacf(test,lag.max=20)

From the PACF plot, we observe a AR(1) and from ACF, we do not observe any MA nature. We will shortly validate our visual analysis through statistical analysis.

Section:2

Fitting some ARIMA Models:

Comparing The BIC of some ARIMA Models:

BIC(
  arima(data$LogRetail,order=c(0,1,1)),
  arima(data$LogRetail,order=c(0,1,2)),
  arima(data$LogRetail,order=c(0,1,3)),
  arima(data$LogRetail,order=c(0,1,4)),
  arima(data$LogRetail,order=c(0,1,5)),
  arima(data$LogRetail,order=c(1,1,1)),
  arima(data$LogRetail,order=c(1,1,2)),
  arima(data$LogRetail,order=c(1,1,3)),
  arima(data$LogRetail,order=c(1,1,4)),
  arima(data$LogRetail,order=c(1,1,5))
)
##                                           df      BIC
## arima(data$LogRetail, order = c(0, 1, 1))  2 401.9657
## arima(data$LogRetail, order = c(0, 1, 2))  3 403.9194
## arima(data$LogRetail, order = c(0, 1, 3))  4 407.3440
## arima(data$LogRetail, order = c(0, 1, 4))  5 411.9966
## arima(data$LogRetail, order = c(0, 1, 5))  6 415.2577
## arima(data$LogRetail, order = c(1, 1, 1))  3 403.4402
## arima(data$LogRetail, order = c(1, 1, 2))  4 407.9830
## arima(data$LogRetail, order = c(1, 1, 3))  5 412.0165
## arima(data$LogRetail, order = c(1, 1, 4))  6 410.9173
## arima(data$LogRetail, order = c(1, 1, 5))  7 415.2849
AIC(
  arima(data$LogRetail,order=c(0,1,1)),
  arima(data$LogRetail,order=c(0,1,2)),
  arima(data$LogRetail,order=c(0,1,3)),
  arima(data$LogRetail,order=c(0,1,4)),
  arima(data$LogRetail,order=c(0,1,5)),
  arima(data$LogRetail,order=c(1,1,1)),
  arima(data$LogRetail,order=c(1,1,2)),
  arima(data$LogRetail,order=c(1,1,3)),
  arima(data$LogRetail,order=c(1,1,4)),
  arima(data$LogRetail,order=c(1,1,5))
)
##                                           df      AIC
## arima(data$LogRetail, order = c(0, 1, 1))  2 396.6014
## arima(data$LogRetail, order = c(0, 1, 2))  3 395.8730
## arima(data$LogRetail, order = c(0, 1, 3))  4 396.6154
## arima(data$LogRetail, order = c(0, 1, 4))  5 398.5860
## arima(data$LogRetail, order = c(0, 1, 5))  6 399.1650
## arima(data$LogRetail, order = c(1, 1, 1))  3 395.3938
## arima(data$LogRetail, order = c(1, 1, 2))  4 397.2545
## arima(data$LogRetail, order = c(1, 1, 3))  5 398.6059
## arima(data$LogRetail, order = c(1, 1, 4))  6 394.8245
## arima(data$LogRetail, order = c(1, 1, 5))  7 396.5100

Auto ARIMA

auto.arima(data$LogRetail,stationary=FALSE,allowdrift=FALSE,
seasonal=FALSE,stepwise=FALSE,approximation=FALSE)
## Series: data$LogRetail 
## ARIMA(1,1,1) 
## 
## Coefficients:
##           ar1      ma1
##       -0.1944  -0.8588
## s.e.   0.1068   0.0642
## 
## sigma^2 = 2.161:  log likelihood = -194.7
## AIC=395.39   AICc=395.62   BIC=403.44

Best Model From the AIC, BIC and automated ARIMA fitting, we observe that ARIMA(1,1,1) gives the lease BIC and AIC, and hence is the best model for the given data,.

Residual Diagnostics on Best Model

library('forecast')
best_mod = arima(data$LogRetail,order=c(1,1,1))
forecast::checkresiduals(best_mod)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)
## Q* = 4.8185, df = 8, p-value = 0.7768
## 
## Model df: 2.   Total lags used: 10

Ljung Box Test Results

par(mfrow=c(1,2))
resid = best_mod$resid
acf(resid,lag.max=20)
pacf(resid,lag.max=20)

From the ACF and PACF plots, we do not observe any auto-regressive nature in our residuals. The Ljung Box test, further confirms our belief that the residuals from the best fit model is not autocorrelated as p-value significantly greater than 0.05.

So we can go ahead with our best fit model as ARIMA(0,1,1).

Prediction Using the best fit model

#best_mod = arima(data$logRetail,order=c(0,1,1))
resid = best_mod$residuals
pred = data$LogRetail-resid
ggplot()+
  geom_line(aes(data$date,data$LogRetail))+
  geom_line(aes(data$date,pred),color='blue',alpha=0.4)+
  theme_bw()+
  xlab("Date")+
  ylab("Log(Retail)")

Model Accuracy:

RMSE = sqrt(mean((expm1(pred) - expm1(data$LogRetail))^2,na.rm=T))
RMSE
## [1] 146085.8

The results suggest that our model predicts the BoxCox of Retail Sum within ~1.5lacs INR on average in-sample.

The data doesn’t seem to be accurately forecasting our retail sum and seems off by a huge margin.

Building Forecasts

We built a forecast 5 time period- in our case 5 days in advance and it seems to be fitting pretty well. But the question arises is this data good enough to fit further future time-points. Well, a straight answer glancing at the in-sample RMSE is no.

best_mod %>%
  forecast(h=5) %>% 
  autoplot()

Suggestions to improve performance:

Collect more data with continuity from previous time periods. This step is to ensure that the model eventually learns better and is able to forecast better, especially capture the peaks and spikes.