This analysis uses the Rossmann Stores Sales data provided for the Kaggle competition to forecast 6 weeks of sales for their 1115 stores.

Reading the Data:

library(data.table)
library(ggplot2)
library(forecast)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 6.2
library(xts)
## 
## Attaching package: 'xts'
## 
## The following object is masked from 'package:data.table':
## 
##     last
library(dygraphs)
library(caret)
## Warning: package 'caret' was built under R version 3.2.3
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.

The files provided are in .csv format. The data.table library is used in order to speed up the reading and exploring process.

store <- fread("store.csv")
train <- fread("train.csv")
test <- fread("test.csv")

The data format and other features can be viewed by using either of the commands.

#summary(store)
#summary(train)
#summary(test)
str(store)
## Classes 'data.table' and 'data.frame':   1115 obs. of  10 variables:
##  $ Store                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ StoreType                : chr  "c" "a" "a" "c" ...
##  $ Assortment               : chr  "a" "a" "a" "c" ...
##  $ CompetitionDistance      : int  1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
##  $ CompetitionOpenSinceMonth: int  9 11 12 9 4 12 4 10 8 9 ...
##  $ CompetitionOpenSinceYear : int  2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
##  $ Promo2                   : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ Promo2SinceWeek          : int  NA 13 14 NA NA NA NA NA NA NA ...
##  $ Promo2SinceYear          : int  NA 2010 2011 NA NA NA NA NA NA NA ...
##  $ PromoInterval            : chr  "" "Jan,Apr,Jul,Oct" "Jan,Apr,Jul,Oct" "" ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(train)
## Classes 'data.table' and 'data.frame':   1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : chr  "2015-07-31" "2015-07-31" "2015-07-31" "2015-07-31" ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: chr  "1" "1" "1" "1" ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(test)
## Classes 'data.table' and 'data.frame':   41088 obs. of  8 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Store        : int  1 3 7 8 9 10 11 12 13 14 ...
##  $ DayOfWeek    : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Date         : chr  "2015-09-17" "2015-09-17" "2015-09-17" "2015-09-17" ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: chr  "0" "0" "0" "0" ...
##  - attr(*, ".internal.selfref")=<externalptr>

It can be observed that the Date is not in Date format so we need to convert it to Date format for a time based analysis.

train[,Date := as.Date(Date)]
test[,Date := as.Date(Date)]
train <- setorderv(train,"Date")
test <- setorderv(test,"Date")
#summary(train)
#summary(test)
#str(train)
#str(test)
train[is.na(train$Open)]
## Empty data.table (0 rows) of 9 cols: Store,DayOfWeek,Date,Sales,Customers,Open...
test[is.na(test$Open)]
##        Id Store DayOfWeek       Date Open Promo StateHoliday SchoolHoliday
##  1: 10752   622         6 2015-09-05   NA     0            0             0
##  2:  9040   622         1 2015-09-07   NA     0            0             0
##  3:  8184   622         2 2015-09-08   NA     0            0             0
##  4:  7328   622         3 2015-09-09   NA     0            0             0
##  5:  6472   622         4 2015-09-10   NA     0            0             0
##  6:  5616   622         5 2015-09-11   NA     0            0             0
##  7:  4760   622         6 2015-09-12   NA     0            0             0
##  8:  3048   622         1 2015-09-14   NA     1            0             0
##  9:  2192   622         2 2015-09-15   NA     1            0             0
## 10:  1336   622         3 2015-09-16   NA     1            0             0
## 11:   480   622         4 2015-09-17   NA     1            0             0

There are no NA’s in “Open Coloumn” for the train data but few NA’s in the test data and only store number 622. Since no other store is closed on these days, it can be assumed that the store is open source.

test[is.na(test)] <- 1

First, we need to check if all the stores data is given on all the days in the train dataset. For this, the data,table .N is used and also plotted using ggplot.

sum_stores <- train[,.(storecount = .N),by = Date]
ggplot(sum_stores,aes(Date,storecount)) + geom_line()

From the plot we can clearly see that for a period of 6 months, ie; (July to Dec 2014) some stores data is missing. We can get the dates by the following commands.

sum_stores.less <- sum_stores[sum_stores$storecount !=1115]
head(sum_stores.less)
##          Date storecount
## 1: 2013-01-01       1114
## 2: 2014-07-01        935
## 3: 2014-07-02        935
## 4: 2014-07-03        935
## 5: 2014-07-04        935
## 6: 2014-07-05        935

We can see that there are 935 stores for the 6 month period and also single store missing on the starting day which we missed from the plot (can be seen if we observe carefully).

#1-7-2014 to 31-12-2014
sum_stores.less.935 <- sum_stores.less[sum_stores.less$storecount ==935]
head(sum_stores.less.935)
##          Date storecount
## 1: 2014-07-01        935
## 2: 2014-07-02        935
## 3: 2014-07-03        935
## 4: 2014-07-04        935
## 5: 2014-07-05        935
## 6: 2014-07-06        935
sum_stores.less.1134 <- sum_stores.less[sum_stores.less$storecount !=935]
head(sum_stores.less.1134)
##          Date storecount
## 1: 2013-01-01       1114

For the prediction to be modelled better, the missing data should be imputed. The missing store id’s can be found.

store_ids <- unique(train$Store) 
store_ids.jul1 <- train$Store[train$Date == as.Date("2014-7-1")]
store_ids.missing <- data.frame( store_ids[!(store_ids %in% store_ids.jul1)])
head(store_ids.missing)
##   store_ids...store_ids..in..store_ids.jul1..
## 1                                          13
## 2                                          20
## 3                                          22
## 4                                          32
## 5                                          36
## 6                                          41
store_ids.jan1 <- train$Store[train$Date == as.Date("2013-01-01")]
store_ids.jan1.missing <- data.frame( store_ids[!(store_ids %in% store_ids.jan1)])
head(store_ids.jan1.missing)
##   store_ids...store_ids..in..store_ids.jan1..
## 1                                         988

Imputing Missing data:

There are many ways to impute missing data. We can impute with the mean of the values of the other data.First, we will impute the missing data for Jan 1.

d.jan1 <- as.Date("2013-1-1")
jan1 <- train[train$Date == d.jan1]
jan1[,.(.N),by = jan1$Open]
##    jan1    N
## 1:    0 1097
## 2:    1   17

So only 0.015% of the stores are open so it is safe to assume that the store 988 is closed. Based on this assumption, we can impute the missing data for Jan1, store 988

jan1.date <- d.jan1
jan1.store <- as.integer(store_ids.jan1.missing)
jan1.dayofweek <- unique(train$DayOfWeek[train$Date == d.jan1])
jan1.open <- 0
jan1.sales <- 0
jan1.customers <- 0
jan1.promo <- unique(train$Promo[train$Date == d.jan1])
jan1.stateholiday <- unique(train$StateHoliday[train$Date == d.jan1])
jan1.schoolholiday <- unique(train$SchoolHoliday[train$Date == d.jan1])

jan1.row.988 <- data.table(Store = jan1.store,
                          DayOfWeek = jan1.dayofweek,
                          Date = jan1.date,
                          Sales = as.integer(jan1.sales),
                          Customers = as.integer(jan1.customers),
                          Open = as.integer(jan1.open) ,
                          Promo = jan1.promo,
                          StateHoliday = as.character(jan1.stateholiday),
                          SchoolHoliday = as.character(jan1.schoolholiday))
str(train)
## Classes 'data.table' and 'data.frame':   1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Date         : Date, format: "2013-01-01" "2013-01-01" ...
##  $ Sales        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Customers    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Open         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Promo        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ StateHoliday : chr  "a" "a" "a" "a" ...
##  $ SchoolHoliday: chr  "1" "1" "1" "1" ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(jan1.row.988)
## Classes 'data.table' and 'data.frame':   1 obs. of  9 variables:
##  $ Store        : int 988
##  $ DayOfWeek    : int 2
##  $ Date         : Date, format: "2013-01-01"
##  $ Sales        : int 0
##  $ Customers    : int 0
##  $ Open         : int 0
##  $ Promo        : int 0
##  $ StateHoliday : chr "a"
##  $ SchoolHoliday: chr "1"
##  - attr(*, ".internal.selfref")=<externalptr>
l <- list(train,jan1.row.988)
train <- rbindlist(l)

Now we need to impute data in to all the stores missing in gap (July’14 - Dec’14). We need to make a data.table to impute all the missing values and then combine with the main train data. The “Date” coloumn in the missing data table can be filled by

missingperiod <- seq(as.Date("2014-7-1"),as.Date("2014-12-31"),by="day")
num_rows_missing <- length(missingperiod)*nrow(store_ids.missing)
mising_datatable <- data.table(Store = integer(num_rows_missing),
                         DayOfWeek = integer(num_rows_missing),
                         Date = rep(missingperiod,nrow(store_ids.missing)),
                         Sales = integer(num_rows_missing),
                         Customers = integer(num_rows_missing),
                         Open = integer(num_rows_missing),
                         Promo = integer(num_rows_missing),
                         StateHoliday = character(num_rows_missing),
                         SchoolHoliday = integer(num_rows_missing)
                        )
#str(mising_datatable
#for( date in mising_datatable){
#  mising_datatable$Store[mising_datatable$Date == date] <- data.frame(store_ids.missing)
#}

Forecasting

I have decided to forecast for a random store “9” since my computational resources are poor.

train_final <- fread("train_filled_gap.csv")
train_final[,Date := as.Date(Date)]
train_final <- setorderv(train_final,"Date")
store9 <- train_final[train_final$Store == 9]
store9 <- setcolorder(store9, c("Date", setdiff(names(store9), "Date")))
store9_test <- test[test$Store == 9]
store9_test <- setcolorder(store9_test, c("Date", setdiff(names(store9_test), "Date")))

#str(store9)
store9.sales <- store9[,list(Date,Sales)]
#ggplot(store9,aes(Date,Sales)) + geom_line()
dygraph(store9.sales)

holiday <- 1 - store9$Open
holidayf <- 1 - store9_test$Open
promo <- store9$Promo
promof <- store9_test$Promo
test_period <- max(store9_test$Date) - min(store9_test$Date) + 1

y <- ts(store9$Sales, frequency=7)
z <- fourier(ts(store9$Sales, frequency=365.25), K=5)
zf <- fourierf(ts(store9$Sales, frequency=365.25), K=5, h=test_period)
fit <- auto.arima(y, xreg=cbind(z,holiday,promo), seasonal=FALSE)
## Warning in auto.arima(y, xreg = cbind(z, holiday, promo), seasonal =
## FALSE): Unable to fit final model using maximum likelihood. AIC value
## approximated
fc <- forecast(fit, xreg=cbind(zf,holidayf,promof), h=test_period)
plot(fc)