Reading the Libraries

library(data.table)
library(ggplot2)
library(dplyr)
library(forecast)
library(zoo)
library(plotly)
library(lubridate)
library(reshape2)
library(tseries)
library(urca)
library(randomForest)
options(scipen=999)

Loading the data

test <-fread("C:/Users/6430/Desktop/Project/test.csv/test.csv")
train<-fread("C:/Users/6430/Desktop/Project/train.csv/train.csv")
Store<- fread("C:/Users/6430/Desktop/Project/store.csv/store.csv")

First look at the data

str(train)
## Classes 'data.table' and 'data.frame':   1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : chr  "2015-07-31" "2015-07-31" "2015-07-31" "2015-07-31" ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(test)
## Classes 'data.table' and 'data.frame':   41088 obs. of  8 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Store        : int  1 3 7 8 9 10 11 12 13 14 ...
##  $ DayOfWeek    : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Date         : chr  "2015-09-17" "2015-09-17" "2015-09-17" "2015-09-17" ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(Store)
## Classes 'data.table' and 'data.frame':   1115 obs. of  10 variables:
##  $ Store                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ StoreType                : chr  "c" "a" "a" "c" ...
##  $ Assortment               : chr  "a" "a" "a" "c" ...
##  $ CompetitionDistance      : int  1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
##  $ CompetitionOpenSinceMonth: int  9 11 12 9 4 12 4 10 8 9 ...
##  $ CompetitionOpenSinceYear : int  2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
##  $ Promo2                   : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ Promo2SinceWeek          : int  NA 13 14 NA NA NA NA NA NA NA ...
##  $ Promo2SinceYear          : int  NA 2010 2011 NA NA NA NA NA NA NA ...
##  $ PromoInterval            : chr  "" "Jan,Apr,Jul,Oct" "Jan,Apr,Jul,Oct" "" ...
##  - attr(*, ".internal.selfref")=<externalptr>
train[, Date := as.Date(Date)]
test[, Date := as.Date(Date)]
train <- train[order(Date)]
test <- test[order(Date)]
summary(train)
##      Store          DayOfWeek          Date                Sales      
##  Min.   :   1.0   Min.   :1.000   Min.   :2013-01-01   Min.   :    0  
##  1st Qu.: 280.0   1st Qu.:2.000   1st Qu.:2013-08-17   1st Qu.: 3727  
##  Median : 558.0   Median :4.000   Median :2014-04-02   Median : 5744  
##  Mean   : 558.4   Mean   :3.998   Mean   :2014-04-11   Mean   : 5774  
##  3rd Qu.: 838.0   3rd Qu.:6.000   3rd Qu.:2014-12-12   3rd Qu.: 7856  
##  Max.   :1115.0   Max.   :7.000   Max.   :2015-07-31   Max.   :41551  
##    Customers           Open            Promo        StateHoliday      
##  Min.   :   0.0   Min.   :0.0000   Min.   :0.0000   Length:1017209    
##  1st Qu.: 405.0   1st Qu.:1.0000   1st Qu.:0.0000   Class :character  
##  Median : 609.0   Median :1.0000   Median :0.0000   Mode  :character  
##  Mean   : 633.1   Mean   :0.8301   Mean   :0.3815                     
##  3rd Qu.: 837.0   3rd Qu.:1.0000   3rd Qu.:1.0000                     
##  Max.   :7388.0   Max.   :1.0000   Max.   :1.0000                     
##  SchoolHoliday   
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1786  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
summary(test)
##        Id            Store          DayOfWeek          Date           
##  Min.   :    1   Min.   :   1.0   Min.   :1.000   Min.   :2015-08-01  
##  1st Qu.:10273   1st Qu.: 279.8   1st Qu.:2.000   1st Qu.:2015-08-12  
##  Median :20545   Median : 553.5   Median :4.000   Median :2015-08-24  
##  Mean   :20545   Mean   : 555.9   Mean   :3.979   Mean   :2015-08-24  
##  3rd Qu.:30816   3rd Qu.: 832.2   3rd Qu.:6.000   3rd Qu.:2015-09-05  
##  Max.   :41088   Max.   :1115.0   Max.   :7.000   Max.   :2015-09-17  
##                                                                       
##       Open            Promo        StateHoliday       SchoolHoliday   
##  Min.   :0.0000   Min.   :0.0000   Length:41088       Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   Class :character   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Mode  :character   Median :0.0000  
##  Mean   :0.8543   Mean   :0.3958                      Mean   :0.4435  
##  3rd Qu.:1.0000   3rd Qu.:1.0000                      3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000                      Max.   :1.0000  
##  NA's   :11
test[is.na(test$Open), ] # Only store 622
##        Id Store DayOfWeek       Date Open Promo StateHoliday SchoolHoliday
##  1: 10752   622         6 2015-09-05   NA     0            0             0
##  2:  9040   622         1 2015-09-07   NA     0            0             0
##  3:  8184   622         2 2015-09-08   NA     0            0             0
##  4:  7328   622         3 2015-09-09   NA     0            0             0
##  5:  6472   622         4 2015-09-10   NA     0            0             0
##  6:  5616   622         5 2015-09-11   NA     0            0             0
##  7:  4760   622         6 2015-09-12   NA     0            0             0
##  8:  3048   622         1 2015-09-14   NA     1            0             0
##  9:  2192   622         2 2015-09-15   NA     1            0             0
## 10:  1336   622         3 2015-09-16   NA     1            0             0
## 11:   480   622         4 2015-09-17   NA     1            0             0
test$Open[test$Store == 622]
##  [1]  1  0  1  1  1  1  1  1  0  1  1  1  1  1  1  0  1  1  1  1  1  1  0
## [24]  1  1  1  1  1  1  0  1  1  1  1  1 NA  0 NA NA NA NA NA NA  0 NA NA
## [47] NA NA

The test set has 41088 rows of date while the train set has 1017209 rows. Additonally store 622 has 11 missing values for the “open” column, so to predict correctly I have decided to input “1” for open column of store 622. Otherwise our prediction will not be correct.

Further, the whole “Customers” column is not availab“le in test date but that will be fixed by combining the two data set (”test" and “Store”) at later stage.

test[is.na(test)] <- 1 # Treating the closed store as open in test data

Also when we compare train and test data, we find all test stores are available in train data but 259 stores are not in test data

sum(unique(test$Store) %in% unique(train$Store))
## [1] 856
sum(!(unique(train$Store) %in% unique(test$Store))) 
## [1] 259
table(train$StateHoliday) / nrow(train) # State holidays contribution in train data
## 
##           0           a           b           c 
## 0.969475300 0.019917244 0.006576820 0.004030637
table(test$StateHoliday) / nrow(test) # This indicates that there is no Easter or Xmas holidays in test data
## 
##           0           a 
## 0.995619159 0.004380841
table(train$SchoolHoliday) / nrow(train) # School holiday Contribution in train data
## 
##         0         1 
## 0.8213533 0.1786467
table(test$SchoolHoliday) / nrow(test) # School holiday contribution in test data
## 
##         0         1 
## 0.5565129 0.4434871

Now let’s examine the columns in train dataset that are most relevant for our EDA in train data. We will not include the stores with sales=0 as the stores is either closed or just didn’t have any sale for some other reasons and includsion of those stores would be biased.

hist(train$Sales, 40)

hist(aggregate(train[Sales != 0]$Sales, 
               by = list(train[Sales != 0]$Store), mean)$x, 75,
     main = "Mean sales of each store when not closed")

hist(train$Customers, 40)

hist(aggregate(train[Sales != 0]$Customers, 
               by = list(train[Sales != 0]$Store), mean)$x, 100,
     main = "Mean customers/store when not closed")

Sales more than 20,000 looks to be outliers. We will check the data in ordet to make sure if the data really exhibit outliers or it is just the case for some stores.

boxplot(train$Sales)

Store_outliers <- subset(train[train$Sales > 20000, ])
Store_outliers
##       Store DayOfWeek       Date Sales Customers Open Promo StateHoliday
##    1:   513         3 2013-01-02 20445      2424    1     0            0
##    2:   523         3 2013-01-02 20493      2424    1     0            0
##    3:   817         3 2013-01-02 25357      3462    1     0            0
##    4:   842         3 2013-01-02 20355      1257    1     0            0
##    5:  1114         3 2013-01-02 20642      3401    1     0            0
##   ---                                                                   
## 4095:   562         5 2015-07-31 20303      3255    1     1            0
## 4096:   756         5 2015-07-31 25280      2901    1     1            0
## 4097:   817         5 2015-07-31 23093      2884    1     1            0
## 4098:   842         5 2015-07-31 23996      1362    1     1            0
## 4099:  1114         5 2015-07-31 27508      3745    1     1            0
##       SchoolHoliday
##    1:             1
##    2:             1
##    3:             1
##    4:             1
##    5:             1
##   ---              
## 4095:             1
## 4096:             1
## 4097:             1
## 4098:             0
## 4099:             1

It is evident from the above data that high sales is just the case for few stores and it is also spread across the different dates i.e not particularly a case of outliers.

As predicted sales and customers are quite positively related. Also it is obvious to think that Promo and Sales will be corelated as generally promo tend to atttract more customers and as a result customers tend to spend more. Let’s support this through the correlation test available in R.

tapply(train$Sales,train$DayOfWeek,mean)#Sale is highest on Sunday and Monday and almost even on other days
##         1         2         3         4         5         6         7 
## 7809.0445 7005.2445 6555.8841 6247.5759 6723.2743 5847.5626  204.1832
with(train[train$Sales != 0 & train$Promo == 0], mean(Sales / Customers))# No promotion
## [1] 8.941128
with(train[train$Sales != 0 & train$Promo == 1], mean(Sales / Customers))# With Promotion
## [1] 10.17896

There are few Stores that were open but didn’t have any sales. This could be due to tht fact that some customers just did window shopping, checked the deals or items and didn’t buy anything.

train[Open == 1 & Sales == 0]
##     Store DayOfWeek       Date Sales Customers Open Promo StateHoliday
##  1:   762         4 2013-01-17     0         0    1     0            0
##  2:   232         4 2013-01-24     0         0    1     1            0
##  3:   339         3 2013-01-30     0         0    1     0            0
##  4:   339         4 2013-01-31     0         0    1     0            0
##  5:   259         4 2013-02-07     0         0    1     1            0
##  6:   353         6 2013-03-16     0         0    1     0            0
##  7:   948         4 2013-04-25     0         5    1     1            0
##  8:   589         1 2013-04-29     0         0    1     1            0
##  9:   364         2 2013-05-07     0         0    1     0            0
## 10:   364         3 2013-05-08     0         0    1     0            0
## 11:   681         5 2013-05-10     0         0    1     0            0
## 12:   700         3 2013-06-05     0         0    1     1            0
## 13:   665         5 2013-06-28     0         0    1     0            0
## 14:   665         6 2013-06-29     0         0    1     0            0
## 15:  1039         2 2013-07-09     0         0    1     0            0
## 16:  1039         3 2013-07-10     0         0    1     0            0
## 17:   927         4 2013-08-08     0         0    1     0            0
## 18:   391         3 2013-08-28     0         0    1     1            0
## 19:   663         1 2013-09-02     0         0    1     0            0
## 20:   983         5 2014-01-17     0         0    1     0            0
## 21:   983         6 2014-01-18     0         0    1     0            0
## 22:   623         5 2014-01-24     0         0    1     1            0
## 23:   623         6 2014-01-25     0         0    1     0            0
## 24:    25         3 2014-02-12     0         0    1     0            0
## 25:    25         4 2014-02-13     0         0    1     0            0
## 26:   327         3 2014-03-12     0         0    1     0            0
## 27:   986         2 2014-03-18     0         0    1     1            0
## 28:   850         6 2014-03-29     0         0    1     0            0
## 29:   661         5 2014-04-04     0         0    1     1            0
## 30:  1100         2 2014-04-29     0         3    1     1            0
## 31:  1100         3 2014-04-30     0         0    1     1            0
## 32:  1017         3 2014-06-04     0         0    1     1            0
## 33:  1017         4 2014-06-05     0         0    1     1            0
## 34:    57         2 2014-07-01     0         0    1     1            0
## 35:   925         4 2014-07-03     0         0    1     1            0
## 36:   102         6 2014-07-12     0         0    1     0            0
## 37:   882         3 2014-07-23     0         0    1     0            0
## 38:   887         3 2014-07-23     0         0    1     0            0
## 39:   102         4 2014-07-24     0         0    1     0            0
## 40:   238         4 2014-07-24     0         0    1     0            0
## 41:   303         4 2014-07-24     0         0    1     0            0
## 42:   387         4 2014-07-24     0         0    1     0            0
## 43:    28         2 2014-09-02     0         0    1     1            0
## 44:    28         3 2014-09-03     0         0    1     1            0
## 45:    28         4 2014-09-04     0         0    1     1            0
## 46:   548         5 2014-09-05     0         0    1     1            0
## 47:   835         3 2014-09-10     0         0    1     0            0
## 48:   227         4 2014-09-11     0         0    1     0            0
## 49:   835         4 2014-09-11     0         0    1     0            0
## 50:   357         1 2014-09-22     0         0    1     0            0
## 51:   708         3 2014-10-01     0         0    1     1            0
## 52:   699         4 2015-02-05     0         0    1     1            0
## 53:   674         4 2015-03-26     0         0    1     0            0
## 54:   971         5 2015-05-15     0         0    1     0            0
##     Store DayOfWeek       Date Sales Customers Open Promo StateHoliday
##     SchoolHoliday
##  1:             0
##  2:             0
##  3:             0
##  4:             0
##  5:             0
##  6:             0
##  7:             0
##  8:             0
##  9:             0
## 10:             0
## 11:             0
## 12:             0
## 13:             0
## 14:             0
## 15:             0
## 16:             0
## 17:             1
## 18:             1
## 19:             1
## 20:             0
## 21:             0
## 22:             0
## 23:             0
## 24:             0
## 25:             0
## 26:             0
## 27:             0
## 28:             0
## 29:             0
## 30:             0
## 31:             0
## 32:             0
## 33:             0
## 34:             0
## 35:             0
## 36:             0
## 37:             1
## 38:             0
## 39:             1
## 40:             1
## 41:             1
## 42:             1
## 43:             1
## 44:             1
## 45:             0
## 46:             1
## 47:             0
## 48:             0
## 49:             0
## 50:             0
## 51:             0
## 52:             0
## 53:             0
## 54:             1
##     SchoolHoliday

Feature Selection Now let’s see the effect of other paramaters on Sales such as StoreType, AssortmentType etc.

We will merge the two dataset “train” and “Store” by the Store column.

train_store <- merge(train, Store, by = "Store")
ggplot(train_store[Sales != 0], aes(x = factor(PromoInterval), y = Sales)) + 
  geom_jitter(alpha = 0.1) + 
  geom_boxplot(color = "yellow", outlier.colour = NA, fill = NA)

ggplot(train_store[Sales != 0], 
       aes(x = as.Date(Date), y = Sales, color = factor(StoreType))) + 
  geom_smooth(size = 2)

Sales of all the stores over the given period

qplot(Date, Sales, data = train_store[Sales!=0],
      geom = "smooth", span =0.5)+
  scale_x_date(date_breaks="3 months", date_labels = "%m/%y")

Distribution of four different store types and sales pattern

ggplot(train_store, aes(StoreType, fill= StoreType)) +geom_bar()+
  ylab("Store count of total store") +
  ggtitle("Distribution of avilable StoreTypes")

Distribution of available Assortment types and sales

ggplot(train_store, aes(Assortment, fill= Assortment)) +
 geom_bar()+xlab("AssortmentType")+ggtitle("Distribution of available AssortmentTypes")

ggplot(train_store[Sales != 0], aes(x = Assortment , y = Sales, fill= Assortment)) + 
 geom_boxplot() + scale_y_continuous(breaks = seq(0,100000,5000))+
ggtitle("Boxplot showing the effect of Assortment Type on Sales")

Effect of Single day Promotion on Sales

 ggplot(train_store, aes(x = Promo2 , y = Sales, color = factor (Promo2))) + 
    
    geom_boxplot() + scale_y_continuous(breaks = seq(0,100000,10000))+
    scale_x_continuous(breaks = seq(0,1,1))+xlab("Promotion on/off")+
    ylab("Sales of Stores")+
    ggtitle("Boxplot of the effect of the promotion on sales")

Effect of Competition on Sales since Competition Started. Surprisingly competition staretd in year 1900.

qplot(factor(CompetitionOpenSinceYear), Sales, data = train_store, 
      fill = factor(CompetitionOpenSinceYear),geom = "boxplot")+
  scale_y_continuous(breaks = seq(0,100000,5000))+xlab("Competition Year")

Sales Trend of any chosen store on open days

  ggplot(train_store[Store == 256])+geom_line(aes(x= Date, y = Sales))+
    scale_y_continuous(breaks = seq(0,100000,4000))+xlab("Timeline")+
    ggtitle("Sales trend of a chosen store only for open days")

DayofWeek feature:

ggplot(train_store[Sales != 0], aes(x = weekdays(DayOfWeek), y = Sales, 
  fill = factor(weekdays(DayOfWeek))))+ geom_boxplot()+
  xlab("Sales distribution of each weekday")

Effect of StateHoliday on Sales

  ggplot(train_store[Sales != 0], aes(x = StateHoliday, y = Sales)) + 
    
    geom_boxplot() + scale_y_continuous(breaks = seq(0,100000,5000))

Type B stores never closes even on Sunday, we will examine Type B stores sales data with time

df<-train_store
subdf=subset(df, StoreType=="b")
subdf$Year <- format(subdf$Date, "%Y")
subdf$Month <- format(subdf$Date, "%b")
subdf$Day <- format(subdf$Date, "%d")
train_store$Day<-format(train_store$Date, "%d")
subdf$MonthDay <- format(subdf$Date, "%d-%b")
subdf$CommonDate <- as.Date(paste0("2013-",format(subdf$Date, "%j")), "%Y-%j")
qplot(x= Date, Sales, data = subdf, 
      color = factor(Store),geom = "line")+
scale_x_date(date_breaks="3 months", date_labels = "%m/%y")

Distribution of Promotion on each Day of Month

ggplot(train_store, aes(x = Day, y=Promo,
 fill = factor(Day)))+geom_bar(stat="identity")+
  xlab("Days of month")+ylab("No of days on which promotion is offered")+
  ggtitle("promotion distribution on the days of months")

Weekly Average Sales Distribution on Store 85

qplot(Date, Sales, data = subdf[Store==85], geom ="line", col= as.factor(Year))+
  scale_x_date(date_breaks="1 week", date_labels = "%W")+xlab("week of year")+
  ggtitle("Average weekly sales of each year")

Learnings from the above EDA