Reading the Libraries
library(data.table)
library(ggplot2)
library(dplyr)
library(forecast)
library(zoo)
library(plotly)
library(lubridate)
library(reshape2)
library(tseries)
library(urca)
library(randomForest)
options(scipen=999)
Loading the data
test <-fread("C:/Users/6430/Desktop/Project/test.csv/test.csv")
train<-fread("C:/Users/6430/Desktop/Project/train.csv/train.csv")
Store<- fread("C:/Users/6430/Desktop/Project/store.csv/store.csv")
First look at the data
str(train)
## Classes 'data.table' and 'data.frame': 1017209 obs. of 9 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Date : chr "2015-07-31" "2015-07-31" "2015-07-31" "2015-07-31" ...
## $ Sales : int 5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
## $ Customers : int 555 625 821 1498 559 589 1414 833 687 681 ...
## $ Open : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 1 1 1 1 1 ...
## $ StateHoliday : chr "0" "0" "0" "0" ...
## $ SchoolHoliday: int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(test)
## Classes 'data.table' and 'data.frame': 41088 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Store : int 1 3 7 8 9 10 11 12 13 14 ...
## $ DayOfWeek : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Date : chr "2015-09-17" "2015-09-17" "2015-09-17" "2015-09-17" ...
## $ Open : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 1 1 1 1 1 ...
## $ StateHoliday : chr "0" "0" "0" "0" ...
## $ SchoolHoliday: int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(Store)
## Classes 'data.table' and 'data.frame': 1115 obs. of 10 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ StoreType : chr "c" "a" "a" "c" ...
## $ Assortment : chr "a" "a" "a" "c" ...
## $ CompetitionDistance : int 1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
## $ CompetitionOpenSinceMonth: int 9 11 12 9 4 12 4 10 8 9 ...
## $ CompetitionOpenSinceYear : int 2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
## $ Promo2 : int 0 1 1 0 0 0 0 0 0 0 ...
## $ Promo2SinceWeek : int NA 13 14 NA NA NA NA NA NA NA ...
## $ Promo2SinceYear : int NA 2010 2011 NA NA NA NA NA NA NA ...
## $ PromoInterval : chr "" "Jan,Apr,Jul,Oct" "Jan,Apr,Jul,Oct" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
train[, Date := as.Date(Date)]
test[, Date := as.Date(Date)]
train <- train[order(Date)]
test <- test[order(Date)]
summary(train)
## Store DayOfWeek Date Sales
## Min. : 1.0 Min. :1.000 Min. :2013-01-01 Min. : 0
## 1st Qu.: 280.0 1st Qu.:2.000 1st Qu.:2013-08-17 1st Qu.: 3727
## Median : 558.0 Median :4.000 Median :2014-04-02 Median : 5744
## Mean : 558.4 Mean :3.998 Mean :2014-04-11 Mean : 5774
## 3rd Qu.: 838.0 3rd Qu.:6.000 3rd Qu.:2014-12-12 3rd Qu.: 7856
## Max. :1115.0 Max. :7.000 Max. :2015-07-31 Max. :41551
## Customers Open Promo StateHoliday
## Min. : 0.0 Min. :0.0000 Min. :0.0000 Length:1017209
## 1st Qu.: 405.0 1st Qu.:1.0000 1st Qu.:0.0000 Class :character
## Median : 609.0 Median :1.0000 Median :0.0000 Mode :character
## Mean : 633.1 Mean :0.8301 Mean :0.3815
## 3rd Qu.: 837.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7388.0 Max. :1.0000 Max. :1.0000
## SchoolHoliday
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1786
## 3rd Qu.:0.0000
## Max. :1.0000
summary(test)
## Id Store DayOfWeek Date
## Min. : 1 Min. : 1.0 Min. :1.000 Min. :2015-08-01
## 1st Qu.:10273 1st Qu.: 279.8 1st Qu.:2.000 1st Qu.:2015-08-12
## Median :20545 Median : 553.5 Median :4.000 Median :2015-08-24
## Mean :20545 Mean : 555.9 Mean :3.979 Mean :2015-08-24
## 3rd Qu.:30816 3rd Qu.: 832.2 3rd Qu.:6.000 3rd Qu.:2015-09-05
## Max. :41088 Max. :1115.0 Max. :7.000 Max. :2015-09-17
##
## Open Promo StateHoliday SchoolHoliday
## Min. :0.0000 Min. :0.0000 Length:41088 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 Class :character 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Mode :character Median :0.0000
## Mean :0.8543 Mean :0.3958 Mean :0.4435
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :11
test[is.na(test$Open), ] # Only store 622
## Id Store DayOfWeek Date Open Promo StateHoliday SchoolHoliday
## 1: 10752 622 6 2015-09-05 NA 0 0 0
## 2: 9040 622 1 2015-09-07 NA 0 0 0
## 3: 8184 622 2 2015-09-08 NA 0 0 0
## 4: 7328 622 3 2015-09-09 NA 0 0 0
## 5: 6472 622 4 2015-09-10 NA 0 0 0
## 6: 5616 622 5 2015-09-11 NA 0 0 0
## 7: 4760 622 6 2015-09-12 NA 0 0 0
## 8: 3048 622 1 2015-09-14 NA 1 0 0
## 9: 2192 622 2 2015-09-15 NA 1 0 0
## 10: 1336 622 3 2015-09-16 NA 1 0 0
## 11: 480 622 4 2015-09-17 NA 1 0 0
test$Open[test$Store == 622]
## [1] 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
## [24] 1 1 1 1 1 1 0 1 1 1 1 1 NA 0 NA NA NA NA NA NA 0 NA NA
## [47] NA NA
The test set has 41088 rows of date while the train set has 1017209 rows. Additonally store 622 has 11 missing values for the “open” column, so to predict correctly I have decided to input “1” for open column of store 622. Otherwise our prediction will not be correct.
Further, the whole “Customers” column is not availab“le in test date but that will be fixed by combining the two data set (”test" and “Store”) at later stage.
test[is.na(test)] <- 1 # Treating the closed store as open in test data
Also when we compare train and test data, we find all test stores are available in train data but 259 stores are not in test data
sum(unique(test$Store) %in% unique(train$Store))
## [1] 856
sum(!(unique(train$Store) %in% unique(test$Store)))
## [1] 259
table(train$StateHoliday) / nrow(train) # State holidays contribution in train data
##
## 0 a b c
## 0.969475300 0.019917244 0.006576820 0.004030637
table(test$StateHoliday) / nrow(test) # This indicates that there is no Easter or Xmas holidays in test data
##
## 0 a
## 0.995619159 0.004380841
table(train$SchoolHoliday) / nrow(train) # School holiday Contribution in train data
##
## 0 1
## 0.8213533 0.1786467
table(test$SchoolHoliday) / nrow(test) # School holiday contribution in test data
##
## 0 1
## 0.5565129 0.4434871
Now let’s examine the columns in train dataset that are most relevant for our EDA in train data. We will not include the stores with sales=0 as the stores is either closed or just didn’t have any sale for some other reasons and includsion of those stores would be biased.
hist(train$Sales, 40)
hist(aggregate(train[Sales != 0]$Sales,
by = list(train[Sales != 0]$Store), mean)$x, 75,
main = "Mean sales of each store when not closed")
hist(train$Customers, 40)
hist(aggregate(train[Sales != 0]$Customers,
by = list(train[Sales != 0]$Store), mean)$x, 100,
main = "Mean customers/store when not closed")
Sales more than 20,000 looks to be outliers. We will check the data in ordet to make sure if the data really exhibit outliers or it is just the case for some stores.
boxplot(train$Sales)
Store_outliers <- subset(train[train$Sales > 20000, ])
Store_outliers
## Store DayOfWeek Date Sales Customers Open Promo StateHoliday
## 1: 513 3 2013-01-02 20445 2424 1 0 0
## 2: 523 3 2013-01-02 20493 2424 1 0 0
## 3: 817 3 2013-01-02 25357 3462 1 0 0
## 4: 842 3 2013-01-02 20355 1257 1 0 0
## 5: 1114 3 2013-01-02 20642 3401 1 0 0
## ---
## 4095: 562 5 2015-07-31 20303 3255 1 1 0
## 4096: 756 5 2015-07-31 25280 2901 1 1 0
## 4097: 817 5 2015-07-31 23093 2884 1 1 0
## 4098: 842 5 2015-07-31 23996 1362 1 1 0
## 4099: 1114 5 2015-07-31 27508 3745 1 1 0
## SchoolHoliday
## 1: 1
## 2: 1
## 3: 1
## 4: 1
## 5: 1
## ---
## 4095: 1
## 4096: 1
## 4097: 1
## 4098: 0
## 4099: 1
It is evident from the above data that high sales is just the case for few stores and it is also spread across the different dates i.e not particularly a case of outliers.
As predicted sales and customers are quite positively related. Also it is obvious to think that Promo and Sales will be corelated as generally promo tend to atttract more customers and as a result customers tend to spend more. Let’s support this through the correlation test available in R.
tapply(train$Sales,train$DayOfWeek,mean)#Sale is highest on Sunday and Monday and almost even on other days
## 1 2 3 4 5 6 7
## 7809.0445 7005.2445 6555.8841 6247.5759 6723.2743 5847.5626 204.1832
with(train[train$Sales != 0 & train$Promo == 0], mean(Sales / Customers))# No promotion
## [1] 8.941128
with(train[train$Sales != 0 & train$Promo == 1], mean(Sales / Customers))# With Promotion
## [1] 10.17896
There are few Stores that were open but didn’t have any sales. This could be due to tht fact that some customers just did window shopping, checked the deals or items and didn’t buy anything.
train[Open == 1 & Sales == 0]
## Store DayOfWeek Date Sales Customers Open Promo StateHoliday
## 1: 762 4 2013-01-17 0 0 1 0 0
## 2: 232 4 2013-01-24 0 0 1 1 0
## 3: 339 3 2013-01-30 0 0 1 0 0
## 4: 339 4 2013-01-31 0 0 1 0 0
## 5: 259 4 2013-02-07 0 0 1 1 0
## 6: 353 6 2013-03-16 0 0 1 0 0
## 7: 948 4 2013-04-25 0 5 1 1 0
## 8: 589 1 2013-04-29 0 0 1 1 0
## 9: 364 2 2013-05-07 0 0 1 0 0
## 10: 364 3 2013-05-08 0 0 1 0 0
## 11: 681 5 2013-05-10 0 0 1 0 0
## 12: 700 3 2013-06-05 0 0 1 1 0
## 13: 665 5 2013-06-28 0 0 1 0 0
## 14: 665 6 2013-06-29 0 0 1 0 0
## 15: 1039 2 2013-07-09 0 0 1 0 0
## 16: 1039 3 2013-07-10 0 0 1 0 0
## 17: 927 4 2013-08-08 0 0 1 0 0
## 18: 391 3 2013-08-28 0 0 1 1 0
## 19: 663 1 2013-09-02 0 0 1 0 0
## 20: 983 5 2014-01-17 0 0 1 0 0
## 21: 983 6 2014-01-18 0 0 1 0 0
## 22: 623 5 2014-01-24 0 0 1 1 0
## 23: 623 6 2014-01-25 0 0 1 0 0
## 24: 25 3 2014-02-12 0 0 1 0 0
## 25: 25 4 2014-02-13 0 0 1 0 0
## 26: 327 3 2014-03-12 0 0 1 0 0
## 27: 986 2 2014-03-18 0 0 1 1 0
## 28: 850 6 2014-03-29 0 0 1 0 0
## 29: 661 5 2014-04-04 0 0 1 1 0
## 30: 1100 2 2014-04-29 0 3 1 1 0
## 31: 1100 3 2014-04-30 0 0 1 1 0
## 32: 1017 3 2014-06-04 0 0 1 1 0
## 33: 1017 4 2014-06-05 0 0 1 1 0
## 34: 57 2 2014-07-01 0 0 1 1 0
## 35: 925 4 2014-07-03 0 0 1 1 0
## 36: 102 6 2014-07-12 0 0 1 0 0
## 37: 882 3 2014-07-23 0 0 1 0 0
## 38: 887 3 2014-07-23 0 0 1 0 0
## 39: 102 4 2014-07-24 0 0 1 0 0
## 40: 238 4 2014-07-24 0 0 1 0 0
## 41: 303 4 2014-07-24 0 0 1 0 0
## 42: 387 4 2014-07-24 0 0 1 0 0
## 43: 28 2 2014-09-02 0 0 1 1 0
## 44: 28 3 2014-09-03 0 0 1 1 0
## 45: 28 4 2014-09-04 0 0 1 1 0
## 46: 548 5 2014-09-05 0 0 1 1 0
## 47: 835 3 2014-09-10 0 0 1 0 0
## 48: 227 4 2014-09-11 0 0 1 0 0
## 49: 835 4 2014-09-11 0 0 1 0 0
## 50: 357 1 2014-09-22 0 0 1 0 0
## 51: 708 3 2014-10-01 0 0 1 1 0
## 52: 699 4 2015-02-05 0 0 1 1 0
## 53: 674 4 2015-03-26 0 0 1 0 0
## 54: 971 5 2015-05-15 0 0 1 0 0
## Store DayOfWeek Date Sales Customers Open Promo StateHoliday
## SchoolHoliday
## 1: 0
## 2: 0
## 3: 0
## 4: 0
## 5: 0
## 6: 0
## 7: 0
## 8: 0
## 9: 0
## 10: 0
## 11: 0
## 12: 0
## 13: 0
## 14: 0
## 15: 0
## 16: 0
## 17: 1
## 18: 1
## 19: 1
## 20: 0
## 21: 0
## 22: 0
## 23: 0
## 24: 0
## 25: 0
## 26: 0
## 27: 0
## 28: 0
## 29: 0
## 30: 0
## 31: 0
## 32: 0
## 33: 0
## 34: 0
## 35: 0
## 36: 0
## 37: 1
## 38: 0
## 39: 1
## 40: 1
## 41: 1
## 42: 1
## 43: 1
## 44: 1
## 45: 0
## 46: 1
## 47: 0
## 48: 0
## 49: 0
## 50: 0
## 51: 0
## 52: 0
## 53: 0
## 54: 1
## SchoolHoliday
Feature Selection Now let’s see the effect of other paramaters on Sales such as StoreType, AssortmentType etc.
We will merge the two dataset “train” and “Store” by the Store column.
train_store <- merge(train, Store, by = "Store")
ggplot(train_store[Sales != 0], aes(x = factor(PromoInterval), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "yellow", outlier.colour = NA, fill = NA)
ggplot(train_store[Sales != 0],
aes(x = as.Date(Date), y = Sales, color = factor(StoreType))) +
geom_smooth(size = 2)
Sales of all the stores over the given period
qplot(Date, Sales, data = train_store[Sales!=0],
geom = "smooth", span =0.5)+
scale_x_date(date_breaks="3 months", date_labels = "%m/%y")
Distribution of four different store types and sales pattern
ggplot(train_store, aes(StoreType, fill= StoreType)) +geom_bar()+
ylab("Store count of total store") +
ggtitle("Distribution of avilable StoreTypes")
Distribution of available Assortment types and sales
ggplot(train_store, aes(Assortment, fill= Assortment)) +
geom_bar()+xlab("AssortmentType")+ggtitle("Distribution of available AssortmentTypes")
ggplot(train_store[Sales != 0], aes(x = Assortment , y = Sales, fill= Assortment)) +
geom_boxplot() + scale_y_continuous(breaks = seq(0,100000,5000))+
ggtitle("Boxplot showing the effect of Assortment Type on Sales")
Effect of Single day Promotion on Sales
ggplot(train_store, aes(x = Promo2 , y = Sales, color = factor (Promo2))) +
geom_boxplot() + scale_y_continuous(breaks = seq(0,100000,10000))+
scale_x_continuous(breaks = seq(0,1,1))+xlab("Promotion on/off")+
ylab("Sales of Stores")+
ggtitle("Boxplot of the effect of the promotion on sales")
Effect of Competition on Sales since Competition Started. Surprisingly competition staretd in year 1900.
qplot(factor(CompetitionOpenSinceYear), Sales, data = train_store,
fill = factor(CompetitionOpenSinceYear),geom = "boxplot")+
scale_y_continuous(breaks = seq(0,100000,5000))+xlab("Competition Year")
Sales Trend of any chosen store on open days
ggplot(train_store[Store == 256])+geom_line(aes(x= Date, y = Sales))+
scale_y_continuous(breaks = seq(0,100000,4000))+xlab("Timeline")+
ggtitle("Sales trend of a chosen store only for open days")
DayofWeek feature:
ggplot(train_store[Sales != 0], aes(x = weekdays(DayOfWeek), y = Sales,
fill = factor(weekdays(DayOfWeek))))+ geom_boxplot()+
xlab("Sales distribution of each weekday")
Effect of StateHoliday on Sales
ggplot(train_store[Sales != 0], aes(x = StateHoliday, y = Sales)) +
geom_boxplot() + scale_y_continuous(breaks = seq(0,100000,5000))
Type B stores never closes even on Sunday, we will examine Type B stores sales data with time
df<-train_store
subdf=subset(df, StoreType=="b")
subdf$Year <- format(subdf$Date, "%Y")
subdf$Month <- format(subdf$Date, "%b")
subdf$Day <- format(subdf$Date, "%d")
train_store$Day<-format(train_store$Date, "%d")
subdf$MonthDay <- format(subdf$Date, "%d-%b")
subdf$CommonDate <- as.Date(paste0("2013-",format(subdf$Date, "%j")), "%Y-%j")
qplot(x= Date, Sales, data = subdf,
color = factor(Store),geom = "line")+
scale_x_date(date_breaks="3 months", date_labels = "%m/%y")
Distribution of Promotion on each Day of Month
ggplot(train_store, aes(x = Day, y=Promo,
fill = factor(Day)))+geom_bar(stat="identity")+
xlab("Days of month")+ylab("No of days on which promotion is offered")+
ggtitle("promotion distribution on the days of months")
Weekly Average Sales Distribution on Store 85
qplot(Date, Sales, data = subdf[Store==85], geom ="line", col= as.factor(Year))+
scale_x_date(date_breaks="1 week", date_labels = "%W")+xlab("week of year")+
ggtitle("Average weekly sales of each year")
Learnings from the above EDA