DATA CLEANSING

Load packages

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.0.2
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.0.2
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.2
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.0.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Import train data

train <- read.csv("train.csv")

Show the structure of train data

str(train)
## 'data.frame':    1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : chr  "2015-07-31" "2015-07-31" "2015-07-31" "2015-07-31" ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: int  1 1 1 1 1 1 1 1 1 1 ...

Some data fields should be converted to a more suitable data type for the convenience of the explanatory process.

Change the data type of “Date” from “char” to “date”

train$Date <- as.Date(as.character(train$Date))

Factorize categorical data fields in train data

train$DayOfWeek <- as.factor(as.integer(train$DayOfWeek))
train$StateHoliday <- as.factor(as.character(train$StateHoliday))
train$Open <- as.factor(as.character(train$Open))
train$Promo <- as.factor(as.character(train$Promo))
train$SchoolHoliday <- as.factor(as.character(train$SchoolHoliday))

Check if there’s any NA to deal with

table (complete.cases (train))
## 
##    TRUE 
## 1017209

no NULL values found

summary(train)
##      Store        DayOfWeek       Date                Sales      
##  Min.   :   1.0   1:144730   Min.   :2013-01-01   Min.   :    0  
##  1st Qu.: 280.0   2:145664   1st Qu.:2013-08-17   1st Qu.: 3727  
##  Median : 558.0   3:145665   Median :2014-04-02   Median : 5744  
##  Mean   : 558.4   4:145845   Mean   :2014-04-11   Mean   : 5774  
##  3rd Qu.: 838.0   5:145845   3rd Qu.:2014-12-12   3rd Qu.: 7856  
##  Max.   :1115.0   6:144730   Max.   :2015-07-31   Max.   :41551  
##                   7:144730                                       
##    Customers      Open       Promo      StateHoliday SchoolHoliday
##  Min.   :   0.0   0:172817   0:629129   0:986159     0:835488     
##  1st Qu.: 405.0   1:844392   1:388080   a: 20260     1:181721     
##  Median : 609.0                         b:  6690                  
##  Mean   : 633.1                         c:  4100                  
##  3rd Qu.: 837.0                                                   
##  Max.   :7388.0                                                   
## 

Now the train data is ready for the analysis process. We will move to cleaning the store data.

Assign column names for store data

colNames <- c ("Store", "StoreType",    "Assortment",   "CompetitionDistance",  
              "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear",
              "PromoContinuation",  "PromoParticipationSinceWeek",  
              "PromoParticipationSinceYear", "PromoInterval")

Import store data

store <- read.table ("store.csv", header = TRUE, sep = ",",
                       strip.white = TRUE, col.names = colNames,
                       na.strings = "?", stringsAsFactors = TRUE)

Show the structure of the store data

str (store)
## 'data.frame':    1115 obs. of  10 variables:
##  $ Store                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ StoreType                  : Factor w/ 4 levels "a","b","c","d": 3 1 1 3 1 1 1 1 1 1 ...
##  $ Assortment                 : Factor w/ 3 levels "a","b","c": 1 1 1 3 1 1 3 1 3 1 ...
##  $ CompetitionDistance        : int  1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
##  $ CompetitionOpenSinceMonth  : int  9 11 12 9 4 12 4 10 8 9 ...
##  $ CompetitionOpenSinceYear   : int  2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
##  $ PromoContinuation          : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ PromoParticipationSinceWeek: int  NA 13 14 NA NA NA NA NA NA NA ...
##  $ PromoParticipationSinceYear: int  NA 2010 2011 NA NA NA NA NA NA NA ...
##  $ PromoInterval              : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 3 3 1 1 1 1 1 1 1 ...

Identify NA cases

table (complete.cases (store))
## 
## FALSE  TRUE 
##   750   365

Take a look at the table summary to identify the NAs

summary(store)
##      Store        StoreType Assortment CompetitionDistance
##  Min.   :   1.0   a:602     a:593      Min.   :   20.0    
##  1st Qu.: 279.5   b: 17     b:  9      1st Qu.:  717.5    
##  Median : 558.0   c:148     c:513      Median : 2325.0    
##  Mean   : 558.0   d:348                Mean   : 5404.9    
##  3rd Qu.: 836.5                        3rd Qu.: 6882.5    
##  Max.   :1115.0                        Max.   :75860.0    
##                                        NA's   :3          
##  CompetitionOpenSinceMonth CompetitionOpenSinceYear PromoContinuation
##  Min.   : 1.000            Min.   :1900             Min.   :0.0000   
##  1st Qu.: 4.000            1st Qu.:2006             1st Qu.:0.0000   
##  Median : 8.000            Median :2010             Median :1.0000   
##  Mean   : 7.225            Mean   :2009             Mean   :0.5121   
##  3rd Qu.:10.000            3rd Qu.:2013             3rd Qu.:1.0000   
##  Max.   :12.000            Max.   :2015             Max.   :1.0000   
##  NA's   :354               NA's   :354                               
##  PromoParticipationSinceWeek PromoParticipationSinceYear          PromoInterval
##  Min.   : 1.0                Min.   :2009                                :544  
##  1st Qu.:13.0                1st Qu.:2011                Feb,May,Aug,Nov :130  
##  Median :22.0                Median :2012                Jan,Apr,Jul,Oct :335  
##  Mean   :23.6                Mean   :2012                Mar,Jun,Sept,Dec:106  
##  3rd Qu.:37.0                3rd Qu.:2013                                      
##  Max.   :50.0                Max.   :2015                                      
##  NA's   :544                 NA's   :544

Replace the NAs in Competition Distance by its median

store$CompetitionDistance[is.na(store$CompetitionDistance)] <- median(store$CompetitionDistance, na.rm=TRUE)

Replace the remaining NA’s by 0

store[is.na(store)] <- 0

EXPLORATION

Join train and store tables to further explore other correlations between the data fields of 2 tables.

train_store <- merge(train, store, by = "Store")

For graph to display number in full (E.g. 1000000 instead of 10e6)

options("scipen" = 10)

For our exploratory data analysis, we will look into the relationship between Sales and other data fields.

Sales vs Store

Here we made a vector containing the mean sales of 1115 stores

MeanSalesPerStore <- vector(mode = "numeric",length = 1115)
for (i in 1:1115) {
  MeanSalesPerStore[i] <- mean(train_store$Sales[train_store$Store==i])
}
match(max(MeanSalesPerStore),MeanSalesPerStore) 
## [1] 262
match(min(MeanSalesPerStore),MeanSalesPerStore) 
## [1] 307

Store 262 has the max mean sales. Store 307 has the min mean sales.

hist(MeanSalesPerStore,100)

summary(MeanSalesPerStore)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2245    4412    5459    5763    6634   20719

Average sales mostly range from 2500 - 7500. The outliers are stores with over 10000 in mean sales. Sales vary from stores to stores –> Strong predictor value.

Sales vs DayOfWeek

boxplot(Sales ~ DayOfWeek,data=train_store)

Sales of day 7 of the week (Sunday) is extremely low compared to other dates. –> Strong predictor value.

We will take a closer look at Day 7.

Day7Sales <- subset(train_store,DayOfWeek==7)
summary(Day7Sales) 
##      Store        DayOfWeek       Date                Sales        
##  Min.   :   1.0   1:     0   Min.   :2013-01-06   Min.   :    0.0  
##  1st Qu.: 280.0   2:     0   1st Qu.:2013-08-18   1st Qu.:    0.0  
##  Median : 558.0   3:     0   Median :2014-03-30   Median :    0.0  
##  Mean   : 558.4   4:     0   Mean   :2014-04-11   Mean   :  204.2  
##  3rd Qu.: 838.0   5:     0   3rd Qu.:2014-12-14   3rd Qu.:    0.0  
##  Max.   :1115.0   6:     0   Max.   :2015-07-26   Max.   :37376.0  
##                   7:144730                                         
##    Customers       Open       Promo      StateHoliday SchoolHoliday StoreType
##  Min.   :   0.00   0:141137   0:144730   0:144421     0:142006      a:78484  
##  1st Qu.:   0.00   1:  3593   1:     0   a:   309     1:  2724      b: 2252  
##  Median :   0.00                         b:     0                   c:19468  
##  Mean   :  35.79                         c:     0                   d:44526  
##  3rd Qu.:   0.00                                                             
##  Max.   :5145.00                                                             
##                                                                              
##  Assortment CompetitionDistance CompetitionOpenSinceMonth
##  a:76472    Min.   :   20       Min.   : 0.000           
##  b: 1180    1st Qu.:  710       1st Qu.: 0.000           
##  c:67078    Median : 2325       Median : 4.000           
##             Mean   : 5422       Mean   : 4.927           
##             3rd Qu.: 6880       3rd Qu.: 9.000           
##             Max.   :75860       Max.   :12.000           
##                                                          
##  CompetitionOpenSinceYear PromoContinuation PromoParticipationSinceWeek
##  Min.   :   0             Min.   :0.0000    Min.   : 0.00              
##  1st Qu.:   0             1st Qu.:0.0000    1st Qu.: 0.00              
##  Median :2006             Median :1.0000    Median : 1.00              
##  Mean   :1370             Mean   :0.5006    Mean   :11.65              
##  3rd Qu.:2011             3rd Qu.:1.0000    3rd Qu.:22.00              
##  Max.   :2015             Max.   :1.0000    Max.   :50.00              
##                                                                        
##  PromoParticipationSinceYear          PromoInterval  
##  Min.   :   0                                :72272  
##  1st Qu.:   0                Feb,May,Aug,Nov :16874  
##  Median :2009                Jan,Apr,Jul,Oct :41718  
##  Mean   :1007                Mar,Jun,Sept,Dec:13866  
##  3rd Qu.:2012                                        
##  Max.   :2015                                        
## 

Look at “Sales” data, we see that most stores have 0 sales on Sunday, and this is because 97.5% (141137/144730) of the records indicated that the stores were closed on those dates.

Now we will check if the stores that were open on Day 7 had sales or not.

summary(subset(Day7Sales,Open==1))
##      Store        DayOfWeek      Date                Sales         Customers   
##  Min.   :  85.0   1:   0    Min.   :2013-01-06   Min.   :  286   Min.   :  40  
##  1st Qu.: 310.0   2:   0    1st Qu.:2013-08-18   1st Qu.: 3314   1st Qu.: 438  
##  Median : 530.0   3:   0    Median :2014-05-04   Median : 6876   Median :1262  
##  Mean   : 573.9   4:   0    Mean   :2014-04-21   Mean   : 8225   Mean   :1442  
##  3rd Qu.: 769.0   5:   0    3rd Qu.:2014-12-07   3rd Qu.:11418   3rd Qu.:2175  
##  Max.   :1099.0   6:   0    Max.   :2015-07-26   Max.   :37376   Max.   :5145  
##                   7:3593                                                       
##  Open     Promo    StateHoliday SchoolHoliday StoreType Assortment
##  0:   0   0:3593   0:3591       0:3511        a:1100    a:1018    
##  1:3593   1:   0   a:   2       1:  82        b:2216    b:1170    
##                    b:   0                     c:   0    c:1405    
##                    c:   0                     d: 277              
##                                                                   
##                                                                   
##                                                                   
##  CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear
##  Min.   :   90       Min.   : 0.000            Min.   :   0            
##  1st Qu.:  840       1st Qu.: 0.000            1st Qu.:   0            
##  Median : 1410       Median : 4.000            Median :2006            
##  Mean   :10623       Mean   : 4.389            Mean   :1199            
##  3rd Qu.:18160       3rd Qu.: 9.000            3rd Qu.:2012            
##  Max.   :75860       Max.   :12.000            Max.   :2014            
##                                                                        
##  PromoContinuation PromoParticipationSinceWeek PromoParticipationSinceYear
##  Min.   :0.0000    Min.   : 0.000              Min.   :   0               
##  1st Qu.:0.0000    1st Qu.: 0.000              1st Qu.:   0               
##  Median :0.0000    Median : 0.000              Median :   0               
##  Mean   :0.2797    Mean   : 5.413              Mean   : 563               
##  3rd Qu.:1.0000    3rd Qu.: 5.000              3rd Qu.:2012               
##  Max.   :1.0000    Max.   :48.000              Max.   :2014               
##                                                                           
##           PromoInterval 
##                  :2588  
##  Feb,May,Aug,Nov : 133  
##  Jan,Apr,Jul,Oct : 664  
##  Mar,Jun,Sept,Dec: 208  
##                         
##                         
## 

All stores that were opened on Day 7 had sales.

Holiday might be a factor of store closure, so we will check that as follow:

sqldf("select Open, sum(StateHoliday), sum(SchoolHoliday) from Day7Sales group by Open")
##   Open sum(StateHoliday) sum(SchoolHoliday)
## 1    0                 0               2642
## 2    1                 0                 82

On all Day 7 records, no stores open on State Holiday. 2642 records that was on School Holiday indicated that the store was closed, while in total we have 141137 closed stores. Therefore, holidays are not a strong factor of store closure like we assumed.

Sales vs Date

ggplot(train_store, aes(x=Date,y=Sales)) + geom_smooth() 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Sales increased from 2013 to 2015 –> Strong predictor value.

Sales vs Customers

ggplot(train_store, aes(x=Date,y=Customers)) + geom_smooth() 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Customers increased from 2013 to 2015. –> Customers and Sales are positively correlated.

Sales vs StateHoliday

boxplot(Sales ~ StateHoliday,data=train_store)

Sales are significantly lower on holidays. –> Strong predictor value.

Now we want to see which holidays had the most sales.

OpenOnHoliday <- subset(train_store,Open==1)
mean_a <- mean(OpenOnHoliday$Sales[OpenOnHoliday$StateHoliday=="a"])
mean_b <- mean(OpenOnHoliday$Sales[OpenOnHoliday$StateHoliday=="b"])
mean_c <- mean(OpenOnHoliday$Sales[OpenOnHoliday$StateHoliday=="c"])
barplot(c(mean_a,mean_b,mean_c),main="Mean sales on each holiday",names.arg = c("a","b","c"))

Within 910 stores that are open on holiday, public holiday (a) saw the highest sale (8487.471 - mean_a), significantly more than Easter (b) (9887.89 - mean_b) and Christmas (c) (9743.746 - mean_c).

Sales vs SchoolHoliday

boxplot(Sales ~ SchoolHoliday,data=train_store)

Sales on School Holiday were just slightly higher than that of non-School Holiday. Therefore, School Holiday is not a strong predictor value.

Sales vs StoreType

boxplot(Sales ~ StoreType,data=train_store)

Type b has the highest mean sales –> Strong predictor value.

Determine the sales of each PromoInterval

boxplot(Sales ~ PromoInterval, data = train_store,
        main = "Sales based on the PromoInterval",
        xlab = "PromoInterval", ylab = "Sales", col = "blue")

Overall, all intervals share relatively same mean, quartiles, and minimum and maximum values. However, The “Jan, Apr, Jun, Oct” interval had the highest mean and number of outliers.

Take a closer look at the “Jan, Apr, Jun, Oct” interval

summary(train_store[train_store$PromoInterval == "Jan,Apr,Jul,Oct",]$Sales)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    3680    5610    5562    7614   37646
sqldf("SELECT PromoInterval, COUNT(PromoInterval) FROM train_store WHERE Sales > 7614 + 1.5 * (7614-3680) GROUP BY PromoInterval")
##      PromoInterval COUNT(PromoInterval)
## 1                                 22590
## 2  Feb,May,Aug,Nov                 1572
## 3  Jan,Apr,Jul,Oct                 6320
## 4 Mar,Jun,Sept,Dec                 1334

The “Jan, Apr, Jun, Oct” interval’s number of outliers are more than 5 times bigger than those of other intervals.

Distribution of each assortment

boxplot(Sales ~ Assortment,data=train_store)

We can see that assortment b had the best sales among three assortments.

Determine the sales of each assortment in the timeline

ggplot(train_store["Sales" != 0], 
       aes(x = as.Date(Date), y = Sales, color = factor(Assortment))) + 
  geom_smooth(size = 2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Assortment b (b>c>a) always had higher sales than the other assortments.

Let’s see if the Average sales per assortment of b assortment would be the same

avg_a <- mean(train_store$Sales[train_store$Assortment == "a"])
avg_b <- mean(train_store$Sales[train_store$Assortment == "b"])
avg_c <- mean(train_store$Sales[train_store$Assortment == "c"])
barplot(c(avg_a,avg_b,avg_c), main = "Average sales per assortment", names.arg = c("a","b","c"))

Yes, even its average sales is higher than the others’ average sales.

Customers per assortment

cust_a <- sum(train_store$Customers[train_store$Assortment == "a"])
cust_b <- sum(train_store$Customers[train_store$Assortment == "b"])
cust_c <- sum(train_store$Customers[train_store$Assortment == "c"])
barplot(c(cust_a,cust_b,cust_c), main = "Customers per assortment", names.arg = c("a","b","c"))

Furthermore, the number of customers who bought b are extremely low compare to other assortments’ –> B’s sales performance was far better thanthose of the other two.

Move on to the correlation between Competition Distance and Sales

plot(Sales ~ CompetitionDistance, train_store)

There is a clear correlation between sales and competition distance. The plot indicates that the closer the competitor, the lower the sales.

Days since start of promo2

!! Need to find a way to leverage the data promo2 !!

PromoContinuation vs Sales

boxplot(Sales ~ PromoContinuation, data = train_store,
        main = "Sales based on the PromoContinuation",
        xlab = "PromoContinuation", ylab = "Sales", col = "yellow")

Sales when having a 2nd Promo were less than without a 2nd Promo but not significant

Since there is 0 sales on closed days, I want to specifically look at the promo data on Open days

row_to_keep = which(as.integer(train_store$Open) > 0)
openday <- train_store[row_to_keep,]
summary(openday)
##      Store        DayOfWeek       Date                Sales      
##  Min.   :   1.0   1:144730   Min.   :2013-01-01   Min.   :    0  
##  1st Qu.: 280.0   2:145664   1st Qu.:2013-08-17   1st Qu.: 3727  
##  Median : 558.0   3:145665   Median :2014-04-02   Median : 5744  
##  Mean   : 558.4   4:145845   Mean   :2014-04-11   Mean   : 5774  
##  3rd Qu.: 838.0   5:145845   3rd Qu.:2014-12-12   3rd Qu.: 7856  
##  Max.   :1115.0   6:144730   Max.   :2015-07-31   Max.   :41551  
##                   7:144730                                       
##    Customers      Open       Promo      StateHoliday SchoolHoliday StoreType 
##  Min.   :   0.0   0:172817   0:629129   0:986159     0:835488      a:551627  
##  1st Qu.: 405.0   1:844392   1:388080   a: 20260     1:181721      b: 15830  
##  Median : 609.0                         b:  6690                   c:136840  
##  Mean   : 633.1                         c:  4100                   d:312912  
##  3rd Qu.: 837.0                                                              
##  Max.   :7388.0                                                              
##                                                                              
##  Assortment CompetitionDistance CompetitionOpenSinceMonth
##  a:537445   Min.   :   20       Min.   : 0.000           
##  b:  8294   1st Qu.:  710       1st Qu.: 0.000           
##  c:471470   Median : 2325       Median : 4.000           
##             Mean   : 5422       Mean   : 4.927           
##             3rd Qu.: 6880       3rd Qu.: 9.000           
##             Max.   :75860       Max.   :12.000           
##                                                          
##  CompetitionOpenSinceYear PromoContinuation PromoParticipationSinceWeek
##  Min.   :   0             Min.   :0.0000    Min.   : 0.00              
##  1st Qu.:   0             1st Qu.:0.0000    1st Qu.: 0.00              
##  Median :2006             Median :1.0000    Median : 1.00              
##  Mean   :1370             Mean   :0.5006    Mean   :11.65              
##  3rd Qu.:2011             3rd Qu.:1.0000    3rd Qu.:22.00              
##  Max.   :2015             Max.   :1.0000    Max.   :50.00              
##                                                                        
##  PromoParticipationSinceYear          PromoInterval   
##  Min.   :   0                                :508031  
##  1st Qu.:   0                Feb,May,Aug,Nov :118596  
##  Median :2009                Jan,Apr,Jul,Oct :293122  
##  Mean   :1007                Mar,Jun,Sept,Dec: 97460  
##  3rd Qu.:2012                                         
##  Max.   :2015                                         
## 

The number of two categories of PromoContinuation are nearly equal, which is good for the comparison of sales between the two.

We compare sales between promo day and not promo day

ggplot(openday["Sales" != 0], 
       aes(x = as.Date(Date), y = Sales, color = factor(Promo))) + 
  geom_smooth(size = 2) + xlab("Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

promoY <- mean(train_store$Sales[train_store$Promo == 1])
promoN <- mean(train_store$Sales[train_store$Promo == 0])
barplot(c(promoY,promoN), main = "Average sales per Promo", names.arg = c("1","0"))

Sales nearly doubled when there was a promo on that day.