email: tarunasudhakar@gmail.com college: SSN College of Engineering(16-20)

This is the Capstone project of Data Analytics with managerial applications under professor Dr. Sameer Mathur which aims to do an introductory analysis of customer purchase patterns in a drugstore based in the US.

The dataset can be found at: https://www.kaggle.com/c/rossmann-store-sales/data

drugs <- read.csv(file="capstone.csv",head=TRUE,sep=",")
nrow(drugs)
## [1] 942
ncol(drugs)
## [1] 7

hence there are 942 rows and 7 columns in this data set.

drugs$Date<- as.Date(drugs$Date,'%m/%d/%Y')
drugs$DateYear <- as.numeric(format(drugs$Date, format = "%Y"))
drugs$DateMonth <- as.numeric(format(drugs$Date, format = "%m"))
drugs$DateDay <- as.numeric(format(drugs$Date, format = "%d"))

thus new columns have been added to separately represent year, month and date.

# Adding additional Quarter Column to specify financial quarter of purchase.

drugs$Quarter[drugs$DateMonth < 7] <- 1
drugs$Quarter[drugs$DateMonth > 6] <- 2

# Converting them to factors.

drugs$DateYear<- factor(drugs$DateYear)
drugs$DateMonth<- factor(drugs$DateMonth)
drugs$DateDay<- factor(drugs$DateDay)
drugs$Quarter<- factor(drugs$Quarter)

ncol(drugs)
## [1] 11

Thus there are 11 useful columns;DayofWeek,Sales,Customers,Promo, Schoolholiday,Quarter,DateMonth, DateYear,DateDay,Date,Open

View(drugs)
drugs$DayOfWeek<- factor(drugs$DayOfWeek)
drugs$Open<- factor(drugs$Open)
drugs$Promo<- factor(drugs$Promo)
drugs$SchoolHoliday<- factor(drugs$SchoolHoliday)
summary(drugs)
##  DayOfWeek      Date                Sales        Customers      Open   
##  1:134     Min.   :2013-01-01   Min.   :   0   Min.   :   0.0   0:161  
##  2:135     1st Qu.:2013-08-24   1st Qu.:3588   1st Qu.: 463.0   1:781  
##  3:135     Median :2014-04-16   Median :4374   Median : 529.0          
##  4:135     Mean   :2014-04-16   Mean   :3946   Mean   : 467.6          
##  5:135     3rd Qu.:2014-12-07   3rd Qu.:5220   3rd Qu.: 598.8          
##  6:134     Max.   :2015-07-31   Max.   :9528   Max.   :1130.0          
##  7:134                                                                 
##  Promo   SchoolHoliday DateYear     DateMonth      DateDay    Quarter
##  0:582   0:749         2013:365   1      : 93   1      : 31   1:543  
##  1:360   1:193         2014:365   3      : 93   2      : 31   2:399  
##                        2015:212   5      : 93   3      : 31          
##                                   7      : 93   4      : 31          
##                                   4      : 90   5      : 31          
##                                   6      : 90   6      : 31          
##                                   (Other):390   (Other):756

BOXPLOTS

library(lattice)

boxplot(drugs$Sales ~drugs$DayOfWeek ,col = "Brown",
        xlab = "day of week (1-7)", ylab ="Total sales", main = "Effect of day of the week on Total Sales.")

Note:The drug store is closed on sundays and hence zero sales are recorded.

boxplot(drugs$Sales ~drugs$DateMonth ,col = "yellow",
        xlab = "Month", ylab ="Total Sales", main = "Effect of Month on Total Sales.")

highest sales are recorded during the month of december.

boxplot(drugs$Sales ~drugs$DateYear ,col = "grey",
        xlab = "year", ylab ="Sales", main = "Effect of year on Total Sales")

boxplot(drugs$Sales ~drugs$Promo ,col = "grey",
        xlab = "promotional offer (Y/N)", ylab =" Sales", main = "Effect of promotional offers on Total Sales")

Thus there is a significant increase in sales when there are promotional offers available.

boxplot(drugs$Sales ~drugs$SchoolHoliday ,col = "pink",
        xlab = "school holiday (Y/N)", ylab =" Sales", main = "Effect of school holidays on Total Sales")

boxplot(drugs$Sales ~drugs$Quarter ,col = "pink",
        xlab = "Financial quarter
        
        
        ", ylab =" Sales", main = "Effect of financial quarter on Total Sales")

boxplot(drugs$Customers ~drugs$DayOfWeek ,col = "Pink",
        xlab = "Day of week", ylab ="Number of customers", main = "Effect of day of week on number of customers")

boxplot(drugs$Customers ~drugs$Quarter ,col = "Pink",
        xlab = "Financial quarter", ylab ="Number of customers", main = "Effect of financial quarter on number of customers")

boxplot(drugs$Customers ~drugs$Promo ,col = "Pink",
        xlab = "Promo (Y/n)", ylab ="Number of customers", main = "Effect of promotional offers on number of customers")

hist(drugs$Sales, col ="pink")

hist(drugs$Customers, col="yellow")

Countday <- table(drugs$DayOfWeek )
barplot(Countday, main="Distribution of Sales according to day of the week (mon-sun) in log scale.",log='y',
        names.arg=rownames(Countday), cex.names=0.8, col="Red")

CountDateMonth <- table(drugs$DateMonth )
barplot(CountDateMonth, main="Distribution of sales according to Month in log scale.",log='y',
        names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")

CountDateDay <- table(drugs$DateDay )
barplot(CountDateDay, main="Distribution of sales according to Date in log scale.",log='y',
        names.arg=rownames(CountDateDay), cex.names=0.8, col="Cyan")

library(car)

plot(drugs$Sales~drugs$Customers,spread=FALSE, smoother.args=list(lty=2), pch=19, main="plot of log of Total Sales per day vs. log of Customers per day ", log="xy", col = "Orange",xlab="Customers", ylab="Total Sales")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 x values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 y values <= 0 omitted
## from logarithmic plot
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "smoother.args" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "smoother.args" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in box(...): "spread" is not a graphical parameter
## Warning in box(...): "smoother.args" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in title(...): "smoother.args" is not a graphical parameter

thus there exists a linear relationship between sales and number of customers.

COVARIANCE AND CORRELATION

drugshop <- drugs[,c(3,4)]
covdrug <-cov(drugshop)
covdrug
##               Sales Customers
## Sales     4061909.1 454166.49
## Customers  454166.5  52409.33
cordrug <- cov2cor(covdrug)
cordrug
##               Sales Customers
## Sales     1.0000000 0.9843412
## Customers 0.9843412 1.0000000
library("corrgram") 
corrgram(drugs, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of variables")

scatterplotMatrix(formula = ~ drugs$Sales + drugs$Customers, cex=0.6,
                  data=drugs, main = "Effect of Number of  customers on daily sales." )

pairs(~DayOfWeek+Customers +Sales+DateDay + DateYear + DateMonth + Quarter+Promo+SchoolHoliday, data= drugs, main=" Scatterplot Matrix of the Ecommerce transactions.")

T test between total sales and School holidays

t.test(drugs$Sales~ drugs$SchoolHoliday)
## 
##  Welch Two Sample t-test
## 
## data:  drugs$Sales by drugs$SchoolHoliday
## t = -0.22852, df = 287.27, p-value = 0.8194
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -369.0290  292.2526
## sample estimates:
## mean in group 0 mean in group 1 
##        3937.840        3976.228

As p-value=0.8194(>0.05).We can accept the null hypothesis for SchoolHoliday column. Therefore There is no strong correlation between Sales and school holidays.

Regression model

reg<-lm(Sales~DayOfWeek+Promo+SchoolHoliday+DateMonth+DateYear-1,data=drugs)
summary(reg)
## 
## Call:
## lm(formula = Sales ~ DayOfWeek + Promo + SchoolHoliday + DateMonth + 
##     DateYear - 1, data = drugs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5337.5  -320.3    39.7   400.0  4266.9 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## DayOfWeek1     4424.126    147.106  30.074  < 2e-16 ***
## DayOfWeek2     4133.651    146.209  28.272  < 2e-16 ***
## DayOfWeek3     3946.863    145.674  27.094  < 2e-16 ***
## DayOfWeek4     3597.525    145.011  24.809  < 2e-16 ***
## DayOfWeek5     4018.872    145.812  27.562  < 2e-16 ***
## DayOfWeek6     5145.602    139.622  36.854  < 2e-16 ***
## DayOfWeek7      196.238    140.260   1.399   0.1621    
## Promo1         1352.651     76.229  17.745  < 2e-16 ***
## SchoolHoliday1 -180.494     93.958  -1.921   0.0550 .  
## DateMonth2      133.744    151.281   0.884   0.3769    
## DateMonth3      146.444    146.342   1.001   0.3172    
## DateMonth4     -150.014    145.642  -1.030   0.3033    
## DateMonth5     -324.055    147.532  -2.197   0.0283 *  
## DateMonth6     -236.477    148.744  -1.590   0.1122    
## DateMonth7        6.707    144.490   0.046   0.9630    
## DateMonth8     -220.199    168.563  -1.306   0.1918    
## DateMonth9     -419.526    167.151  -2.510   0.0122 *  
## DateMonth10    -301.747    163.843  -1.842   0.0658 .  
## DateMonth11     195.381    168.460   1.160   0.2464    
## DateMonth12    1017.513    163.912   6.208 8.13e-10 ***
## DateYear2014   -176.682     72.892  -2.424   0.0155 *  
## DateYear2015   -360.053     90.621  -3.973 7.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 984.6 on 920 degrees of freedom
## Multiple R-squared:  0.9518, Adjusted R-squared:  0.9506 
## F-statistic:   825 on 22 and 920 DF,  p-value: < 2.2e-16

Results

-> Highest sales occur in the month of December. -> there exists a strong relation between sales and the existence of promotional offers.