email: tarunasudhakar@gmail.com college: SSN College of Engineering(16-20)

This is the Capstone project of Data Analytics with managerial applications under professor Dr. Sameer Mathur which aims to do an introductory analysis of customer purchase patterns in a drugstore based in the US.

The dataset can be found at: https://www.kaggle.com/c/rossmann-store-sales/data

drugs <- read.csv(file="capstone.csv",head=TRUE,sep=",")
nrow(drugs)
## [1] 942
ncol(drugs)
## [1] 7

hence there are 942 rows and 7 columns in this data set.

drugs$Date<- as.Date(drugs$Date,'%m/%d/%Y')
drugs$DateYear <- as.numeric(format(drugs$Date, format = "%Y"))
drugs$DateMonth <- as.numeric(format(drugs$Date, format = "%m"))
drugs$DateDay <- as.numeric(format(drugs$Date, format = "%d"))

thus new columns have been added to separately represent year, month and date.

# Adding additional Quarter Column to specify financial quarter of purchase.

drugs$Quarter[drugs$DateMonth < 7] <- 1
drugs$Quarter[drugs$DateMonth > 6] <- 2

# Converting them to factors.

drugs$DateYear<- factor(drugs$DateYear)
drugs$DateMonth<- factor(drugs$DateMonth)
drugs$DateDay<- factor(drugs$DateDay)
drugs$Quarter<- factor(drugs$Quarter)

ncol(drugs)
## [1] 11

Thus there are 11 useful columns

View(drugs)
drugs$DayOfWeek<- factor(drugs$DayOfWeek)
drugs$Open<- factor(drugs$Open)
drugs$Promo<- factor(drugs$Promo)
drugs$SchoolHoliday<- factor(drugs$SchoolHoliday)
summary(drugs)
##  DayOfWeek      Date                Sales        Customers      Open   
##  1:134     Min.   :2013-01-01   Min.   :   0   Min.   :   0.0   0:161  
##  2:135     1st Qu.:2013-08-24   1st Qu.:3588   1st Qu.: 463.0   1:781  
##  3:135     Median :2014-04-16   Median :4374   Median : 529.0          
##  4:135     Mean   :2014-04-16   Mean   :3946   Mean   : 467.6          
##  5:135     3rd Qu.:2014-12-07   3rd Qu.:5220   3rd Qu.: 598.8          
##  6:134     Max.   :2015-07-31   Max.   :9528   Max.   :1130.0          
##  7:134                                                                 
##  Promo   SchoolHoliday DateYear     DateMonth      DateDay    Quarter
##  0:582   0:749         2013:365   1      : 93   1      : 31   1:543  
##  1:360   1:193         2014:365   3      : 93   2      : 31   2:399  
##                        2015:212   5      : 93   3      : 31          
##                                   7      : 93   4      : 31          
##                                   4      : 90   5      : 31          
##                                   6      : 90   6      : 31          
##                                   (Other):390   (Other):756

BOXPLOTS

library(lattice)

boxplot(drugs$Sales ~drugs$DayOfWeek ,col = "Brown",
        xlab = "day of week (1-7)", ylab ="Total sales", main = "Effect of day of the week on Total Sales.")

Note:The drug store is closed on sundays and hence zero sales are recorded.

boxplot(drugs$Sales ~drugs$DateMonth ,col = "yellow",
        xlab = "Month", ylab ="Total Sales", main = "Effect of Month on Total Sales.")

highest sales are recorded during the month of december.

boxplot(drugs$Sales ~drugs$DateYear ,col = "grey",
        xlab = "year", ylab ="Sales", main = "Effect of year on Total Sales")

boxplot(drugs$Sales ~drugs$Promo ,col = "grey",
        xlab = "promotional offer (Y/N)", ylab =" Sales", main = "Effect of promotional offers on Total Sales")

Thus there is a significant increase in sales when there are promotional offers available.

boxplot(drugs$Sales ~drugs$SchoolHoliday ,col = "pink",
        xlab = "school holiday (Y/N)", ylab =" Sales", main = "Effect of school holidays on Total Sales")

there is no significant correlation between the ocuurence of school holidays and sales.

boxplot(drugs$Customers ~drugs$DayOfWeek ,col = "Pink",
        xlab = "Day of week", ylab ="Number of customers", main = "Effect of day of week on number of customers")

boxplot(drugs$Customers ~drugs$Quarter ,col = "Pink",
        xlab = "Financial quarter", ylab ="Number of customers", main = "Effect of financial quarter on number of customers")

hist(drugs$Sales, col ="pink")

hist(drugs$Customers, col="yellow")

Countday <- table(drugs$DayOfWeek )
barplot(Countday, main="Distribution of Sales according to day of the week (mon-sun) in log scale.",log='y',
        names.arg=rownames(Countday), cex.names=0.8, col="Red")

CountDateMonth <- table(drugs$DateMonth )
barplot(CountDateMonth, main="Distribution of sales according to Month in log scale.",log='y',
        names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")

CountDateDay <- table(drugs$DateDay )
barplot(CountDateDay, main="Distribution of sales according to Date in log scale.",log='y',
        names.arg=rownames(CountDateDay), cex.names=0.8, col="Cyan")

library(car)

plot(drugs$Sales~drugs$Customers,spread=FALSE, smoother.args=list(lty=2), pch=19, main="plot of log of Total Sales per day vs. log of Customers per day ", log="xy", col = "Orange",xlab="Customers", ylab="Total Sales")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 x values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 y values <= 0 omitted
## from logarithmic plot
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "smoother.args" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "smoother.args" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in box(...): "spread" is not a graphical parameter
## Warning in box(...): "smoother.args" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in title(...): "smoother.args" is not a graphical parameter

thus there exists a linear relationship between sales and number of customers.

COVARIANCE AND CORRELATION

drugshop <- drugs[,c(3,4)]
covdrug <-cov(drugshop)
covdrug
##               Sales Customers
## Sales     4061909.1 454166.49
## Customers  454166.5  52409.33
cordrug <- cov2cor(covdrug)
cordrug
##               Sales Customers
## Sales     1.0000000 0.9843412
## Customers 0.9843412 1.0000000
library("corrgram") 
corrgram(drugs, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of variables")

scatterplotMatrix(formula = ~ drugs$Sales + drugs$Customers, cex=0.6,
                  data=drugs, main = "Effect of Number of  customers on daily sales." )

pairs(~DayOfWeek+Customers +Sales+DateDay + DateYear + DateMonth + Quarter+Promo+SchoolHoliday, data= drugs, main=" Scatterplot Matrix of the Ecommerce transactions.")

T test between total sales and School holidays

t.test(drugs$Sales~ drugs$SchoolHoliday)
## 
##  Welch Two Sample t-test
## 
## data:  drugs$Sales by drugs$SchoolHoliday
## t = -0.22852, df = 287.27, p-value = 0.8194
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -369.0290  292.2526
## sample estimates:
## mean in group 0 mean in group 1 
##        3937.840        3976.228

As p-value=0.8194(>0.05).We can accept the null hypothesis for SchoolHoliday column. Therefore There is no strong correlation between Sales and school holidays.