This is the Capstone project of Data Analytics with managerial applications under professor Dr. Sameer Mathur which aims to do an introductory analysis of customer purchase patterns in a drugstore based in the US.
The dataset can be found at: https://www.kaggle.com/c/rossmann-store-sales/data
drugs <- read.csv(file="capstone.csv",head=TRUE,sep=",")
nrow(drugs)
## [1] 942
ncol(drugs)
## [1] 7
hence there are 942 rows and 7 columns in this data set.
drugs$Date<- as.Date(drugs$Date,'%m/%d/%Y')
drugs$DateYear <- as.numeric(format(drugs$Date, format = "%Y"))
drugs$DateMonth <- as.numeric(format(drugs$Date, format = "%m"))
drugs$DateDay <- as.numeric(format(drugs$Date, format = "%d"))
thus new columns have been added to separately represent year, month and date.
# Adding additional Quarter Column to specify financial quarter of purchase.
drugs$Quarter[drugs$DateMonth < 7] <- 1
drugs$Quarter[drugs$DateMonth > 6] <- 2
# Converting them to factors.
drugs$DateYear<- factor(drugs$DateYear)
drugs$DateMonth<- factor(drugs$DateMonth)
drugs$DateDay<- factor(drugs$DateDay)
drugs$Quarter<- factor(drugs$Quarter)
ncol(drugs)
## [1] 11
Thus there are 11 useful columns;DayofWeek,Sales,Customers,Promo, Schoolholiday,Quarter,DateMonth, DateYear,DateDay,Date,Open
View(drugs)
drugs$DayOfWeek<- factor(drugs$DayOfWeek)
drugs$Open<- factor(drugs$Open)
drugs$Promo<- factor(drugs$Promo)
drugs$SchoolHoliday<- factor(drugs$SchoolHoliday)
summary(drugs)
## DayOfWeek Date Sales Customers Open
## 1:134 Min. :2013-01-01 Min. : 0 Min. : 0.0 0:161
## 2:135 1st Qu.:2013-08-24 1st Qu.:3588 1st Qu.: 463.0 1:781
## 3:135 Median :2014-04-16 Median :4374 Median : 529.0
## 4:135 Mean :2014-04-16 Mean :3946 Mean : 467.6
## 5:135 3rd Qu.:2014-12-07 3rd Qu.:5220 3rd Qu.: 598.8
## 6:134 Max. :2015-07-31 Max. :9528 Max. :1130.0
## 7:134
## Promo SchoolHoliday DateYear DateMonth DateDay Quarter
## 0:582 0:749 2013:365 1 : 93 1 : 31 1:543
## 1:360 1:193 2014:365 3 : 93 2 : 31 2:399
## 2015:212 5 : 93 3 : 31
## 7 : 93 4 : 31
## 4 : 90 5 : 31
## 6 : 90 6 : 31
## (Other):390 (Other):756
library(lattice)
boxplot(drugs$Sales ~drugs$DayOfWeek ,col = "Brown",
xlab = "day of week (1-7)", ylab ="Total sales", main = "Effect of day of the week on Total Sales.")
Note:The drug store is closed on sundays and hence zero sales are recorded.
boxplot(drugs$Sales ~drugs$DateMonth ,col = "yellow",
xlab = "Month", ylab ="Total Sales", main = "Effect of Month on Total Sales.")
highest sales are recorded during the month of december.
boxplot(drugs$Sales ~drugs$DateYear ,col = "grey",
xlab = "year", ylab ="Sales", main = "Effect of year on Total Sales")
boxplot(drugs$Sales ~drugs$Promo ,col = "grey",
xlab = "promotional offer (Y/N)", ylab =" Sales", main = "Effect of promotional offers on Total Sales")
Thus there is a significant increase in sales when there are promotional offers available.
boxplot(drugs$Sales ~drugs$SchoolHoliday ,col = "pink",
xlab = "school holiday (Y/N)", ylab =" Sales", main = "Effect of school holidays on Total Sales")
boxplot(drugs$Sales ~drugs$Quarter ,col = "pink",
xlab = "Financial quarter
", ylab =" Sales", main = "Effect of financial quarter on Total Sales")
boxplot(drugs$Customers ~drugs$DayOfWeek ,col = "Pink",
xlab = "Day of week", ylab ="Number of customers", main = "Effect of day of week on number of customers")
boxplot(drugs$Customers ~drugs$Quarter ,col = "Pink",
xlab = "Financial quarter", ylab ="Number of customers", main = "Effect of financial quarter on number of customers")
boxplot(drugs$Customers ~drugs$Promo ,col = "Pink",
xlab = "Promo (Y/n)", ylab ="Number of customers", main = "Effect of promotional offers on number of customers")
hist(drugs$Sales, col ="pink")
hist(drugs$Customers, col="yellow")
Countday <- table(drugs$DayOfWeek )
barplot(Countday, main="Distribution of Sales according to day of the week (mon-sun) in log scale.",log='y',
names.arg=rownames(Countday), cex.names=0.8, col="Red")
CountDateMonth <- table(drugs$DateMonth )
barplot(CountDateMonth, main="Distribution of sales according to Month in log scale.",log='y',
names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")
CountDateDay <- table(drugs$DateDay )
barplot(CountDateDay, main="Distribution of sales according to Date in log scale.",log='y',
names.arg=rownames(CountDateDay), cex.names=0.8, col="Cyan")
library(car)
plot(drugs$Sales~drugs$Customers,spread=FALSE, smoother.args=list(lty=2), pch=19, main="plot of log of Total Sales per day vs. log of Customers per day ", log="xy", col = "Orange",xlab="Customers", ylab="Total Sales")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 x values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 y values <= 0 omitted
## from logarithmic plot
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "smoother.args" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "smoother.args" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in box(...): "spread" is not a graphical parameter
## Warning in box(...): "smoother.args" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in title(...): "smoother.args" is not a graphical parameter
thus there exists a linear relationship between sales and number of customers.
drugshop <- drugs[,c(3,4)]
covdrug <-cov(drugshop)
covdrug
## Sales Customers
## Sales 4061909.1 454166.49
## Customers 454166.5 52409.33
cordrug <- cov2cor(covdrug)
cordrug
## Sales Customers
## Sales 1.0000000 0.9843412
## Customers 0.9843412 1.0000000
library("corrgram")
corrgram(drugs, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of variables")
scatterplotMatrix(formula = ~ drugs$Sales + drugs$Customers, cex=0.6,
data=drugs, main = "Effect of Number of customers on daily sales." )
pairs(~DayOfWeek+Customers +Sales+DateDay + DateYear + DateMonth + Quarter+Promo+SchoolHoliday, data= drugs, main=" Scatterplot Matrix of the Ecommerce transactions.")
t.test(drugs$Sales~ drugs$SchoolHoliday)
##
## Welch Two Sample t-test
##
## data: drugs$Sales by drugs$SchoolHoliday
## t = -0.22852, df = 287.27, p-value = 0.8194
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -369.0290 292.2526
## sample estimates:
## mean in group 0 mean in group 1
## 3937.840 3976.228
As p-value=0.8194(>0.05).We can accept the null hypothesis for SchoolHoliday column. Therefore There is no strong correlation between Sales and school holidays.
reg<-lm(Sales~DayOfWeek+Promo+SchoolHoliday+DateMonth+DateYear-1,data=drugs)
summary(reg)
##
## Call:
## lm(formula = Sales ~ DayOfWeek + Promo + SchoolHoliday + DateMonth +
## DateYear - 1, data = drugs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5337.5 -320.3 39.7 400.0 4266.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## DayOfWeek1 4424.126 147.106 30.074 < 2e-16 ***
## DayOfWeek2 4133.651 146.209 28.272 < 2e-16 ***
## DayOfWeek3 3946.863 145.674 27.094 < 2e-16 ***
## DayOfWeek4 3597.525 145.011 24.809 < 2e-16 ***
## DayOfWeek5 4018.872 145.812 27.562 < 2e-16 ***
## DayOfWeek6 5145.602 139.622 36.854 < 2e-16 ***
## DayOfWeek7 196.238 140.260 1.399 0.1621
## Promo1 1352.651 76.229 17.745 < 2e-16 ***
## SchoolHoliday1 -180.494 93.958 -1.921 0.0550 .
## DateMonth2 133.744 151.281 0.884 0.3769
## DateMonth3 146.444 146.342 1.001 0.3172
## DateMonth4 -150.014 145.642 -1.030 0.3033
## DateMonth5 -324.055 147.532 -2.197 0.0283 *
## DateMonth6 -236.477 148.744 -1.590 0.1122
## DateMonth7 6.707 144.490 0.046 0.9630
## DateMonth8 -220.199 168.563 -1.306 0.1918
## DateMonth9 -419.526 167.151 -2.510 0.0122 *
## DateMonth10 -301.747 163.843 -1.842 0.0658 .
## DateMonth11 195.381 168.460 1.160 0.2464
## DateMonth12 1017.513 163.912 6.208 8.13e-10 ***
## DateYear2014 -176.682 72.892 -2.424 0.0155 *
## DateYear2015 -360.053 90.621 -3.973 7.65e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 984.6 on 920 degrees of freedom
## Multiple R-squared: 0.9518, Adjusted R-squared: 0.9506
## F-statistic: 825 on 22 and 920 DF, p-value: < 2.2e-16
-> Highest sales occur in the month of December. -> there exists a strong relation between sales and the existence of promotional offers.