This is the Capstone project of Data Analytics with managerial applications under professor Dr. Sameer Mathur which aims to do an introductory analysis of customer purchase patterns in a drugstore based in the US.
The dataset can be found at: https://www.kaggle.com/c/rossmann-store-sales/data
drugs <- read.csv(file="capstone.csv",head=TRUE,sep=",")
nrow(drugs)
## [1] 942
ncol(drugs)
## [1] 7
hence there are 942 rows and 7 columns in this data set.
drugs$Date<- as.Date(drugs$Date,'%m/%d/%Y')
drugs$DateYear <- as.numeric(format(drugs$Date, format = "%Y"))
drugs$DateMonth <- as.numeric(format(drugs$Date, format = "%m"))
drugs$DateDay <- as.numeric(format(drugs$Date, format = "%d"))
thus new columns have been added to separately represent year, month and date.
# Adding additional Quarter Column to specify financial quarter of purchase.
drugs$Quarter[drugs$DateMonth < 7] <- 1
drugs$Quarter[drugs$DateMonth > 6] <- 2
# Converting them to factors.
drugs$DateYear<- factor(drugs$DateYear)
drugs$DateMonth<- factor(drugs$DateMonth)
drugs$DateDay<- factor(drugs$DateDay)
drugs$Quarter<- factor(drugs$Quarter)
ncol(drugs)
## [1] 11
Thus there are 11 useful columns
View(drugs)
drugs$DayOfWeek<- factor(drugs$DayOfWeek)
drugs$Open<- factor(drugs$Open)
drugs$Promo<- factor(drugs$Promo)
drugs$SchoolHoliday<- factor(drugs$SchoolHoliday)
summary(drugs)
## DayOfWeek Date Sales Customers Open
## 1:134 Min. :2013-01-01 Min. : 0 Min. : 0.0 0:161
## 2:135 1st Qu.:2013-08-24 1st Qu.:3588 1st Qu.: 463.0 1:781
## 3:135 Median :2014-04-16 Median :4374 Median : 529.0
## 4:135 Mean :2014-04-16 Mean :3946 Mean : 467.6
## 5:135 3rd Qu.:2014-12-07 3rd Qu.:5220 3rd Qu.: 598.8
## 6:134 Max. :2015-07-31 Max. :9528 Max. :1130.0
## 7:134
## Promo SchoolHoliday DateYear DateMonth DateDay Quarter
## 0:582 0:749 2013:365 1 : 93 1 : 31 1:543
## 1:360 1:193 2014:365 3 : 93 2 : 31 2:399
## 2015:212 5 : 93 3 : 31
## 7 : 93 4 : 31
## 4 : 90 5 : 31
## 6 : 90 6 : 31
## (Other):390 (Other):756
library(lattice)
boxplot(drugs$Sales ~drugs$DayOfWeek ,col = "Brown",
xlab = "day of week (1-7)", ylab ="Total sales", main = "Effect of day of the week on Total Sales.")
Note:The drug store is closed on sundays and hence zero sales are recorded.
boxplot(drugs$Sales ~drugs$DateMonth ,col = "yellow",
xlab = "Month", ylab ="Total Sales", main = "Effect of Month on Total Sales.")
highest sales are recorded during the month of december.
boxplot(drugs$Sales ~drugs$DateYear ,col = "grey",
xlab = "year", ylab ="Sales", main = "Effect of year on Total Sales")
boxplot(drugs$Sales ~drugs$Promo ,col = "grey",
xlab = "promotional offer (Y/N)", ylab =" Sales", main = "Effect of promotional offers on Total Sales")
Thus there is a significant increase in sales when there are promotional offers available.
boxplot(drugs$Sales ~drugs$SchoolHoliday ,col = "pink",
xlab = "school holiday (Y/N)", ylab =" Sales", main = "Effect of school holidays on Total Sales")
there is no significant correlation between the ocuurence of school holidays and sales.
boxplot(drugs$Customers ~drugs$DayOfWeek ,col = "Pink",
xlab = "Day of week", ylab ="Number of customers", main = "Effect of day of week on number of customers")
boxplot(drugs$Customers ~drugs$Quarter ,col = "Pink",
xlab = "Financial quarter", ylab ="Number of customers", main = "Effect of financial quarter on number of customers")
hist(drugs$Sales, col ="pink")
hist(drugs$Customers, col="yellow")
Countday <- table(drugs$DayOfWeek )
barplot(Countday, main="Distribution of Sales according to day of the week (mon-sun) in log scale.",log='y',
names.arg=rownames(Countday), cex.names=0.8, col="Red")
CountDateMonth <- table(drugs$DateMonth )
barplot(CountDateMonth, main="Distribution of sales according to Month in log scale.",log='y',
names.arg=rownames(CountDateMonth), cex.names=0.8, col="Orange")
CountDateDay <- table(drugs$DateDay )
barplot(CountDateDay, main="Distribution of sales according to Date in log scale.",log='y',
names.arg=rownames(CountDateDay), cex.names=0.8, col="Cyan")
library(car)
plot(drugs$Sales~drugs$Customers,spread=FALSE, smoother.args=list(lty=2), pch=19, main="plot of log of Total Sales per day vs. log of Customers per day ", log="xy", col = "Orange",xlab="Customers", ylab="Total Sales")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 x values <= 0 omitted
## from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 161 y values <= 0 omitted
## from logarithmic plot
## Warning in plot.window(...): "spread" is not a graphical parameter
## Warning in plot.window(...): "smoother.args" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "spread" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "smoother.args" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "spread" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...):
## "smoother.args" is not a graphical parameter
## Warning in box(...): "spread" is not a graphical parameter
## Warning in box(...): "smoother.args" is not a graphical parameter
## Warning in title(...): "spread" is not a graphical parameter
## Warning in title(...): "smoother.args" is not a graphical parameter
thus there exists a linear relationship between sales and number of customers.
drugshop <- drugs[,c(3,4)]
covdrug <-cov(drugshop)
covdrug
## Sales Customers
## Sales 4061909.1 454166.49
## Customers 454166.5 52409.33
cordrug <- cov2cor(covdrug)
cordrug
## Sales Customers
## Sales 1.0000000 0.9843412
## Customers 0.9843412 1.0000000
library("corrgram")
corrgram(drugs, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of variables")
scatterplotMatrix(formula = ~ drugs$Sales + drugs$Customers, cex=0.6,
data=drugs, main = "Effect of Number of customers on daily sales." )
pairs(~DayOfWeek+Customers +Sales+DateDay + DateYear + DateMonth + Quarter+Promo+SchoolHoliday, data= drugs, main=" Scatterplot Matrix of the Ecommerce transactions.")
t.test(drugs$Sales~ drugs$SchoolHoliday)
##
## Welch Two Sample t-test
##
## data: drugs$Sales by drugs$SchoolHoliday
## t = -0.22852, df = 287.27, p-value = 0.8194
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -369.0290 292.2526
## sample estimates:
## mean in group 0 mean in group 1
## 3937.840 3976.228
As p-value=0.8194(>0.05).We can accept the null hypothesis for SchoolHoliday column. Therefore There is no strong correlation between Sales and school holidays.