library(readr)
library(plyr)
library(corrplot)
## corrplot 0.92 loaded
The file is merged with both the Black friday data and Non-Black Friday data and read into R.
PBI <- read_csv("rmd.csv",show_col_types = FALSE)
PBI$Product <- revalue(PBI$Product,c("Entertainment"="1",
"Movies"="2",
"Sports"="3",
"Kids"="4"))
PBI$Product <- as.numeric(PBI$Product)
The Indicator field is used to identify the sales of Black Friday and rest of year sales. The data is divided into the 2 groups and the respective data frames are produced.
BF <- PBI[PBI$Indicator == "0",]
BF <- BF[c(-59,-71),]
NBF <- PBI[PBI$Indicator == "1",]
NBF <- NBF[c(-41,-85,-58,-68),]
Correlation plots are drawn to understand if the numerical features have correlations between them. In order to find if the difference exists between the means, correlations should be understood. If the correlations are high, then there is a need to perform multivariate testing of hypothesis instead of carrying out individual univariate tests on variables.
corr_brew <- cor(BF[,c(4:6)])
mat <- cor(corr_brew)
corrplot(mat,order = "AOE", method = "color", addCoef.col = "gray")
Correlations between variables in Black friday
corr_brew <- cor(NBF[,c(4:6)])
mat <- cor(corr_brew)
corrplot(mat,order = "AOE", method = "color", addCoef.col = "gray")
Correlations between variables in Non-Black friday
Due to the high correlations, both the variables are considered and multivariate hypothesis testing (Hotelling’s T2 Test) is carried out to check the if the significant difference exists between the means of variables Price_paid and Days_before_activation.
y_bar_BF <- as.matrix(apply(BF[,5:6],2,mean))
y_bar_BF
## [,1]
## Price_paid 64.65408
## Days_before_activation 33.85714
y_bar_NBF <- as.matrix(apply(NBF[,5:6],2,mean))
y_bar_NBF
## [,1]
## Price_paid 44.728437
## Days_before_activation 2.302083
n1 = 98
n2 = 96
p = 2
S_BF <- cov(BF[c(-59,-71),5:6])
S_NBF <- cov(NBF[c(-41),5:6])
Sp <- ( ( (n1-1)* S_BF + (n2-1)*S_NBF ) / (n1+n2-2) ) * ((1/n1) + (1/n2) )
T2 <- t(y_bar_BF - y_bar_NBF) %*% solve(Sp) %*% (y_bar_BF - y_bar_NBF)
T2
## [,1]
## [1,] 9617.976
F <- ((n1+n2- p-1) / ((n1+n2-2)*p) ) * T2
F
## [,1]
## [1,] 4783.941
1-pf(F,2,194) # way less than 0.05 so we reject null that both means are equal.
## [,1]
## [1,] 0
hist(BF$Price_paid,freq = FALSE ,xlab = "Price Paid",
main = "Histogram of Price Paid by Black Friday Customers")
lines(0:240,dnorm(0:240,mean = mean(BF$Price_paid),sd= sd(BF$Price_paid)),
type = "l",lwd = 2, col = "maroon")
Histogram of Price_paid
hist(BF$Days_before_activation,freq = FALSE ,xlab = "Days before Activation",
main = "Histogram of Days before Activation for Black Friday Customers")
lines(0:50,dnorm(0:50,mean = mean(BF$Days_before_activation),sd= sd(BF$Days_before_activation)),
type = "l",lwd = 2, col = "maroon")
Histogram of Days_before_activaton
hist(NBF$Price_paid,freq = FALSE ,
xlab = "Price Paid",
main = "Histogram of Price Paid by Non Black Friday Customers")
lines(0:240,dnorm(0:240,mean = mean(NBF$Price_paid),sd= sd(NBF$Price_paid)),
type = "l",lwd = 2, col = "maroon")
Histogram of Price_paid
hist(NBF$Days_before_activation,freq = FALSE ,
xlab = "Days before Activation",
main = "Histogram of Days before Activation for Non Black Friday Customers")
lines(0:50,dnorm(0:50,mean = mean(NBF$Days_before_activation),sd= sd(NBF$Days_before_activation)),
type = "l",lwd = 2, col = "maroon")
Histogram of Days_before_activaton