DAM TEAM 2
7th Dec, 2018
With the neck to neck competition amongst ecommerce companies and their fight for market share, Amazon conducts the “Great Indian Shopping festival” to entice customers to spend more on its platform. We also saw Amazon conducting multiple sales just within a few days of each other.
setwd("C:/Users/SARIM/Desktop/DAM2018/Project")
library(readxl)
dam_project.df <- read_excel("C:/Users/SARIM/Desktop/DAM2018/Project/Dam_Data.xlsx")
head(dam_project.df)
# A tibble: 6 x 9
ASIN Is_Electronics Subcategory MRP SP Sale In_Newspaper
<chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 B077~ 1 Mobile Acc~ 1999 1499 0 0
2 B073~ 1 Mobile Acc~ 600 349 0 0
3 B073~ 1 Mobile Acc~ 1000 672 0 0
4 B077~ 1 Mobile Acc~ 1199 799 0 0
5 B075~ 1 Mobile Acc~ 1149 695 0 0
6 B06Y~ 1 Mobile Acc~ 2199 1199 0 0
# ... with 2 more variables: On_Flipkart <dbl>, Discount <dbl>
attach(dam_project.df)
library(psych)
describe(dam_project.df)
vars n mean sd median trimmed mad min
ASIN* 1 56750 NaN NA NA NaN NA Inf
Is_Electronics 2 56750 0.45 0.50 0.00 0.43 0.00 0
Subcategory* 3 56750 NaN NA NA NaN NA Inf
MRP 4 56750 1784.30 3369.22 985.00 1075.82 690.89 2
SP 5 56750 1463.38 2791.61 789.00 866.97 564.87 1
Sale 6 56750 2.50 1.71 2.00 2.50 2.97 0
In_Newspaper 7 56750 0.00 0.04 0.00 0.00 0.00 0
On_Flipkart 8 56750 1.73 0.63 2.00 1.91 0.00 0
Discount 9 56750 0.19 0.07 0.19 0.18 0.06 0
max range skew kurtosis se
ASIN* -Inf -Inf NA NA NA
Is_Electronics 1.00 1.00 0.22 -1.95 0.00
Subcategory* -Inf -Inf NA NA NA
MRP 134990.00 134988.00 13.77 396.98 14.14
SP 125249.00 125248.00 14.04 428.96 11.72
Sale 5.00 5.00 0.00 -1.27 0.01
In_Newspaper 1.00 1.00 24.78 611.83 0.00
On_Flipkart 2.00 2.00 -2.11 2.81 0.00
Discount 0.76 0.76 1.65 3.51 0.00
aggdata1 <-aggregate(Discount, by=list(Is_Electronics), FUN=mean, na.rm=TRUE)
colnames(aggdata1) <- c("Is_Electronics","Mean Discount")
aggdata1
Is_Electronics Mean Discount
1 0 0.1893155
2 1 0.1942325
aggdata2 <-aggregate(Discount, by=list(Subcategory), FUN=mean, na.rm=TRUE)
colnames(aggdata2) <- c("Sub Category","Mean Discount")
aggdata2
Sub Category Mean Discount
1 Beauty 0.1367282
2 Computer Accessories 0.1602442
3 Fashion 0.2415938
4 Miscellaneous 0.1666187
5 Miscellaneous_E 0.1816214
6 Mobile Accessories 0.2163866
7 Sports 0.1551708
aggdata3 <-aggregate(Discount, by=list(Sale), FUN=mean, na.rm=TRUE)
colnames(aggdata3) <- c("Sales Period","Mean Discount")
rownames(aggdata3) <- c("1st Sale (10-15th Oct)","After 1st Sale","2nd Sale (24-28th Oct)","After 2nd Sale","Diwali Sale (3-5th Nov)","Post Diwali Sale Period")
aggdata3
Sales Period Mean Discount
1st Sale (10-15th Oct) 0 0.2149316
After 1st Sale 1 0.1736692
2nd Sale (24-28th Oct) 2 0.2079721
After 2nd Sale 3 0.1729543
Diwali Sale (3-5th Nov) 4 0.2065348
Post Diwali Sale Period 5 0.1727392
boxplot(Discount ~ Sale, data = dam_project.df, main="Discount vs Sale",xlab="Sale",ylab="Discount")
library(lattice)
bwplot(Discount ~ Subcategory | Sale, data = dam_project.df, main = "Boxplot of Discount across SubCategory and Sale", ylab = "Discount", col = "black")
library(gplots)
suppressWarnings(plotmeans(Discount ~ Sale, xlab = "Sale", ylab = "Discount", main = "Mean Plot with 95% CI", mean.labels=TRUE, n.label = FALSE))
suppressWarnings(plotmeans(Discount ~ Subcategory, xlab = "SubCategory", ylab = "Discount", main = "Mean Plot with 95% CI", mean.labels = TRUE, n.label = FALSE))
twoWayfit <- aov(Discount ~ Subcategory * Sale, data = dam_project.df)
# normal Q-Q plot
plot(twoWayfit, 2)
plot(twoWayfit, 1)
with(dam_project.df, tapply(Discount[7000:9999], Is_Electronics[7000:9999], shapiro.test))
$`0`
Shapiro-Wilk normality test
data: X[[i]]
W = 0.95636, p-value < 2.2e-16
$`1`
Shapiro-Wilk normality test
data: X[[i]]
W = 0.79604, p-value < 2.2e-16
library(car)
leveneTest(Discount ~ Subcategory, data = dam_project.df)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 6 668.75 < 2.2e-16 ***
56743
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dam_project.df$LogDiscount <- log(dam_project.df$Discount)
head(dam_project.df)
# A tibble: 6 x 10
ASIN Is_Electronics Subcategory MRP SP Sale In_Newspaper
<chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 B077~ 1 Mobile Acc~ 1999 1499 0 0
2 B073~ 1 Mobile Acc~ 600 349 0 0
3 B073~ 1 Mobile Acc~ 1000 672 0 0
4 B077~ 1 Mobile Acc~ 1199 799 0 0
5 B075~ 1 Mobile Acc~ 1149 695 0 0
6 B06Y~ 1 Mobile Acc~ 2199 1199 0 0
# ... with 3 more variables: On_Flipkart <dbl>, Discount <dbl>,
# LogDiscount <dbl>
with(dam_project.df, tapply(LogDiscount[7000:9999], Is_Electronics[7000:9999], shapiro.test))
$`0`
Shapiro-Wilk normality test
data: X[[i]]
W = 0.95205, p-value < 2.2e-16
$`1`
Shapiro-Wilk normality test
data: X[[i]]
W = 0.84774, p-value < 2.2e-16
library(car)
leveneTest(dam_project.df$LogDiscount ~ Subcategory, data = dam_project.df)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 6 285.65 < 2.2e-16 ***
56743
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
diet.welch = oneway.test(Discount~Sale,data=dam_project.df)
print(diet.welch)
One-way analysis of means (not assuming equal variances)
data: Discount and Sale
F = 822.61, num df = 5, denom df = 25845, p-value < 2.2e-16
diet.kruskal = kruskal.test(Discount~Sale,data=dam_project.df)
print(diet.kruskal)
Kruskal-Wallis rank sum test
data: Discount by Sale
Kruskal-Wallis chi-squared = 1017.3, df = 5, p-value < 2.2e-16