Dataset from IBM Watson Community - Marketing A/B Test Evaluation and Results

library(multcomp)

## Loading required package: mvtnorm

## Loading required package: survival

## Loading required package: TH.data

## Loading required package: MASS

## 
## Attaching package: 'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()

library(cowplot)
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

Import and inspect data

df<-read.csv("https://raw.githubusercontent.com/pthiagu2/DataMining/master/WA_Fn-UseC_-Marketing-Campaign-Eff-UseC_-FastF.csv")

#check results
head(df)

#check for missing data using VIM package
aggr(df, prop = F, numbers = T) # no red - no missing values

#summary sales statistics
(grouped.df <- df %>% 
  group_by(Promotion) %>% 
  summarize(
    count = n(),
    totalSales = sum(SalesInThousands),
    meanSales = mean(SalesInThousands),
    sd = sd(SalesInThousands)))

## `summarise()` ungrouping output (override with `.groups` argument)

-We can see that group 3 created the most sales followed by groups 1 & 2 -We can also see that there were 172 stores that were in promotion 1 while there were 188 stores in promotion 2. This is technically not balanced, but nearly-balanced. -As long as we have equal variances in our groups, this shouldn’t be a problem.

library("ggpubr")

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:cowplot':
## 
##     get_legend

ggboxplot(df, x = "Promotion", y = "SalesInThousands", 
          color = "Promotion", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          ylab = "Sales", xlab = "Promotion")

We see that promotion 1 has the most average sales followed by promotion 3 just like in our summary statistics table

Data visualization and exploration

(viz_1 <- ggplot(df, aes(x=MarketSize))+
  geom_bar(stat="count", width=0.7)+
  theme_minimal())

#Create a subset of data
#market size by promotion data frame
market_df <- df %>% 
  group_by(Promotion) %>% 
  count(MarketSize) %>% 
  mutate(percent = n/sum(n))
market_df #check results

(viz_2 <- ggplot(data = df, aes(x=MarketSize, fill = factor(Promotion))) +
  geom_bar(position = "fill") + 
  theme(
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    plot.title = element_text(hjust = 0.5),
    text=element_text(size=14,  family="Helvetica")) + labs(x = " ", title = "MarketSize"))

(viz_3 <- ggplot(data = df, aes(x = factor(Promotion), y = LocationID, fill = Promotion))+geom_boxplot())

(viz_4 <- ggplot(data = df, aes(x = factor(Promotion), y = AgeOfStore, fill = Promotion))+geom_boxplot())

(viz_5 <- ggplot(data=df, aes(x=factor(AgeOfStore)))+
  geom_bar(stat="count", width=0.7, fill="#b2df8a")+
  theme_minimal())

#there is a bit of a delay before plots appear
plot_grid(viz_1, viz_2, viz_3, viz_4, viz_5, labels = "AUTO")

Data cleaning

#check the promotion variable
str(df$Promotion) #an integer object, we need to change this

##  int [1:548] 3 3 3 3 2 2 2 2 1 1 ...

#factor the promotion variable before we model it
df$Promotion <- as.factor(df$Promotion)

#check results
str(df$Promotion)

##  Factor w/ 3 levels "1","2","3": 3 3 3 3 2 2 2 2 1 1 ...

Data question & hypothesis test

Does store sales differ by promotion?

aggregate(SalesInThousands ~ Promotion, df, mean)

#promotion 1 has the highest level of sales, but
# is it statistically significant?

Significance Testing - ANOVA

Promotion 1 has the highest mean of sales, but is it statistically significant?

#We plot the ANOVA model to visualize confidence
#intervals for mean sales by promotion
df.anova <- aov(SalesInThousands ~  Promotion, data = df)
summary(df.anova)

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Promotion     2  11449    5725   21.95 6.77e-10 ***
## Residuals   545 142114     261                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Conclusions and interpretation: We see that the sales differs by Promotion, and the model is statistically significant but we don’t know which pair groups are significant

How can we change this?? We need to perform additional testing

Post hoc testing

#Use glht() to perform multiple pairwise-comparisons for
# a one-way ANOVA: (with confidence interval and p-values)
summary(glht(df.anova, linfct = mcp(Promotion = "Tukey")))

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: aov(formula = SalesInThousands ~ Promotion, data = df)
## 
## Linear Hypotheses:
##            Estimate Std. Error t value Pr(>|t|)    
## 2 - 1 == 0  -10.770      1.704  -6.321   <1e-04 ***
## 3 - 1 == 0   -2.735      1.704  -1.605    0.244    
## 3 - 2 == 0    8.035      1.666   4.824   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

#group 2 is significant against group 1
#group 3 is significant against group 2

TukeyHSD(aov(df.anova), "Promotion") #does same as glht function but includes the confidence intervals

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = df.anova)
## 
## $Promotion
##           diff        lwr       upr     p adj
## 2-1 -10.769597 -14.773842 -6.765351 0.0000000
## 3-1  -2.734544  -6.738789  1.269702 0.2443878
## 3-2   8.035053   4.120802 11.949304 0.0000055

# plot difference in mean levels of promotion
plot(TukeyHSD(df.anova))

#Tukey comparison of means - much better and has confidence intervals
a1 <- aov(formula = df$SalesInThousands ~ df$Promotion)
plot(a1) # plots to check normality

#Post hoc testing
posthoc <- TukeyHSD(x=a1, conf.level = 0.95)
posthoc

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = df$SalesInThousands ~ df$Promotion)
## 
## $`df$Promotion`
##           diff        lwr       upr     p adj
## 2-1 -10.769597 -14.773842 -6.765351 0.0000000
## 3-1  -2.734544  -6.738789  1.269702 0.2443878
## 3-2   8.035053   4.120802 11.949304 0.0000055

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

plotmeans(SalesInThousands ~ Promotion, data = df,
  frame = FALSE, connect = TRUE, mean.labels = TRUE,
  digits = 2, col=" dark red",
  barwidth=2, pch = " ",
          main = "Groups 1 & 2 and 3 & 2 are significant")

## Warning in text.default(x, y, label = labels, col = col, ...): "frame" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

#With intercept removed, glht gives us the mean value for each segment
df.anova2 <- aov(SalesInThousands ~  -1 + Promotion, data = df)
glht(df.anova2)

## 
##   General Linear Hypotheses
## 
## Linear Hypotheses:
##                 Estimate
## Promotion1 == 0    58.10
## Promotion2 == 0    47.33
## Promotion3 == 0    55.36

# Now we plot difference in mean levels of promotion
plot(glht(df.anova2), xlab="Average sales by promotion (95% CI)")

#The dot shows the mean for each segment and bars reflect the confidence intervals.

With all 3 plotted with confidence intervals, Promo 2 is significantly worse than Promo 1 and 3, but we cannot say that Promo 1 and 3 are significant as their confidence intervals overlap.

Check ANOVA Assumptions

#I'm doing this as the residuals were a bit skewed

#1. Homogeneity of variances
plot(df.anova) #first anova model -- Looks good

#2. Levene's test for non-normal distribution - we check due to skew in residuals
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

leveneTest(SalesInThousands ~ Promotion, data = df)

#We see that the p-value for Promotion 2 is large and therefore not significant. So we are good here. Promotion 2 is still not significant.


#3. Shapiro-Wilk (Has better power than K-S test)

# Extract the residuals
aov_residuals <- residuals(object = df.anova)
# Run Shapiro-Wilk test
shapiro.test(x = aov_residuals )

## 
##  Shapiro-Wilk normality test
## 
## data:  aov_residuals
## W = 0.92208, p-value = 3.155e-16

#We reject null hypothesis that residuals are normally distributed

#4. Kruskal-Wallis
#Non-parametric alternative to ANOVA
# It’s recommended when the assumptions of one-way ANOVA test are not
# met. One of those assumptions are that the residuals are normally
# distributed

kruskal.test(SalesInThousands ~ Promotion, data = df)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  SalesInThousands by Promotion
## Kruskal-Wallis chi-squared = 53.295, df = 2, p-value = 2.674e-12

#The p-value is tiny; therefore, we can reject null hypothesis that there
# are no differences in group means, but we don't know which groups.

#5. We do pairwise comparisons and adjust for multiple groups
pairwise.wilcox.test(df$SalesInThousands, df$Promotion,
                 p.adjust.method = "bonferroni", paired = FALSE)

## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  df$SalesInThousands and df$Promotion 
## 
##   1       2      
## 2 1.8e-11 -      
## 3 0.11    3.6e-07
## 
## P value adjustment method: bonferroni

This validates what we have done above with original anova model. Our conclusions from are original findings are still valid most likely due to having a very large sample size to make the group comparisons.

###Summary: What should you tell the marketing & sales team?

Let’s run again with just promotion 1 & 3 to see if we can get a significant result. The test should not take as long to run as we only have 2 groups to compare so we could see significant results quite fast.

Having a proper control group for comparison to be able to calculate the impact of the promotions

It appeared in group 1 there were some stores that were slightly younger than those in Group 3 it may not have made a difference but we should try to control for this.