library(readr)
WA_Marketing_Campaign <- read_csv("~/Downloads/WA_Marketing-Campaign.csv")
## Rows: 548 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): MarketSize
## dbl (6): MarketID, LocationID, AgeOfStore, Promotion, week, SalesInThousands
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(WA_Marketing_Campaign)

Data source:https://bit.ly/3YQQVfF

Scenario

A fast-food chain plans to add a new item to its menu. However, they are still undecided between three possible marketing campaigns for promoting the new product. In order to determine which promotion has the greatest effect on sales, the new item is introduced at locations in several randomly selected markets. A different promotion is used at each location, and the weekly sales of the new item are recorded for the first four weeks.

Goal

Evaluate A/B testing results and decide which marketing strategy works the best.

Columns MarketID: unique identifier for market

MarketSize: size of market area by sales

LocationID: unique identifier for store location

AgeOfStore: age of store in years

Promotion: one of three promotions that were tested

week: one of four weeks when the promotions were run

SalesInThousands: sales amount for a specific LocationID, Promotion, and week

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.1 
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(broom)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
head(WA_Marketing_Campaign)
## # A tibble: 6 × 7
##   MarketID MarketSize LocationID AgeOfStore Promotion  week SalesInThousands
##      <dbl> <chr>           <dbl>      <dbl>     <dbl> <dbl>            <dbl>
## 1        1 Medium              1          4         3     1             33.7
## 2        1 Medium              1          4         3     2             35.7
## 3        1 Medium              1          4         3     3             29.0
## 4        1 Medium              1          4         3     4             39.2
## 5        1 Medium              2          5         2     1             27.8
## 6        1 Medium              2          5         2     2             34.7
str(WA_Marketing_Campaign)
## spc_tbl_ [548 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ MarketID        : num [1:548] 1 1 1 1 1 1 1 1 1 1 ...
##  $ MarketSize      : chr [1:548] "Medium" "Medium" "Medium" "Medium" ...
##  $ LocationID      : num [1:548] 1 1 1 1 2 2 2 2 3 3 ...
##  $ AgeOfStore      : num [1:548] 4 4 4 4 5 5 5 5 12 12 ...
##  $ Promotion       : num [1:548] 3 3 3 3 2 2 2 2 1 1 ...
##  $ week            : num [1:548] 1 2 3 4 1 2 3 4 1 2 ...
##  $ SalesInThousands: num [1:548] 33.7 35.7 29 39.2 27.8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   MarketID = col_double(),
##   ..   MarketSize = col_character(),
##   ..   LocationID = col_double(),
##   ..   AgeOfStore = col_double(),
##   ..   Promotion = col_double(),
##   ..   week = col_double(),
##   ..   SalesInThousands = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
nrow(WA_Marketing_Campaign)
## [1] 548
n_distinct(WA_Marketing_Campaign)
## [1] 548
#checking missing values
 WA_Marketing_Campaign%>%
  is.na()%>%
sum()
## [1] 0
# Rename promotion 1, 2, and 3 for simplicity
my_data<-WA_Marketing_Campaign %>% 
  mutate(
    Promotion = fct_recode(as.factor(Promotion),
                     promotion_1= '1',
                   promotion_2='2',
                      Promotion_3 = '3'))

nrow(my_data)
## [1] 548
# Visualize to check for outliers in "SalesInThousands" column
sales_boxplot<-ggplot(my_data, aes(x=Promotion, y=SalesInThousands, fill=Promotion)) + geom_boxplot()
ggplotly(sales_boxplot)
# mean and standard deviations before removing outliers .
average_sales<-group_by(my_data, Promotion) %>%
  summarise(
    mean = mean(SalesInThousands, na.rm = TRUE),
    sd = sd(SalesInThousands, na.rm = TRUE))%>%
  arrange(desc(mean)) 
average_sales
## # A tibble: 3 × 3
##   Promotion    mean    sd
##   <fct>       <dbl> <dbl>
## 1 promotion_1  58.1  16.6
## 2 Promotion_3  55.4  16.8
## 3 promotion_2  47.3  15.1

-There are outliers in the sales. We need to remove these outliers for more reliable analysis.

# But first, we assign the outlier values into a vector
sales_outliers <- boxplot(my_data$SalesInThousands, plot=FALSE)$out 
sales_outliers
##  [1] 88.73 94.17 89.70 88.12 90.30 89.77 88.91 87.70 94.21 96.48 91.98 93.71
## [13] 96.01 93.03 97.61 88.07 94.43 89.44 88.64 87.90 91.60 93.86 94.89 93.63
## [25] 91.61 99.65 88.64 89.25 89.32 99.12 93.32 91.29

There are 32 outliers in ‘SalesInThousands’ column

# Remove the sales outliers
mydata<-my_data[-which(my_data$SalesInThousands %in% sales_outliers),]
nrow(mydata)
## [1] 516

-After removing the sales outliers, the data set dropped from 548 to 516 observations.

# mean and standard deviations AFTER removing outliers.
average_sales<-group_by(mydata, Promotion) %>%
  summarise(
    mean = mean(SalesInThousands, na.rm = TRUE),
    sd = sd(SalesInThousands, na.rm = TRUE)
  )%>%
  arrange(desc(mean)) 
average_sales
## # A tibble: 3 × 3
##   Promotion    mean    sd
##   <fct>       <dbl> <dbl>
## 1 promotion_1  54.0  12.1
## 2 Promotion_3  52.9  14.3
## 3 promotion_2  46.9  14.6

Compute two-way ANOVA test

Hypothesis Testing:

Null hypothesis: There is no difference in the means of the 3 promotions

Alternative: The means of the 3 promotion groups are not all equal.

Anova Model

model1 <- lm(SalesInThousands ~ Promotion, data = mydata)
myanova <- aov(model1)
summary(myanova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Promotion     2   5202    2601   13.69 1.61e-06 ***
## Residuals   513  97453     190                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p-value <0.05. Therefore, we reject the null hypothesis that there is no difference in the means of the 3 promotions.

But we need to investigate the difference further by performing Tukey HSD test

Tukey HSD test for ‘promotion’ group

tukey.test <- TukeyHSD(myanova)
tukey.test
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = model1)
## 
## $Promotion
##                              diff       lwr       upr     p adj
## promotion_2-promotion_1 -7.149522 -10.67915 -3.619895 0.0000075
## Promotion_3-promotion_1 -1.131623  -4.70638  2.443133 0.7373352
## Promotion_3-promotion_2  6.017898   2.61109  9.424706 0.0001141

-There is statistically significant difference in the means of promotion 1 and 2 since p_value<0.05

-There is no statistically significant difference in the means of promotion 3 and 1 since p_value>0.05

-There is statistically significant difference in the means of promotion 3 and 2 since p_value<0.05

Therefore, any or both of promotion 1 and 3 are recommended.

Thank you for your precious time!