# Install pacman if needed
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
# load packages
pacman::p_load(pacman,
tidyverse, openxlsx, ggpubr)
#Dataset is in subfolder
(games <- read.xlsx("datasets/games.xlsx", sheet = 3))
(games_long <- games %>%
pivot_longer(cols = starts_with("Price"), #columns to pivot to rows
names_to = "price", #name the new column for the price variable
values_to = "sales")) #the new sales column
games_long <- games_long %>%
mutate(advertising = factor(Advertising, levels = c("Low", "Medium", "High")),
price = factor(price, levels = c("PriceLow", "PriceMedium", "PriceHigh"))) %>%
select(-Advertising)
str(games_long)
## tibble [27 × 3] (S3: tbl_df/tbl/data.frame)
## $ price : Factor w/ 3 levels "PriceLow","PriceMedium",..: 1 2 3 1 2 3 1 2 3 1 ...
## $ sales : num [1:27] 41 21 15 25 20 14 23 16 13 28 ...
## $ advertising: Factor w/ 3 levels "Low","Medium",..: 1 1 1 1 1 1 1 1 1 2 ...
#Plot using ggpubr
ggline(games_long, x = "advertising", y = "sales",
add = c("mean_se", "jitter"),
color = "price", palette = "jco",
title = "Sales increase when ad spending increases",
subtitle = "(but not when prices are high)",
legend.title = "price levels"
)
The aov function models each combination of advertising and price on sales.
The f-values and p-values suggests that we reject the null hypothesis for advertising, price, and the interaction between advertising and price.
two_way_aov <- aov(sales ~ advertising*price, data = games_long)
#The anova table
summary(two_way_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## advertising 2 829.0 414.5 24.22 7.86e-06 ***
## price 2 2498.7 1249.4 73.02 2.31e-09 ***
## advertising:price 4 509.9 127.5 7.45 0.00101 **
## Residuals 18 308.0 17.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
How do we use this information from the aov table to forecast?
Since the p-value for the interaction (advertising*price) is very small, we can forecast sales for any price and advertising combination equal to the mean of the observations.
#histogram with kernel density line
#hist(two_way_aov$residuals, prob = TRUE)
#lines(density(two_way_aov$residuals))
#However, when you have a small dataset it's better to visualize the residuals using qqplot.
qqnorm(two_way_aov$residuals)
qqline(two_way_aov$residuals)
#We have a few points quite far away from the line
(forecast_tab <- games_long %>%
group_by(advertising, price) %>%
summarize(forecast_sales = round(mean(sales),2),
std_dev = round(sd(sales),2)))
## `summarise()` has grouped output by 'advertising'. You can override using the `.groups` argument.
Because the p-value for the interaction is low, we have a significant interaction between advertising and price. Forecast if advertising is high and price is medium is 35.33
A key insight for marketing stakeholders, would be to cut back on advertising when the price is high.
Two-way ANOVA (with replication), if interaction effect is significant, then the predicted value is the value of the response variable (y) is equal to the mean of all observations having that combination of factor levels.