Set up environment

# Install pacman if needed
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
# load packages
pacman::p_load(pacman,
  tidyverse, openxlsx, ggthemes)

Scenario 1: If groups are insignificant

Weekly sales (in hundreds)

bookstore <- read.xlsx("datasets/OnewayANOVA.xlsx", skipEmptyRows = TRUE, sheet = "insignificant")

head(bookstore)
colMeans(bookstore, na.rm = TRUE)
##  Front   Back Middle 
##      9     14     11
books <- bookstore %>% 
mutate(week_num = row_number())  %>%
  pivot_longer(!week_num, names_to = "location",
               values_to = "sales")

books
#set Wall Street Journal theme for all plots
theme_set(theme_wsj())

ggplot(data = books, aes(x=location, y=sales)) + geom_boxplot(na.rm=TRUE) + ggtitle("Book sales by shelf location")

# Test normality across groups (Shapiro)
tapply(books$sales, books$location, FUN = shapiro.test)
## $Back
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.94943, p-value = 0.7331
## 
## 
## $Front
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.83668, p-value = 0.1223
## 
## 
## $Middle
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.94466, p-value = 0.683
# Check the homogeneity of variance (Bartlett)
bartlett.test(sales ~ location, data = books)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  sales by location
## Bartlett's K-squared = 0.44658, df = 2, p-value = 0.7999

Shapiro-Wilk normality test - all the p-values are very large so we can assume normality.

Bartlett variance test - p-value is very large, we can assume homogeneity of variance

# Perform one-way ANOVA 
(anova_results <- oneway.test(sales ~ location, data = books, var.equal = TRUE))
## 
##  One-way analysis of means
## 
## data:  sales and location
## F = 0.71498, num df = 2, denom df = 12, p-value = 0.5089
#Extract p-value
#If true, means are different. If false, mean sales are identical in all shelf positions.
if(anova_results$p.value < 0.05){
  print("Means are different")
} else{
  print("Means are not different")
}
## [1] "Means are not different"

Null hypothesis: Group means are equal Alternative hypothesis: Group means are not equal

one-way analysis of means - p-value is very large at 0.5089

INTERPRETATION OF ONE-WAY ANOVA RESULT: The p-value of the test is greater than the significance level alpha = 0.05. We can cannot conclude that sales are significantly different based on shelf height. (The p-value is higher than 5%, so we fail to reject the null hypothesis that the means across groups are equal). In other words, we accept the null hypothesis and conclude that sales are not significantly different across shelf positions.

Forecasting for scenario 1: The predicted mean for each group is the overall mean.

-The forecast will be weekly sales of irrespective of shelf location.

mean(books$sales, na.rm = TRUE)
## [1] 11.2

We can expect sales of 1,120 books to be sold per week.

Scenario 2: If one of the groups is significant

bookstore2 <- read.xlsx("~/Documents/GitHub/Forecasting-in-R/Forecasting-in-R/datasets/OnewayANOVA.xlsx", skipEmptyRows = TRUE, sheet = "significant")

head(bookstore2)
(books2 <- bookstore2 %>% 
mutate(week_num = row_number())  %>%
  pivot_longer(!week_num, names_to = "location",
               values_to = "sales"))
ggplot(data = books2, aes(x=location, y=sales)) + geom_boxplot(na.rm=TRUE)+ ggtitle("Book sales by shelf location", subtitle = "scenario 2")

# Test normality across groups
tapply(books2$sales, books2$location, FUN = shapiro.test)
## $Back
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.94971, p-value = 0.7143
## 
## 
## $Front
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 0.98676, p-value = 0.9672
## 
## 
## $Middle
## 
##  Shapiro-Wilk normality test
## 
## data:  X[[i]]
## W = 1, p-value = 1
# Check the homogeneity of variance
bartlett.test(sales ~ location, data = books2)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  sales by location
## Bartlett's K-squared = 0.66029, df = 2, p-value = 0.7188
# Perform one-way ANOVA 
(anova_results2 <- oneway.test(sales ~ location, data = books2, var.equal = TRUE))
## 
##  One-way analysis of means
## 
## data:  sales and location
## F = 11.386, num df = 2, denom df = 9, p-value = 0.003426
anova_results2$p.value < 0.05 #If true, means are different. reject null hypothesis and alternative hypothesis is true. if false, mean sales are identical in all shelf positions.
## [1] TRUE

INTERPRETATION OF ONE-WAY ANOVA RESULT: The p-value is very small 0.003426 so we reject the null hypothesis and conclude that sales are significantly different.

Forecasting for scenario 2: The predicted mean for each group equals the group mean

(booksales_forecast_signif <- books2 %>% 
   group_by(location) %>% 
  summarize(mean_sales = mean(sales, na.rm = TRUE)))

We can expect sales of 900 books to be sold per week when located at the front, 1100 per week when located at the middle and 1400 per week when located at the back of the book section.