Sameer Mathur
Two-way ANOVA
---
# reading data into R
resGrades.df <- read.csv(paste("AdvertisingDataV2.csv"))
# attaching data columns of the dataframe
attach(resGrades.df)
# dimension of the dataframe
dim(resGrades.df)
[1] 30000 6
# number of reservations by adType and restaurant type
addmargins(table(adType, restaurantType))
restaurantType
adType chain independent Sum
Curr Ads 4000 6000 10000
New Ads 4000 6000 10000
No Ads 4000 6000 10000
Sum 12000 18000 30000
# descriptive statistics of reservations by adType
library(data.table)
dt <- data.table(resGrades.df)
dt[, list(Count = .N,
mean = round(mean(reservations), 3),
sd = round(mean(reservations), 3),
median = round(median(reservations), 3),
min = min(reservations),
max = max(reservations)),
by = list(adType)]
adType Count mean sd median min max
1: No Ads 10000 33.960 33.960 33 15 58
2: Curr Ads 10000 34.021 34.021 33 18 59
3: New Ads 10000 41.681 41.681 40 18 79
# descriptive statistics of reservations by restaurant type
library(data.table)
dt <- data.table(resGrades.df)
dt[, list(Count = .N,
mean = round(mean(reservations), 3),
sd = round(mean(reservations), 3),
median = round(median(reservations), 3),
min = min(reservations),
max = max(reservations)),
by = list(restaurantType)]
restaurantType Count mean sd median min max
1: chain 12000 42.676 42.676 42 18 79
2: independent 18000 32.473 32.473 32 15 51
# box plot by restaurant type
boxplot(reservations ~ restaurantType, data = resGrades.df,
main = "Boxplot of Restaurant Type",
xlab = "Restaurant Type", ylab = "Number of Reservations")
# box plot by adType and restaurant type
library(lattice)
bwplot(reservations ~ adType | restaurantType, data = resGrades.df,
main = "Boxplot of Reservations by Ad Type and Restaurant Type",
ylab = "Number of Reservations",
col = "black")
# mean plot by restaurant type
library(gplots)
plotmeans(reservations ~ restaurantType, data = resGrades.df,
xlab = "Restaurant Type", ylab = "Number of Reservations",
digits=2, col = "black", ccol = "blue", barwidth = 2,
legends = TRUE, mean.labels = TRUE, frame = TRUE)
# interaction plot of adType and restaurant type
interaction.plot(adType, restaurantType, reservations,
type = "b", col = c(1:3), leg.bty = "o",
leg.bg = "beige", lwd = 2, pch = c(18, 24, 22),
xlab = "Restaurant type", ylab = "Number of Reservations",
main = "Interaction Plot of Ad Type and Restaurant Type")
# two-way ANOVA
twoWayfit <- aov(reservations ~ adType * restaurantType, data = resGrades.df)
# summary of the ANOVA model
summary(twoWayfit)
Df Sum Sq Mean Sq F value Pr(>F)
adType 2 394228 197114 7658.285 < 2e-16 ***
restaurantType 1 749570 749570 29122.292 < 2e-16 ***
adType:restaurantType 2 442 221 8.594 0.000186 ***
Residuals 29994 772006 26
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# normal Q-Q plot
plot(twoWayfit, 2)
# Anderson-Darling normality test
library(nortest)
ad.test(reservations)
Anderson-Darling normality test
data: reservations
A = 222.08, p-value < 2.2e-16
Null hypothesis (\( H_0 \)): The data is normally distributed.
The test fails to reject the null hypothesis. So, we cannot assume the normality of the data.
# residual versus fitted plot
plot(twoWayfit, 1)
# Levene test for homogeneity of variance
library(car)
leveneTest(reservations ~ adType * restaurantType, data = resGrades.df)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 974.04 < 2.2e-16 ***
29994
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Null hypothesis (\( H_0 \)): The variance of number of reservations are homogenous between groups.
We fail to reject the null hypothesis. So, we cannot assume the homogeneity of variance between the groups.
We use log transformation to the dependent variable (number of reservations) and again check for the normality and homogeneity of the variance.
# change reservations to log of reservations
resGrades.df$logReservations <- log(resGrades.df$reservations)
# attacjing data columns
attach(resGrades.df)
# first few rows of the dataframe
head(resGrades.df)
adType pageViews phoneCalls reservations businessID restaurantType
1 No Ads 643 44 39 1 chain
2 No Ads 621 41 44 2 chain
3 No Ads 581 40 38 3 chain
4 No Ads 592 35 31 4 chain
5 No Ads 648 45 46 5 chain
6 No Ads 519 37 41 6 chain
logReservations
1 3.663562
2 3.784190
3 3.637586
4 3.433987
5 3.828641
6 3.713572
# two-way ANOVA after log-transformation
twoWayTransfit <- aov(logReservations ~ adType * restaurantType, data = resGrades.df)
# summary of the ANOVA model
summary(twoWayTransfit)
Df Sum Sq Mean Sq F value Pr(>F)
adType 2 278.1 139.1 7588.0 <2e-16 ***
restaurantType 1 530.9 530.9 28967.7 <2e-16 ***
adType:restaurantType 2 4.3 2.2 118.2 <2e-16 ***
Residuals 29994 549.7 0.0
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value < 0.05 for the adType and restaurantType, we can conclude that there are significant differences in number of reservations in restaurant between the ad type as well as restaurant type.
For interaction of adType and restaurantType, we also find a significant differences.
Before Log-transformation
# normal Q-Q plot before log-transformation
plot(twoWayfit, 2)
After Log-transformation
# normal Q-Q plot after log-transformation
plot(twoWayTransfit, 2)
Before Log-transformation
# Anderson-Darling normality test before log-transformation
library(nortest)
ad.test(reservations)
Anderson-Darling normality test
data: reservations
A = 222.08, p-value < 2.2e-16
After Log-transformation
# Anderson-Darling normality test after log-transformation
library(nortest)
ad.test(logReservations)
Anderson-Darling normality test
data: logReservations
A = 36.301, p-value < 2.2e-16
Before Log-transformations
# residual versus fitted plot before log-transformation
plot(twoWayfit, 1)
After Log-transformations
# residual versus fitted plot after log-transformation
plot(twoWayTransfit, 1)
Before Log-transformation
# test homogeneity of variance after log-transformation
library(car)
leveneTest(reservations ~ adType * restaurantType, data = resGrades.df)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 974.04 < 2.2e-16 ***
29994
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
After Log-transformation
# test homogeneity of variance after log-transformation
library(car)
leveneTest(logReservations ~ adType * restaurantType, data = resGrades.df)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 249.59 < 2.2e-16 ***
29994
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
After log transformation of the dependent variable number of reservations, we find homogeneity problem i.e. we didn't have equal variances between the groups.
Same for normality assumption, we fail to accept the null hypothesis. That concludes, we cannot assume the normality of the data after the transformation.
# Tukey comparison test
TukeyHSD(twoWayTransfit)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = logReservations ~ adType * restaurantType, data = resGrades.df)
$adType
diff lwr upr p adj
New Ads-Curr Ads 0.202877282 0.198390382 0.207364182 0.0000000
No Ads-Curr Ads -0.002709695 -0.007196595 0.001777205 0.3328927
No Ads-New Ads -0.205586978 -0.210073878 -0.201100078 0.0000000
$restaurantType
diff lwr upr p adj
independent-chain -0.2715314 -0.2746583 -0.2684046 0
$`adType:restaurantType`
diff lwr
New Ads:chain-Curr Ads:chain 0.170776091 0.162149996
No Ads:chain-Curr Ads:chain -0.004558874 -0.013184969
Curr Ads:independent-Curr Ads:chain -0.290392755 -0.298267266
New Ads:independent-Curr Ads:chain -0.066114678 -0.073989190
No Ads:independent-Curr Ads:chain -0.291869665 -0.299744176
No Ads:chain-New Ads:chain -0.175334964 -0.183961060
Curr Ads:independent-New Ads:chain -0.461168846 -0.469043357
New Ads:independent-New Ads:chain -0.236890769 -0.244765280
No Ads:independent-New Ads:chain -0.462645755 -0.470520267
Curr Ads:independent-No Ads:chain -0.285833881 -0.293708393
New Ads:independent-No Ads:chain -0.061555804 -0.069430316
No Ads:independent-No Ads:chain -0.287310791 -0.295185302
New Ads:independent-Curr Ads:independent 0.224278077 0.217234900
No Ads:independent-Curr Ads:independent -0.001476910 -0.008520087
No Ads:independent-New Ads:independent -0.225754986 -0.232798164
upr p adj
New Ads:chain-Curr Ads:chain 0.179402186 0.0000000
No Ads:chain-Curr Ads:chain 0.004067221 0.6603760
Curr Ads:independent-Curr Ads:chain -0.282518243 0.0000000
New Ads:independent-Curr Ads:chain -0.058240167 0.0000000
No Ads:independent-Curr Ads:chain -0.283995153 0.0000000
No Ads:chain-New Ads:chain -0.166708869 0.0000000
Curr Ads:independent-New Ads:chain -0.453294334 0.0000000
New Ads:independent-New Ads:chain -0.229016257 0.0000000
No Ads:independent-New Ads:chain -0.454771244 0.0000000
Curr Ads:independent-No Ads:chain -0.277959370 0.0000000
New Ads:independent-No Ads:chain -0.053681293 0.0000000
No Ads:independent-No Ads:chain -0.279436279 0.0000000
New Ads:independent-Curr Ads:independent 0.231321254 0.0000000
No Ads:independent-Curr Ads:independent 0.005566268 0.9912292
No Ads:independent-New Ads:independent -0.218711809 0.0000000
# Tukey pair-wise comparisons plot: Differences in mean levels of adType
plot(TukeyHSD(twoWayTransfit, 1))
# Tukey pair-wise comparisons plot: Differences in mean levels of restaurant type
plot(TukeyHSD(twoWayTransfit, 2))
# Tukey pair-wise comparisons plot: Differences in mean levels of adType and restaurant type
plot(TukeyHSD(twoWayTransfit, 3))
A non-parametric alternative to ANOVA is Kruskal-Wallis rank sum test, which can be used when ANOVA assumptions are not met.
# Kruskal-Wallis rank sum test
kruskal.test(logReservations ~ interaction(adType, restaurantType), data = resGrades.df)
Kruskal-Wallis rank sum test
data: logReservations by interaction(adType, restaurantType)
Kruskal-Wallis chi-squared = 18667, df = 5, p-value < 2.2e-16
pairwise.t.test(logReservations, interaction(adType, restaurantType), data = resGrades.df, p.adjust.method = "BH", pool.sd = FALSE)
Pairwise comparisons using t tests with non-pooled SD
data: logReservations and interaction(adType, restaurantType)
Curr Ads.chain New Ads.chain No Ads.chain
New Ads.chain <2e-16 - -
No Ads.chain 0.12 <2e-16 -
Curr Ads.independent <2e-16 <2e-16 <2e-16
New Ads.independent <2e-16 <2e-16 <2e-16
No Ads.independent <2e-16 <2e-16 <2e-16
Curr Ads.independent New Ads.independent
New Ads.chain - -
No Ads.chain - -
Curr Ads.independent - -
New Ads.independent <2e-16 -
No Ads.independent 0.53 <2e-16
P value adjustment method: BH
The classical ANOVA test requires an assumption of equal variances for all groups. In our example, the homogeneity of variance assumption turned out to be fail. Hence, we use Welch one-way test.
# anova test when variances are not same
oneway.test(logReservations ~ interaction(adType, restaurantType), data = resGrades.df)
One-way analysis of means (not assuming equal variances)
data: logReservations and interaction(adType, restaurantType)
F = 7794, num df = 5, denom df = 12999, p-value < 2.2e-16