Understanding ANOVA

Olesya Volchenko

It can be hard to imagine what ‘variances’ mean

Here is an example.

We are going to simulate 4 different datasets to see how the results for ANOVA will be different.

Dataset 1

dat1 <- data.frame(cond = factor(rep(c("A","B", "C"), each = 200)), 
                  rating = c(rnorm(200, mean = -3),
                             rnorm(200, mean = 0), 
                             rnorm(200, mean = 3)))

Plot 1

mean = -3, 0, 3; sd = 1; normal distribution

p1 <- ggplot(dat1, aes(x=rating, fill=cond)) + 
  geom_density(alpha=.3) + 
  scale_x_continuous(limits = c(-15, 15)) +
  ylim(c(0, 0.5))
p1

Dataset 2

dat2 <- data.frame(cond = factor(rep(c("A","B", "C"), each = 200)), 
                  rating = c(rnorm(200, mean = -1),
                             rnorm(200, mean = 0), 
                             rnorm(200, mean = 1)))

Plot 2

mean = -1, 0, 1; sd = 1; normal distribution

p2 <- ggplot(dat2, aes(x=rating, fill=cond)) + 
  geom_density(alpha=.3) + 
  scale_x_continuous(limits = c(-15, 15)) +
  ylim(c(0, 0.5))
p2

Dataset 3:

dat3 <- data.frame(cond = factor(rep(c("A","B", "C"), each=200)), 
                   rating = c(rnorm(200, mean = -3, sd = 3),
                              rnorm(200, mean = 0, sd = 3), 
                              rnorm(200, mean = 3, sd = 3)))

Plot 3

mean = -3, 0, 3; sd = 3; normal distribution

p3 <- ggplot(dat3, aes(x = rating, fill = cond)) + 
  geom_density(alpha = .3) + 
  scale_x_continuous(limits = c(-15, 15)) +
  ylim(c(0, 0.5))
p3

Dataset 4:

dat4 <- data.frame(cond = factor(rep(c("A","B", "C"), each=200)), 
                   rating = c(rnorm(200, mean = -1, sd = 3),
                              rnorm(200, mean = 0, sd = 3), 
                              rnorm(200, mean = 1, sd = 3)))

Plot 4

mean = -1, 0, 1; sd = 3; normal distribution

p4 <- ggplot(dat4, aes(x = rating, fill = cond)) + 
  geom_density(alpha = .3) + 
  scale_x_continuous(limits = c(-15, 15)) +
  ylim(c(0, 0.5))
p4

All the plots together

Compare oneway ANOVAs for the 4 datasets

anova1 <- oneway.test(dat1$rating ~ dat1$cond, var.equal = T) 
anova2 <-oneway.test(dat2$rating ~ dat2$cond, var.equal = T) 
anova3 <-oneway.test(dat3$rating ~ dat3$cond, var.equal = T) 
anova4 <-oneway.test(dat4$rating ~ dat4$cond, var.equal = T) 
anova1; anova2; anova3; anova4
## 
##  One-way analysis of means
## 
## data:  dat1$rating and dat1$cond
## F = 1983.3, num df = 2, denom df = 597, p-value < 2.2e-16
## 
##  One-way analysis of means
## 
## data:  dat2$rating and dat2$cond
## F = 211.55, num df = 2, denom df = 597, p-value < 2.2e-16
## 
##  One-way analysis of means
## 
## data:  dat3$rating and dat3$cond
## F = 199.11, num df = 2, denom df = 597, p-value < 2.2e-16
## 
##  One-way analysis of means
## 
## data:  dat4$rating and dat4$cond
## F = 24.895, num df = 2, denom df = 597, p-value = 4.126e-11

Summary table:

data1 data2 data3 data4
group mean 1 -3.000 -1.0000 -3.0000 -1.00000
group mean 2 0.000 0.0000 0.0000 0.00000
group mean 3 3.000 1.0000 3.0000 1.00000
SD 1.000 1.0000 3.0000 3.00000
F-statistic 1983.288 211.5492 199.1143 24.89488
omega-squared 0.853 0.3710 0.3830 0.10200
df num 2.000 2.0000 2.0000 2.00000
df denom 597.000 597.0000 597.0000 597.00000
p-value 0.000 0.0000 0.0000 0.00000