## load/install libraries
.libPaths(c("./Rpackages",.libPaths()))
library(knitr)
library(tidyverse)
library(deSolve)
library(minpack.lm)
library(ggpubr)
library(readxl)
library(gamlss)
library(data.table)
library(grid)
library(png)
library(nlme)
library(gridExtra)
library(mvtnorm)
library(e1071)
library(lattice)
library(ggplot2)
library(dslabs)
library(NHANES)
library(plyr)
library(dplyr)
library(nasaweather)
library(ggplot2)
library(gganimate)
library(av)
library(gifski)
library(foreach)
library("DAAG")
library(DT)
library(TeachingDemos)
library(gridExtra)

One-Way ANOVA

One-way ANOVA is used when we want to compare the mean of a continuous outcome across more than two groups.

Hypotheses

For a one-way ANOVA with \(k\) groups:

  • \(H_0\): all group means are equal
  • \(H_A\): at least one group mean differs

General workflow

  1. Visualize the outcome by group.
  2. Check assumptions:
    • independence
    • approximate normality within groups / residuals
    • roughly equal variances
  3. Fit the ANOVA model.
  4. Interpret the F-test and p-value.

Example 1: Iris data

data(iris)

str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50

Side-by-side boxplots

boxplot(Sepal.Length ~ Species,
        data = iris,
        col = c("gray85", "gray70", "gray55"),
        main = "Sepal Length by Species",
        xlab = "Species",
        ylab = "Sepal Length")

Group means

aggregate(Sepal.Length ~ Species, data = iris, mean)
##      Species Sepal.Length
## 1     setosa        5.006
## 2 versicolor        5.936
## 3  virginica        6.588

Fit one-way ANOVA

fit_iris <- aov(Sepal.Length ~ Species, data = iris)
summary(fit_iris)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Species       2  63.21  31.606   119.3 <2e-16 ***
## Residuals   147  38.96   0.265                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Diagnostic checks

par(mfrow = c(1, 2))
plot(fitted(fit_iris), resid(fit_iris),
     xlab = "Fitted values",
     ylab = "Residuals",
     main = "Residuals vs Fitted")
abline(h = 0, lty = 2, col = "red")

qqnorm(resid(fit_iris), main = "Normal Q-Q Plot")
qqline(resid(fit_iris), col = "blue", lwd = 2)

par(mfrow = c(1, 1))

Variability by group

tapply(iris$Sepal.Length, iris$Species, sd)
##     setosa versicolor  virginica 
##  0.3524897  0.5161711  0.6358796

Two-Way ANOVA

Two-way ANOVA is used when a continuous outcome is explained by two categorical predictors.
We usually examine:

  • the main effect of factor 1
  • the main effect of factor 2
  • the interaction effect between the two factors

Hypotheses

For factors \(G_1\) and \(G_2\):

  • \(H_0\): \(G_1\) has no main effect
  • \(H_0\): \(G_2\) has no main effect
  • \(H_0\): there is no interaction between \(G_1\) and \(G_2\)

Example: warpbreaks

data(warpbreaks)

str(warpbreaks)
## 'data.frame':    54 obs. of  3 variables:
##  $ breaks : num  26 30 54 25 70 52 51 26 67 18 ...
##  $ wool   : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tension: Factor w/ 3 levels "L","M","H": 1 1 1 1 1 1 1 1 1 2 ...
table(warpbreaks$wool, warpbreaks$tension)
##    
##     L M H
##   A 9 9 9
##   B 9 9 9

Means by wool and tension

wb_means <- aggregate(warpbreaks$breaks,
                      by = list(wool = warpbreaks$wool,
                                tension = warpbreaks$tension),
                      FUN = mean)
wb_means
##   wool tension        x
## 1    A       L 44.55556
## 2    B       L 28.22222
## 3    A       M 24.00000
## 4    B       M 28.77778
## 5    A       H 24.55556
## 6    B       H 18.77778

One-way ANOVA on each factor separately

summary(aov(breaks ~ wool, data = warpbreaks))
##             Df Sum Sq Mean Sq F value Pr(>F)
## wool         1    451   450.7   2.668  0.108
## Residuals   52   8782   168.9
summary(aov(breaks ~ tension, data = warpbreaks))
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## tension      2   2034  1017.1   7.206 0.00175 **
## Residuals   51   7199   141.1                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Full two-way ANOVA with interaction

fit_int <- aov(breaks ~ wool * tension, data = warpbreaks)
summary(fit_int)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## wool          1    451   450.7   3.765 0.058213 .  
## tension       2   2034  1017.1   8.498 0.000693 ***
## wool:tension  2   1003   501.4   4.189 0.021044 *  
## Residuals    48   5745   119.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interaction plot

interaction.plot(x.factor = warpbreaks$tension,
                 trace.factor = warpbreaks$wool,
                 response = warpbreaks$breaks,
                 xlab = "Tension",
                 ylab = "Mean number of breaks",
                 trace.label = "Wool",
                 col = c("black", "gray40"),
                 lwd = 2)

Kruskal-Wallis Test

The Kruskal-Wallis test is a nonparametric alternative to one-way ANOVA.
It is useful when normality is doubtful and you want to compare several groups.

Hypotheses

  • \(H_0\): group distributions / medians are equal
  • \(H_A\): at least one group differs

Example: survey data from MASS

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:DAAG':
## 
##     hills
## The following object is masked from 'package:dplyr':
## 
##     select
data(survey)

str(survey)
## 'data.frame':    237 obs. of  12 variables:
##  $ Sex   : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
##  $ Wr.Hnd: num  18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
##  $ NW.Hnd: num  18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
##  $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Fold  : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
##  $ Pulse : int  92 104 87 NA 35 64 83 74 72 90 ...
##  $ Clap  : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
##  $ Exer  : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
##  $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ Height: num  173 178 NA 160 165 ...
##  $ M.I   : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
##  $ Age   : num  18.2 17.6 16.9 20.3 23.7 ...
table(survey$Smoke, useNA = "ifany")
## 
## Heavy Never Occas Regul  <NA> 
##    11   189    19    17     1

Kruskal-Wallis test

kruskal.test(Age ~ Smoke, data = survey)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by Smoke
## Kruskal-Wallis chi-squared = 3.9262, df = 3, p-value = 0.2695