## load/install libraries
.libPaths(c("./Rpackages",.libPaths()))
library(knitr)
library(tidyverse)
library(deSolve)
library(minpack.lm)
library(ggpubr)
library(readxl)
library(gamlss)
library(data.table)
library(grid)
library(png)
library(nlme)
library(gridExtra)
library(mvtnorm)
library(e1071)
library(lattice)
library(ggplot2)
library(dslabs)
library(NHANES)
library(plyr)
library(dplyr)
library(nasaweather)
library(ggplot2)
library(gganimate)
library(av)
library(gifski)
library(foreach)
library("DAAG")
library(DT)
library(TeachingDemos)
library(gridExtra)
One-way ANOVA is used when we want to compare the mean of a continuous outcome across more than two groups.
For a one-way ANOVA with \(k\) groups:
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
boxplot(Sepal.Length ~ Species,
data = iris,
col = c("gray85", "gray70", "gray55"),
main = "Sepal Length by Species",
xlab = "Species",
ylab = "Sepal Length")
aggregate(Sepal.Length ~ Species, data = iris, mean)
## Species Sepal.Length
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
fit_iris <- aov(Sepal.Length ~ Species, data = iris)
summary(fit_iris)
## Df Sum Sq Mean Sq F value Pr(>F)
## Species 2 63.21 31.606 119.3 <2e-16 ***
## Residuals 147 38.96 0.265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 2))
plot(fitted(fit_iris), resid(fit_iris),
xlab = "Fitted values",
ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, lty = 2, col = "red")
qqnorm(resid(fit_iris), main = "Normal Q-Q Plot")
qqline(resid(fit_iris), col = "blue", lwd = 2)
par(mfrow = c(1, 1))
tapply(iris$Sepal.Length, iris$Species, sd)
## setosa versicolor virginica
## 0.3524897 0.5161711 0.6358796
Two-way ANOVA is used when a continuous outcome is explained by two categorical predictors.
We usually examine:
For factors \(G_1\) and \(G_2\):
warpbreaksdata(warpbreaks)
str(warpbreaks)
## 'data.frame': 54 obs. of 3 variables:
## $ breaks : num 26 30 54 25 70 52 51 26 67 18 ...
## $ wool : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
## $ tension: Factor w/ 3 levels "L","M","H": 1 1 1 1 1 1 1 1 1 2 ...
table(warpbreaks$wool, warpbreaks$tension)
##
## L M H
## A 9 9 9
## B 9 9 9
wb_means <- aggregate(warpbreaks$breaks,
by = list(wool = warpbreaks$wool,
tension = warpbreaks$tension),
FUN = mean)
wb_means
## wool tension x
## 1 A L 44.55556
## 2 B L 28.22222
## 3 A M 24.00000
## 4 B M 28.77778
## 5 A H 24.55556
## 6 B H 18.77778
summary(aov(breaks ~ wool, data = warpbreaks))
## Df Sum Sq Mean Sq F value Pr(>F)
## wool 1 451 450.7 2.668 0.108
## Residuals 52 8782 168.9
summary(aov(breaks ~ tension, data = warpbreaks))
## Df Sum Sq Mean Sq F value Pr(>F)
## tension 2 2034 1017.1 7.206 0.00175 **
## Residuals 51 7199 141.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_int <- aov(breaks ~ wool * tension, data = warpbreaks)
summary(fit_int)
## Df Sum Sq Mean Sq F value Pr(>F)
## wool 1 451 450.7 3.765 0.058213 .
## tension 2 2034 1017.1 8.498 0.000693 ***
## wool:tension 2 1003 501.4 4.189 0.021044 *
## Residuals 48 5745 119.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
interaction.plot(x.factor = warpbreaks$tension,
trace.factor = warpbreaks$wool,
response = warpbreaks$breaks,
xlab = "Tension",
ylab = "Mean number of breaks",
trace.label = "Wool",
col = c("black", "gray40"),
lwd = 2)
The Kruskal-Wallis test is a nonparametric alternative to one-way ANOVA.
It is useful when normality is doubtful and you want to compare several groups.
survey data from MASSlibrary(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:DAAG':
##
## hills
## The following object is masked from 'package:dplyr':
##
## select
data(survey)
str(survey)
## 'data.frame': 237 obs. of 12 variables:
## $ Sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
## $ Wr.Hnd: num 18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
## $ NW.Hnd: num 18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
## $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
## $ Fold : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
## $ Pulse : int 92 104 87 NA 35 64 83 74 72 90 ...
## $ Clap : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
## $ Exer : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
## $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ Height: num 173 178 NA 160 165 ...
## $ M.I : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
## $ Age : num 18.2 17.6 16.9 20.3 23.7 ...
table(survey$Smoke, useNA = "ifany")
##
## Heavy Never Occas Regul <NA>
## 11 189 19 17 1
kruskal.test(Age ~ Smoke, data = survey)
##
## Kruskal-Wallis rank sum test
##
## data: Age by Smoke
## Kruskal-Wallis chi-squared = 3.9262, df = 3, p-value = 0.2695