# ************************************************************************
# MultipleComparisonsWithGLS.R
# ************************************************************************
# Brief description: This script explores multiple comparisons using a
# generalized least squares regression model with a multi-level factor
# variable. Packages required: nlme, multcomp, MASS Keywords (less than
# 10): Multiple comparisons, gls, regression, factor, multcomp, generalized
# least squares EPA Contact Information Name: Phone Number: E-mail Address:
# Is data available upon request? Yes/No Date script was written: Feb. 13,
# 2014
#### Clean up workspace
rm(list = ls())
#### Include packages.
library(nlme)
library(multcomp)
## Warning: package 'multcomp' was built under R version 3.0.2
## Loading required package: mvtnorm
## Loading required package: TH.data
## Warning: package 'TH.data' was built under R version 3.0.2
library(MASS)
#### Setup some fake data.
set.seed(1111)
site <- rep(c("A", "B", "C", "D", "E", "F"), each = 20) # Specify site levels
siteMns <- c(A = 10, B = 20, C = 35, D = 20, E = 15, F = 20) # Declare true site means.
resp <- as.vector(mvrnorm(20, mu = siteMns, Sigma = diag(c(2, 4, 6, 4, 2, 4)))) # Grab random values by site.
error <- rnorm(length(site), 0, 0.5) # Introduce some more error
dat <- data.frame(y = resp + error, site = as.factor(site)) # Make a data frame
aggregate(y ~ site, dat, summary)
## site y.Min. y.1st Qu. y.Median y.Mean y.3rd Qu. y.Max.
## 1 A 7.79 9.80 10.90 10.90 11.70 13.90
## 2 B 16.10 18.10 19.20 19.40 20.30 24.80
## 3 C 28.10 34.40 35.80 35.30 37.50 38.90
## 4 D 18.00 19.20 20.50 20.60 21.60 24.50
## 5 E 13.10 14.60 15.30 15.60 16.50 19.20
## 6 F 17.70 19.80 21.20 21.30 22.80 25.40
# This script works the same with an unbalanced design:
if (FALSE) {
dat <- dat[-c(1, 40), ]
}
#### Use gls() to setup a model using the 6-level factor as a predictor.
glsModel1 <- gls(y ~ site, data = dat)
anova(glsModel1) # There is clearly evidence that means by site differ in some capacity.
## Denom. DF: 114
## numDF F-value p-value
## (Intercept) 1 11461 <.0001
## site 5 306 <.0001
summary(glsModel1) # The p-values are all 'significant', but that is misleading. See below.
## Generalized least squares fit by REML
## Model: y ~ site
## Data: dat
## AIC BIC logLik
## 524.6 543.8 -255.3
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 10.887 0.4696 23.19 0
## siteB 8.514 0.6641 12.82 0
## siteC 24.431 0.6641 36.79 0
## siteD 9.740 0.6641 14.67 0
## siteE 4.754 0.6641 7.16 0
## siteF 10.370 0.6641 15.62 0
##
## Correlation:
## (Intr) siteB siteC siteD siteE
## siteB -0.707
## siteC -0.707 0.500
## siteD -0.707 0.500 0.500
## siteE -0.707 0.500 0.500 0.500
## siteF -0.707 0.500 0.500 0.500 0.500
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -3.459501 -0.628343 -0.005658 0.606989 2.572364
##
## Residual standard error: 2.1
## Degrees of freedom: 120 total; 114 residual
# The estimates for each listed coefficient are from t-tests (really Wald
# tests) with the null hypothesis that the difference between the reference
# level mean (Intercept) and the other factor level mean is zero. Since site
# 'A' has a true mean of 10 and no other site has a true mean of 10, we
# expect these p-values to be 'significant'. But that does not mean that
# all the levels of the factor have different means. To check this, we need
# a multiple comparisons test that will test the pairwise differences. Doing
# this be re-ordering the factor level several times and re-running the
# gls() model will not inflate the standard errors, and this will bring the
# familywise confidence level down.
#### Note that if we re-order the factor levels, the p-values change. But the
#### standard errors don't account for the fact that we're potentially running
#### several tests. Post-hoc analysis like this can be dangerous.
dat$site <- relevel(dat$site, ref = "B")
glsModel2 <- gls(y ~ site, data = dat)
anova(glsModel2) # This is identical to the anova above with 'A' as the reference.
## Denom. DF: 114
## numDF F-value p-value
## (Intercept) 1 11461 <.0001
## site 5 306 <.0001
summary(glsModel2) # The coefficient for site 'D' is not significantly different from the Intercept
## Generalized least squares fit by REML
## Model: y ~ site
## Data: dat
## AIC BIC logLik
## 524.6 543.8 -255.3
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 19.402 0.4696 41.32 0.0000
## siteA -8.514 0.6641 -12.82 0.0000
## siteC 15.916 0.6641 23.97 0.0000
## siteD 1.225 0.6641 1.84 0.0676
## siteE -3.761 0.6641 -5.66 0.0000
## siteF 1.856 0.6641 2.79 0.0061
##
## Correlation:
## (Intr) siteA siteC siteD siteE
## siteA -0.707
## siteC -0.707 0.500
## siteD -0.707 0.500 0.500
## siteE -0.707 0.500 0.500 0.500
## siteF -0.707 0.500 0.500 0.500 0.500
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -3.459501 -0.628343 -0.005658 0.606989 2.572364
##
## Residual standard error: 2.1
## Degrees of freedom: 120 total; 114 residual
#### Multiple comparisons procedures
glht(glsModel1, linfct = mcp(site = "Tukey")) # Doesn't work, as you know.
## Error: no 'model.matrix' method for 'model' found!
## Run the following lines. These introduce methods for 'gls' objects.
model.matrix.gls <- function(object, ...) {
model.matrix(terms(object), data = getData(object), ...)
}
model.frame.gls <- function(object, ...) {
model.frame(formula(object), data = getData(object), ...)
}
terms.gls <- function(object, ...) {
terms(model.frame(object), ...)
}
# The line below gives pairwise comparisons now. Note that the above
# performs t-tests for all pairwise differences.
multCompTukey <- glht(glsModel1, linfct = mcp(site = "Tukey"))