# ************************************************************************
# MultipleComparisonsWithGLS.R
# ************************************************************************
# Brief description: This script explores multiple comparisons using a
# generalized least squares regression model with a multi-level factor
# variable.  Packages required: nlme, multcomp, MASS Keywords (less than
# 10): Multiple comparisons, gls, regression, factor, multcomp, generalized
# least squares EPA Contact Information Name: Phone Number: E-mail Address:
# Is data available upon request? Yes/No Date script was written: Feb. 13,
# 2014


#### Clean up workspace
rm(list = ls())

#### Include packages.
library(nlme)
library(multcomp)

## Warning: package 'multcomp' was built under R version 3.0.2

## Loading required package: mvtnorm
## Loading required package: TH.data

## Warning: package 'TH.data' was built under R version 3.0.2

library(MASS)


#### Setup some fake data.
set.seed(1111)
site <- rep(c("A", "B", "C", "D", "E", "F"), each = 20)  # Specify site levels
siteMns <- c(A = 10, B = 20, C = 35, D = 20, E = 15, F = 20)  # Declare true site means.
resp <- as.vector(mvrnorm(20, mu = siteMns, Sigma = diag(c(2, 4, 6, 4, 2, 4))))  # Grab random values by site.
error <- rnorm(length(site), 0, 0.5)  # Introduce some more error
dat <- data.frame(y = resp + error, site = as.factor(site))  # Make a data frame
aggregate(y ~ site, dat, summary)

##   site y.Min. y.1st Qu. y.Median y.Mean y.3rd Qu. y.Max.
## 1    A   7.79      9.80    10.90  10.90     11.70  13.90
## 2    B  16.10     18.10    19.20  19.40     20.30  24.80
## 3    C  28.10     34.40    35.80  35.30     37.50  38.90
## 4    D  18.00     19.20    20.50  20.60     21.60  24.50
## 5    E  13.10     14.60    15.30  15.60     16.50  19.20
## 6    F  17.70     19.80    21.20  21.30     22.80  25.40


# This script works the same with an unbalanced design:
if (FALSE) {
    dat <- dat[-c(1, 40), ]
}

#### Use gls() to setup a model using the 6-level factor as a predictor.
glsModel1 <- gls(y ~ site, data = dat)
anova(glsModel1)  # There is clearly evidence that means by site differ in some capacity.

## Denom. DF: 114 
##             numDF F-value p-value
## (Intercept)     1   11461  <.0001
## site            5     306  <.0001

summary(glsModel1)  # The p-values are all 'significant', but that is misleading. See below.

## Generalized least squares fit by REML
##   Model: y ~ site 
##   Data: dat 
##     AIC   BIC logLik
##   524.6 543.8 -255.3
## 
## Coefficients:
##              Value Std.Error t-value p-value
## (Intercept) 10.887    0.4696   23.19       0
## siteB        8.514    0.6641   12.82       0
## siteC       24.431    0.6641   36.79       0
## siteD        9.740    0.6641   14.67       0
## siteE        4.754    0.6641    7.16       0
## siteF       10.370    0.6641   15.62       0
## 
##  Correlation: 
##       (Intr) siteB  siteC  siteD  siteE 
## siteB -0.707                            
## siteC -0.707  0.500                     
## siteD -0.707  0.500  0.500              
## siteE -0.707  0.500  0.500  0.500       
## siteF -0.707  0.500  0.500  0.500  0.500
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -3.459501 -0.628343 -0.005658  0.606989  2.572364 
## 
## Residual standard error: 2.1 
## Degrees of freedom: 120 total; 114 residual


# The estimates for each listed coefficient are from t-tests (really Wald
# tests) with the null hypothesis that the difference between the reference
# level mean (Intercept) and the other factor level mean is zero. Since site
# 'A' has a true mean of 10 and no other site has a true mean of 10, we
# expect these p-values to be 'significant'.  But that does not mean that
# all the levels of the factor have different means. To check this, we need
# a multiple comparisons test that will test the pairwise differences. Doing
# this be re-ordering the factor level several times and re-running the
# gls() model will not inflate the standard errors, and this will bring the
# familywise confidence level down.

#### Note that if we re-order the factor levels, the p-values change. But the
#### standard errors don't account for the fact that we're potentially running
#### several tests. Post-hoc analysis like this can be dangerous.
dat$site <- relevel(dat$site, ref = "B")
glsModel2 <- gls(y ~ site, data = dat)
anova(glsModel2)  # This is identical to the anova above with 'A' as the reference.

## Denom. DF: 114 
##             numDF F-value p-value
## (Intercept)     1   11461  <.0001
## site            5     306  <.0001

summary(glsModel2)  # The coefficient for site 'D' is not significantly different from the Intercept

## Generalized least squares fit by REML
##   Model: y ~ site 
##   Data: dat 
##     AIC   BIC logLik
##   524.6 543.8 -255.3
## 
## Coefficients:
##              Value Std.Error t-value p-value
## (Intercept) 19.402    0.4696   41.32  0.0000
## siteA       -8.514    0.6641  -12.82  0.0000
## siteC       15.916    0.6641   23.97  0.0000
## siteD        1.225    0.6641    1.84  0.0676
## siteE       -3.761    0.6641   -5.66  0.0000
## siteF        1.856    0.6641    2.79  0.0061
## 
##  Correlation: 
##       (Intr) siteA  siteC  siteD  siteE 
## siteA -0.707                            
## siteC -0.707  0.500                     
## siteD -0.707  0.500  0.500              
## siteE -0.707  0.500  0.500  0.500       
## siteF -0.707  0.500  0.500  0.500  0.500
## 
## Standardized residuals:
##       Min        Q1       Med        Q3       Max 
## -3.459501 -0.628343 -0.005658  0.606989  2.572364 
## 
## Residual standard error: 2.1 
## Degrees of freedom: 120 total; 114 residual



#### Multiple comparisons procedures
glht(glsModel1, linfct = mcp(site = "Tukey"))  # Doesn't work, as you know.

## Error: no 'model.matrix' method for 'model' found!

## Run the following lines. These introduce methods for 'gls' objects.
model.matrix.gls <- function(object, ...) {
    model.matrix(terms(object), data = getData(object), ...)
}
model.frame.gls <- function(object, ...) {
    model.frame(formula(object), data = getData(object), ...)
}
terms.gls <- function(object, ...) {
    terms(model.frame(object), ...)
}
# The line below gives pairwise comparisons now.  Note that the above
# performs t-tests for all pairwise differences.
multCompTukey <- glht(glsModel1, linfct = mcp(site = "Tukey"))