Module 5

Leo_Ohyama

Last Updated 11 September, 2022

5A. ANCOVAs in R

We will need the following libraries:

library(tidyverse)   
library(car) ### helpful for analyzing linear models
library(emmeans) ### helpful for getting means from linear models

Here we assemble data with a covariate (don’t worry about the code):

set.seed(17)
data("InsectSprays")
d <- InsectSprays %>% filter(spray=='A'|spray=='B'|spray=='C'|spray=='F') %>%
  droplevels()
d$count[13:24] <- d$count[13:24]+5
d$weeds <- abs(round(rnorm(48,2*d$count,10),1))
d$weeds[25:36] <- c(55.3,46.8,30.2,62.3,24.2,33.2,18.2,12.6,39.7,41.0,46.9,42.8)

Let’s plot the raw data:

ggplot(d, aes(x=spray,y=count)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(height=0,width=.1)

Let’s run a model with counts as a function of spray (an ANOVA) and get the pairwise means:

anova_means <- emmeans(lm(count~spray, data=d), pairwise~spray) 
anova_means

## $emmeans
##  spray emmean   SE df lower.CL upper.CL
##  A      14.50 1.32 44   11.849    17.15
##  B      20.33 1.32 44   17.683    22.98
##  C       2.08 1.32 44   -0.567     4.73
##  F      16.67 1.32 44   14.016    19.32
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE df t.ratio p.value
##  A - B       -5.83 1.86 44  -3.136  0.0155
##  A - C       12.42 1.86 44   6.676  <.0001
##  A - F       -2.17 1.86 44  -1.165  0.6518
##  B - C       18.25 1.86 44   9.812  <.0001
##  B - F        3.67 1.86 44   1.971  0.2143
##  C - F      -14.58 1.86 44  -7.841  <.0001
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

Now let’s plot out the data with weed cover:

ggplot(d, aes(x=weeds,y=count)) +
  geom_point() +
  facet_wrap(~spray) +
  geom_smooth(method='lm')

ANCOVA: multiple intercept model

Let’s run our ANCOVA model:

lm1i <- lm(count ~ spray + weeds, data=d)

Let’s get an ANOVA table with Type II sums of squares (aka Type III SS):

Anova(lm1i, type=2)

## Anova Table (Type II tests)
## 
## Response: count
##            Sum Sq Df F value    Pr(>F)    
## spray     2098.05  3  68.134 2.229e-16 ***
## weeds      471.88  1  45.973 2.679e-08 ***
## Residuals  441.37 43                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Don’t use the base anova! Switch terms around in the model and see what happens:

anova(lm1i)

## Analysis of Variance Table
## 
## Response: count
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## spray      3 2256.23  752.08  73.270 < 2.2e-16 ***
## weeds      1  471.88  471.88  45.973 2.679e-08 ***
## Residuals 43  441.37   10.26                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm1i) ## model coefficients

## 
## Call:
## lm(formula = count ~ spray + weeds, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.030 -2.155 -0.057  1.744  6.526 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.69502    1.36479   5.638 1.23e-06 ***
## sprayB        2.88374    1.37840   2.092   0.0424 *  
## sprayC      -13.64507    1.32044 -10.334 3.16e-13 ***
## sprayF        1.48599    1.31180   1.133   0.2636    
## weeds         0.21271    0.03137   6.780 2.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.204 on 43 degrees of freedom
## Multiple R-squared:  0.8607, Adjusted R-squared:  0.8478 
## F-statistic: 66.45 on 4 and 43 DF,  p-value: < 2.2e-16

Here we can calculate estimated marginal mean for each group (ie. groups means after accounting for the effect of weeds). We can extract the emmean means (ie. group means after accounting for the effect of weeds):

ancova_means <- emmeans(lm1i, pairwise~spray)  
ancova_means

## $emmeans
##  spray emmean    SE df lower.CL upper.CL
##  A      15.71 0.942 43   13.815    17.61
##  B      18.60 0.960 43   16.663    20.53
##  C       2.07 0.925 43    0.204     3.93
##  F      17.20 0.928 43   15.329    19.07
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE df t.ratio p.value
##  A - B       -2.88 1.38 43  -2.092  0.1720
##  A - C       13.65 1.32 43  10.334  <.0001
##  A - F       -1.49 1.31 43  -1.133  0.6716
##  B - C       16.53 1.33 43  12.406  <.0001
##  B - F        1.40 1.35 43   1.035  0.7298
##  C - F      -15.13 1.31 43 -11.547  <.0001
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

This code is just for adding features of the model to the graph:

lm1i_coef <- as.data.frame(emmeans(lm1i, ~spray))

Now let’s extract intercepts and add slopes into new dataframe:

lm1i_coef2 <- as.data.frame(emmeans(lm1i, ~spray, at=list(weeds=0)))
lm1i_coef2$slope <- coef(lm1i)[5]

We can now plot the data with the fitted model:

ggplot(data=d, aes(x=weeds,y=count)) +
  geom_point() +
  facet_wrap(~spray) + 
  geom_abline(data=lm1i_coef2,
              aes(intercept=emmean, slope=slope)) +
  geom_point(data=lm1i_coef2,
             aes(x=0,y=emmean),color="red") +
  geom_point(data=lm1i_coef,
             aes(x=mean(d$weeds),y=emmean),
             color="blue", size=2)

ANCOVA: multiple intercept AND slope model

This is how we do an ANCOVA with multiple intercept and slopes:

lm1is <- lm(count ~ spray + weeds + spray:weeds, data=d)

Another way to code the above would be:

lm(count ~ spray * weeds, data=d)

## 
## Call:
## lm(formula = count ~ spray * weeds, data = d)
## 
## Coefficients:
##  (Intercept)        sprayB        sprayC        sprayF         weeds  
##     5.619631      2.440328     -2.441189      0.613378      0.277584  
## sprayB:weeds  sprayC:weeds  sprayF:weeds  
##    -0.009947     -0.306581      0.018897

Let’s get an ANOVA table and a summary of the model:

Anova(lm1is, type=2)

## Anova Table (Type II tests)
## 
## Response: count
##              Sum Sq Df  F value    Pr(>F)    
## spray       2098.05  3 108.0786 < 2.2e-16 ***
## weeds        471.88  1  72.9254 1.489e-10 ***
## spray:weeds  182.54  3   9.4033 7.898e-05 ***
## Residuals    258.83 40                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(lm1is)

## 
## Call:
## lm(formula = count ~ spray + weeds + spray:weeds, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.780 -1.323 -0.246  1.230  5.843 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.619631   1.837847   3.058 0.003965 ** 
## sprayB        2.440328   3.408876   0.716 0.478227    
## sprayC       -2.441189   2.788364  -0.875 0.386534    
## sprayF        0.613378   2.439710   0.251 0.802781    
## weeds         0.277584   0.052663   5.271 4.98e-06 ***
## sprayB:weeds -0.009947   0.080228  -0.124 0.901948    
## sprayC:weeds -0.306581   0.074015  -4.142 0.000173 ***
## sprayF:weeds  0.018897   0.066459   0.284 0.777614    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.544 on 40 degrees of freedom
## Multiple R-squared:  0.9183, Adjusted R-squared:  0.904 
## F-statistic: 64.26 on 7 and 40 DF,  p-value: < 2.2e-16

Here we calculate estimated marginal mean for each group (ie. groups means after accounting for the effect of weeds):

ancova_is_means <- emmeans(lm1is, pairwise~spray)  
ancova_is_means

## $emmeans
##  spray emmean    SE df lower.CL upper.CL
##  A      16.09 0.794 40   14.481    17.69
##  B      18.15 0.885 40   16.362    19.94
##  C       2.09 0.734 40    0.601     3.57
##  F      17.41 0.741 40   15.913    18.91
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE df t.ratio p.value
##  A - B      -2.065 1.19 40  -1.738  0.3182
##  A - C      14.000 1.08 40  12.949  <.0001
##  A - F      -1.326 1.09 40  -1.221  0.6175
##  B - C      16.065 1.15 40  13.972  <.0001
##  B - F       0.739 1.15 40   0.641  0.9182
##  C - F     -15.326 1.04 40 -14.687  <.0001
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

We can calculate the slope for each group:

ancova_is_slopes <- emtrends(lm1is, pairwise~spray, var="weeds")  
ancova_is_slopes

## $emtrends
##  spray weeds.trend     SE df lower.CL upper.CL
##  A           0.278 0.0527 40    0.171   0.3840
##  B           0.268 0.0605 40    0.145   0.3900
##  C          -0.029 0.0520 40   -0.134   0.0761
##  F           0.296 0.0405 40    0.215   0.3784
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate     SE df t.ratio p.value
##  A - B     0.00995 0.0802 40   0.124  0.9993
##  A - C     0.30658 0.0740 40   4.142  0.0010
##  A - F    -0.01890 0.0665 40  -0.284  0.9919
##  B - C     0.29663 0.0798 40   3.717  0.0033
##  B - F    -0.02884 0.0728 40  -0.396  0.9787
##  C - F    -0.32548 0.0659 40  -4.936  0.0001
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

We can extract the emmean means (ie. group means after accounting for the effect of weeds):

lm1is_coef <- as.data.frame(emmeans(lm1is, ~spray))

Also we can extract intercepts and add slopes into new dataframe:

lm1is_coef2a <- as.data.frame(emmeans(lm1is, ~spray, at=list(weeds=0)))
lm1is_coef2b <- as.data.frame(emtrends(lm1is, var="weeds"))
lm1is_coef2 <- full_join(lm1is_coef2a,lm1is_coef2b,by="spray")

Finally we can plot the data of the fitted model:

ggplot(data=d, aes(x=weeds,y=count)) +
  geom_point() +
  facet_wrap(~spray) + 
  geom_abline(data=lm1is_coef2, aes(intercept=emmean,
                                    slope=weeds.trend), lty=2) +
  geom_point(data=lm1is_coef2, 
             aes(x=0,y=emmean),
             color="orange") +
  geom_point(data=lm1is_coef,
             aes(x=mean(d$weeds),y=emmean),
             color="purple", size=2)

Here’s an alternative nice plot of the data with weed cover:

ggplot(d, aes(x=weeds,y=count)) +
  geom_point() + 
  facet_wrap(~spray) + 
  geom_smooth(method='lm', color='black')+
  theme_bw(base_size = 16)

5B. Block Designs

Data and Plotting

Let’s load the packages and data:

library(tidyverse)
library(car)
data("InsectSprays")

Now we can add our blocks to the data:

InsectSprays$block <- as.factor(rep(c(1:12), 6))
d<- InsectSprays %>%
  filter(spray=='A'|spray=='B'|spray=='C'|spray=='F')
glimpse(d)

## Rows: 48
## Columns: 3
## $ count <dbl> 10, 7, 20, 14, 14, 12, 10, 23, 17, 20, 14, 13, 11, 17, 21, 11, 1…
## $ spray <fct> A, A, A, A, A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B, B, B…
## $ block <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9…

We can graph the data by treatment group:

#plot data by treatment group
ggplot(d, aes(x=spray,y=count)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(height=0,width=.1)

A plot by treatment and block (note one observation per block):

ggplot(d, aes(x=spray,y=count)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(height=0,width=.1) +
  facet_wrap(~block)  #12 blocks

Models

This is a no block model:

lm1 <- lm(count~spray, data=d)
anova(lm1)

## Analysis of Variance Table
## 
## Response: count
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## spray      3 1648.73  549.58  26.478 6.091e-10 ***
## Residuals 44  913.25   20.76                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This is a block model (the block as a fixed effect).

lm2 <- lm(count~spray+block, data=d)
anova(lm2)

## Analysis of Variance Table
## 
## Response: count
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## spray      3 1648.73  549.58 32.5298 5.685e-10 ***
## block     11  355.73   32.34  1.9142   0.07378 .  
## Residuals 33  557.52   16.89                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Blocks as Random Effects

We need to load additional packages for this section

library(lme4)
library(lmerTest)

Here we set up a linear mixed effect model with a block as a random effect (random intercept):

# block as random effect
lm3 <- lmer(count~spray+(1|block), data=d)
anova(lm3)

## Type III Analysis of Variance Table with Satterthwaite's method
##       Sum Sq Mean Sq NumDF DenDF F value    Pr(>F)    
## spray 1648.7  549.58     3    33   32.53 5.685e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now let’s compare the model for blocks as fixed vs. random effects:

# compare summary() for fixed vs. random blocking effect
summary(lm2)

## 
## Call:
## lm(formula = count ~ spray + block, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6875 -1.6250 -0.1458  1.6458  7.9792 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.3542     2.2977   4.506 7.84e-05 ***
## sprayB        0.8333     1.6780   0.497  0.62275    
## sprayC      -12.4167     1.6780  -7.400 1.68e-08 ***
## sprayF        2.1667     1.6780   1.291  0.20561    
## block2        0.5000     2.9064   0.172  0.86446    
## block3        7.7500     2.9064   2.667  0.01178 *  
## block4        4.2500     2.9064   1.462  0.15312    
## block5        4.0000     2.9064   1.376  0.17801    
## block6        2.7500     2.9064   0.946  0.35093    
## block7        2.5000     2.9064   0.860  0.39590    
## block8        4.7500     2.9064   1.634  0.11170    
## block9        8.2500     2.9064   2.839  0.00770 ** 
## block10       8.7500     2.9064   3.011  0.00497 ** 
## block11       3.5000     2.9064   1.204  0.23707    
## block12       2.7500     2.9064   0.946  0.35093    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.11 on 33 degrees of freedom
## Multiple R-squared:  0.7824, Adjusted R-squared:  0.6901 
## F-statistic: 8.475 on 14 and 33 DF,  p-value: 2.595e-07

summary(lm3)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: count ~ spray + (1 | block)
##    Data: d
## 
## REML criterion at convergence: 266.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.95239 -0.58269 -0.07399  0.60466  1.99778 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  block    (Intercept)  3.861   1.965   
##  Residual             16.895   4.110   
## Number of obs: 48, groups:  block, 12
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  14.5000     1.3152  39.8617  11.025 1.13e-13 ***
## sprayB        0.8333     1.6780  33.0000   0.497    0.623    
## sprayC      -12.4167     1.6780  33.0000  -7.400 1.68e-08 ***
## sprayF        2.1667     1.6780  33.0000   1.291    0.206    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr) sprayB sprayC
## sprayB -0.638              
## sprayC -0.638  0.500       
## sprayF -0.638  0.500  0.500

coef(lm3)    ## prints model coefficients

## $block
##    (Intercept)    sprayB    sprayC   sprayF
## 1     12.52004 0.8333333 -12.41667 2.166667
## 2     12.75883 0.8333333 -12.41667 2.166667
## 3     16.22128 0.8333333 -12.41667 2.166667
## 4     14.54975 0.8333333 -12.41667 2.166667
## 5     14.43035 0.8333333 -12.41667 2.166667
## 6     13.83338 0.8333333 -12.41667 2.166667
## 7     13.71398 0.8333333 -12.41667 2.166667
## 8     14.78854 0.8333333 -12.41667 2.166667
## 9     16.46007 0.8333333 -12.41667 2.166667
## 10    16.69885 0.8333333 -12.41667 2.166667
## 11    14.19156 0.8333333 -12.41667 2.166667
## 12    13.83338 0.8333333 -12.41667 2.166667
## 
## attr(,"class")
## [1] "coef.mer"

We can also print out the variance components from the model:

print(VarCorr(lm3), comp=c("Variance"))  ## print variance components from model

##  Groups   Name        Variance
##  block    (Intercept)  3.8611 
##  Residual             16.8946

Let’s check residuals:

plot(lm1$residuals~lm1$fitted.values) # check residuals of no block and block models

plot(resid(lm3)~fitted(lm3))

hist(lm1$residuals)

hist(resid(lm3))

Finally, we can compare estimated marginal means:

emmeans(lm2, pairwise~spray) ## fixed effect block

## $emmeans
##  spray emmean   SE df lower.CL upper.CL
##  A      14.50 1.19 33   12.086     16.9
##  B      15.33 1.19 33   12.919     17.7
##  C       2.08 1.19 33   -0.331      4.5
##  F      16.67 1.19 33   14.253     19.1
## 
## Results are averaged over the levels of: block 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE df t.ratio p.value
##  A - B      -0.833 1.68 33  -0.497  0.9593
##  A - C      12.417 1.68 33   7.400  <.0001
##  A - F      -2.167 1.68 33  -1.291  0.5749
##  B - C      13.250 1.68 33   7.896  <.0001
##  B - F      -1.333 1.68 33  -0.795  0.8564
##  C - F     -14.583 1.68 33  -8.691  <.0001
## 
## Results are averaged over the levels of: block 
## P value adjustment: tukey method for comparing a family of 4 estimates

emmeans(lm3, pairwise~spray) ## random effect block

## $emmeans
##  spray emmean   SE   df lower.CL upper.CL
##  A      14.50 1.32 39.9   11.842    17.16
##  B      15.33 1.32 39.9   12.675    17.99
##  C       2.08 1.32 39.9   -0.575     4.74
##  F      16.67 1.32 39.9   14.008    19.32
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE df t.ratio p.value
##  A - B      -0.833 1.68 33  -0.497  0.9593
##  A - C      12.417 1.68 33   7.400  <.0001
##  A - F      -2.167 1.68 33  -1.291  0.5749
##  B - C      13.250 1.68 33   7.896  <.0001
##  B - F      -1.333 1.68 33  -0.795  0.8564
##  C - F     -14.583 1.68 33  -8.691  <.0001
## 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: tukey method for comparing a family of 4 estimates

5C. Variance Components

In this section we will cover how to calculate variance components from correlation coefficicents. First we need to load the necessary packages:

library(tidyverse)
library(car)
library(lme4)
library(lmerTest)
library(emmeans)
library(patchwork)
library(MuMIn)

R² m & R² c example

Let’s make up some data as an example:

set.seed(10)
fakedata <- data.frame(Site=factor(40), ID=factor(40), temp=double(40), size=double(40), stringsAsFactors = F)
fakedata$Site <- rep(1:8, each=5)
fakedata$ID <- rep(1:5, times=8)
fakedata$temp <- rep(c(10,18,12,15,8,11,10,16), each=5)
fakedata$size <- round(rnorm(40, (2*fakedata$temp), 8), 1)

The fake data above represents measurements of body size for 5 insects at 8 sites, so 5 sub-replicates per site:

head(fakedata)

##   Site ID temp size
## 1    1  1   10 20.1
## 2    1  2   10 18.5
## 3    1  3   10  9.0
## 4    1  4   10 15.2
## 5    1  5   10 22.4
## 6    2  1   18 39.1

hist(fakedata$size)

Let’s make plot of fake data:

ggplot(fakedata, aes(x=temp, y=size)) +
  geom_point() +
  geom_smooth(method="lm") +
  theme_bw(base_size=16)

Now let’s calculate the R² for a linear model and a linear mixed model:

lm1 <- lm(size ~ temp + Site, data=fakedata)
summary(lm1) # look at R2

## 
## Call:
## lm(formula = size ~ temp + Site, data = fakedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9260  -5.2771   0.3738   4.2517  13.0510 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.2989     5.1305  -0.643    0.524    
## temp          2.2225     0.3475   6.395 1.84e-07 ***
## Site         -0.6110     0.4915  -1.243    0.222    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.106 on 37 degrees of freedom
## Multiple R-squared:  0.5415, Adjusted R-squared:  0.5168 
## F-statistic: 21.85 on 2 and 37 DF,  p-value: 5.419e-07

lmm1 <- lmer(size ~ temp + (1|Site), data=fakedata)
summary(lmm1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: size ~ temp + (1 | Site)
##    Data: fakedata
## 
## REML criterion at convergence: 262.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8504 -0.8172  0.2309  0.6294  1.8574 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Site     (Intercept) 18.24    4.271   
##  Residual             36.82    6.068   
## Number of obs: 40, groups:  Site, 8
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)   
## (Intercept)  -6.4118     7.1295  6.0000  -0.899  0.40312   
## temp          2.2515     0.5521  6.0000   4.078  0.00652 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##      (Intr)
## temp -0.968

Notice for the mixed effect model we don’t see an R² but we can use the MuMIn package to calculate R² for mixed models:

library(MuMIn)
r.squaredGLMM(lmm1)

##            R2m       R2c
## [1,] 0.4978533 0.6641872

Now let’s load in the ChickWeight dataset. It contains weight (g) of small chickens grown on four different diets. Chickens were weighed every few days for 21 days:

data("ChickWeight")
head(ChickWeight)

## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1

ChickWeight$Diet <- as.factor(ChickWeight$Diet)

Let’s plot out the data:

ggplot(ChickWeight,aes(x=Time,y=weight)) +
  geom_point() +
  facet_wrap(~Diet) +
  geom_smooth(method="lm")

Let’s run a regular linear model ignoring chick:

cw0 <- lm(weight ~ Time * Diet , data=ChickWeight)
Anova(cw0)

## Anova Table (Type II tests)
## 
## Response: weight
##            Sum Sq  Df  F value    Pr(>F)    
## Time      2016357   1 1737.367 < 2.2e-16 ***
## Diet       129876   3   37.302 < 2.2e-16 ***
## Time:Diet   80804   3   23.208 3.474e-14 ***
## Residuals  661532 570                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s examine means at time 20:

emmeans(cw0, pairwise~Diet, at=list(Time=20)) # All contrasts are significant

## $emmeans
##  Diet emmean   SE  df lower.CL upper.CL
##  1       168 3.97 570      160      176
##  2       201 5.20 570      191      211
##  3       247 5.20 570      236      257
##  4       225 5.34 570      215      236
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE  df t.ratio p.value
##  1 - 2       -33.0 6.55 570  -5.049  <.0001
##  1 - 3       -78.9 6.55 570 -12.059  <.0001
##  1 - 4       -57.3 6.65 570  -8.613  <.0001
##  2 - 3       -45.9 7.36 570  -6.239  <.0001
##  2 - 4       -24.3 7.45 570  -3.256  0.0066
##  3 - 4        21.6 7.45 570   2.902  0.0200
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

Now let’s construct a new model with chick as a fixed effect:

cw1a <- lm(weight ~ Time * Diet + Chick , data=ChickWeight)
Anova(cw1a)

## Anova Table (Type II tests)
## 
## Response: weight
##            Sum Sq  Df  F value    Pr(>F)    
## Time      1962914   1 3049.280 < 2.2e-16 ***
## Diet                0                       
## Chick      324217  46   10.949 < 2.2e-16 ***
## Time:Diet   84222   3   43.612 < 2.2e-16 ***
## Residuals  337315 524                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(cw1a) ## look for R2

## 
## Call:
## lm(formula = weight ~ Time * Diet + Chick, data = ChickWeight)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79.06 -14.78  -1.71  14.71  87.54 
## 
## Coefficients: (3 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  196.2618    51.8763   3.783 0.000173 ***
## Time           6.6906     0.2598  25.752  < 2e-16 ***
## Diet2       -133.6649    31.3708  -4.261 2.42e-05 ***
## Diet3         93.6591   139.0537   0.674 0.500897    
## Diet4       -800.0806   334.8490  -2.389 0.017230 *  
## Chick.L     1502.0066   565.9956   2.654 0.008202 ** 
## Chick.Q     1070.1487   582.0340   1.839 0.066534 .  
## Chick.C      673.7440   380.4009   1.771 0.077118 .  
## Chick^4       43.8751    71.2020   0.616 0.538027    
## Chick^5     -618.1610   316.5601  -1.953 0.051382 .  
## Chick^6     -595.7233   294.8603  -2.020 0.043855 *  
## Chick^7      -39.1387    19.3620  -2.021 0.043744 *  
## Chick^8      451.8879   205.2816   2.201 0.028149 *  
## Chick^9      364.8077   185.9140   1.962 0.050264 .  
## Chick^10      16.9199    46.2525   0.366 0.714651    
## Chick^11    -178.3476    85.9134  -2.076 0.038390 *  
## Chick^12    -177.5330   104.0664  -1.706 0.088608 .  
## Chick^13    -116.1828    68.8889  -1.687 0.092290 .  
## Chick^14     -20.6745    15.0876  -1.370 0.171183    
## Chick^15      89.9792    52.9947   1.698 0.090122 .  
## Chick^16     163.1244    92.6556   1.761 0.078899 .  
## Chick^17     146.0519    75.7630   1.928 0.054427 .  
## Chick^18       6.7008    19.6523   0.341 0.733263    
## Chick^19    -200.1902    98.0010  -2.043 0.041579 *  
## Chick^20    -200.9099    99.8239  -2.013 0.044663 *  
## Chick^21     -20.5819    15.6958  -1.311 0.190331    
## Chick^22     145.1498    71.3571   2.034 0.042441 *  
## Chick^23     156.5123    79.4184   1.971 0.049280 *  
## Chick^24      61.5086    40.1348   1.533 0.125990    
## Chick^25     -36.2348    12.9476  -2.799 0.005322 ** 
## Chick^26     -36.5756    35.5035  -1.030 0.303392    
## Chick^27     -82.5707    53.6074  -1.540 0.124094    
## Chick^28     -98.9499    52.8243  -1.873 0.061598 .  
## Chick^29     -24.8503    16.8082  -1.478 0.139885    
## Chick^30      78.3097    44.6066   1.756 0.079747 .  
## Chick^31     156.3761    81.8686   1.910 0.056668 .  
## Chick^32      12.0154     8.4130   1.428 0.153834    
## Chick^33      60.8395    28.1312   2.163 0.031015 *  
## Chick^34      87.6693    42.1235   2.081 0.037896 *  
## Chick^35      17.7932    10.7078   1.662 0.097169 .  
## Chick^36      42.9840    27.0162   1.591 0.112203    
## Chick^37     -27.0017    17.8036  -1.517 0.129959    
## Chick^38      96.3419    53.3925   1.804 0.071741 .  
## Chick^39    -189.9708    97.3521  -1.951 0.051545 .  
## Chick^40     -95.4898    62.2727  -1.533 0.125777    
## Chick^41      92.7144    52.0812   1.780 0.075624 .  
## Chick^42     -52.6440    19.9868  -2.634 0.008690 ** 
## Chick^43     -96.4083    34.9387  -2.759 0.005994 ** 
## Chick^44      60.4535    26.1515   2.312 0.021183 *  
## Chick^45    -102.5662    47.4313  -2.162 0.031038 *  
## Chick^46     -13.8246    14.5963  -0.947 0.344009    
## Chick^47           NA         NA      NA       NA    
## Chick^48           NA         NA      NA       NA    
## Chick^49           NA         NA      NA       NA    
## Time:Diet2     1.9185     0.4294   4.468 9.67e-06 ***
## Time:Diet3     4.7322     0.4294  11.022  < 2e-16 ***
## Time:Diet4     2.9653     0.4350   6.817 2.57e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.37 on 524 degrees of freedom
## Multiple R-squared:  0.8843, Adjusted R-squared:  0.8726 
## F-statistic: 75.54 on 53 and 524 DF,  p-value: < 2.2e-16

Now let’s again examine means at time 20:

emmeans(cw1a, pairwise~Diet, at=list(Time=20)) # Contrasts similar to above

## $emmeans
##  Diet emmean   SE  df lower.CL upper.CL
##  1       165 3.22 524      159      172
##  2       201 3.87 524      193      208
##  3       247 3.87 524      239      254
##  4       224 3.99 524      216      232
## 
## Results are averaged over the levels of: Chick 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE  df t.ratio p.value
##  1 - 2       -35.4 5.04 524  -7.030  <.0001
##  1 - 3       -81.3 5.04 524 -16.140  <.0001
##  1 - 4       -58.9 5.13 524 -11.475  <.0001
##  2 - 3       -45.9 5.48 524  -8.377  <.0001
##  2 - 4       -23.5 5.56 524  -4.216  0.0002
##  3 - 4        22.4 5.56 524   4.033  0.0004
## 
## Results are averaged over the levels of: Chick 
## P value adjustment: tukey method for comparing a family of 4 estimates

Finally, let’s use chick as a random (block) effect:

cw1 <- lmer(weight ~ Time * Diet + (1|Chick), data=ChickWeight)
summary(cw1) ## look for variance component. Where is R2 ???

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: weight ~ Time * Diet + (1 | Chick)
##    Data: ChickWeight
## 
## REML criterion at convergence: 5466.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3158 -0.5900 -0.0693  0.5361  3.6024 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Chick    (Intercept) 545.7    23.36   
##  Residual             643.3    25.36   
## Number of obs: 578, groups:  Chick, 50
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  31.5143     6.1163  70.7030   5.152 2.23e-06 ***
## Time          6.7115     0.2584 532.8900  25.976  < 2e-16 ***
## Diet2        -2.8807    10.5479  69.6438  -0.273    0.786    
## Diet3       -13.2640    10.5479  69.6438  -1.258    0.213    
## Diet4        -0.4016    10.5565  69.8601  -0.038    0.970    
## Time:Diet2    1.8977     0.4284 527.6886   4.430 1.15e-05 ***
## Time:Diet3    4.7114     0.4284 527.6886  10.998  < 2e-16 ***
## Time:Diet4    2.9506     0.4340 528.0372   6.799 2.86e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) Time   Diet2  Diet3  Diet4  Tm:Dt2 Tm:Dt3
## Time       -0.426                                          
## Diet2      -0.580  0.247                                   
## Diet3      -0.580  0.247  0.336                            
## Diet4      -0.579  0.247  0.336  0.336                     
## Time:Diet2  0.257 -0.603 -0.431 -0.149 -0.149              
## Time:Diet3  0.257 -0.603 -0.149 -0.431 -0.149  0.364       
## Time:Diet4  0.254 -0.595 -0.147 -0.147 -0.432  0.359  0.359

We can use the VarCorr() to print off variance component:

### Print off only variance component
cvar <- VarCorr(cw1)
print(cvar, comp=c("Variance","Std.Dev."))

##  Groups   Name        Variance Std.Dev.
##  Chick    (Intercept) 545.72   23.361  
##  Residual             643.31   25.364

Let’s print off anova tables:

anova(cw1) ## anova() from lmerTest package is best when using lmer()

## Type III Analysis of Variance Table with Satterthwaite's method
##            Sum Sq Mean Sq NumDF  DenDF   F value Pr(>F)    
## Time      2021052 2021052     1 526.15 3141.6571 <2e-16 ***
## Diet         1127     376     3  69.52    0.5838 0.6277    
## Time:Diet   83885   27962     3 526.53   43.4657 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova(cw1) ## Anova() from car package. Diet p-value is low, but note it uses Wald Chi-sq test. lmerTest::anova() is more appropriate in this case.

## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: weight
##              Chisq Df Pr(>Chisq)    
## Time      3064.408  1  < 2.2e-16 ***
## Diet        18.601  3  0.0003306 ***
## Time:Diet  130.397  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now let’s examine model residuals:

plot(resid(cw1)~fitted(cw1)) ## residuals not great, but we'll return to this later. For now we will proceed with caution.

Also let’s examine using emmeans and contrasts:

emmeans(cw1, pairwise~Diet, at=list(Time=20)) # 4 of 6 differ at time 20

## $emmeans
##  Diet emmean   SE   df lower.CL upper.CL
##  1       166 6.10 67.7      154      178
##  2       201 8.34 60.8      184      217
##  3       247 8.34 60.8      230      263
##  4       224 8.40 62.4      208      241
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE   df t.ratio p.value
##  1 - 2       -35.1 10.3 63.1  -3.394  0.0064
##  1 - 3       -81.0 10.3 63.1  -7.836  <.0001
##  1 - 4       -58.6 10.4 64.2  -5.648  <.0001
##  2 - 3       -45.9 11.8 60.8  -3.891  0.0014
##  2 - 4       -23.5 11.8 61.6  -1.989  0.2033
##  3 - 4        22.4 11.8 61.6   1.889  0.2434
## 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: tukey method for comparing a family of 4 estimates

We can use the MuMIn package to calculate R² for the mixed-model:

r.squaredGLMM(cw1)

##           R2m       R2c
## [1,] 0.765202 0.8729656

Followup Question

How much of the variance in weight is explained by Diet and Time? How much by Chick?
Harder question: calculate the R² c by hand based (based on lecture notes) on the R² m and variance components (just to check).

More variance components

Let’s calculate a rough approximation of the variance component. First we need to extract the random effect estimates.

randoms<-ranef(cw1, condVar = TRUE) ## extract the random effect estimates
rand.blups<-randoms$Chick 
head(rand.blups,15) ## print off the random effect estimates.

##    (Intercept)
## 18  -0.7712244
## 16 -18.8880797
## 15 -16.0104705
## 13 -33.6430174
## 9  -21.5023292
## 20 -24.0063461
## 10 -19.7571052
## 8   -5.9874470
## 17 -11.1827442
## 19 -16.4184160
## 4   -4.9606415
## 6    8.1664776
## 11  22.8870621
## 3   10.0634602
## 1    6.2694951

Above, the row labels are the chick number and the (intercept) values are the random effect estimates for each chick. Basically, each chick gets its own adjustment to the base intercept in the model. In this case the base intercept is 31.5143, so if you make the adjustments most of the individual chick intercepts are positive.

Let’s extract some values for plotting:

qq <- attr(ranef(cw1, condVar = TRUE)[[1]], "postVar") ## convert list to vector
sd(qq)  ## approximate Variance Component. Note this basic calculation usually biases the VC estimates downwards (they are too low)

## [1] 22.16376

var(qq)  ## approximate Variance Component.

## [1] 491.2321

cw1am <- emmeans(cw1a, ~Chick, at=list(Time=0)) %>%
  as.data.frame() %>%
  mutate(Chick=as.character(Chick)) # extract means for plotting later


df<-data.frame(Intercepts=randoms$Chick[,1],
               sd.interc=2*sqrt(qq[,,1:length(qq)]),
               Chick=rownames(rand.blups))
df$Chick<-factor(df$Chick,levels=df$Chick[order(cw1am$Chick)])
#df$Intercepts1 <- df$Intercepts+121.8183

df1 <- full_join(df,cw1am)

Now make a plot of random effect BLUPs (best linear unbiased predictor):

p1 <- ggplot(df1,aes(x=Chick, y=Intercepts, color=Diet)) +
  geom_hline(yintercept=0) +
  geom_errorbar(aes(ymin = Intercepts-sd.interc,
                    ymax=Intercepts+sd.interc), 
                width=0,color="black") + 
  geom_point(size=2) +
  xlab("Chick ID") +
  ylab("Intercepts") + 
  geom_hline(yintercept=-23.361, color="red") +
  geom_hline(yintercept=23.361, color="red") +
  ggtitle("Random effect BLUPs") + 
  coord_flip() +
  theme_bw() 
p1

We can also make plot of fixed-estimates intercepts for each chick from linear model (sometimes called BLUEs, best linear unbiased estimate):

p2 <- ggplot(df1,aes(x=Chick,y= emmean- 31.5143, color=Diet)) +
  geom_hline(yintercept=0) +
  geom_errorbar(aes(ymin=(lower.CL-31.5143), 
                    ymax=upper.CL-31.5143), 
                width=0,color="black") + 
  geom_point(size=2) + 
  xlab("Chick ID") + 
  ylab("Intercepts") + 
  ggtitle("Fixed effects EMMEANS") + 
  coord_flip() +
  theme_bw() +
  facet_wrap(~Diet)
p2

Now let’s plot histogram of the random intercepts. Thin red bars represent 1 SD of the variance component, which is similar (although not exactly the same) as 1 SD of the of the random intercepts:

ggplot(df1, aes(x=Intercepts)) +
  geom_histogram(color="grey", bins=7) + 
  geom_vline(xintercept = 0, color="red", lwd=3) +
  geom_vline(xintercept = 23.361, color="red", lwd=2) +
  geom_vline(xintercept = -23.361, color="red", lwd=2)

Heritability

Let’s load new packages and data for this section:

library(inti)
library(agridat)
dt <- john.alpha
?john.alpha  ## read about the design. Each genotype has 3 replicates planted in each block.

This design allows us to estimate variance due to genotype (ie. heritibility) and environment (ie. blocking effect) plus random variation in traits (ie. residual error).

Examine data. View and sort by genotypes to inspect more thoroughly:

head(dt)

##   plot rep block gen  yield row col
## 1    1  R1    B1 G11 4.1172   1   1
## 2    2  R1    B1 G04 4.4461   2   1
## 3    3  R1    B1 G05 5.8757   3   1
## 4    4  R1    B1 G22 4.5784   4   1
## 5    5  R1    B2 G21 4.6540   5   1
## 6    6  R1    B2 G10 4.1736   6   1

We can calculate heritability using the H2cal() function in inti package:

hr <- H2cal(data = dt
            , trait = "yield"
            , gen.name = "gen"
            , rep.n = 3
            , fixed.model = "(1|rep:block) + gen"
            , random.model = "(1|gen)+(1|rep:block) "
            , emmeans = TRUE
            , plot_diag = TRUE
            , outliers.rm = FALSE
)

hr$model %>% summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: yield ~ (1 | gen) + (1 | rep:block)
##    Data: dt.rm
## Weights: weights
## 
## REML criterion at convergence: 101.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.89350 -0.36226  0.09156  0.39282  2.59865 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  gen       (Intercept) 0.14020  0.3744  
##  rep:block (Intercept) 0.15920  0.3990  
##  Residual              0.08098  0.2846  
## Number of obs: 72, groups:  gen, 24; rep:block, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   4.4795     0.1257   35.62

hr$tabsmr

##   trait rep geno env year     mean       std      min      max       V.g
## 1 yield   3   24   1    1 4.479517 0.4102628 3.470785 5.091577 0.1402035
##          V.e       V.p repeatability      H2.s      H2.p      H2.c
## 1 0.08097805 0.1671962     0.8385569 0.8385569 0.7927501 0.7828254

hr$blups

## # A tibble: 24 × 2
##    gen   yield
##    <chr> <dbl>
##  1 G01    4.96
##  2 G02    4.48
##  3 G03    3.74
##  4 G04    4.50
##  5 G05    4.95
##  6 G06    4.49
##  7 G07    4.17
##  8 G08    4.59
##  9 G09    3.66
## 10 G10    4.39
## # … with 14 more rows

hr$blues

## # A tibble: 24 × 6
##    gen   yield    SE    df lower.CL upper.CL
##    <fct> <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 G01    5.09 0.212  46.8     4.66     5.52
##  2 G02    4.47 0.212  46.8     4.05     4.90
##  3 G03    3.55 0.212  46.8     3.13     3.98
##  4 G04    4.51 0.212  46.8     4.09     4.94
##  5 G05    5.03 0.212  46.7     4.61     5.46
##  6 G06    4.48 0.212  46.7     4.06     4.91
##  7 G07    4.11 0.212  46.8     3.68     4.54
##  8 G08    4.59 0.212  46.8     4.17     5.02
##  9 G09    3.47 0.212  46.7     3.04     3.90
## 10 G10    4.37 0.212  46.8     3.94     4.79
## # … with 14 more rows

We can calculate heritability manually by fitting a model with lmer(). Use intercept only fixed-effects, block/rep or just block:rep as the blocking term and gen as a random effect:

h1 <- lmer(yield ~ 1  + (1|gen) + (1|rep:block), data=dt)
summary(h1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: yield ~ 1 + (1 | gen) + (1 | rep:block)
##    Data: dt
## 
## REML criterion at convergence: 101.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.89350 -0.36226  0.09156  0.39282  2.59865 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  gen       (Intercept) 0.14020  0.3744  
##  rep:block (Intercept) 0.15920  0.3990  
##  Residual              0.08098  0.2846  
## Number of obs: 72, groups:  gen, 24; rep:block, 18
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)   4.4795     0.1257 29.9786   35.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now we can extract variance components (VC) from model:

mod1v <- as.data.frame(VarCorr(h1))
Vg <- mod1v$vcov[1]   ## extract VC for gen (i.e. genetically based variation in yield)
Vp <- (mod1v$vcov[1]+(sum(mod1v$vcov[1:2])/6)+(mod1v$vcov[3]/(6+3))) ## sum up all variance components (i.e. total phenotypic variation), this formula is roughly from Schmidt et al. 2019 (see link at bottom)

h2 <- Vg/Vp
h2 ## broad sense heritability

## [1] 0.7041827

Note that the H2 estimate is different from inti package, and you need to read inti package and references to see different ways to calculate Vp.

Sources: https://cran.r-project.org/web/packages/inti/vignettes/heritability.html#ref-zystro2018Alternative

https://acsess.onlinelibrary.wiley.com/doi/full/10.2135/cropsci2018.06.0376

Side note: There is a massive literature on heritability, so proceed with caution. Lynch and Walsh 1998 is a great resource.

5D. Contrasts

For this section let’s load the packages we will need:

library(tidyverse)
library(lme4)
library(lmerTest)
library(emmeans)
library(MuMIn)
library(agridat) ## install and load package for datasets
library(multcomp) ## install and load package for multiple comparisons

Let’s load a rice experiment and process the data:

data("gomez.multilocsplitplot")
gomez.multilocsplitplot$nitro <- as.factor(gomez.multilocsplitplot$nitro)
gomez <- gomez.multilocsplitplot
head(gomez)

##   loc nitro rep gen yield
## 1  L1     0  R1  G1  1979
## 2  L1    30  R1  G1  4572
## 3  L1    60  R1  G1  5630
## 4  L1    90  R1  G1  7153
## 5  L1   120  R1  G1  7223
## 6  L1   150  R1  G1  7239

Let’s make a preliminary plot of the data:

ggplot(gomez, aes(x=gen, y=yield,
                  fill=nitro)) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(position =position_jitterdodge(jitter.height=0,
                                    jitter.width=.1)) +
  facet_wrap(~loc)

Let’s average data by loc and nitro to account for pseudo-replication (n = 36 plots):

gomez_summarized <- gomez %>% group_by(loc,nitro,gen) %>% summarize(yield=mean(yield, na.rm=T))

Now let’s run a regular two-way anova using summarized dataset with no blocking:

mm0 <- lm(yield ~ gen*nitro, data=gomez_summarized)
anova(mm0)

## Analysis of Variance Table
## 
## Response: yield
##           Df   Sum Sq Mean Sq F value    Pr(>F)    
## gen        1  3111892 3111892  7.7656   0.01024 *  
## nitro      5 45616365 9123273 22.7669 2.166e-08 ***
## gen:nitro  5   381701   76340  0.1905   0.96329    
## Residuals 24  9617414  400726                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can also run a two-way anova with block as a random effect:

mm1 <- lmer(yield ~ gen*nitro+(1|loc), data=gomez_summarized)
anova(mm1)

## Type III Analysis of Variance Table with Satterthwaite's method
##             Sum Sq Mean Sq NumDF DenDF F value    Pr(>F)    
## gen        3111893 3111893     1    22  8.3998  0.008339 ** 
## nitro     45616365 9123273     5    22 24.6261 2.502e-08 ***
## gen:nitro   381701   76340     5    22  0.2061  0.956414    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Also we can run a two-way anova with block and nitro nested within block as random effects:

mm2 <- lmer(yield ~ gen*nitro+(1|loc/nitro), data=gomez_summarized)
anova(mm2)

## Type III Analysis of Variance Table with Satterthwaite's method
##             Sum Sq Mean Sq NumDF DenDF F value    Pr(>F)    
## gen        3111893 3111893     1    12  19.119 0.0009082 ***
## nitro     11981116 2396223     5    10  14.722 0.0002453 ***
## gen:nitro   381701   76340     5    12   0.469 0.7923429    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(mm2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: yield ~ gen * nitro + (1 | loc/nitro)
##    Data: gomez_summarized
## 
## REML criterion at convergence: 385.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.47833 -0.39994 -0.05636  0.45615  1.13929 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  nitro:loc (Intercept) 228472   477.99  
##  loc       (Intercept)   9482    97.38  
##  Residual              162767   403.44  
## Number of obs: 36, groups:  nitro:loc, 18; loc, 3
## 
## Fixed effects:
##                Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)     3392.67     365.48   17.67   9.283 3.26e-08 ***
## genG2            305.89     329.41   12.00   0.929 0.371402    
## nitro30         1652.67     510.71   15.08   3.236 0.005510 ** 
## nitro60         2380.67     510.71   15.08   4.661 0.000303 ***
## nitro90         2994.11     510.71   15.08   5.863 3.06e-05 ***
## nitro120        3119.11     510.71   15.08   6.107 1.96e-05 ***
## nitro150        2794.89     510.71   15.08   5.473 6.31e-05 ***
## genG2:nitro30    465.00     465.86   12.00   0.998 0.337904    
## genG2:nitro60    275.33     465.86   12.00   0.591 0.565470    
## genG2:nitro90    531.67     465.86   12.00   1.141 0.276023    
## genG2:nitro120    24.89     465.86   12.00   0.053 0.958272    
## genG2:nitro150   395.89     465.86   12.00   0.850 0.412066    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) genG2  nitr30 nitr60 nitr90 ntr120 ntr150 gG2:30 gG2:60
## genG2       -0.451                                                        
## nitro30     -0.699  0.323                                                 
## nitro60     -0.699  0.323  0.500                                          
## nitro90     -0.699  0.323  0.500  0.500                                   
## nitro120    -0.699  0.323  0.500  0.500  0.500                            
## nitro150    -0.699  0.323  0.500  0.500  0.500  0.500                     
## genG2:ntr30  0.319 -0.707 -0.456 -0.228 -0.228 -0.228 -0.228              
## genG2:ntr60  0.319 -0.707 -0.228 -0.456 -0.228 -0.228 -0.228  0.500       
## genG2:ntr90  0.319 -0.707 -0.228 -0.228 -0.456 -0.228 -0.228  0.500  0.500
## gnG2:ntr120  0.319 -0.707 -0.228 -0.228 -0.228 -0.456 -0.228  0.500  0.500
## gnG2:ntr150  0.319 -0.707 -0.228 -0.228 -0.228 -0.228 -0.456  0.500  0.500
##             gG2:90 gG2:12
## genG2                    
## nitro30                  
## nitro60                  
## nitro90                  
## nitro120                 
## nitro150                 
## genG2:ntr30              
## genG2:ntr60              
## genG2:ntr90              
## gnG2:ntr120  0.500       
## gnG2:ntr150  0.500  0.500

We can use emmeans for just for nitro (let’s not worry about genotype):

emmeans(mm2, pairwise~nitro)

## $emmeans
##  nitro emmean  SE   df lower.CL upper.CL
##  0       3546 326 11.9     2834     4257
##  30      5431 326 11.9     4720     6142
##  60      6064 326 11.9     5353     6775
##  90      6806 326 11.9     6094     7517
##  120     6677 326 11.9     5966     7388
##  150     6538 326 11.9     5827     7250
## 
## Results are averaged over the levels of: gen 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast  estimate  SE df t.ratio p.value
##  0 - 30       -1885 455 10  -4.148  0.0180
##  0 - 60       -2518 455 10  -5.541  0.0025
##  0 - 90       -3260 455 10  -7.173  0.0003
##  0 - 120      -3132 455 10  -6.890  0.0004
##  0 - 150      -2993 455 10  -6.585  0.0006
##  30 - 60       -633 455 10  -1.393  0.7306
##  30 - 90      -1375 455 10  -3.025  0.0985
##  30 - 120     -1246 455 10  -2.742  0.1495
##  30 - 150     -1108 455 10  -2.437  0.2303
##  60 - 90       -742 455 10  -1.632  0.5983
##  60 - 120      -613 455 10  -1.349  0.7539
##  60 - 150      -474 455 10  -1.044  0.8923
##  90 - 120       128 455 10   0.282  0.9997
##  90 - 150       267 455 10   0.588  0.9896
##  120 - 150      139 455 10   0.305  0.9995
## 
## Results are averaged over the levels of: gen 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: tukey method for comparing a family of 6 estimates

A two-way anova with block and nitro nested within block as random effects can be run where we fully embrace the nestedness and include plot as a random effect instead of averaging (n=108 data points, use them all!):

mm3 <- lmer(yield ~ gen*nitro+(1|loc/nitro/gen), data=gomez)
anova(mm3) ## identical to averaged model in mm2

## Type III Analysis of Variance Table with Satterthwaite's method
##             Sum Sq Mean Sq NumDF DenDF F value    Pr(>F)    
## gen        8450767 8450767     1    12  19.119 0.0009081 ***
## nitro     32535317 6507063     5    10  14.722 0.0002453 ***
## gen:nitro  1036562  207312     5    12   0.469 0.7923377    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(mm3)  ## additional variance component, so no information sacrificed

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: yield ~ gen * nitro + (1 | loc/nitro/gen)
##    Data: gomez
## 
## REML criterion at convergence: 1565.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.45880 -0.47848 -0.02926  0.61971  2.98686 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  gen:(nitro:loc) (Intercept)  15428   124.21  
##  nitro:loc       (Intercept) 228478   477.99  
##  loc             (Intercept)   9484    97.38  
##  Residual                    442009   664.84  
## Number of obs: 108, groups:  gen:(nitro:loc), 36; nitro:loc, 18; loc, 3
## 
## Fixed effects:
##                Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)     3392.67     365.48   17.67   9.283 3.26e-08 ***
## genG2            305.89     329.41   12.00   0.929 0.371398    
## nitro30         1652.67     510.71   15.08   3.236 0.005510 ** 
## nitro60         2380.67     510.71   15.08   4.661 0.000303 ***
## nitro90         2994.11     510.71   15.08   5.863 3.06e-05 ***
## nitro120        3119.11     510.71   15.08   6.107 1.96e-05 ***
## nitro150        2794.89     510.71   15.08   5.473 6.31e-05 ***
## genG2:nitro30    465.00     465.85   12.00   0.998 0.337900    
## genG2:nitro60    275.33     465.85   12.00   0.591 0.565467    
## genG2:nitro90    531.67     465.85   12.00   1.141 0.276019    
## genG2:nitro120    24.89     465.85   12.00   0.053 0.958271    
## genG2:nitro150   395.89     465.85   12.00   0.850 0.412062    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) genG2  nitr30 nitr60 nitr90 ntr120 ntr150 gG2:30 gG2:60
## genG2       -0.451                                                        
## nitro30     -0.699  0.322                                                 
## nitro60     -0.699  0.322  0.500                                          
## nitro90     -0.699  0.322  0.500  0.500                                   
## nitro120    -0.699  0.322  0.500  0.500  0.500                            
## nitro150    -0.699  0.322  0.500  0.500  0.500  0.500                     
## genG2:ntr30  0.319 -0.707 -0.456 -0.228 -0.228 -0.228 -0.228              
## genG2:ntr60  0.319 -0.707 -0.228 -0.456 -0.228 -0.228 -0.228  0.500       
## genG2:ntr90  0.319 -0.707 -0.228 -0.228 -0.456 -0.228 -0.228  0.500  0.500
## gnG2:ntr120  0.319 -0.707 -0.228 -0.228 -0.228 -0.456 -0.228  0.500  0.500
## gnG2:ntr150  0.319 -0.707 -0.228 -0.228 -0.228 -0.228 -0.456  0.500  0.500
##             gG2:90 gG2:12
## genG2                    
## nitro30                  
## nitro60                  
## nitro90                  
## nitro120                 
## nitro150                 
## genG2:ntr30              
## genG2:ntr60              
## genG2:ntr90              
## gnG2:ntr120  0.500       
## gnG2:ntr150  0.500  0.500

Now look at emmeans for nitro and gen:

emmeans(mm3, pairwise~nitro)

## $emmeans
##  nitro emmean  SE   df lower.CL upper.CL
##  0       3546 326 11.9     2834     4257
##  30      5431 326 11.9     4720     6142
##  60      6064 326 11.9     5353     6775
##  90      6806 326 11.9     6094     7517
##  120     6677 326 11.9     5966     7388
##  150     6538 326 11.9     5827     7250
## 
## Results are averaged over the levels of: gen 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast  estimate  SE df t.ratio p.value
##  0 - 30       -1885 455 10  -4.148  0.0180
##  0 - 60       -2518 455 10  -5.541  0.0025
##  0 - 90       -3260 455 10  -7.173  0.0003
##  0 - 120      -3132 455 10  -6.890  0.0004
##  0 - 150      -2993 455 10  -6.585  0.0006
##  30 - 60       -633 455 10  -1.393  0.7306
##  30 - 90      -1375 455 10  -3.025  0.0985
##  30 - 120     -1246 455 10  -2.742  0.1495
##  30 - 150     -1108 455 10  -2.437  0.2304
##  60 - 90       -742 455 10  -1.632  0.5983
##  60 - 120      -613 455 10  -1.349  0.7539
##  60 - 150      -474 455 10  -1.044  0.8923
##  90 - 120       128 455 10   0.282  0.9997
##  90 - 150       267 455 10   0.588  0.9896
##  120 - 150      139 455 10   0.305  0.9995
## 
## Results are averaged over the levels of: gen 
## Degrees-of-freedom method: kenward-roger 
## P value adjustment: tukey method for comparing a family of 6 estimates

emmeans(mm3, pairwise~gen)

## $emmeans
##  gen emmean  SE   df lower.CL upper.CL
##  G1    5550 158 2.96     5044     6055
##  G2    6138 158 2.96     5632     6643
## 
## Results are averaged over the levels of: nitro 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate  SE df t.ratio p.value
##  G1 - G2      -588 134 12  -4.373  0.0009
## 
## Results are averaged over the levels of: nitro 
## Degrees-of-freedom method: kenward-roger

Let’s examine pairwise comparisons as compact letter displays (ie. CLD, sometimes called tukey groupings).

mm3em <- emmeans(mm3, pairwise~nitro)
cld(mm3em)

##  nitro emmean  SE   df lower.CL upper.CL .group
##  0       3546 326 11.9     2834     4257  1    
##  30      5431 326 11.9     4720     6142   2   
##  60      6064 326 11.9     5353     6775   2   
##  150     6538 326 11.9     5827     7250   2   
##  120     6677 326 11.9     5966     7388   2   
##  90      6806 326 11.9     6094     7517   2   
## 
## Results are averaged over the levels of: gen 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 6 estimates 
## significance level used: alpha = 0.05 
## NOTE: Compact letter displays can be misleading
##       because they show NON-findings rather than findings.
##       Consider using 'pairs()', 'pwpp()', or 'pwpm()' instead.

Treatments that share the same number in the “.group” column do not differ significantly, while treatments with different numbers are significantly different.

Now let’s extract means and make plot of nitrogen emmeans with CLD:

n1 <- emmeans(mm3, ~nitro) %>% as.data.frame()

Let’s plot this:

ggplot(n1, aes(x=nitro, y=emmean)) + 
  geom_point(size=5) + 
  geom_errorbar(aes(ymin=lower.CL,
                    ymax=upper.CL), width=0, lwd=2) + 
  ylab("yield (g) +/- 95% CI") +
  theme_bw(base_size = 20)+
  annotate("text",
           x=c(1,2,3,4,5,6), y=7750, 
           label=c("A","B","B","B","B","B"),
           size=10)

R CHALLENGE

Do grazing or Nitrogen affect insect abundance?

Construct an appropriate linear model
Model assumptions met?
Do grazing or Nitrogen affect insect abundance?
How do the results change depending on whether you include a block or split-plot?
How big is the blocking and split-plot effect?

For the challenge here is the data:

d1 <-read_csv("InsectData.csv")
head(d1)

## # A tibble: 6 × 4
##   abund N_Add  site  grazed
##   <dbl> <chr>  <chr> <chr> 
## 1    25 High.N A     YES   
## 2    24 Low.N  A     YES   
## 3    20 Med.N  A     YES   
## 4    17 No.N   A     YES   
## 5    20 High.N A     NO    
## 6    22 Low.N  A     NO

d1$N_Add <- factor(d1$N_Add, levels=c("No.N","Low.N","Med.N","High.N"))  ## reorder factor levels

Extra: Revisiting residuals

For this section let’s load the following packages:

library(tidyverse)
library(car)
library(lme4)
library(lmerTest)
library(emmeans)
library(viridis)
library(MuMIn)
library(glmmTMB)

Let’s use the chickweight dataset:

data("ChickWeight")
ChickWeight$Diet <- as.factor(ChickWeight$Diet)
?ChickWeight

Once again let’s quickly plot out the data:

## plot out data
ggplot(data=ChickWeight) +
  geom_point(data=ChickWeight, aes(x=Time,
                                   y=weight,
                                   color=as.numeric(Chick))) +
  scale_color_viridis() +
  facet_wrap(~Diet) +
  geom_smooth(data=ChickWeight, method="lm",
              aes(x=Time,y=weight)) +
  theme_bw(base_size = 16)

Here we use Chick as a random effect:

cw1 <- glmmTMB(weight ~ Time * Diet + (1|Chick), data=ChickWeight)
summary(cw1)

##  Family: gaussian  ( identity )
## Formula:          weight ~ Time * Diet + (1 | Chick)
## Data: ChickWeight
## 
##      AIC      BIC   logLik deviance df.resid 
##   5508.0   5551.6  -2744.0   5488.0      568 
## 
## Random effects:
## 
## Conditional model:
##  Groups   Name        Variance Std.Dev.
##  Chick    (Intercept) 498.0    22.32   
##  Residual             638.4    25.27   
## Number of obs: 578, groups:  Chick, 50
## 
## Dispersion estimate for gaussian family (sigma^2):  638 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  31.5081     5.9113   5.330 9.82e-08 ***
## Time          6.7130     0.2573  26.086  < 2e-16 ***
## Diet2        -2.8812    10.1919  -0.283    0.777    
## Diet3       -13.2564    10.1919  -1.301    0.193    
## Diet4        -0.3927    10.2007  -0.038    0.969    
## Time:Diet2    1.8962     0.4267   4.444 8.85e-06 ***
## Time:Diet3    4.7098     0.4267  11.037  < 2e-16 ***
## Time:Diet4    2.9494     0.4323   6.823 8.91e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova(cw1)

## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: weight
##              Chisq Df Pr(>Chisq)    
## Time      3088.530  1  < 2.2e-16 ***
## Diet        20.222  3  0.0001527 ***
## Time:Diet  131.333  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note we are using Anova() from the car package. Diet p-value is low, but note it uses Wald Chi-sq test.

Residuals:

hist(residuals(cw1))

plot(residuals(cw1)~fitted(cw1))

So how can we fix those residuals? We can try using a model incorporating an autoregressive covariance structure with the package glmmTMB:

cw1ar <- glmmTMB(weight ~ Time * Diet + ar1(0 + as.factor(Time)|Chick), data=ChickWeight)

Now let’s examine model results:

summary(cw1ar)

##  Family: gaussian  ( identity )
## Formula:          weight ~ Time * Diet + ar1(0 + as.factor(Time) | Chick)
## Data: ChickWeight
## 
##      AIC      BIC   logLik deviance df.resid 
##   4484.3   4532.2  -2231.1   4462.3      567 
## 
## Random effects:
## 
## Conditional model:
##  Groups   Name             Variance  Std.Dev.  Corr      
##  Chick    as.factor(Time)0 1.651e+03 40.634167 0.97 (ar1)
##  Residual                  4.121e-05  0.006419           
## Number of obs: 578, groups:  Chick, 50
## 
## Dispersion estimate for gaussian family (sigma^2): 4.12e-05 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  40.3594     9.0782   4.446 8.76e-06 ***
## Time          6.0679     0.3465  17.510  < 2e-16 ***
## Diet2        -0.9261    15.7206  -0.059 0.953024    
## Diet3        -2.3382    15.7240  -0.149 0.881786    
## Diet4        -1.0029    15.7211  -0.064 0.949136    
## Time:Diet2    2.2042     0.5837   3.776 0.000159 ***
## Time:Diet3    4.8553     0.5837   8.319  < 2e-16 ***
## Time:Diet4    3.1306     0.5864   5.339 9.36e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova(cw1ar)

## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: weight
##              Chisq Df Pr(>Chisq)    
## Time      1459.448  1  < 2.2e-16 ***
## Diet        11.624  3    0.00879 ** 
## Time:Diet   75.942  3  2.277e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

and the residuals:

hist(residuals(cw1ar)) ## hist looks ok

plot(residuals(cw1ar)~fitted(cw1ar))

We can compare AIC for models:

AIC(cw1,cw1ar)

##       df      AIC
## cw1   10 5508.017
## cw1ar 11 4484.270

As well as compare the emmeans:

emmeans(cw1, pairwise~Diet, at=list(Time=20)) # Yes, 4 of 6 differ at time 20

## $emmeans
##  Diet emmean   SE  df lower.CL upper.CL
##  1       166 5.89 568      154      177
##  2       201 8.04 568      185      217
##  3       247 8.04 568      231      263
##  4       224 8.10 568      208      240
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate    SE  df t.ratio p.value
##  1 - 2       -35.0  9.97 568  -3.516  0.0027
##  1 - 3       -80.9  9.97 568  -8.120  <.0001
##  1 - 4       -58.6 10.01 568  -5.852  <.0001
##  2 - 3       -45.9 11.37 568  -4.035  0.0004
##  2 - 4       -23.6 11.41 568  -2.063  0.1665
##  3 - 4        22.3 11.41 568   1.958  0.2054
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

emmeans(cw1ar, pairwise~Diet, at=list(Time=20)) # Yes, 4 of 6 differ at time 20

## $emmeans
##  Diet emmean    SE  df lower.CL upper.CL
##  1       162  9.15 567      144      180
##  2       205 12.64 567      180      230
##  3       256 12.65 567      232      281
##  4       223 12.69 567      198      248
## 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast estimate   SE  df t.ratio p.value
##  1 - 2       -43.2 15.6 567  -2.766  0.0298
##  1 - 3       -94.8 15.6 567  -6.072  <.0001
##  1 - 4       -61.6 15.6 567  -3.938  0.0005
##  2 - 3       -51.6 17.9 567  -2.888  0.0210
##  2 - 4       -18.5 17.9 567  -1.030  0.7318
##  3 - 4        33.2 17.9 567   1.851  0.2506
## 
## P value adjustment: tukey method for comparing a family of 4 estimates

TRY ON YOUR OWN

Use the data sets from agridat:

?nass.sorghum

nass1 <- nass.cotton %>% filter(state=="Florida") %>%
  mutate(crop="cotton")
nass2 <- nass.corn %>% filter(state=="Florida") %>%
  mutate(crop="corn")
nass3 <- nass.hay %>% filter(state=="Florida") %>%
  mutate(crop="hay")

Put together the agridat data:

nassdata <- rbind(nass1,nass2,nass3)
head(nassdata)

##   year   state  acres yield   crop
## 1 1866 Florida 155000   123 cotton
## 2 1867 Florida 145000   181 cotton
## 3 1868 Florida 120000   130 cotton
## 4 1869 Florida 132000   121 cotton
## 5 1870 Florida 148000   170 cotton
## 6 1871 Florida 162000    90 cotton

Now plot it:

ggplot(nassdata, aes(x=year, y=yield)) +
  geom_point() +
  geom_line() +
  facet_wrap(~crop, scales="free_y")

On your own: construct some models you think are appropriate for fitting to these data