Modified from hybridInflo Biomass.Rmd (April 2017)
Identify reproductive barriers between two sympatric moth-pollinated plant species, Schiedea kaalae and S. hookeri by fitting a generalized linear mixed model (GLMM).
In the experimental design, the following crosstypes were made:
In this analysis the response variable is the biomass of the offspring produced by each cross. Other barriers (hybrid survival, flowering) could be analyzed in a similar framework, with appropriate changes to the underlying distribution.
Fixed effects:
Potential random effects:
ib <- read.table("vegbiomass.csv", header=T, sep="\t",
colClasses=c(collect.date="Date", weigh.date="Date"))
crosses <- read.table("hybrids.csv", header=T, sep="\t", colClasses=c(mompop="factor", dadpop="factor"))
crosscol <- c("green","blue","orange","red")
#treat populations as factors
ib$mompop <- crosses$mompop[match(ib$crossid, crosses$crossid)]
ib$momid <- crosses$momid[match(ib$crossid, crosses$crossid)]
ib$species <- crosses$momsp[match(ib$crossid, crosses$crossid)]
ib$dadpop <- crosses$dadpop[match(ib$crossid, crosses$crossid)]
ib$dadid <- crosses$dadid[match(ib$crossid, crosses$crossid)]
ib$dadsp <- crosses$dadsp[match(ib$crossid, crosses$crossid)]
ib$crosstype <- crosses$crosstype[match(ib$crossid, crosses$crossid)]
ib$cross <- crosses$cross[match(ib$crossid, crosses$crossid)]
ib <- ib[!is.na(ib$crosstype),] #missing information for 2 samples
#rename crosstype codes
ib$crosstype <- factor(ib$crosstype, levels=c("between", "within", "hybrid"))
#made "between" the first reference level to facilitate comparison between outcrossing populations and hybridizing species
ib$mompop <- sapply(ib$mompop, mapvalues, from = c("794","866","899","879","892","904","3587"), to = c("WK","WK","WK","879WKG","892WKG","904WPG","3587WP"))
ib$dadpop <- sapply(ib$dadpop, mapvalues, from = c("794","866","899","879","892","904","3587"), to = c("WK","WK","WK","879WKG","892WKG","904WPG","3587WP"))
#define interactions
ib <- within(ib, sxc <- interaction(species,crosstype))
ib <- within(ib, sxcxm <- interaction(species,crosstype,mompop,momid))
ib <- within(ib, mompid <- as.factor(paste(mompop,momid,sep=".")))
ib <- within(ib, dadpid <- as.factor(paste(dadpop,dadid,sep=".")))
ib <- within(ib, smompop <- as.factor(paste(species,mompop,sep="")))
ib$collect.date <- round(difftime(ib$collect.date, "2016-03-10"))
#check final structure
str(ib)
'data.frame': 1067 obs. of 22 variables:
$ crossid : int 93 76 67 3 29 7 2 89 63 60 ...
$ plantid : Factor w/ 29 levels "","`4","1","10",..: 11 17 9 23 9 3 14 15 23 24 ...
$ cross : Factor w/ 4 levels "HH","HK","KH",..: 1 1 2 1 2 2 1 1 1 1 ...
$ block1 : Factor w/ 4 levels "","A","B","V": 1 2 2 1 2 2 1 2 1 2 ...
$ blockAB : Factor w/ 10 levels "","\"HH\"","\"KK\"",..: 1 1 1 1 1 1 1 1 1 1 ...
$ collect.date: 'difftime' num 280 280 280 282 ...
..- attr(*, "units")= chr "days"
$ mass : num 14.41 10.28 11.22 4.26 12.98 ...
$ weigh.date : Date, format: "2017-08-03" "2017-08-03" ...
$ comments : Factor w/ 37 levels "","?\"KK\"-Sara",..: 1 1 1 1 1 1 1 1 1 1 ...
$ linenum : num 1 2 3 4 5 6 7 8 9 10 ...
$ mompop : Factor w/ 5 levels "3587WP","WK",..: 2 2 2 3 3 3 3 2 2 2 ...
$ momid : Factor w/ 17 levels "1","10","10-1",..: 6 10 6 7 3 7 7 6 6 6 ...
$ species : Factor w/ 2 levels "hook","kaal": 1 1 1 1 1 1 1 1 1 1 ...
$ dadpop : Factor w/ 5 levels "3587WP","WK",..: 2 3 4 3 1 4 3 3 2 3 ...
$ dadid : Factor w/ 23 levels "1","10","10-1",..: 14 19 14 22 6 13 3 10 14 3 ...
$ dadsp : Factor w/ 2 levels "hook","kaal": 1 1 2 1 2 2 1 1 1 1 ...
$ crosstype : Factor w/ 3 levels "between","within",..: 1 1 3 2 3 3 2 1 2 1 ...
$ sxc : Factor w/ 6 levels "hook.between",..: 1 1 5 3 5 5 3 1 3 1 ...
$ sxcxm : Factor w/ 510 levels "hook.between.3587WP.1",..: 157 277 161 195 77 197 195 157 159 157 ...
$ mompid : Factor w/ 21 levels "3587WP.10","3587WP.14",..: 19 21 19 8 7 8 8 19 19 19 ...
$ dadpid : Factor w/ 23 levels "3587WP.10","3587WP.14",..: 23 9 15 11 3 14 7 8 23 7 ...
$ smompop : Factor w/ 5 levels "hook879WKG","hookWK",..: 2 2 2 1 1 1 1 2 2 2 ...
library(ggplot2)
ggplot(ib, aes(x=log10(mass), fill=cross, color=cross)) + geom_density(alpha=0.1, bw=0.06) + scale_fill_manual(values=crosscol) +scale_color_manual(values=crosscol)
The sample sizes are unbalanced at all levels, including maternal population:
reptab <- with(ib, table(smompop,crosstype))
mosaic(reptab, pop=F)
labeling_cells(text = reptab, margin = 0)(reptab)
Replication is low for some within-population crosses. The replication is even lower for each maternal plant, so we need to be wary of estimates when subsetting at this level:
with(ib, kable(table(mompid,crosstype)))
| between | within | hybrid | |
|---|---|---|---|
| 3587WP.10 | 0 | 1 | 0 |
| 3587WP.14 | 15 | 0 | 7 |
| 3587WP.15 | 4 | 0 | 0 |
| 3587WP.7 | 18 | 15 | 11 |
| 3587WP.A | 5 | 0 | 0 |
| 3587WP.C | 11 | 1 | 0 |
| 879WKG.10-1 | 51 | 1 | 38 |
| 879WKG.2-2 | 31 | 42 | 93 |
| 879WKG.G-2 | 17 | 7 | 30 |
| 879WKG.H-2 | 0 | 1 | 8 |
| 879WKG.N-5 | 9 | 6 | 21 |
| 892WKG.1 | 25 | 0 | 8 |
| 892WKG.3 | 1 | 0 | 3 |
| 892WKG.4 | 1 | 0 | 1 |
| 892WKG.5 | 7 | 5 | 6 |
| 904WPG.2 | 12 | 8 | 8 |
| 904WPG.3 | 15 | 26 | 4 |
| 904WPG.5 | 79 | 29 | 23 |
| WK.2 | 137 | 18 | 103 |
| WK.2E- 1 | 0 | 0 | 17 |
| WK.4 | 59 | 7 | 22 |
To identify the best-fitting distribution, we make quantile-quantile plots of the raw data against various distributions. The more points within the confidence interval envelopes, the better the fit. Later, we present quantile-quantile plots of the model residuals to assess model fit.
#QQ plots against various distributions
set.seed(1)
par(mfrow=c(1,3))
normal <- fitdistr(log10(ib$mass+1), "normal")
qqp(log10(ib$mass+1), "norm", main="Normal")
[1] 940 983
lognormal <- fitdistr(ib$mass+1, "lognormal")
qqp(ib$mass+1, "lnorm", main="Log Normal")
[1] 1044 535
#pois <- fitdistr(ib$mass+1, "Poisson")
#qqp(ib$mass, "pois", pois$estimate, main="Poisson")
#nbinom <- fitdistr(ib$mass+1, "Negative Binomial")
#qqp(ib$mass+1, "nbinom", size = nbinom$estimate[[1]], mu=nbinom$estimate[[2]], main="Negative Binomial")
gamma <- fitdistr(ib$mass+1, "gamma")
qqp(ib$mass+1, "gamma", shape = gamma$estimate[[1]], rate = gamma$estimate[[2]], main="Gamma")
[1] 1044 535
ggplot(ib, aes(x = mass, fill=species)) +
geom_histogram(data=subset(ib,species == "hook"), aes(y=-..density..),binwidth=2)+
geom_histogram(data=subset(ib,species == "kaal"), aes(y= ..density..),binwidth=2)+
coord_flip() + facet_grid(~crosstype) + labs(y="Histogram", x="Vegetative biomass (g)") +
scale_fill_manual("Maternal species", labels = c("S. hookeri ", "S. kaalae "), values=brewer.pal(name="Set1", n=3)[c(3,2)]) + theme_classic() + theme(legend.text=element_text(face="italic", size=rel(1)), legend.position="bottom", axis.text = element_text(colour="black", size=rel(1)), text=element_text(size=14), axis.ticks.x = element_blank())
ggplot(aes(y=mass, x=mompid, color=crosstype), data=ib) + geom_count(alpha=0.8) + coord_flip() + labs(x="Maternal plant", y="Mass")
Our mixed model uses one parameter to capture random effect variance, which is assumed to be homogeneous. Plotting on a log scale should uncouple variances from means to assess this visually. Subsets are species * crosstype * maternal plant.
Subset variances are not homogeneous:
ggplot(aes(y=log10(mass+1), x=sxcxm, color=crosstype), data=ib) + geom_boxplot() + coord_flip() + labs(y="ln(Vegetative Biomass + 1)",x="Subsets")
Various distributions make different assumptions about the mean-variance (µ-Var) ratio.
grpVars <- with(ib, tapply(mass, list(sxcxm), var))
grpMeans <- with(ib, tapply(mass, list(sxcxm), mean))
grpCounts <- with(ib, tapply(mass, list(sxcxm), length))
#set weight=grpCounts to weight loess by sample sizes
ggplot(na.omit(data.frame(grpMeans,grpVars,grpCounts)),
aes(x=grpMeans,y=grpVars, weight=1))+geom_point(aes(size=grpCounts))+
guides(colour=guide_legend(title="Fit"),size=guide_legend(title="Sample size")) + labs(x="Subset Mean", y="Subset Variance") + labs(subtitle="Subset: species*crosstype*mompid")
Effects and interactions in these plots are simply given by the mean, which may be unduly influenced by high values.
intplot <- ggplot(ib,aes(x=crosstype,y=mass))+
geom_count(aes(size = ..prop.., group=sxc),alpha=0.5)+
stat_summary(aes(x=as.numeric(crosstype)),fun.y=mean,geom="line")+ facet_grid(~species)
intplot + aes(group=species, color=species)
intplot + aes(group=mompop, color=mompop)
intplot + aes(group=mompid, color=mompop)
intplot + aes(group=dadpop, color=dadpop)
Run many generalized linear models on subsets of the data defined by crosstype | mompid to see if effects estimates are consistent within maternal plants.
Most maternal plant subsets agree, but some are problematic outliers. These plants can be picked out visually from the random effects interaction plot above, the estimated parameters of each subset model, and the QQ plot of the estimated parameters:
#had to get rid of species or mompid since mompid is nested inside species. dadpop also works
glm.lis <- lmList(log10(mass)~crosstype|mompid,data=ib, family="gaussian")
plot.lmList(glm.lis,scale=list(x=list(relation="free")))
Loading required package: reshape
Attaching package: 'reshape'
The following object is masked from 'package:dplyr':
rename
The following objects are masked from 'package:plyr':
rename, round_any
The following object is masked from 'package:Matrix':
expand
Using grp as id variables
qqmath.lmList(glm.lis)#
Using as id variables
We constructed the following models with the package glmmADMB. They all have the same fixed effects, species x crosstype, and response variable, log10(mass)
| Distribution, Random Effects: | None | Maternal plant | Maternal population |
|---|---|---|---|
| normal (norm) | X | X | X |
#Normal (Gaussian) distribution, identity link
ib <- ib[!is.na(ib$collect.date),]
sc.norm.l <- lm(log10(mass)~species*crosstype+collect.date, data=ib)
sc.mix.mompid.l <- lmer(log10(mass)~species*crosstype+collect.date + (1|mompid), data=ib)
sc.nd.mix.momdadpid.l <- lmer(log10(mass)~species*crosstype + (1|mompid)+ (1|dadpid), data=ib)
sc.mix.mompop.l <- lmer(log10(mass)~species*crosstype+collect.date + (1|mompop), data=ib)
sc.mix.momdadpid.l <- lmer(log10(mass)~species*crosstype+collect.date + (1|mompid) + (1|dadpid), data=ib)
sc.mix.momdadpop.l <- lmer(log10(mass)~species*crosstype+collect.date + (1|mompop) + (1|dadpop), data=ib)
We will use the Aikake Information Criterion to pick the model the best fits the data, penalized by the number of parameters. Differences of 2 units are significant.
sc.names <- c("sc.norm.l","sc.mix.mompid.l","sc.mix.mompop.l","sc.mix.momdadpid.l","sc.mix.momdadpop.l")
sc.list <- sapply(sc.names, get, USE.NAMES=T)
sc.AIC <- ICtab(sc.list,mnames=sc.names,type="AIC", base=T, delta=F) # for AICc, nobs=nobs(sc.list[[1]])
class(sc.AIC)<-"data.frame"
all.names <- c(sc.names)
all.list <- sapply(all.names, get, USE.NAMES=T)
all.AIC <- dfun(rbind(sc.AIC))
all.AIC <- all.AIC[order(all.AIC$dAIC),]
kable(all.AIC, format.arg=list(digits=3))
| dAIC | df | |
|---|---|---|
| sc.norm.l | 0.0 | 8 |
| sc.mix.mompid.l | 45.7 | 9 |
| sc.mix.momdadpid.l | 46.6 | 10 |
| sc.mix.momdadpop.l | 48.9 | 10 |
| sc.mix.mompop.l | 51.7 | 9 |
The chosen model is a mixed model with the following components:
Looking at the normal, fixed effects model, we see that the residuals are not normal:
shapiro.test(sc.norm.l$residuals)#raw residuals!
Shapiro-Wilk normality test
data: sc.norm.l$residuals
W = 0.82461, p-value < 2.2e-16
The coefficients estimated for each model agree qualitatively.
sc.log.names <- sc.names
sc.log <- sapply(sc.log.names, get, USE.NAMES=T)
coefplot2(sc.log, legend.x="topright",legend=T,legend.args=list(cex=0.8, xpd=T, inset=c(-0.1,0)), col.pts=sample(gg_color_hue(length(sc.log.names))), spacing=0.05, lwd.2=2, lwd.1=4, intercept=F)
We chose the model with nearly the best (lowest) AIC, to carry out inference tests and parameter estimation.
mod <- sc.mix.momdadpid.l
print(mod)
Linear mixed model fit by REML ['lmerMod']
Formula:
log10(mass) ~ species * crosstype + collect.date + (1 | mompid) +
(1 | dadpid)
Data: ib
REML criterion at convergence: -33.4985
Random effects:
Groups Name Std.Dev.
dadpid (Intercept) 0.01992
mompid (Intercept) 0.03578
Residual 0.23136
Number of obs: 1052, groups: dadpid, 23; mompid, 21
Fixed Effects:
(Intercept) specieskaal
0.9346396 -0.2940649
crosstypewithin crosstypehybrid
0.0196676 -0.0075099
collect.date specieskaal:crosstypewithin
0.0006075 -0.0924778
specieskaal:crosstypehybrid
0.2200991
By dropping it from the model and performing a likelihood-ratio test, we see that the species x crosstype interaction is significant:
sxc.chisq <- drop1(mod, test="Chisq") #load from file
dfun(sxc.chisq)
Single term deletions
Model:
log10(mass) ~ species * crosstype + collect.date + (1 | mompid) +
(1 | dadpid)
Df dAIC LRT Pr(Chi)
<none> 0.000
collect.date 1 11.138 13.138 0.0002894 ***
species:crosstype 2 22.229 26.229 2.016e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The model estimated the following parameters, with individual parameter significance determined by the Wald z-test, and fixed effect significance determined by analysis of deviance Wald test.
summary(mod)
Linear mixed model fit by REML ['lmerMod']
Formula:
log10(mass) ~ species * crosstype + collect.date + (1 | mompid) +
(1 | dadpid)
Data: ib
REML criterion at convergence: -33.5
Scaled residuals:
Min 1Q Median 3Q Max
-9.8218 -0.4103 0.1157 0.5712 2.3843
Random effects:
Groups Name Variance Std.Dev.
dadpid (Intercept) 0.0003967 0.01992
mompid (Intercept) 0.0012800 0.03578
Residual 0.0535294 0.23136
Number of obs: 1052, groups: dadpid, 23; mompid, 21
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.9346396 0.0575279 16.247
specieskaal -0.2940649 0.0315574 -9.318
crosstypewithin 0.0196676 0.0306259 0.642
crosstypehybrid -0.0075099 0.0219931 -0.341
collect.date 0.0006075 0.0001667 3.644
specieskaal:crosstypewithin -0.0924778 0.0446544 -2.071
specieskaal:crosstypehybrid 0.2200991 0.0433032 5.083
Correlation of Fixed Effects:
(Intr) spcskl crsstypw crsstyph cllct. spcskl:crsstypw
specieskaal -0.228
crsstypwthn -0.139 0.231
crsstyphybr -0.174 0.462 0.342
collect.dat -0.924 -0.042 0.013 -0.050
spcskl:crsstypw 0.087 -0.370 -0.688 -0.247 0.002
spcskl:crsstyph 0.095 -0.519 -0.167 -0.603 0.040 0.275
Anova(mod, type=3)
Analysis of Deviance Table (Type III Wald chisquare tests)
Response: log10(mass)
Chisq Df Pr(>Chisq)
(Intercept) 263.9556 1 < 2.2e-16 ***
species 86.8325 1 < 2.2e-16 ***
crosstype 0.7693 2 0.6806900
collect.date 13.2784 1 0.0002685 ***
species:crosstype 38.8599 2 3.645e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The least square means procedure can generate predictor estimates of each type, and give their significance groupings with a post-hoc Tukey test. S. hookeri-produced hybrids produce less Vegetative Biomass than either crosses between or within S. hookeri populations. The other differences are not significant, but remember that the fixed effect of hybrid (vs. between) was significant (model summary).
#Count
rg <- ref.grid(mod)
Loading required namespace: lmerTest
#summary(rg)
sxc.lsm <- lsmeans(rg, ~ crosstype*species)
plot(sxc.lsm)
options(digits=4)
cld.mod <- cld(sxc.lsm, Letters=letters) #tukey letterings
cld.mod$response <- 10 ^ cld.mod$lsmean
cld.mod$uSE <- 10 ^ (cld.mod$lsmean+cld.mod$SE)
cld.mod$lSE <- 10 ^ (cld.mod$lsmean-cld.mod$SE)
cld.mod[rev(order(cld.mod$species, cld.mod$crosstype)),]
crosstype species lsmean SE df lower.CL upper.CL .group response
hybrid kaal 1.0493 0.03235 96.63 0.9811 1.1176 b 11.203
within kaal 0.7639 0.03115 63.09 0.6982 0.8297 a 5.807
between kaal 0.8367 0.02258 34.68 0.7891 0.8844 a 6.867
hybrid hook 1.1233 0.02026 16.25 1.0805 1.1661 b 13.283
within hook 1.1505 0.03121 71.05 1.0846 1.2164 b 14.141
between hook 1.1308 0.02202 16.91 1.0843 1.1773 b 13.515
uSE lSE
12.069 10.399
6.239 5.405
7.233 6.519
13.917 12.678
15.195 13.160
14.218 12.847
Degrees-of-freedom method: satterthwaite
Results are given on the log10 (not the response) scale.
Confidence level used: 0.95
P value adjustment: tukey method for comparing a family of 6 estimates
significance level used: alpha = 0.05
H.wb <- with(cld.mod[cld.mod$species=="hook",], response[crosstype=="within"]/response[crosstype=="between"] - 1)
K.wb <- with(cld.mod[cld.mod$species=="kaal",], response[crosstype=="within"]/response[crosstype=="between"] - 1)
K.H <- with(cld.mod[cld.mod$crosstype=="between",], response[species=="kaal"]/response[species=="hook"] - 1)
maxsp <- ifelse(K.H>0, "kaal","hook")
minsp <- ifelse(K.H<0, "kaal","hook")
maxresp <- with(cld.mod, response[species==maxsp & crosstype=="between"])
minresp <- with(cld.mod, response[species==minsp & crosstype=="between"])
HK.resp <- with(cld.mod, response[species=="hook" & crosstype=="hybrid"])
KH.resp <- with(cld.mod, response[species=="kaal" & crosstype=="hybrid"])
HK.int <- with(cld.mod, ifelse(HK.resp > minresp & HK.resp < maxresp, (HK.resp-minresp)/(maxresp-minresp),
ifelse(HK.resp < minresp, HK.resp/minresp-1, HK.resp/maxresp-1)))
KH.int <- with(cld.mod, ifelse(KH.resp > minresp & KH.resp < maxresp, (KH.resp-minresp)/(maxresp-minresp),
ifelse(KH.resp < minresp, KH.resp/minresp-1, KH.resp/maxresp-1)))
intermed <- (minresp + maxresp) / 2
with(ib, wilcox.test(ib[species=="kaal" & crosstype=="hybrid","mass"], mu=intermed))
Wilcoxon signed rank test with continuity correction
data: ib[species == "kaal" & crosstype == "hybrid", "mass"]
V = 1938, p-value = 1e-05
alternative hypothesis: true location is not equal to 10.19
with(ib, wilcox.test(ib[species=="hook" & crosstype=="hybrid","mass"], mu=intermed))
Wilcoxon signed rank test with continuity correction
data: ib[species == "hook" & crosstype == "hybrid", "mass"]
V = 44724, p-value <2e-16
alternative hypothesis: true location is not equal to 10.19
round(c(H.wb,K.wb,K.H,HK.int,KH.int),2)
[1] 0.05 -0.15 -0.49 0.97 0.65
ggplot(as.data.frame(cld.mod), aes(y=response, x=relevel(crosstype, "within"), fill=species)) +
geom_col(position=position_dodge2()) +
geom_linerange(aes(ymin=lSE, ymax=uSE), position=position_dodge(0.9)) +
labs(x="", y="Vegetative biomass (g)",fill="Maternal species") +
scale_fill_manual(labels = c("S. hookeri ", "S. kaalae "), values=c("grey70","grey30")) +
scale_x_discrete(labels = c("Intrapopulation", "Interpopulation", "Hybrid")) +
geom_text(aes(label=.group), position=position_dodge(0.9), hjust=0, vjust=-1) +
scale_y_continuous(expand = expand_scale(add=c(0,0)), breaks = scales::pretty_breaks(n = 5)) +
theme_classic() + theme(legend.text=element_text(face="italic", size=rel(1)), legend.position="bottom", axis.text = element_text(colour="black", size=rel(1)), text=element_text(size=14), axis.ticks.x = element_blank()) + geom_segment(aes(x=2.5, y=intermed, xend=3.5, yend=intermed))