Modified from hybridViability.Rmd (Oct 2017)
Identify reproductive barriers between two sympatric moth-pollinated plant species, Schiedea kaalae and S. hookeri by fitting a generalized linear mixed model (GLMM).
In the experimental design, the following crosstypes were made:
In this analysis the response variable is the pollen viability of each cross. Other barriers (hybrid survival, flowering) could be analyzed in a similar framework, with appropriate changes to the underlying distribution.
Fixed effects:
Potential random effects:
pol <- read.table("hybridpollen.csv", header=T, sep="\t", colClasses=c(date="Date"))
pol$v <- rowSums(pol[,grepl("V", colnames(pol))])/10 #average viable counts
pol$i <- rowSums(pol[,grepl("I", colnames(pol))])/10 #average inviable counts
pol$vpf <- (pol$v * 444.4/20) * 10 #number viable per anther (5 flrs x 4 anthers)
pol$ipf <- (pol$i * 444.4/20) * 10 #number inviable per anther (5 flrs x 4 anthers)
pol$vp <- pol$vpf / (pol$vpf + pol$ipf) #proportion viable
pol$tpf <- pol$vpf + pol$ipf
pol <- pol[pol$crosstype!="field",]
momdadid <- as.data.frame(matrix(unlist(matrix(lapply(matrix(unlist(strsplit(as.character(pol$fullcross), " x ", fixed=T)), nrow=2), function(x) { spl <- unlist(strsplit(unlist(strsplit(x, " ", fixed=T)), "-", fixed=T)); c(spl[1],gsub("-NA","",paste(spl[2],spl[3],spl[4], sep="-"))) }), ncol=2)), ncol=4, byrow=T))
colnames(momdadid) <- c("mompop","momid","dadpop","dadid")
pol <- cbind(pol, momdadid)
pol$species
NULL
#treat populations as factors
pol$species <- factor(ifelse(pol$mompop %in% c("904","3587", "892"), "kaal", "hook"))
pol$dadsp <- factor(ifelse(pol$dadpop %in% c("904","3587", "892"), "kaal", "hook"))
pol$cross <- factor(toupper(paste0(substr(pol$species,0,1), substr(pol$dadsp,0,1))))
crosscol <- c("green","blue","orange","red")
#rename crosstype codes
pol$crosstype <- factor(pol$crosstype, levels=c("between", "within", "hybrid"))
#made "between" the first reference level to facilitate comparison between outcrossing populations and hybridizing species
pol$mompop <- sapply(pol$mompop, mapvalues, from = c("794","866","899","879","892","904","3587"), to = c("WK","WK","WK","879WKG","892WKG","904WPG","3587WP"))
pol$dadpop <- sapply(pol$dadpop, mapvalues, from = c("794","866","899","879","892","904","3587"), to = c("WK","WK","WK","879WKG","892WKG","904WPG","3587WP"), warn_missing=F)
#define interactions
pol <- within(pol, sxc <- interaction(species,crosstype))
pol <- within(pol, sxcxm <- interaction(species,crosstype,mompop,momid))
pol <- within(pol, mompid <- as.factor(paste(mompop,momid,sep=".")))
pol <- within(pol, dadpid <- as.factor(paste(dadpop,dadid,sep=".")))
pol <- within(pol, smompop <- as.factor(paste(species,mompop,sep="")))
#check final structure
str(pol)
'data.frame': 197 obs. of 45 variables:
$ date : Date, format: "15-10-19" "16-08-17" ...
$ cell : int 3 8 2 5 3 1 3 3 4 NA ...
$ fullcross: Factor w/ 105 levels "3587-12","3587-14",..: 54 54 54 53 53 53 53 53 53 53 ...
$ V1 : int 30 17 13 16 20 3 26 26 24 5 ...
$ I1 : int 28 54 40 43 36 42 35 43 62 42 ...
$ V2 : int 24 2 20 18 11 6 21 50 18 6 ...
$ I2 : int 42 29 27 46 35 31 35 37 71 68 ...
$ V3 : int 26 9 5 17 12 0 24 24 12 4 ...
$ I3 : int 33 43 23 57 34 33 21 27 59 36 ...
$ V4 : int 30 10 6 21 24 6 30 49 15 5 ...
$ I4 : int 28 50 37 36 47 23 44 37 53 55 ...
$ V5 : int 24 10 12 13 10 4 19 40 9 5 ...
$ I5 : int 37 41 36 57 32 35 30 32 27 48 ...
$ V6 : int 26 14 7 13 12 5 30 41 19 5 ...
$ I6 : int 35 43 49 39 21 26 28 28 47 50 ...
$ V7 : int 37 7 7 22 13 1 31 31 15 2 ...
$ I7 : int 45 54 42 55 37 33 25 31 43 60 ...
$ V8 : int 40 14 8 15 8 2 15 30 24 6 ...
$ I8 : int 43 59 44 47 27 29 46 30 40 42 ...
$ V9 : int 38 14 8 23 26 1 24 44 7 4 ...
$ I9 : int 25 50 37 65 43 32 42 57 53 69 ...
$ V10 : int 33 12 14 14 12 2 33 50 14 7 ...
$ I10 : int 57 54 44 37 39 33 37 49 47 59 ...
$ crosstype: Factor w/ 3 levels "between","within",..: 3 3 3 3 3 3 3 3 3 3 ...
$ set : int 2 2 2 2 2 2 2 2 2 NA ...
$ id : Factor w/ 131 levels "","1","10","102-7",..: 93 112 19 67 93 104 3 3 44 68 ...
$ notes : Factor w/ 8 levels "","3587-14 (4/3/13)",..: 1 1 1 1 1 1 1 1 1 1 ...
$ v : num 30.8 10.9 10 17.2 14.8 3 25.3 38.5 15.7 4.9 ...
$ i : num 37.3 47.7 37.9 48.2 35.1 31.7 34.3 37.1 50.2 52.9 ...
$ vpf : num 6844 2422 2222 3822 3289 ...
$ ipf : num 8288 10599 8421 10710 7799 ...
$ vp : num 0.452 0.186 0.209 0.263 0.297 ...
$ tpf : num 15132 13021 10643 14532 11088 ...
$ mompop : Factor w/ 5 levels "3587WP","WK",..: 3 3 3 3 3 3 3 3 3 3 ...
$ momid : Factor w/ 16 levels "1","10","10-1",..: 7 7 7 7 7 7 7 7 7 7 ...
$ dadpop : Factor w/ 5 levels "3587WP","WK",..: 4 4 4 4 4 4 4 4 4 4 ...
$ dadid : Factor w/ 14 levels "1","10","10-1",..: 9 9 9 2 2 2 2 2 2 2 ...
$ species : Factor w/ 2 levels "hook","kaal": 1 1 1 1 1 1 1 1 1 1 ...
$ dadsp : Factor w/ 2 levels "hook","kaal": 2 2 2 2 2 2 2 2 2 2 ...
$ cross : Factor w/ 4 levels "HH","HK","KH",..: 2 2 2 2 2 2 2 2 2 2 ...
$ sxc : Factor w/ 6 levels "hook.between",..: 5 5 5 5 5 5 5 5 5 5 ...
$ sxcxm : Factor w/ 480 levels "hook.between.3587WP.1",..: 197 197 197 197 197 197 197 197 197 197 ...
$ mompid : Factor w/ 21 levels "3587WP.14","3587WP.15",..: 7 7 7 7 7 7 7 7 7 7 ...
$ dadpid : Factor w/ 18 levels "3587WP.14","3587WP.7",..: 11 11 11 9 9 9 9 9 9 9 ...
$ smompop : Factor w/ 5 levels "hook879WKG","hookWK",..: 1 1 1 1 1 1 1 1 1 1 ...
The sample sizes are unbalanced at all levels, including maternal population:
reptab <- with(pol, table(smompop,crosstype))
mosaic(reptab, pop=F)
labeling_cells(text = reptab, margin = 0)(reptab)
Replication is low for some within-population crosses. The replication is even lower for each maternal plant, so we need to be wary of estimates when subsetting at this level:
with(pol, kable(table(mompid,crosstype)))
| between | within | hybrid | |
|---|---|---|---|
| 3587WP.14 | 2 | 0 | 6 |
| 3587WP.15 | 1 | 2 | 1 |
| 3587WP.7 | 3 | 3 | 5 |
| 3587WP.A | 0 | 2 | 4 |
| 3587WP.C | 3 | 0 | 0 |
| 879WKG.10-1 | 2 | 2 | 12 |
| 879WKG.2-2 | 2 | 2 | 17 |
| 879WKG.G-2 | 0 | 0 | 6 |
| 879WKG.N-5 | 2 | 1 | 0 |
| 892WKG.1 | 5 | 3 | 1 |
| 892WKG.10 | 0 | 0 | 2 |
| 892WKG.3 | 5 | 1 | 3 |
| 892WKG.4 | 0 | 0 | 2 |
| 892WKG.5 | 6 | 3 | 0 |
| 904WPG.2 | 2 | 1 | 10 |
| 904WPG.3 | 3 | 3 | 10 |
| 904WPG.5 | 5 | 2 | 7 |
| WK.2 | 10 | 2 | 15 |
| WK.2E-1 | 1 | 0 | 7 |
| WK.3 | 0 | 1 | 0 |
| WK.4 | 5 | 0 | 4 |
To identify the best-fitting distribution, we make quantile-quantile plots of the raw data against various distributions. The more points within the confidence interval envelopes, the better the fit. Later, we present quantile-quantile plots of the model residuals to assess model fit.
#QQ plots against various distributions
set.seed(1)
par(mfrow=c(1,3))
normal <- fitdistr(pol$tpf, "normal")
qqp(pol$tpf, "norm", main="Normal")
lognormal <- fitdistr(pol$tpf, "lognormal")
qqp(pol$tpf, "lnorm", main="Log Normal")
pois <- fitdistr(pol$tpf, "Poisson")
qqp(pol$tpf, "pois", pois$estimate, main="Poisson")
gamma <- fitdistr(pol$tpf, "gamma")
qqp(pol$tpf, "gamma", shape = gamma$estimate[[1]], rate = gamma$estimate[[2]], main="Gamma")
ggplot(pol, aes(x = tpf, fill=species)) +
geom_histogram(data=subset(pol,species == "hook"), aes(y=-..density..),binwidth=1000)+
geom_histogram(data=subset(pol,species == "kaal"), aes(y= ..density..),binwidth=1000)+
coord_flip() + facet_grid(~crosstype) + labs(y="Histogram", x="Viability")
ggplot(aes(y=tpf, x=mompid, color=crosstype), data=pol) + geom_count(alpha=0.8) + coord_flip() + labs(x="Maternal plant", y="tpf")
Our mixed model uses one parameter to capture random effect variance, which is assumed to be homogeneous. Plotting on a log scale should uncouple variances from means to assess this visually. Subsets are species * crosstype * maternal plant.
Subset variances are not homogeneous:
ggplot(aes(y=log(tpf+1), x=sxcxm, color=crosstype), data=pol) + geom_boxplot() + coord_flip() + labs(y="ln(Viability + 1)",x="Subsets")
Various distributions make different assumptions about the mean-variance (µ-Var) ratio.
grpVars <- with(pol, tapply(tpf, list(sxcxm), var))
grpMeans <- with(pol, tapply(tpf, list(sxcxm), mean))
grpCounts <- with(pol, tapply(tpf, list(sxcxm), length))
#set weight=grpCounts to weight loess by sample sizes
ggplot(na.omit(data.frame(grpMeans,grpVars,grpCounts)),
aes(x=grpMeans,y=grpVars, weight=1))+geom_point(aes(size=grpCounts))+
guides(colour=guide_legend(title="Fit"),size=guide_legend(title="Sample size")) + labs(x="Subset Mean", y="Subset Variance") + labs(subtitle="Subset: species*crosstype*mompid")
Effects and interactions in these plots are simply given by the mean, which may be unduly influenced by high values.
intplot <- ggplot(pol,aes(x=crosstype,y=tpf))+
geom_count(aes(size = ..prop.., group=sxc),alpha=0.5)+
stat_summary(aes(x=as.numeric(crosstype)),fun.y=mean,geom="line")+ facet_grid(~species)
intplot + aes(group=species, color=species)
intplot + aes(group=mompop, color=mompop)
intplot + aes(group=mompid, color=mompop)
intplot + aes(group=dadpop, color=dadpop)
Run many generalized linear models on subsets of the data defined by crosstype | mompid to see if effects estimates are consistent within maternal plants.
Most maternal plant subsets agree, but some are problematic outliers. These plants can be picked out visually from the random effects interaction plot above, the estimated parameters of each subset model, and the QQ plot of the estimated parameters:
#had to get rid of species or mompid since mompid is nested inside species. dadpop also works
glm.lis <- lmList(tpf~crosstype|mompid,data=pol, family="gaussian", weights=pol$tpf)
plot.lmList(glm.lis,scale=list(x=list(relation="free")))
Loading required package: reshape
Attaching package: 'reshape'
The following objects are masked from 'package:plyr':
rename, round_any
The following object is masked from 'package:Matrix':
expand
Using grp as id variables
qqmath.lmList(glm.lis)#
Using as id variables
We constructed the following models with the package glmmADMB. They all have the same fixed effects, species x crosstype, and response variable, tpf
| distribution, Random Effects: | None | Maternal plant | Maternal population |
|---|---|---|---|
| normal (norm) | X | X | X |
sc <- lm(tpf~species*crosstype, data=pol)
sc.mix.mompid <- lmer(tpf~species*crosstype + (1|mompid), data=pol)
sc.mix.mompop <- lmer(tpf~species*crosstype + (1|mompop), data=pol)
sc.mix.momdadpid <- lmer(tpf~species*crosstype + (1|mompid) + (1|dadpid), data=pol)
We will use the Aikake Information Criterion to pick the model the best fits the data, penalized by the number of parameters. Differences of 2 units are significant.
#AICtab(sc.b, sc.mix.mompop.b, sc.mix.mompid.b)
sc.names <- c("sc", "sc.mix.mompid", "sc.mix.mompop","sc.mix.momdadpid")
sc.list <- sapply(sc.names, get, USE.NAMES=T)
sc.AIC <- ICtab(sc.list,mnames=sc.names,type="AIC", base=T, delta=F) # for AICc, nobs=nobs(sc.list[[1]])
class(sc.AIC)<-"data.frame"
all.names <- c(sc.names)
all.list <- sapply(all.names, get, USE.NAMES=T)
all.AIC <- dfun(rbind(sc.AIC))
all.AIC <- all.AIC[order(all.AIC$dAIC),]
kable(all.AIC, format.arg=list(digits=3))
| dAIC | df | |
|---|---|---|
| sc.mix.momdadpid | 0.00 | 9 |
| sc.mix.mompid | 6.82 | 8 |
| sc.mix.mompop | 8.77 | 8 |
| sc | 93.52 | 7 |
The best-fiting model is a mixed model with the following components:
Looking at the normal, fixed effects model, we see that the residuals are not normal:
shapiro.test(sc$residuals)#raw residuals!
Shapiro-Wilk normality test
data: sc$residuals
W = 0.98331, p-value = 0.01939
The coefficients estimated for each model agree qualitatively.
sc.log.names <- sc.names
sc.log <- sapply(sc.log.names, get, USE.NAMES=T)
coefplot2(sc.log, legend.x="topright",legend=T,legend.args=list(cex=0.8, xpd=T, inset=c(-0.1,0)), col.pts=sample(gg_color_hue(length(sc.log.names))), spacing=0.05, lwd.2=2, lwd.1=4, intercept=T)
We chose the model with nearly the best (lowest) AIC, to carry out inference tests and parameter estimation.
mod <- sc.mix.momdadpid
print(mod)
Linear mixed model fit by REML ['lmerMod']
Formula: tpf ~ species * crosstype + (1 | mompid) + (1 | dadpid)
Data: pol
REML criterion at convergence: 3593.64
Random effects:
Groups Name Std.Dev.
mompid (Intercept) 493.3
dadpid (Intercept) 1086.4
Residual 2666.2
Number of obs: 197, groups: mompid, 21; dadpid, 18
Fixed Effects:
(Intercept) specieskaal
10765.9 752.4
crosstypewithin crosstypehybrid
-658.5 1235.6
specieskaal:crosstypewithin specieskaal:crosstypehybrid
481.0 -2909.3
Using a likelihood ratio test, with a null hypothesis of zero variance, the random effect (maternal plant) is significant for both model parts:
anova(sc, sc.mix.momdadpid) #double this p-value. or simulate null by permuting data.
By dropping it from the model and performing a likelihood-ratio test, we see that the species x crosstype interaction is not significant.
sxc.chisq <- drop1(mod, test="Chisq") #load from file
dfun(sxc.chisq)
Single term deletions
Model:
tpf ~ species * crosstype + (1 | mompid) + (1 | dadpid)
Df dAIC LRT Pr(Chi)
<none> 0.00000
species:crosstype 2 0.13755 4.1375 0.1263
The model estimated the following parameters, with individual parameter significance determined by the Wald z-test, and fixed effect significance determined by analysis of deviance Wald test.
summary(mod)
Linear mixed model fit by REML ['lmerMod']
Formula: tpf ~ species * crosstype + (1 | mompid) + (1 | dadpid)
Data: pol
REML criterion at convergence: 3593.6
Scaled residuals:
Min 1Q Median 3Q Max
-3.2808 -0.5761 -0.0379 0.5410 2.8167
Random effects:
Groups Name Variance Std.Dev.
mompid (Intercept) 243387 493.3
dadpid (Intercept) 1180340 1086.4
Residual 7108792 2666.2
Number of obs: 197, groups: mompid, 21; dadpid, 18
Fixed effects:
Estimate Std. Error t value
(Intercept) 10765.9 779.6 13.810
specieskaal 752.4 989.2 0.761
crosstypewithin -658.5 1144.6 -0.575
crosstypehybrid 1235.6 903.2 1.368
specieskaal:crosstypewithin 481.0 1392.0 0.346
specieskaal:crosstypehybrid -2909.3 1425.0 -2.042
Correlation of Fixed Effects:
(Intr) spcskl crsstypw crsstyph spcskl:crsstypw
specieskaal -0.782
crsstypwthn -0.436 0.346
crsstyphybr -0.798 0.764 0.374
spcskl:crsstypw 0.358 -0.458 -0.823 -0.326
spcskl:crsstyph 0.635 -0.831 -0.236 -0.835 0.325
Anova(mod, type=3)
Analysis of Deviance Table (Type III Wald chisquare tests)
Response: tpf
Chisq Df Pr(>Chisq)
(Intercept) 190.7107 1 < 2e-16 ***
species 0.5786 1 0.44687
crosstype 3.2457 2 0.19733
species:crosstype 5.3045 2 0.07049 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
These are box and QQ (to check normality) plots of the estimated random effect of each maternal plant.
predre <- setNames(data.frame(RE=ranef(mod)$mompid,SD=ranef(mod, sd=T)$`1`),c("RE","SD"))
ggplot(predre, aes(x = rownames(predre),y=RE)) +
geom_point(size = 2) + coord_flip()+
geom_errorbar(aes(ymin = RE-SD, ymax = RE+SD)) + labs(x="Maternal plants", y="Predicted random effects")
#Count
reStack <- ldply(ranef(mod))
print( qqmath( ~`(Intercept)`|.id, data=reStack, scales=list(relation="free"),
prepanel = prepanel.qqmathline,
panel = function(x, ...) {
panel.qqmathline(x, ...)
panel.qqmath(x, ...)
},
layout=c(1,1)))
The least square means procedure can generate predictor estimates of each type, and give their significance groupings with a post-hoc Tukey test. S. hookeri-produced hybrids produce less Viability than either crosses between or within S. hookeri populations. The other differences are not significant, but remember that the fixed effect of hybrid (vs. between) was significant (model summary).
#Count
rg <- ref.grid(mod)
Loading required namespace: lmerTest
#summary(rg)
sxc.lsm <- lsmeans(rg, ~ crosstype*species)
plot(sxc.lsm)
cld.mod <- cld(sxc.lsm, Letters=letters) #tukey letterings
cld.mod$response <- cld.mod$lsmean
cld.mod$uSE <- cld.mod$lsmean+cld.mod$SE
cld.mod$lSE <- cld.mod$lsmean-cld.mod$SE
options(digits=4)
cld.mod[rev(order(cld.mod$species, cld.mod$crosstype)),]
crosstype species lsmean SE df lower.CL upper.CL .group response
hybrid kaal 9845 622.8 21.80 8552 11137 a 9845
within kaal 11341 725.5 60.94 9890 12792 a 11341
between kaal 11518 616.3 36.84 10269 12767 a 11518
hybrid hook 12002 547.9 20.50 10860 13143 a 12002
within hook 10107 1067.9 75.02 7980 12235 a 10107
between hook 10766 779.6 36.06 9185 12347 a 10766
uSE lSE
10467 9222
12066 10615
12135 10902
12549 11454
11175 9039
11545 9986
Degrees-of-freedom method: satterthwaite
Confidence level used: 0.95
P value adjustment: tukey method for comparing a family of 6 estimates
significance level used: alpha = 0.05
H.wb <- with(cld.mod[cld.mod$species=="hook",], response[crosstype=="within"]/response[crosstype=="between"] - 1)
K.wb <- with(cld.mod[cld.mod$species=="kaal",], response[crosstype=="within"]/response[crosstype=="between"] - 1)
K.H <- with(cld.mod[cld.mod$crosstype=="between",], response[species=="kaal"]/response[species=="hook"] - 1)
maxsp <- ifelse(K.H>0, "kaal","hook")
minsp <- ifelse(K.H<0, "kaal","hook")
maxresp <- with(cld.mod, response[species==maxsp & crosstype=="between"])
minresp <- with(cld.mod, response[species==minsp & crosstype=="between"])
HK.resp <- with(cld.mod, response[species=="hook" & crosstype=="hybrid"])
KH.resp <- with(cld.mod, response[species=="kaal" & crosstype=="hybrid"])
HK.int <- with(cld.mod, ifelse(HK.resp > minresp & HK.resp < maxresp, (HK.resp-minresp)/(maxresp-minresp),
ifelse(HK.resp < minresp, HK.resp/minresp-1, HK.resp/maxresp-1)))
KH.int <- with(cld.mod, ifelse(KH.resp > minresp & KH.resp < maxresp, (KH.resp-minresp)/(maxresp-minresp),
ifelse(KH.resp < minresp, KH.resp/minresp-1, KH.resp/maxresp-1)))
intermed <- (minresp + maxresp) / 2
round(c(H.wb,K.wb,K.H,HK.int,KH.int),2)
[1] -0.06 -0.02 0.07 0.04 -0.09
ggplot(as.data.frame(cld.mod), aes(y=response, x=relevel(crosstype, "within"), fill=species)) +
geom_col(position=position_dodge2()) +
geom_linerange(aes(ymin=lSE, ymax=uSE), position=position_dodge(0.9)) +
labs(x="", y="Pollen grains per anther",fill="Maternal plant") +
scale_fill_manual(labels = c("S. hookeri ", "S. kaalae "), values=c("grey70","grey30")) +
scale_x_discrete(labels = c("Intrapopulation", "Interpopulation", "Hybrid")) +
geom_text(aes(label=.group), position=position_dodge(0.9), hjust=0, vjust=-1) +
scale_y_continuous(expand = expand_scale(mult=c(0,0.05)), breaks = scales::pretty_breaks(n = 5)) +
theme_classic() + theme(legend.text=element_text(face="italic", size=rel(1)), legend.position="bottom", axis.text = element_text(colour="black", size=rel(1)), text=element_text(size=13), axis.ticks.x = element_blank()) + geom_segment(aes(x=2.5, y=intermed, xend=3.5, yend=intermed))