This is the code used to analyse the data in Parasites as biological tags for stock discrimination of beaked redfish (Sebastes mentella): parasite infra-communities vs. limited resolution of cytochrome markers by Klapper et al. The data is on Figshare.
First we have to set some things up: libraries, and a change to one function, that will be used later.
library(gdata)
library(parallel)
library(boral)
library(RColorBrewer)
library(MASS)
# Function to and 'and' in the right place in the text
pasteand <- function(vec) {
v1 <- paste(vec[-length(vec)], collapse=", ")
paste(v1, vec[length(vec)], sep=", and ")
}
# replacement to lvsplot in boral: main difference is that colours an be specified
lvsplot2 <- function (x, jitter = FALSE, a = 1, biplot = TRUE, ind.spp = NULL, which=c(1,2),
alpha = 0.5, main = NULL, est = "median",cols.lvs="black", cols.coefs="red", ...) {
if (x$num.lv == 0)
stop("No latent variables to plot.")
if (x$num.lv > 2 & length(which)!=2)
stop("which should be of length 2.")
if (x$num.lv > 2 & any(which>x$num.lv))
stop("Fewer latent varibales than chosen by which")
n <- nrow(x$lv.median)
p <- nrow(x$lv.coefs.median)
if (!is.null(ind.spp)) {
if (ind.spp > p) {
ind.spp <- p
}
}
if (biplot == TRUE & !is.null(ind.spp)) {
message("Only the first", ind.spp, "`most important' latent variable coefficients included in biplot")
}
if (biplot == TRUE & is.null(ind.spp)) {
ind.spp <- p
if(x$num.lv == 2) message("All latent variable coefficients included in biplot")
if(x$num.lv > 2) message(paste0("Latent variable coefficients ", which[1], " and ", which[2], " included in biplot"))
}
par(cex = a, cex.axis = a, cex.lab = a + 0.5, mar = c(5,
5, 3, 1), mfrow = c(1, 1), las = 1, cex.main = a + 0.5,
...)
if (x$num.lv == 1) {
choose.x <- x$lv.median
main <- "Plot of latent variable posterior medians"
if (est == "mean") {
choose.x <- x$lv.mean
main <- "Plot of latent variable posterior means"
}
plot(1:n, choose.x, xlab = "Row index", ylab = "Latent variable 1",
main = main, cex = 1.2 * a, type = "n", ...)
text(x = 1:n, y = x$lv.median, label = 1:n, cex = 1.2 *
a)
}
if (x$num.lv >= 2) {
# No point in calculating this if est=="mean"
# testcov <- x$lv.median[, which] %*% t(x$lv.coefs.median[, 1+which])
if (est == "mean") {
testcov <- x$lv.mean[, which] %*% t(x$lv.coefs.mean[, 1+which])
} else {
testcov <- x$lv.median[, which] %*% t(x$lv.coefs.median[, 1+which])
}
do.svd <- svd(testcov, length(which), length(which)) # replaced x$num.lv by length(which)
choose.lvs <- scale(do.svd$u * matrix(do.svd$d[1:length(which)]^alpha,
nrow = x$n, ncol = 2, byrow = T), center = T, scale = F)
choose.lv.coefs <- scale(do.svd$v * matrix(do.svd$d[1:length(which)]^(1 -
alpha), nrow = x$p, ncol = 2, byrow = T), center = T,
scale = F)
if (!biplot) {
if (is.null(main) & est == "median") {
main = "Plot of latent variable posterior medians"
}
if (is.null(main) & est == "mean") {
main = "Plot of latent variable posterior means"
}
plot(choose.lvs, xlab = "Latent variable 1", ylab = "Latent variable 2",
main = main, cex = 1.2 * a, type = "n", ...)
if (!jitter)
text(choose.lvs, label = 1:n, cex = 1.2 * a)
if (jitter)
text(jitter(choose.lvs[, 1]), jitter(choose.lvs[,
2]), label = 1:n, cex = 1.2 * a)
}
if (biplot) {
if (is.null(main) & est == "median") {
main = "Biplot of latent variable posterior medians"
}
if (is.null(main) & est == "mean") {
main = "Biplot of latent variable posterior means"
}
largest.lnorms <- order(rowSums(choose.lv.coefs^2),
decreasing = TRUE)[1:ind.spp]
plot(rbind(choose.lvs, choose.lv.coefs), xlab = paste0("Latent variable ", which[1]),
ylab = paste0("Latent variable ", which[2]), main = main, cex = a,
type = "n", xlim = 1.1 * range(rbind(choose.lvs,
choose.lv.coefs)[, 1]), ylim = 1.1 * range(rbind(choose.lvs,
choose.lv.coefs)[, 2]))
if (!jitter)
text(choose.lvs, label = 1:n, cex = 1.2 * a, col=cols.lvs)
if (jitter)
text(jitter(choose.lvs[, 1]), jitter(choose.lvs[, 2]),
label = 1:n, cex = 1.2 * a, col=cols.lvs)
text(choose.lv.coefs[largest.lnorms, ],
label = rownames(x$lv.coefs.mean[largest.lnorms, ]), col = cols.coefs, cex = 0.9 * a)
}
}
par(mfrow = c(1, 1), las = 0, cex = 1, cex.axis = 1, cex.lab = 1,
ask = FALSE, cex.main = 1.2, mar = c(5.1, 4.1, 4.1, 2.1))
}
First we read in the data and then change it a bit, and add a few variables.
url <- "https://ndownloader.figshare.com/files/4938157" # Hopefully this is stable!
Data <- read.csv(url, stringsAsFactors = FALSE)
Data$Origin <- gsub("rI", "r I", Data$Origin) # BearIsland -> Bear Island
Data$Origin <- factor(Data$Origin)
Data$Greenland=(Data$Origin=="Greenland")
Data$Tampen=(Data$Origin=="Tampen")
Data$GenderF=(Data$gender=="f")
# Scale covariates
Data$TL.sc <- scale(Data$TL)
Data$TW.sc <- scale(Data$TW)
Data$SL.sc <- scale(Data$SL)
Data$FultonK.sc <- scale(Data$FultonK)
names(Data) <- gsub("_", "\n", names(Data))
# Get variable names for covariates and responses
NotUse <- c("Origin", "Fish", "TW", "TL", "FultonK")
Covars <- c("Greenland", "Tampen", "TL.sc", "FultonK.sc")
Species <- names(Data)[grep("\n", names(Data), fixed=TRUE)]
SpLabs <- gsub("\n", " ", Species)
As a preliminary, we check to see if there is a correlation between total length and condition. There is no evidence for one (the correlation is 0.1), but there seem to be two small fish.
Col <- brewer.pal(3, "Dark2")[as.numeric(factor(Data$gender))]
par(mfrow=c(1,1), mar=c(4.1,4.1,1,1), oma=rep(0,4))
plot(Data$FultonK, Data$TL, xlab="Fulton's K", ylab="Total Length, cm", col=Col, pch=16)
legend(1.4,33, levels(factor(Data$gender)), fill=brewer.pal(3, "Dark2"))
Fig. 1: Plot of Fulton’s K against total fish length
We can also plot the means of the different covariates. Although the data are mixed, Tampen fish may be slightly larger, and Bear Island fish a bit smaller. But there is plenty of overlap.
Data$OriginGender <- paste(Data$Origin, Data$gender, sep="_")
SizeData <- data.frame(
TL=tapply(Data$TL, list(Data$OriginGender), mean, na.rm=TRUE),
TL.sd=tapply(Data$TL, list(Data$OriginGender), sd, na.rm=TRUE),
TW=tapply(Data$TW, list(Data$OriginGender), mean, na.rm=TRUE),
TW.sd=tapply(Data$TW, list(Data$OriginGender), sd, na.rm=TRUE),
SL=tapply(Data$SL, list(Data$OriginGender), mean, na.rm=TRUE),
SL.sd=tapply(Data$SL, list(Data$OriginGender), sd, na.rm=TRUE),
N=tapply(Data$TW, list(Data$OriginGender), length)
)
SizeData$Origin <- gsub("_[xmf]", "",rownames(SizeData))
SizeData$Gender <- gsub("[ a-zA-Z]*_", "",rownames(SizeData))
SizeData$At=as.numeric(factor(SizeData$Origin))*3+as.numeric(factor(SizeData$Gender))-1.5
SizeData$Col=brewer.pal(4, "Dark2")[as.numeric(factor(SizeData$Gender))]
SizeData$TL.LCI=SizeData$TL-SizeData$TL.sd; SizeData$TL.UCI=SizeData$TL+SizeData$TL.sd;
SizeData$SL.LCI=SizeData$SL-SizeData$SL.sd; SizeData$SL.UCI=SizeData$SL+SizeData$SL.sd;
SizeData$TW.LCI=SizeData$TW-SizeData$TW.sd; SizeData$TW.UCI=SizeData$TW+SizeData$TW.sd;
SizeData <- SizeData[!SizeData$Gender=="x",] # Only use male & female
par(mfrow=c(1,2), mar=c(4,1,1,1), oma=c(2,5,0,0))
plot(SizeData$TL, SizeData$At, col=SizeData$Col,
xlim=c(min(SizeData$TL.LCI, na.rm=TRUE), max(SizeData$TL.UCI, na.rm=TRUE)), yaxt="n", ann=FALSE)
segments(SizeData$TL.LCI, SizeData$At, SizeData$TL.UCI, SizeData$At, lwd=1.5, col=SizeData$Col)
mtext("Total length, cm" , 1, line=3)
axis(2, unique(factor(SizeData$Origin)), at=as.numeric(unique(factor(SizeData$Origin)))*3, las=1)
plot(SizeData$TW, SizeData$At, col=SizeData$Col,
xlim=c(min(SizeData$TW.LCI, na.rm=TRUE), max(SizeData$TW.UCI, na.rm=TRUE)), yaxt="n", ann=FALSE)
segments(SizeData$TW.LCI, SizeData$At, SizeData$TW.UCI, SizeData$At, lwd=1.5, col=SizeData$Col)
legend(920, 6, c("Male", "Female"), fill=brewer.pal(4, "Dark2")[2:1])
mtext("Total Weight, g" , 1, line=3)
Fig. 2: Differences in Covariates, by population. Means and +/- 1 standard deviation.
We can look at the effects of the covariates on the total abundance. First we fit a GLM assuming a negative binomial repsonse.
Data$TotAbund <- apply(Data[,Species],1,sum)
Abund <- glm.nb(TotAbund ~ TL.sc+FultonK.sc+gender+Origin, data=Data) #, family="poisson")
Summ <- cbind(summary(Abund)$coefficients[-1,], confint(Abund)[-1,])
The coefficients are plotted below. The only large effect is that abundance is lower in fish from Greenland (ignoring the Unknown Gender effect, which is only based on 2 fish, and is poorly estimated): this is clearly larger than any other effect: the next largest effect is of being Male, but this is 4.7 times smaller.
VarNames <- c("Total\nLength", "Fulton's K", "Male", "Unknown\nGender", "Greenland", "Tampen")
par(mfrow=c(1,1), mar=c(4.1,5,1,1))
plot(Summ[,"Estimate"], 1:nrow(Summ), xlim=c(min(Summ[,"2.5 %"]), max(Summ[,"97.5 %"])),
yaxt="n", ann=FALSE)
segments(Summ[,"2.5 %"], 1:nrow(Summ), Summ[,"97.5 %"], 1:nrow(Summ))
mtext("Estimated Effect Size", 1, line=2.4)
axis(2, VarNames, at=1:nrow(Summ), las=1)
abline(v=0, lty=3)
Fig. 3: Estimated Coefficients for effects of covariates on total abundance of parasites. Note that Total Length and Fulton’s K are standardised, so the coefficient represents the change in log abundance when the covariate (e.g. total length) changes by one standard deviation
We use boral (F. K. Hui 2015) to do a parametric ordination (see also Warton et al. (2015)). This fits a GLM to each species, but then adds some terms to model the correlations between species, which are the same as a constrained ordination. We can then plot these correlations as an ordination plot. I have Quick as a variable for testing: set it to true if you just want to check that the code works.
Quick <- FALSE
if(Quick) {
nBurn=1e1; nIter=1e2; nThin=1
} else {
nBurn=1e3; nIter=1e5; nThin=5e2
}
# Without, then with, covariates
bor0 <- boral(y=Data[,Species], X=NULL, family="negative.binomial", n.burnin = nBurn,
n.iteration = nIter,n.thin=nThin, hypparams = c(10,10,10,10), num.lv=2)
bor <- boral(y=Data[,Species], X=Data[,Covars], family="negative.binomial",
n.burnin = nBurn, n.iteration = nIter, n.thin=nThin,
hypparams = c(10,10,10,10), num.lv=2, ssvs.index = 0)
# Calculate Bayes factors and effect sizes
BigEffs <- bor$ssvs.indcoefs.mean[,"Greenland"]>(10/11)
GreenlandPostPs <- bor$ssvs.indcoefs.mean[BigEffs, "Greenland"]
GreenlandBFs <- GreenlandPostPs/(1-GreenlandPostPs)
GreenlandSizes <- bor$X.coefs.median[BigEffs,"Greenland"]
# save(bor, bor0, Data, file="Posts.RData")
First, we can plot the estimated effects of the covariates. The locations are contrasted to Bear Island (if another location is better as a baseline, tell me). The darker the estimate, the higher the evidence for an effect. We can see a few effects: in particular, the main effects are differences between Bear Island and Greenland: Chondracanthus nodosus is only found in Greenland fish (where it was found in 9 out of 39 fish), and Anisakis simplex, Hysterothylacium aduncum, and Bothriocephalus scorpii are all more prevalent in Bear Island fish: in particular B. scorpii was not found is Greenland fish but was found in about 24% of fish from Tampen and Bear Island. No other effects are worth mentioning (i.e. their Bayes Factor are all less than 3).
# Function to plot the coefficients
PlotCoefs <- function(name, obj, labely=TRUE, use.ssvs=TRUE, ...) {
if(use.ssvs & exists("ssvs.indcoefs.mean", where=obj))
Col=grey(2-2*pmax(0.6,obj$ssvs.indcoefs.mean[,name]))
At.y <- 1:nrow(obj$X.coefs.median)
plot(obj$X.coefs.median[,name], At.y, yaxt="n", col=Col,
xlim=c(min(obj$hpdintervals$X.coefs.lower[,name]),
max(obj$hpdintervals$X.coefs.upper[,name])),...)
segments(obj$hpdintervals$X.coefs.lower[,name], At.y,
obj$hpdintervals$X.coefs.upper[,name], At.y, col=Col)
abline(v=0, lty=3)
if(length(labely)==nrow(obj$X.coefs.median)) {
axis(2, at=At.y, labels=labely, las=1)
} else {
if(labely) axis(2, at=At.y, labels=rownames(obj$X.coefs.median), las=1)
}
}
par(mfrow=c(2,2), mar=c(2,1,3,1), oma=c(2,12,0,0))
PlotCoefs(name = "Greenland", obj = bor, labely=SpLabs, use.ssvs=TRUE,
main="Greenland", xlab="", ylab="")
PlotCoefs(name = "Tampen", obj = bor, labely=FALSE, use.ssvs=TRUE,
main="Tampen", xlab="", ylab="")
PlotCoefs(name = "TL.sc", obj = bor, labely=SpLabs, use.ssvs=TRUE,
main="Total Length", xlab="", ylab="")
PlotCoefs(name = "FultonK.sc", obj = bor, labely=FALSE, use.ssvs=TRUE,
main="Fulton's K", xlab="", ylab="")
mtext("Coefficient", 1, outer=TRUE, line=1)
Fig. 4: Posterior estimates of effects of covariates on the abundance of parasites. Points: posterior mean, Lines: 95% Highest Posterior Density Interval.
We can look at the ordination below, first with no covariates, so we can see the effects of population
Fill <- brewer.pal(3, "Paired"); Fill[2] <- "red" # 2 = Greenland
Cols <- Fill[as.numeric(Data$Origin)]
par(mar=c(2,1,3,1), oma=c(2,2,0,0))
lvsplot2(bor0, alpha=0.6, main="", cols.lvs = Cols, cols.coefs = "grey60", a=0.6)
legend(1,0.8, levels(Data$Origin), fill=Fill, cex=0.7)
Fig. 5: Ordination of data (i.e. with no covariates).
The pattern is Scolex pleuronectis sitting out from the other species: it is only present in fish 18, 29, 30, 106, and 108, where it has a mean abundance of 16.4 individuals, so this pulls these fish away from the rest.
par(mar=c(2,1,3,1), oma=c(2,2,0,0))
lvsplot2(bor, alpha=0.6, main="", cols.lvs = Cols, cols.coefs = "grey60", a=0.6)
legend(1,0.8, levels(Data$Origin), fill=Fill, cex=0.6)
Fig. 6: Ordination of residual variation (i.e. after accounting for covariates)
We can use the presence of individual parasite species to classify the fish into population. If the proportion of fish in population \(s\) with a parasite is is \(p_s\), and the number of fish in population \(S\) is \(N_S\) then the probability that a fish with a parasite comes from population \(s\) is \(Pr(S=s|X)\) which is
\[ Pr(S=s|X) = \frac{Pr(X|S=s)Pr(S=s)}{\sum_t Pr(X|S=t)Pr(S=t)} \]
where \(Pr(X|S=s)\) is simply the proportion of infected fish in population \(s\) and \(Pr(S=s)\) is the probability that a randon fish comes from population \(s\), i.e. \(Pr(S=s) = {N_s}/{\sum_t N_t}\). If we assume equal probabilities (for simplicity), \(Pr(S=s)\) cancels out, so we can calculate this simply from the proportions of infected fish in each population.
UseSp <- names(GreenlandSizes)
GetProp <- function(dat, lst) tapply(dat, lst, function(v) mean(v>0))
Props <- t(apply(Data[,UseSp], 2, GetProp, lst=list(Data$Origin)))
rownames(Props) <- gsub("\n"," ",rownames(Props))
Props <- sweep(Props, 1, apply(Props,1,sum), "/")
knitr::kable(Props, digits=2)
| Bear Island | Greenland | Tampen | |
|---|---|---|---|
| Anisakis simplex | 0.42 | 0.20 | 0.38 |
| Hysterothylacium aduncum | 0.54 | 0.17 | 0.29 |
| Bothriocephalus scorpii | 0.52 | 0.00 | 0.48 |
| Chondracanthus nodosus | 0.00 | 1.00 | 0.00 |
We can see, for example, that a fish with Chondracanthus nodosus, for example, must be from Greenland whereas one with Bothriocephalus scorpii cannot be. We also see that presence of Anisakis simplex is a poor biomarker, on the other hand this means that as it is present in 77% of fish overall, and is the most frequent, any genetic differentiation will be more useful.
Hui, Francis K.C. 2015. “boral - Bayesian Ordination and Regression Analysis of Multivariate Abundance Data in R.” Methods in Ecology and Evolution, November, n/a–/a. doi:10.1111/2041-210X.12514.
Warton, David I, F Guillaume Blanchet, Robert B O Hara, Otso Ovaskainen, Sara Taskinen, Steven C Walker, and Francis K C Hui. 2015. “So many variables: joint modeling in community ecology.” Trends in Ecology & Evolution xx (12). Elsevier Ltd: 1–14. doi:10.1016/j.tree.2015.09.007.