Data used here are from our experiments on Providencia rettgeri injected in Drosophila melanogaster. The aim of the experiment is to compare the Pathogen Load Upon Death (PLUD) among several mutant lines of D. melanogaster.
The data consists in CFU counts taken from various dilutions of individual flies homogeneized just after they died from a P. rettgeri infection. Each fly has a unique sample_ID and PLUD must first be estimated on individual flies, before different mutants are compared.
We load data and first multiply the dilution factor by 250, because each fly is homogeneized in 250 microliters of PBS. This way, the estimates will corespond to the number of bacterial cell per infected fly.
d <- read.table("data_CFU_PLUD_microRNA.csv",header=TRUE,sep=";")
d$Dilution <- d$Dilution*250
head(d)## X sample_ID Replicate_plating Volume Dilution Bacteria microRNA Sex
## 1 1 A1_Plate66 1 5 2500 Inf Mir-955 Male
## 2 2 A2_Plate66 1 5 2500 Inf Mir-955 Female
## 3 3 A3_Plate66 1 5 2500 Inf Mir-955 Female
## 4 4 A4_Plate66 1 5 2500 Inf Mir-955 Male
## 5 5 A5_Plate66 1 5 2500 Inf Mir-955 Female
## 6 6 A6_Plate66 1 5 2500 Inf Mir-955 Female
## Treatment
## 1 Balancer
## 2 Balancer
## 3 Balancer
## 4 Sponge
## 5 Balancer
## 6 Balancer
## extract the list of lines/sex/treatment
dlines <- unique(d[,c("sample_ID","microRNA","Sex","Treatment")])
rownames(dlines) <- dlines$sample_ID
head(dlines)## sample_ID microRNA Sex Treatment
## A1_Plate66 A1_Plate66 Mir-955 Male Balancer
## A2_Plate66 A2_Plate66 Mir-955 Female Balancer
## A3_Plate66 A3_Plate66 Mir-955 Female Balancer
## A4_Plate66 A4_Plate66 Mir-955 Male Sponge
## A5_Plate66 A5_Plate66 Mir-955 Female Balancer
## A6_Plate66 A6_Plate66 Mir-955 Female Balancer
with(subset(d,!is.infinite(Bacteria)),tapply(Bacteria,Dilution,max))## 2500 250000 1250000 6250000 31250000 156250000 312500000
## 68 29 88 102 81 49 28
with(d,tapply(is.infinite(Bacteria),Dilution,sum))## 2500 250000 1250000 6250000 31250000 156250000 312500000
## 419 412 781 742 165 5 0
hist(d$Bacteria)The number of CFU in one droplet goes up to 100.
We will test two cases :
maxCFU=50, so that all counts above this value are considered infinite,
maxCFU=100, so that most counts (only one is above 100) will be anayzed. The infinite values will be those already set as such in the dataset.
Let us first analyze one sample…
library(CFUfit)
## estimation for the first sample in the list
## compare the three available methods and two values of maxCFU
dtmp <- subset(d,sample_ID=="A1_Plate66")
l <- with(dtmp,!is.infinite(Bacteria))
res <- data.frame()
res <- rbind(res,with(dtmp,loadEstimate(Bacteria[l],1/Dilution[l],Volume[l],maxCFU = NA,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 50, ignoreAboveMaxCFU = TRUE,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 100, ignoreAboveMaxCFU = TRUE,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 50, ignoreAboveMaxCFU = FALSE,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 100, ignoreAboveMaxCFU = FALSE,logbase=10)))
knitr::kable(head(res));| logload | se.logload | load | se.load | code | sumCFU | maxCFU | ignoreAboveMaxCFU |
|---|---|---|---|---|---|---|---|
| 8.375653 | 0.0406754 | 237494316 | 22243381 | 1 | 114 | NA | TRUE |
| 8.378959 | 0.0426976 | 239308979 | 23527647 | 1 | 114 | 50 | TRUE |
| 8.375664 | 0.0406754 | 237500019 | 22243915 | 1 | 114 | 100 | TRUE |
| 8.444853 | 0.0306519 | 278517710 | 19657383 | 1 | 114 | 50 | FALSE |
| 8.600842 | 0.0245995 | 398880017 | 22593546 | 1 | 114 | 100 | FALSE |
Note that in this situation the sum of CFU counts does not change with maxCFU, because no counts fall inbetween 50 and 100. Increasing maxCFU from 50 to 100 has nevertheless an effect on the estimated load, particularly in the Binomial-Poisson mixture model.
To facilitate the interpretation, load estimations are re-scaled as log10 values.
# Truncated Poisson model
est_TP_50 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 50, ignoreAboveMaxCFU = TRUE,logbase=10))
colnames(est_TP_50)[-1] <- paste0(colnames(est_TP_50)[-1],"_TP_50")
est_TP_100 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 100, ignoreAboveMaxCFU = TRUE,logbase=10))
colnames(est_TP_100)[-1] <- paste0(colnames(est_TP_100)[-1],"_TP_100")
# Binomial-Poisson mixture
est_BP_50 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 50, ignoreAboveMaxCFU = FALSE,logbase=10))
colnames(est_BP_50)[-1] <- paste0(colnames(est_BP_50)[-1],"_BP_50")
est_BP_100 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 100, ignoreAboveMaxCFU = FALSE,logbase=10))
colnames(est_BP_100)[-1] <- paste0(colnames(est_BP_100)[-1],"_BP_100")
dlines <- cbind(dlines,est_TP_50[,-1],est_TP_100[,-1],est_BP_50[,-1],est_BP_100[,-1])#knitr::kable();
datatable(dlines, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )library(beanplot)
color <- list(adjustcolor("blue",alpha.f = 0.5),adjustcolor("red",alpha.f = 0.5))
for(rna in unique(dlines$microRNA)) {
layout(matrix(1:2,ncol=2))
for(sex in unique(dlines$Sex)) {
dtmp <- subset(dlines,microRNA==rna & Sex==sex)
beanplot(logload_TP_50~Treatment,data=dtmp,what = c(F,T,T,F),col = color,main=paste(rna,sex),log = "",ylab="log10 bacteria per fly")
points(logload_TP_50~jitter(as.numeric(Treatment)),data=dtmp,pch=21,bg="white",cex=0.75)
}
}library(beanplot)
color <- list(adjustcolor("blue",alpha.f = 0.5),adjustcolor("red",alpha.f = 0.5))
for(rna in unique(dlines$microRNA)) {
layout(matrix(1:2,ncol=2))
for(sex in unique(dlines$Sex)) {
dtmp <- subset(dlines,microRNA==rna & Sex==sex)
beanplot(logload_BP_100~Treatment,data=dtmp,what = c(F,T,T,F),col = color,main=paste(rna,sex),log="",ylab="log10 bacteria per fly")
points(logload_BP_100~jitter(as.numeric(Treatment)),data=dtmp,pch=21,bg="white",cex=0.75)
}
}lmAnalysis of Binomial-Poisson estimates using a linear model. The logload is analyzed here.
m00 <- glm(logload_BP_100~Treatment*microRNA,
data=subset(dlines,Sex=="Female"))
coef(m00)## (Intercept) TreatmentSponge
## 7.91433247 0.03389514
## microRNAMir-11 microRNAMir-955
## 0.10963002 0.05572384
## microRNAMir-966 TreatmentSponge:microRNAMir-11
## 0.05743920 -0.41161925
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966
## 0.15946578 -0.04173410
layout(matrix(1:4,ncol=2))
plot(m00)We now use the inverse variance in estimation as weights.
m0 <- glm(logload_BP_100~Treatment*microRNA,
weight=(1/se.logload_BP_100^2),
data=subset(dlines,Sex=="Female"))
coef(m0)## (Intercept) TreatmentSponge
## 8.00147581 0.07256144
## microRNAMir-11 microRNAMir-955
## 0.04845881 0.02235766
## microRNAMir-966 TreatmentSponge:microRNAMir-11
## 0.03780731 0.01507777
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966
## 0.02717007 -0.13701202
layout(matrix(1:4,ncol=2)); plot(m0)shapiro.test(residuals(m0))##
## Shapiro-Wilk normality test
##
## data: residuals(m0)
## W = 0.9647, p-value = 1.131e-05
This does improve the fit, by quite a lot! But residuals still depart from Gaussian distribution.
glmAnalysis of Binomial-Poisson estimates using a Gamma glm. The actual load is analyzed with inverse estimation variance as initial weights.
m1 <- glm(load_BP_100~Treatment*microRNA,
weight=(1/se.load_BP_100^2),
family=Gamma(link="log"),
data=subset(dlines,!is.na(logload_BP_100) & Sex=="Female"))
summary(m1)##
## Call:
## glm(formula = load_BP_100 ~ Treatment * microRNA, family = Gamma(link = "log"),
## data = subset(dlines, !is.na(logload_BP_100) & Sex == "Female"),
## weights = (1/se.load_BP_100^2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.138e-05 1.116e-07 1.880e-07 3.553e-07 2.539e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.268789 13.821960 1.249 0.213
## TreatmentSponge -1.153346 34.605962 -0.033 0.973
## microRNAMir-11 -0.006323 74.149996 0.000 1.000
## microRNAMir-955 0.139041 74.959992 0.002 0.999
## microRNAMir-966 -6.759852 13.838023 -0.488 0.626
## TreatmentSponge:microRNAMir-11 -3.031335 82.198318 -0.037 0.971
## TreatmentSponge:microRNAMir-955 1.294928 165.889433 0.008 0.994
## TreatmentSponge:microRNAMir-966 8.494143 177.207928 0.048 0.962
##
## (Dispersion parameter for Gamma family taken to be 2.583978e-08)
##
## Null deviance: 4.0991e-08 on 240 degrees of freedom
## Residual deviance: 6.5054e-09 on 233 degrees of freedom
## AIC: 18
##
## Number of Fisher Scoring iterations: 2
The logLik is ridiculously small, which is probably a consequence of very large se… I can’t even simulate from this model to test its goodness of fit. Overall, the model is probably useless!
We shall now turn to an analysis of the log estimates (still with Binomial-Poisson model and maxCFU=100)
m1log <- glm(logload_BP_100~Treatment*microRNA,
weight=(1/se.logload_BP_100^2),
family=Gamma(link="identity"),
data=subset(dlines,!is.na(logload_BP_100) & Sex=="Female"))
summary(m1log)##
## Call:
## glm(formula = logload_BP_100 ~ Treatment * microRNA, family = Gamma(link = "identity"),
## data = subset(dlines, !is.na(logload_BP_100) & Sex == "Female"),
## weights = (1/se.logload_BP_100^2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -10.1122 -1.3018 -0.4944 0.8728 6.5642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.00148 0.05830 137.256 <2e-16 ***
## TreatmentSponge 0.07256 0.09751 0.744 0.458
## microRNAMir-11 0.04846 0.09345 0.519 0.605
## microRNAMir-955 0.02236 0.08728 0.256 0.798
## microRNAMir-966 0.03781 0.09610 0.393 0.694
## TreatmentSponge:microRNAMir-11 0.01508 0.22289 0.068 0.946
## TreatmentSponge:microRNAMir-955 0.02717 0.15797 0.172 0.864
## TreatmentSponge:microRNAMir-966 -0.13701 0.16815 -0.815 0.416
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 3.904161)
##
## Null deviance: 945.54 on 240 degrees of freedom
## Residual deviance: 937.99 on 233 degrees of freedom
## AIC: 373986
##
## Number of Fisher Scoring iterations: 3
# testing goodness of fit using simulations
simdev <- rep(NA,100)
for(i in 1:length(simdev)) {
y <- simulate(m1log)
msim <- update(m1log, y[,1] ~ .)
simdev[i] <- msim$deviance
}
hist(simdev); abline(v=m1log$deviance,col="red")This is much better behaved! Simulations show that the residual deviance falls in the range of expected values under the null that the model is correct.
anova(m1log,test="F")## Analysis of Deviance Table
##
## Model: Gamma, link: identity
##
## Response: logload_BP_100
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev F Pr(>F)
## NULL 240 945.54
## Treatment 1 2.0989 239 943.44 0.5376 0.4642
## microRNA 3 1.7779 236 941.66 0.1518 0.9285
## Treatment:microRNA 3 3.6757 233 937.99 0.3138 0.8154
We first assume that CFU count is Poisson and that random differences among flies are Gaussian. Note that dilution and volume are accounted for by the offset.
library(spaMM)## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
## spaMM (Rousset & Ferdy, 2014, version 3.8.0) is loaded.
## Type 'help(spaMM)' for a short introduction,
## 'news(package='spaMM')' for news,
## and 'citation('spaMM')' for proper citation.
m2 <- fitme(Bacteria~Treatment*microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
family=poisson(link="log"),
data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m2$fixef/log(10)## (Intercept) TreatmentSponge
## 7.72916962 0.26210067
## microRNAMir-11 microRNAMir-955
## -0.23075941 0.65086835
## microRNAMir-966 TreatmentSponge:microRNAMir-11
## 0.28506275 0.06768503
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966
## 0.11971695 -0.19103136
Same as above, but random differences among flies are now assumed to be Gamma distributed, so that the distribution of CFU is negative binomial.
m3 <- fitme(Bacteria~Treatment*microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
family=poisson(link="log"),
rand.family=Gamma(link="log"),
data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m3$fixef/log(10)## (Intercept) TreatmentSponge
## 7.88081105 0.22506733
## microRNAMir-11 microRNAMir-955
## -0.16586643 0.67484060
## microRNAMir-966 TreatmentSponge:microRNAMir-11
## 0.23396779 -0.07046731
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966
## 0.01421428 -0.19041006
A third option with a negative binomial response and random differences among flies assumed to be Gamma distributed. This model yields the highest likelihood but still does not account from truncation of CFU counts…
m4 <- fitme(Bacteria~Treatment*microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
family=negbin(link="log"),
rand.family=Gamma(link="log"),
data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m4$fixef/log(10)## (Intercept) TreatmentSponge
## 7.9110794 0.2051855
## microRNAMir-11 microRNAMir-955
## -0.1658413 0.6640014
## microRNAMir-966 TreatmentSponge:microRNAMir-11
## 0.2227209 -0.0338885
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966
## 0.0215753 -0.1660939
Overall, the three analysis above yield similar outcomes but produce estimates which are slightly different from those obtained with glms.
We shall finaly use the last model to test the significance of Treatment and interaction effects.
m4.a <- fitme(Bacteria~Treatment+microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
family=negbin(link="log"),
rand.family=Gamma(link="log"),
data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m4.b <- fitme(Bacteria~microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
family=negbin(link="log"),
rand.family=Gamma(link="log"),
data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
# test of Treatment effect
anova(m4.b,m4.a)## chi2_LR df p_value
## p_v 13.21119 1 0.000278283
# test of interaction effect
anova(m4,m4.a)## chi2_LR df p_value
## p_v 2.317012 3 0.5092696
The Treatment effect is found significant, while it was not in the glm approach ! I am not sure what to conclude from this… It may be that the potential problem caused by truncation (which is not accounted for in the spaMM approach) is mild compared to the lack of power of the glm analysis.