Data used here are from our experiments on Providencia rettgeri injected in Drosophila melanogaster. The aim of the experiment is to compare the Pathogen Load Upon Death (PLUD) among several mutant lines of D. melanogaster.

The data consists in CFU counts taken from various dilutions of individual flies homogeneized just after they died from a P. rettgeri infection. Each fly has a unique sample_ID and PLUD must first be estimated on individual flies, before different mutants are compared.

Inspecting data

We load data and first multiply the dilution factor by 250, because each fly is homogeneized in 250 microliters of PBS. This way, the estimates will corespond to the number of bacterial cell per infected fly.

d <- read.table("data_CFU_PLUD_microRNA.csv",header=TRUE,sep=";")
d$Dilution <- d$Dilution*250 
head(d)
##   X  sample_ID Replicate_plating Volume Dilution Bacteria microRNA    Sex
## 1 1 A1_Plate66                 1      5     2500      Inf  Mir-955   Male
## 2 2 A2_Plate66                 1      5     2500      Inf  Mir-955 Female
## 3 3 A3_Plate66                 1      5     2500      Inf  Mir-955 Female
## 4 4 A4_Plate66                 1      5     2500      Inf  Mir-955   Male
## 5 5 A5_Plate66                 1      5     2500      Inf  Mir-955 Female
## 6 6 A6_Plate66                 1      5     2500      Inf  Mir-955 Female
##   Treatment
## 1  Balancer
## 2  Balancer
## 3  Balancer
## 4    Sponge
## 5  Balancer
## 6  Balancer
## extract the list of lines/sex/treatment
dlines <- unique(d[,c("sample_ID","microRNA","Sex","Treatment")])
rownames(dlines) <- dlines$sample_ID
head(dlines)
##             sample_ID microRNA    Sex Treatment
## A1_Plate66 A1_Plate66  Mir-955   Male  Balancer
## A2_Plate66 A2_Plate66  Mir-955 Female  Balancer
## A3_Plate66 A3_Plate66  Mir-955 Female  Balancer
## A4_Plate66 A4_Plate66  Mir-955   Male    Sponge
## A5_Plate66 A5_Plate66  Mir-955 Female  Balancer
## A6_Plate66 A6_Plate66  Mir-955 Female  Balancer
with(subset(d,!is.infinite(Bacteria)),tapply(Bacteria,Dilution,max))
##      2500    250000   1250000   6250000  31250000 156250000 312500000 
##        68        29        88       102        81        49        28
with(d,tapply(is.infinite(Bacteria),Dilution,sum))
##      2500    250000   1250000   6250000  31250000 156250000 312500000 
##       419       412       781       742       165         5         0
hist(d$Bacteria)

The number of CFU in one droplet goes up to 100.

We will test two cases :

  • maxCFU=50, so that all counts above this value are considered infinite,

  • maxCFU=100, so that most counts (only one is above 100) will be anayzed. The infinite values will be those already set as such in the dataset.

Estimation on one sample

Let us first analyze one sample…

library(CFUfit)

## estimation for the first sample in the list
## compare the three available methods and two values of maxCFU
dtmp <- subset(d,sample_ID=="A1_Plate66")
l <- with(dtmp,!is.infinite(Bacteria))
res <- data.frame()
res <- rbind(res,with(dtmp,loadEstimate(Bacteria[l],1/Dilution[l],Volume[l],maxCFU = NA,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 50, ignoreAboveMaxCFU = TRUE,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 100, ignoreAboveMaxCFU = TRUE,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 50, ignoreAboveMaxCFU = FALSE,logbase=10)))
res <- rbind(res,with(dtmp,loadEstimate(Bacteria,1/Dilution,Volume,maxCFU = 100, ignoreAboveMaxCFU = FALSE,logbase=10)))

knitr::kable(head(res));
logload se.logload load se.load code sumCFU maxCFU ignoreAboveMaxCFU
8.375653 0.0406754 237494316 22243381 1 114 NA TRUE
8.378959 0.0426976 239308979 23527647 1 114 50 TRUE
8.375664 0.0406754 237500019 22243915 1 114 100 TRUE
8.444853 0.0306519 278517710 19657383 1 114 50 FALSE
8.600842 0.0245995 398880017 22593546 1 114 100 FALSE

Note that in this situation the sum of CFU counts does not change with maxCFU, because no counts fall inbetween 50 and 100. Increasing maxCFU from 50 to 100 has nevertheless an effect on the estimated load, particularly in the Binomial-Poisson mixture model.

Estimation for all samples

To facilitate the interpretation, load estimations are re-scaled as log10 values.

# Truncated Poisson model
est_TP_50 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 50, ignoreAboveMaxCFU = TRUE,logbase=10))
colnames(est_TP_50)[-1] <- paste0(colnames(est_TP_50)[-1],"_TP_50")

est_TP_100 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 100, ignoreAboveMaxCFU = TRUE,logbase=10))
colnames(est_TP_100)[-1] <- paste0(colnames(est_TP_100)[-1],"_TP_100")

# Binomial-Poisson mixture
est_BP_50 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 50, ignoreAboveMaxCFU = FALSE,logbase=10))
colnames(est_BP_50)[-1] <- paste0(colnames(est_BP_50)[-1],"_BP_50")

est_BP_100 <- with(d,loadEstimate(Bacteria,1/Dilution,Volume,sample_ID,maxCFU = 100, ignoreAboveMaxCFU = FALSE,logbase=10))
colnames(est_BP_100)[-1] <- paste0(colnames(est_BP_100)[-1],"_BP_100")

dlines <- cbind(dlines,est_TP_50[,-1],est_TP_100[,-1],est_BP_50[,-1],est_BP_100[,-1])
#knitr::kable();
datatable(dlines, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Analysis of the load estimation

Truncated Poisson model

library(beanplot)
color <- list(adjustcolor("blue",alpha.f = 0.5),adjustcolor("red",alpha.f = 0.5))
for(rna in unique(dlines$microRNA)) {
  layout(matrix(1:2,ncol=2))
  for(sex in unique(dlines$Sex)) {
    dtmp <- subset(dlines,microRNA==rna & Sex==sex)
    beanplot(logload_TP_50~Treatment,data=dtmp,what = c(F,T,T,F),col = color,main=paste(rna,sex),log = "",ylab="log10 bacteria per fly")
    points(logload_TP_50~jitter(as.numeric(Treatment)),data=dtmp,pch=21,bg="white",cex=0.75)
  }
}

Binomial-Poisson mixture

library(beanplot)
color <- list(adjustcolor("blue",alpha.f = 0.5),adjustcolor("red",alpha.f = 0.5))
for(rna in unique(dlines$microRNA)) {
  layout(matrix(1:2,ncol=2))
  for(sex in unique(dlines$Sex)) {
    dtmp <- subset(dlines,microRNA==rna & Sex==sex)
    beanplot(logload_BP_100~Treatment,data=dtmp,what = c(F,T,T,F),col = color,main=paste(rna,sex),log="",ylab="log10 bacteria per fly")
    points(logload_BP_100~jitter(as.numeric(Treatment)),data=dtmp,pch=21,bg="white",cex=0.75)
  }
}

Statistical inference

Using lm

Analysis of Binomial-Poisson estimates using a linear model. The logload is analyzed here.

m00 <- glm(logload_BP_100~Treatment*microRNA,
         data=subset(dlines,Sex=="Female"))
coef(m00)
##                     (Intercept)                 TreatmentSponge 
##                      7.91433247                      0.03389514 
##                  microRNAMir-11                 microRNAMir-955 
##                      0.10963002                      0.05572384 
##                 microRNAMir-966  TreatmentSponge:microRNAMir-11 
##                      0.05743920                     -0.41161925 
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966 
##                      0.15946578                     -0.04173410
layout(matrix(1:4,ncol=2))
plot(m00)

We now use the inverse variance in estimation as weights.

m0 <- glm(logload_BP_100~Treatment*microRNA,
         weight=(1/se.logload_BP_100^2),
         data=subset(dlines,Sex=="Female"))
coef(m0)
##                     (Intercept)                 TreatmentSponge 
##                      8.00147581                      0.07256144 
##                  microRNAMir-11                 microRNAMir-955 
##                      0.04845881                      0.02235766 
##                 microRNAMir-966  TreatmentSponge:microRNAMir-11 
##                      0.03780731                      0.01507777 
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966 
##                      0.02717007                     -0.13701202
layout(matrix(1:4,ncol=2)); plot(m0)

shapiro.test(residuals(m0))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(m0)
## W = 0.9647, p-value = 1.131e-05

This does improve the fit, by quite a lot! But residuals still depart from Gaussian distribution.

Using glm

Analysis of Binomial-Poisson estimates using a Gamma glm. The actual load is analyzed with inverse estimation variance as initial weights.

m1 <- glm(load_BP_100~Treatment*microRNA,
         weight=(1/se.load_BP_100^2),
         family=Gamma(link="log"),
         data=subset(dlines,!is.na(logload_BP_100) & Sex=="Female"))
summary(m1)
## 
## Call:
## glm(formula = load_BP_100 ~ Treatment * microRNA, family = Gamma(link = "log"), 
##     data = subset(dlines, !is.na(logload_BP_100) & Sex == "Female"), 
##     weights = (1/se.load_BP_100^2))
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -3.138e-05   1.116e-07   1.880e-07   3.553e-07   2.539e-05  
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)                      17.268789  13.821960   1.249    0.213
## TreatmentSponge                  -1.153346  34.605962  -0.033    0.973
## microRNAMir-11                   -0.006323  74.149996   0.000    1.000
## microRNAMir-955                   0.139041  74.959992   0.002    0.999
## microRNAMir-966                  -6.759852  13.838023  -0.488    0.626
## TreatmentSponge:microRNAMir-11   -3.031335  82.198318  -0.037    0.971
## TreatmentSponge:microRNAMir-955   1.294928 165.889433   0.008    0.994
## TreatmentSponge:microRNAMir-966   8.494143 177.207928   0.048    0.962
## 
## (Dispersion parameter for Gamma family taken to be 2.583978e-08)
## 
##     Null deviance: 4.0991e-08  on 240  degrees of freedom
## Residual deviance: 6.5054e-09  on 233  degrees of freedom
## AIC: 18
## 
## Number of Fisher Scoring iterations: 2

The logLik is ridiculously small, which is probably a consequence of very large se… I can’t even simulate from this model to test its goodness of fit. Overall, the model is probably useless!

We shall now turn to an analysis of the log estimates (still with Binomial-Poisson model and maxCFU=100)

m1log <- glm(logload_BP_100~Treatment*microRNA,
         weight=(1/se.logload_BP_100^2),
         family=Gamma(link="identity"),
         data=subset(dlines,!is.na(logload_BP_100) & Sex=="Female"))
summary(m1log)
## 
## Call:
## glm(formula = logload_BP_100 ~ Treatment * microRNA, family = Gamma(link = "identity"), 
##     data = subset(dlines, !is.na(logload_BP_100) & Sex == "Female"), 
##     weights = (1/se.logload_BP_100^2))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -10.1122   -1.3018   -0.4944    0.8728    6.5642  
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      8.00148    0.05830 137.256   <2e-16 ***
## TreatmentSponge                  0.07256    0.09751   0.744    0.458    
## microRNAMir-11                   0.04846    0.09345   0.519    0.605    
## microRNAMir-955                  0.02236    0.08728   0.256    0.798    
## microRNAMir-966                  0.03781    0.09610   0.393    0.694    
## TreatmentSponge:microRNAMir-11   0.01508    0.22289   0.068    0.946    
## TreatmentSponge:microRNAMir-955  0.02717    0.15797   0.172    0.864    
## TreatmentSponge:microRNAMir-966 -0.13701    0.16815  -0.815    0.416    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 3.904161)
## 
##     Null deviance: 945.54  on 240  degrees of freedom
## Residual deviance: 937.99  on 233  degrees of freedom
## AIC: 373986
## 
## Number of Fisher Scoring iterations: 3
# testing goodness of fit using simulations
simdev <- rep(NA,100)
for(i in 1:length(simdev)) {
  y <- simulate(m1log)
  msim <- update(m1log, y[,1] ~ .)
  simdev[i] <- msim$deviance
}
hist(simdev); abline(v=m1log$deviance,col="red")

This is much better behaved! Simulations show that the residual deviance falls in the range of expected values under the null that the model is correct.

anova(m1log,test="F")
## Analysis of Deviance Table
## 
## Model: Gamma, link: identity
## 
## Response: logload_BP_100
## 
## Terms added sequentially (first to last)
## 
## 
##                    Df Deviance Resid. Df Resid. Dev      F Pr(>F)
## NULL                                 240     945.54              
## Treatment           1   2.0989       239     943.44 0.5376 0.4642
## microRNA            3   1.7779       236     941.66 0.1518 0.9285
## Treatment:microRNA  3   3.6757       233     937.99 0.3138 0.8154

Analysis of count data using spaMM

We first assume that CFU count is Poisson and that random differences among flies are Gaussian. Note that dilution and volume are accounted for by the offset.

library(spaMM)
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## spaMM (Rousset & Ferdy, 2014, version 3.8.0) is loaded.
## Type 'help(spaMM)' for a short introduction,
## 'news(package='spaMM')' for news,
## and 'citation('spaMM')' for proper citation.
m2 <- fitme(Bacteria~Treatment*microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
      family=poisson(link="log"),
      data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m2$fixef/log(10)
##                     (Intercept)                 TreatmentSponge 
##                      7.72916962                      0.26210067 
##                  microRNAMir-11                 microRNAMir-955 
##                     -0.23075941                      0.65086835 
##                 microRNAMir-966  TreatmentSponge:microRNAMir-11 
##                      0.28506275                      0.06768503 
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966 
##                      0.11971695                     -0.19103136

Same as above, but random differences among flies are now assumed to be Gamma distributed, so that the distribution of CFU is negative binomial.

m3 <- fitme(Bacteria~Treatment*microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
      family=poisson(link="log"),
      rand.family=Gamma(link="log"),
      data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m3$fixef/log(10)
##                     (Intercept)                 TreatmentSponge 
##                      7.88081105                      0.22506733 
##                  microRNAMir-11                 microRNAMir-955 
##                     -0.16586643                      0.67484060 
##                 microRNAMir-966  TreatmentSponge:microRNAMir-11 
##                      0.23396779                     -0.07046731 
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966 
##                      0.01421428                     -0.19041006

A third option with a negative binomial response and random differences among flies assumed to be Gamma distributed. This model yields the highest likelihood but still does not account from truncation of CFU counts…

m4 <- fitme(Bacteria~Treatment*microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
      family=negbin(link="log"),
      rand.family=Gamma(link="log"),
      data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m4$fixef/log(10)
##                     (Intercept)                 TreatmentSponge 
##                       7.9110794                       0.2051855 
##                  microRNAMir-11                 microRNAMir-955 
##                      -0.1658413                       0.6640014 
##                 microRNAMir-966  TreatmentSponge:microRNAMir-11 
##                       0.2227209                      -0.0338885 
## TreatmentSponge:microRNAMir-955 TreatmentSponge:microRNAMir-966 
##                       0.0215753                      -0.1660939

Overall, the three analysis above yield similar outcomes but produce estimates which are slightly different from those obtained with glms.

We shall finaly use the last model to test the significance of Treatment and interaction effects.

m4.a <- fitme(Bacteria~Treatment+microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
      family=negbin(link="log"),
      rand.family=Gamma(link="log"),
      data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))
m4.b <- fitme(Bacteria~microRNA+(1|sample_ID)+offset(log(Volume)-log(Dilution)),
      family=negbin(link="log"),
      rand.family=Gamma(link="log"),
      data=subset(d,Sex=="Female" & is.finite(Bacteria) & !is.na(Bacteria)))

# test of Treatment effect
anova(m4.b,m4.a)
##      chi2_LR df     p_value
## p_v 13.21119  1 0.000278283
# test of interaction effect
anova(m4,m4.a)
##      chi2_LR df   p_value
## p_v 2.317012  3 0.5092696

The Treatment effect is found significant, while it was not in the glm approach ! I am not sure what to conclude from this… It may be that the potential problem caused by truncation (which is not accounted for in the spaMM approach) is mild compared to the lack of power of the glm analysis.