Simulation of load estimation from CFU counts

Simulating a dilution scheme

The dilution scheme

Defining the dilution scheme :

from volume taken from the source tube
to volume in the receiving tube
rep number of replicate platings for that dilution
volume.fly volume in which one fly is homogeneized
volume.droplet volume of one droplet

(dilution <- data.frame(from=c(10,10,10,40,40,40,40,50),
                       to=c(90,90,90,160,160,160,160,50),
                       rep=c(1,0,1,2,4,4,4,2)))

##   from  to rep
## 1   10  90   1
## 2   10  90   0
## 3   10  90   1
## 4   40 160   2
## 5   40 160   4
## 6   40 160   4
## 7   40 160   4
## 8   50  50   2

volume.fly <- 250
volume.droplet <- 5

The simulation routine

Sample are simulated with the folowing sequence of events: * for each dilution, the number of cells transferred from the source tube is randomly drawn from a Poisson distribution * the concentration in the destination tube is computed as from/(from+to) * CFU counts are randomly drawn from a Poisson distribution with average the concentration just computed multiplied by the volume of the droplet.

Proceeding this way, replicate platings of a given dilution are truly independent, but successive dilutions are not. This mimics the method we actually use in our experiments.

simulation.sample <- function(lambda,dilution,volume.fly,volume.droplet) {
  nb <- nrow(dilution)
  cfu <- rep(NA,sum(dilution$rep))
  ## number of CFU extracted from the fly (drawn from a Poisson)
  n <- c(rpois(1,lambda),rep(NA,nb))
  ## concentration in the extraction tube
  conc <- c(n[1]/volume.fly,rep(NA,nb))
  nbcfu <- 0
  for(i in 1:nb) {
    ## the number of cells transferred from source is drawn from a Poisson distribution
    n[i+1] <- rpois(1,conc[i]*dilution$from[i])  
    ## concentration in the destination tube
    conc[i+1] <- n[i+1]/(dilution$from[i]+dilution$to[i])
    if(dilution$rep[i]>0) {
      ## number of CFU obtained from the destination tube
      cfu[nbcfu+1:dilution$rep[i]] <- rpois(dilution$rep[i],conc[i+1]*volume.droplet)
      nbcfu <- nbcfu + dilution$rep[i]
    }
  }
  ## computes dilution factors
  df <- cumprod(with(dilution,(from+to)/from))
  return(data.frame(CFU=cfu,Dilution=rep(df,dilution$rep)))
}

simulation.dataset <- function(lambda,dilution,volume.fly,volume.droplet,sampleID=NULL) {
  sim <- NULL
  for(i in 1:length(lambda)) {
    s <- simulation.sample(lambda[i],dilution,volume.fly,volume.droplet)
    if(is.null(sampleID)) {
      s$sampleID <- i  
    } else  {
      s$sampleID <- sampleID[i]
    }
    
    sim <- rbind(sim,s)
  }
  return(sim)
}

Random flies

Randomly draws lines : for each fly, a load is randomly drawn from a Gamma distribution with scale fixed to 1e4 and average varying from 1e4 to 1e9. These different average loads are assumed to correspond to different lines. Each fly is identified by a unique value of sampleID.

nbfly <- 50
scale <- 10^4
v.shape <- 10^(4:9)/scale
dlines <- data.frame(lambda=rgamma(nbfly*length(v.shape),shape=v.shape,scale=scale),
                    Line=paste0("L",rep(1:length(v.shape),nbfly)),
                    sampleID=paste0("L",rep(1:length(v.shape),nbfly),"_",rep(1:nbfly,each=length(v.shape))))
rownames(dlines) <- dlines$sampleID
plot(lambda~Line,data=dlines,log="y")

Randomly draw CFU counts for each fly.

d <- simulation.dataset(dlines$lambda,sampleID=dlines$sampleID,dilution,volume.fly,volume.droplet)

subset(d,sampleID=="L1_1")

##    CFU Dilution sampleID
## 1    4       10     L1_1
## 2    0     1000     L1_1
## 3    0     5000     L1_1
## 4    0     5000     L1_1
## 5    0    25000     L1_1
## 6    0    25000     L1_1
## 7    0    25000     L1_1
## 8    0    25000     L1_1
## 9    0   125000     L1_1
## 10   0   125000     L1_1
## 11   0   125000     L1_1
## 12   0   125000     L1_1
## 13   0   625000     L1_1
## 14   0   625000     L1_1
## 15   0   625000     L1_1
## 16   0   625000     L1_1
## 17   0  1250000     L1_1
## 18   0  1250000     L1_1

subset(d,sampleID=="L2_1")

##    CFU Dilution sampleID
## 19 186       10     L2_1
## 20   2     1000     L2_1
## 21   1     5000     L2_1
## 22   0     5000     L2_1
## 23   0    25000     L2_1
## 24   0    25000     L2_1
## 25   0    25000     L2_1
## 26   0    25000     L2_1
## 27   0   125000     L2_1
## 28   0   125000     L2_1
## 29   0   125000     L2_1
## 30   1   125000     L2_1
## 31   0   625000     L2_1
## 32   0   625000     L2_1
## 33   0   625000     L2_1
## 34   0   625000     L2_1
## 35   0  1250000     L2_1
## 36   0  1250000     L2_1

subset(d,sampleID=="L3_1")

##     CFU Dilution sampleID
## 37 2181       10     L3_1
## 38   24     1000     L3_1
## 39    3     5000     L3_1
## 40    4     5000     L3_1
## 41    1    25000     L3_1
## 42    0    25000     L3_1
## 43    1    25000     L3_1
## 44    0    25000     L3_1
## 45    0   125000     L3_1
## 46    0   125000     L3_1
## 47    0   125000     L3_1
## 48    0   125000     L3_1
## 49    0   625000     L3_1
## 50    0   625000     L3_1
## 51    0   625000     L3_1
## 52    0   625000     L3_1
## 53    0  1250000     L3_1
## 54    0  1250000     L3_1

subset(d,sampleID=="L4_1")

##      CFU Dilution sampleID
## 55 19599       10     L4_1
## 56   218     1000     L4_1
## 57    36     5000     L4_1
## 58    34     5000     L4_1
## 59     8    25000     L4_1
## 60     9    25000     L4_1
## 61     5    25000     L4_1
## 62    10    25000     L4_1
## 63     3   125000     L4_1
## 64     2   125000     L4_1
## 65     1   125000     L4_1
## 66     0   125000     L4_1
## 67     0   625000     L4_1
## 68     0   625000     L4_1
## 69     0   625000     L4_1
## 70     0   625000     L4_1
## 71     0  1250000     L4_1
## 72     0  1250000     L4_1

subset(d,sampleID=="L5_1")

##       CFU Dilution sampleID
## 73 202854       10     L5_1
## 74   2043     1000     L5_1
## 75    389     5000     L5_1
## 76    392     5000     L5_1
## 77     74    25000     L5_1
## 78     90    25000     L5_1
## 79     82    25000     L5_1
## 80     82    25000     L5_1
## 81     16   125000     L5_1
## 82     21   125000     L5_1
## 83     20   125000     L5_1
## 84     16   125000     L5_1
## 85      4   625000     L5_1
## 86      3   625000     L5_1
## 87      5   625000     L5_1
## 88      1   625000     L5_1
## 89      4  1250000     L5_1
## 90      2  1250000     L5_1

subset(d,sampleID=="L6_1")

##         CFU Dilution sampleID
## 91  1992771       10     L6_1
## 92    19777     1000     L6_1
## 93     3920     5000     L6_1
## 94     4035     5000     L6_1
## 95      726    25000     L6_1
## 96      833    25000     L6_1
## 97      797    25000     L6_1
## 98      799    25000     L6_1
## 99      152   125000     L6_1
## 100     161   125000     L6_1
## 101     158   125000     L6_1
## 102     168   125000     L6_1
## 103      35   625000     L6_1
## 104      25   625000     L6_1
## 105      37   625000     L6_1
## 106      34   625000     L6_1
## 107      12  1250000     L6_1
## 108      17  1250000     L6_1

Analysis of complete CFU counts

Check that a Poisson glm using complete data produces unbiased estimates of load. This is not warantied because the autocorrelation between successive dilutions are not accounted for in this analysis.

m <- glm(CFU~sampleID,offset=log(volume.droplet)-log(Dilution),data=d,family=poisson(link="log"))


v <- coef(m)/log(10)
v[-1] <- v[-1]+v[1]+log10(volume.fly)
names(v) <- gsub("sampleID","",names(v))
names(v)[1] <- "L1_1"

plot(log10(dlines[names(v),"lambda"]),v,xlab="log lambda",ylab ="estimated log lambda (complete data)")
abline(0,1)
abline(h=4:9,col="gray")
abline(v=4:9,col="gray")

Analysis of right truncated CFU counts

Now consider that numbers of CFU above 100 are impossible to count.

d$CFUtr <- d$CFU
d$CFUtr[d$CFU>100] <- Inf

We use loadEstimate to estimate load. The three available methods will be tried, with maxCFU=50 or maxCFU=100 in the case of Truncated Poisson and Binomial-Poisson mixture.

library(CFUfit)
est1 <- loadEstimate(cfu = d$CFUtr,dilution = 1/(volume.fly*d$Dilution),volume = volume.droplet,
                     sampleID = d$sampleID,maxCFU=100,ignoreAboveMaxCFU=FALSE,logbase = 10)
colnames(est1)[-1] <- paste0(colnames(est1)[-1],"_BP_100")
est2 <- loadEstimate(cfu = d$CFUtr,dilution = 1/(volume.fly*d$Dilution),volume = volume.droplet,
                     sampleID=d$sampleID,maxCFU=50,ignoreAboveMaxCFU=FALSE,logbase = 10)
colnames(est2)[-1] <- paste0(colnames(est2)[-1],"_BP_50")
est3 <- loadEstimate(cfu = d$CFUtr,dilution = 1/(volume.fly*d$Dilution),volume = volume.droplet,
                     sampleID=d$sampleID,maxCFU=100,ignoreAboveMaxCFU=TRUE,logbase = 10)
colnames(est3)[-1] <- paste0(colnames(est3)[-1],"_TP_100")
est4 <- loadEstimate(cfu = d$CFUtr,dilution = 1/(volume.fly*d$Dilution),volume = volume.droplet,
                     sampleID=d$sampleID,maxCFU=50,ignoreAboveMaxCFU=TRUE,logbase = 10)
colnames(est4)[-1] <- paste0(colnames(est4)[-1],"_TP_50")
l <- is.finite(d$CFUtr)
est5 <- loadEstimate(cfu = d$CFUtr[l],dilution = 1/(volume.fly*d$Dilution[l]),volume = volume.droplet,
                     sampleID=d$sampleID[l],maxCFU=NA,logbase = 10)
colnames(est5)[-1] <- paste0(colnames(est5)[-1],"_P_100")


# merge with dlines
est <- cbind(dlines, est1[match(dlines$sampleID,est1$sampleID),]
                    ,est2[match(dlines$sampleID,est2$sampleID),]
                    ,est3[match(dlines$sampleID,est3$sampleID),]
                    ,est4[match(dlines$sampleID,est4$sampleID),]
                    ,est5[match(dlines$sampleID,est5$sampleID),])

Plotting estimates against actual values.

plot(logload_BP_100~log10(lambda),data=est,
     main="Binomial-Poisson mixture",
     xlab="log lambda",ylab="estimated log lambda (truncated data)")
abline(0,1)
abline(lm(logload_BP_100~log10(lambda),data=est),col="indianred4")
abline(h=4:9,col="gray")
abline(v=4:9,col="gray")

plot(logload_TP_100~log10(lambda),data=est,
     main="Truncated Poisson",
     xlab="log lambda",ylab="estimated log lambda (truncated data)")
abline(0,1)
abline(lm(logload_TP_100~log10(lambda),data=est),col="indianred4")
abline(h=4:9,col="gray")
abline(v=4:9,col="gray")

plot(logload_P_100~log10(lambda),data=est,
     main="Poisson",
     xlab="log lambda",ylab="estimated log lambda (truncated data)")
abline(0,1)
abline(lm(logload_P_100~log10(lambda),data=est),col="indianred4")
abline(h=4:9,col="gray")
abline(v=4:9,col="gray")

Binomial-Poisson is a bit better than Truncated Poisson and plain Poisson in terms of precision (the two latter yield low estimates for intermediate lambda). Plain Poisson sometimes fails to estimate load for the lowest \(\lambda\).

library(beanplot)
beanplot(logload_BP_100~Line,data=est,what = c(F,T,T,F),col = as.list(adjustcolor(terrain.colors(5),alpha.f = 0.75)),
         log="",ylab="log10 bacteria per fly")
points(logload_BP_100~jitter(as.numeric(Line)),data=est,pch=3,
       col=adjustcolor("black",alpha.f=0.5),cex=0.5)

beanplot(logload_BP_50~Line,data=est,what = c(F,T,T,F),col = as.list(adjustcolor(terrain.colors(5),alpha.f = 0.75)),
         log="",ylab="log10 bacteria per fly")
points(logload_BP_50~jitter(as.numeric(Line)),data=est,pch=3,
       col=adjustcolor("black",alpha.f=0.5),cex=0.5)

beanplot(logload_TP_100~Line,data=est,what = c(F,T,T,F),col = as.list(adjustcolor(terrain.colors(5),alpha.f = 0.75)),
         log="",ylab="log10 bacteria per fly")
points(logload_TP_100~jitter(as.numeric(Line)),data=est,pch=3,
       col=adjustcolor("black",alpha.f=0.5),cex=0.5)

beanplot(logload_TP_50~Line,data=est,what = c(F,T,T,F),col = as.list(adjustcolor(terrain.colors(5),alpha.f = 0.75)),
         log="",ylab="log10 bacteria per fly")
points(logload_TP_50~jitter(as.numeric(Line)),data=est,pch=3,
       col=adjustcolor("black",alpha.f=0.5),cex=0.5)

beanplot(logload_P_100~Line,data=est,what = c(F,T,T,F),col = as.list(adjustcolor(terrain.colors(5),alpha.f = 0.75)),
         log="",ylab="log10 bacteria per fly")
points(logload_P_100~jitter(as.numeric(Line)),data=est,pch=3,
       col=adjustcolor("black",alpha.f=0.5),cex=0.5)

Increasing maxCFU increases precision. Load estimates have a multimodal distribution for the two lowest values of \(\lambda\). Changing method does not change much from that prospective…

d.mv <- data.frame(Line=unique(est$Line))
for(i in grep("^logload_",colnames(est))) {
  dtmp <- data.frame(tapply(est[,i],est$Line,mean,na.rm=TRUE),
                     tapply(est[,i],est$Line,var,na.rm=TRUE))
  colnames(dtmp) <- paste(c("mean","var"),colnames(est)[i],sep="_")
  d.mv <- cbind(d.mv,dtmp)
}

d.mv$mean <- with(dlines,tapply(log10(lambda),Line,mean,na.rm=TRUE))
d.mv$var <- with(dlines,tapply(log10(lambda),Line,var,na.rm=TRUE))

plot(var~mean,data=d.mv,type="l",col="red",xlab="mean",ylab="variance")
points(var_logload_BP_100~mean_logload_BP_100,data=d.mv,type="b",pch=1)
points(var_logload_BP_50~mean_logload_BP_100,data=d.mv,type="b",pch=2)
points(var_logload_TP_100~mean_logload_BP_100,data=d.mv,type="b",pch=3)
points(var_logload_TP_50~mean_logload_BP_100,data=d.mv,type="b",pch=4)
points(var_logload_P_100~mean_logload_BP_100,data=d.mv,type="b",pch=5)

The relationship between variance and mean of log estimation (which comes from the Gamma distribution of \(\lambda\)) is best preserved by the Binomial-Poisson mixture. Note that this is not true if load (instead of log load) is analyzed.