Introduction

DAPI (4’,6-diamidino-2-phenylindole) is a fluorescent stain that binds to DNA and allows visualization of cell nuclei. It is also useful for quantifying the amount of DNA in a cell, which is approximately proportional to the integrated fluorescent intensity of the nuclear signal. Because many cells are stationary at 2N or 4N DNA, distributions of cellular DNA show peaks corresponding to those amounts, with 4N cells approximately double the integrated intensity of 2N cells.

Data

The data for this analysis comes from

require(ggplot2)
require(reshape)
require(dplyr)
require(MASS) # fitdistr() function
require(pracma) # quadgk() function
require(ppc) # ppc.peaks() function
cells <- read.csv("./CellData.csv")

Cells are labeled by the well and region (colony) to which they belong.

cellSubset <- filter(cells,WellNo==1,RegionNo==1)
ggplot(cellSubset,aes(x=Location_Center_X_MOS,y=Location_Center_Y_MOS)) + 
  geom_point() + scale_y_reverse() + ggtitle("Cellular Map, Colony 1")

When we combined cellular DAPI levels across an entire experiment, we observed an uncharacteristically noisy distribution.

hist(cells$Intensity_IntegratedIntensity_DNA,100,
     main = "Cellular DNA content distribution, all colonies combined",
     xlab = "Integrated Intensity of DNA (AU)",
     xlim = c(0,350))

However, when we looked at the distributions from single colonies, they were much less noisy.

hist(cellSubset$Intensity_IntegratedIntensity_DNA,100,
     main = "Cellular DNA content distribution, Colony 1",
     xlab = "Integrated Intensity of DNA (AU)",
     xlim = c(0,350))

Methods

We eventually discovered that the artifact was due to regional variation in background and staining that particularly affected the DAPI signal due to mounting media fluorescence in the blue wavelength.

To address this problem, we leveraged our data pipeline, which segregates cells into local colony structures. Using a Gaussian kernel density estimation, the DNA distribution for each colony was modeled, and 2N and 4N peaks were identified. Each colony then became an internal control with its own cellular DNA distribution containing peaks at 2N and 4N DNA. Plotting the 2N and 4N peaks, we found that most colonies indeed had 4N peaks at approximately twice the DAPI intensity of the 2N peaks. Colonies that fell outside a narrow linear range were found to be small colonies (less than 100 cells) which were poorly modeled, and were discarded from further analysis. Because the peaks define DNA content at an absolute level, we normalized the DAPI values of every cell by globally aligning the colony peaks to values of X and 2X:

# Initialize variables
dapiRescaledTotal = numeric()
testG1 <-numeric()
testG2 <- numeric()
count <- 1

for (w in 1:max(cells$WellNo)){
  cellSubsetW <- filter(cells, WellNo == w)
  for (r in 1:max(cellSubsetW$RegionNo)){
    # Filter cells per well number and region
    cellsSubset <- filter(cells, WellNo == w & RegionNo == r)
    dapi <- cellsSubset$Intensity_IntegratedIntensity_DNA
    
    # Find the kernal density estimation of the DNA distribution
    F1 <- density(dapi,n = 512, from=1,to=350, width=3);
    
    # Find locations of G1 and G2 peaks
    peakLocs <- ppc.peaks(F1$y,.3)
    g1_peak <- F1$x[peakLocs][1]
    g2_peak <- F1$x[peakLocs][2]
    
    # Create the density function data frame on first loop
    # with x data
    if (count==1){
      F_total1 <- data.frame(F1$x)
      F_total2 <- data.frame(F1$x)
    }
    
    # Store identified G1 and G2 peaks in testG1 and testG2
    testG1[count] <- g1_peak
    testG2[count] <- g2_peak
    
    # Apply scaling factors per colony
    u <- 2*g1_peak - g2_peak
    v <- 100/(g1_peak - u)
    dapiRescaled <- (dapi-u)*v
    dapiRescaledTotal <- c(dapiRescaledTotal,dapiRescaled)
    
    # Find the kernal density function of the rescaled DNA distribution
    F2 <- density(dapiRescaled,n = 512, from=1,to=350, width=4)
    F_total1 <- data.frame(F_total1,F1$y)
    F_total2 <- data.frame(F_total2,F2$y)
    count <- count+1
  }
}
# Plot G1 peaks vs G2 peaks for each colony
ggplot(data.frame(testG1,testG2),aes(x=testG1,y=testG2)) +
  geom_point(shape=1) + geom_smooth(method=lm) + xlab("G1 peak") +
  ylab("G2 peak") + ggtitle("G1 vs G2 peaks in individual colonies")

df <- melt(F_total1[,1:12] ,  id = 'F1.x', variable_name = 'series')
ggplot(df, aes(F1.x,value)) + geom_line(aes(color = series)) +
  ggtitle('Density estimation for unnormalized DNA content') + labs(y="probability")

df <- melt(F_total2[,1:12] ,  id = 'F1.x', variable_name = 'series')
ggplot(df, aes(F1.x,value)) + geom_line(aes(color = series)) +
  ggtitle('Density estimation for normalized DNA content') + labs(y="probability")

Cell Cycle Modeling

Model based on: Improved Program for the Analysis of DNA Histograms M.G. Ormerod, A.W.R. Payne, and J.V. Watson Cytometry 8:637-641 (1987).

# pdfG1 calculates PDF of x|G1
pdfG1 <- function(x,mu1,sigma1){
  1/(sqrt(2*pi*sigma1^2)) * exp(-(x-mu1)^2/(2*sigma1^2))
}

# pdfG2 calculates PDF of x|G2
pdfG2 <- function(x,mu2,sigma2){
  1/(sqrt(2*pi*sigma2^2)) * exp(-(x-mu2)^2/(2*sigma2^2))
}

# pdfS.unnormalized calculates unnormalized PDF of x|S
pdfS.unnormalized <- function(x,mu1,sigma1,mu2,sigma2){
  erf((x-mu1)/sigma1) - erf((x-mu2)/sigma2)
}

# pdfS calculates PDF of x|S
pdfS <- function(x,mu1,sigma1,mu2,sigma2){
  normalization <- quadgk(function(x) pdfS.unnormalized(x,mu1,sigma1,mu2,sigma2),0,2000)
  pdfS.unnormalized(x,mu1,sigma1,mu2,sigma2)/normalization
}

# pdfCellCycle calculates the total probability density function
pdfCellCycle <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
  if(pG1+pG2 > 1){stop("Frequencies cannot add up to more than 1.")}
  pS = 1-pG1-pG2
  pG1*pdfG1(x,mu1,sigma1)+pS*pdfS(x,mu1,sigma1,mu2,sigma2)+pG2*pdfG2(x,mu2,sigma2)
}

# pdfCellCycleDF creates a dataframe with total and conditional PDFs
pdfCellCycleDF <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
  Total <- pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
  G1 <- pG1*pdfG1(x,mu1,sigma1)
  S <- (1-pG1-pG2)*pdfS(x,mu1,sigma1,mu2,sigma2)
  G2 <- pG2*pdfG2(x,mu2,sigma2)
  data.frame(x,Total,G1,S,G2)
}

Below is an example probability density function for the cell cycle profile:

# Set parameters
params = list(mu1 = 100, sigma1 = 5, mu2 = 200, sigma2 = 10, pG1 = .2, pG2 = .3)
mu1 <- params$mu1; sigma1 = params$sigma1; mu2 = params$mu2; sigma2 = params$sigma2
pG1 = params$pG1; pG2 = params$pG2

xGrid <- seq(0,350,length=1000)
df <- pdfCellCycleDF(xGrid,mu1,sigma1,mu2,sigma2,pG1,pG2)
df <- melt(df ,  id = 'x', variable_name = 'series')
ggplot(df, aes(x,value)) + geom_line(aes(color = series)) +
  ggtitle('Cell Cycle Model Probability Density Function') + labs(y="probability")

We can then look at the posterior probability functions for the G1, S, and G2 states.

# P(G1|x)
pG1x <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
  pdfG1(x,mu1,sigma1) * pG1 / pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
}

# P(S|x)
pSx <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
  pS <- 1-pG1-pG2
  pdfS(x,mu1,sigma1,mu2,sigma2) * pS / pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
}
  
# P(G2|x)
pG2x <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
  pdfG1(x,mu2,sigma2) * pG2 / pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
}

# Data frame with conditional probabilities on each column
pCondDF <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
  G1 <- pG1x(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
  S <- pSx(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
  G2 <- pG2x(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
  data.frame(x,G1,S,G2)
}

The posterior probability functions are plotted below:

params = list(mu1 = 100, sigma1 = 5, mu2 = 200, sigma2 = 10, pG1 = .2, pG2 = .3)
mu1 <- params$mu1; sigma1 = params$sigma1; mu2 = params$mu2; sigma2 = params$sigma2
pG1 = params$pG1; pG2 = params$pG2

xGrid <- seq(3,350,length=1000)
df <- pCondDF(xGrid,mu1,sigma1,mu2,sigma2,pG1,pG2)
df <- melt(df ,  id = 'x', variable_name = 'series')
ggplot(df, aes(x,value)) + geom_line(aes(color = series)) +
  ggtitle('Posterior probabilities P(G1|x),P(S|x),P(G2|x)') + labs(y="probability")

Model Fitting

Maximum Likelihood Estimation was used to find the parameters for both the unnormalized and normalized DNA distributions.

# Unnormalized data

dapi <- cells$Intensity_IntegratedIntensity_DNA
DNA <- dapi[dapi < 300 & dapi > 50]
hist(DNA,1000)

params <- list(mu1 = 90, sigma1 = 20, mu2 = 180, sigma2 = 40, pG1 = .2, pG2 = .2)
lower <- list(mu1 = 70, sigma1 = 5, mu2 = 140, sigma2 = 10, pG1 = 0, pG2 = 0)
upper <- list(mu1 = 100, sigma1 = 30, mu2 = 200, sigma2 = 60, pG1 = .5, pG2 = .5)

fit <- fitdistr(DNA,
         function(x,mu1,sigma1,mu2,sigma2,pG1,pG2) pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),
         start = params,upper=upper,lower=lower)
fit
##        mu1           sigma1          mu2           sigma2   
##   9.761467e+01   1.601368e+01   1.837165e+02   3.985179e+01 
##  (4.199247e-01) (1.073553e-01) (1.972797e+00) (4.574038e-01)
##        pG1            pG2     
##   3.049117e-01   5.000000e-01 
##  (6.938977e-03) (3.281191e-02)
fitUnnormalized <- fit
mu1 <- fit$estimate[1];  sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3]; 
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5];  pG2 <- fit$estimate[6]
hist(DNA,20,freq=F,main = )
par(new=T)
curve(pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),from=50,to=300,xlab="",
      ylab = "")

par(new=F)


# Normalized data

dapiRescaled <- dapiRescaledTotal[dapiRescaledTotal > 50 & dapiRescaledTotal < 250]
params = list(mu1 = 100, sigma1 = 5, mu2 = 200, sigma2 = 10, pG1 = .2, pG2 = .3)
lower <- list(mu1 = 90, sigma1 = 1, mu2 = 180, sigma2 = 2, pG1 = .1, pG2 = .1)
upper <- list(mu1 = 110, sigma1 = 10, mu2 = 220, sigma2 = 20, pG1 = .4, pG2 = .4)
fit <- fitdistr(dapiRescaled,
         function(x,mu1,sigma1,mu2,sigma2,pG1,pG2) pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),
         start = params,upper=upper,lower=lower)
fit
##        mu1           sigma1          mu2           sigma2   
##   9.889816e+01   1.000000e+01   2.017578e+02   1.445421e+01 
##  (7.404323e-02) (4.342101e-02) (9.321407e-02) (6.280588e-02)
##        pG1            pG2     
##   2.894091e-01   3.497422e-01 
##  (1.801729e-03) (1.993583e-03)
fitNormalized <- fit
mu1 <- fit$estimate[1];  sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3]; 
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5];  pG2 <- fit$estimate[6]
hist(dapiRescaled,20,freq=F)
par(new=T)
curve(pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),from=50,to=250,xlab="",
      ylab = "")

par(new=F)

Validation

We tested whether applying the rescaling methodology improved the predictive nature of DNA intensity on cell cycle state identification. In addition to the DNA intensities, we added another dimension: EdU intensity, which is a marker for S-phase. We clustered data with this additional dimension using a k-means algorithm to label each cell. We then compared the cluster labels to labels derived from using just the DNA single variable, based on the posterior distribution with a p>.05 threshold.

cl <- kmeans(cells$Intensity_IntegratedIntensity_DNA,2)
clusterLabel <- cl$cluster

hist(cells$Intensity_MeanIntensity_EdU,100)

clusterLabel[cells$Intensity_MeanIntensity_EdU > .03] <- 3

df <- data.frame(DNA = cells$Intensity_IntegratedIntensity_DNA, meanEdU = cells$Intensity_MeanIntensity_EdU,
                 clusterLabel)
ggplot(df, aes(x=DNA, y=meanEdU,color=clusterLabel)) + geom_point(alpha=.05) +
  coord_cartesian(xlim = c(0,300),ylim=c(0,.15)) 

fit <- fitUnnormalized
mu1 <- fit$estimate[1];  sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3]; 
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5];  pG2 <- fit$estimate[6]
condDF <- pCondDF(cells$Intensity_IntegratedIntensity_DNA,mu1,sigma1,mu2,sigma2,pG1,pG2)
clusterLabelSV <- numeric(length(clusterLabel))
clusterLabelSV[condDF$G1 > .5] <- clusterLabel[1]
clusterLabelSV[condDF$S > .5] <- 3
clusterLabelSV[condDF$G2 > .5] <- 3-clusterLabel[1]

frac <- sum(clusterLabelSV == clusterLabel)/length(clusterLabel)

The fraction of cells successfully classified using the unnormalized DNA distribution as a single variable predictor was: 0.4778347.

fit <- fitNormalized
mu1 <- fit$estimate[1];  sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3]; 
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5];  pG2 <- fit$estimate[6]
condDF <- pCondDF(dapiRescaledTotal,mu1,sigma1,mu2,sigma2,pG1,pG2)
condDF <- pCondDF(cells$Intensity_IntegratedIntensity_DNA,mu1,sigma1,mu2,sigma2,pG1,pG2)
clusterLabelSV <- numeric(length(clusterLabel))
clusterLabelSV[condDF$G1 > .5] <- clusterLabel[1]
clusterLabelSV[condDF$S > .5] <- 3
clusterLabelSV[condDF$G2 > .5] <- 3-clusterLabel[1]

frac <- sum(clusterLabelSV == clusterLabel)/length(clusterLabel)

The fraction of cells successfully classified using the normalized DNA distribution as a single variable predictor was: 0.6112845.