DAPI (4’,6-diamidino-2-phenylindole) is a fluorescent stain that binds to DNA and allows visualization of cell nuclei. It is also useful for quantifying the amount of DNA in a cell, which is approximately proportional to the integrated fluorescent intensity of the nuclear signal. Because many cells are stationary at 2N or 4N DNA, distributions of cellular DNA show peaks corresponding to those amounts, with 4N cells approximately double the integrated intensity of 2N cells.
The data for this analysis comes from
require(ggplot2)
require(reshape)
require(dplyr)
require(MASS) # fitdistr() function
require(pracma) # quadgk() function
require(ppc) # ppc.peaks() function
cells <- read.csv("./CellData.csv")
Cells are labeled by the well and region (colony) to which they belong.
cellSubset <- filter(cells,WellNo==1,RegionNo==1)
ggplot(cellSubset,aes(x=Location_Center_X_MOS,y=Location_Center_Y_MOS)) +
geom_point() + scale_y_reverse() + ggtitle("Cellular Map, Colony 1")
When we combined cellular DAPI levels across an entire experiment, we observed an uncharacteristically noisy distribution.
hist(cells$Intensity_IntegratedIntensity_DNA,100,
main = "Cellular DNA content distribution, all colonies combined",
xlab = "Integrated Intensity of DNA (AU)",
xlim = c(0,350))
However, when we looked at the distributions from single colonies, they were much less noisy.
hist(cellSubset$Intensity_IntegratedIntensity_DNA,100,
main = "Cellular DNA content distribution, Colony 1",
xlab = "Integrated Intensity of DNA (AU)",
xlim = c(0,350))
We eventually discovered that the artifact was due to regional variation in background and staining that particularly affected the DAPI signal due to mounting media fluorescence in the blue wavelength.
To address this problem, we leveraged our data pipeline, which segregates cells into local colony structures. Using a Gaussian kernel density estimation, the DNA distribution for each colony was modeled, and 2N and 4N peaks were identified. Each colony then became an internal control with its own cellular DNA distribution containing peaks at 2N and 4N DNA. Plotting the 2N and 4N peaks, we found that most colonies indeed had 4N peaks at approximately twice the DAPI intensity of the 2N peaks. Colonies that fell outside a narrow linear range were found to be small colonies (less than 100 cells) which were poorly modeled, and were discarded from further analysis. Because the peaks define DNA content at an absolute level, we normalized the DAPI values of every cell by globally aligning the colony peaks to values of X and 2X:
# Initialize variables
dapiRescaledTotal = numeric()
testG1 <-numeric()
testG2 <- numeric()
count <- 1
for (w in 1:max(cells$WellNo)){
cellSubsetW <- filter(cells, WellNo == w)
for (r in 1:max(cellSubsetW$RegionNo)){
# Filter cells per well number and region
cellsSubset <- filter(cells, WellNo == w & RegionNo == r)
dapi <- cellsSubset$Intensity_IntegratedIntensity_DNA
# Find the kernal density estimation of the DNA distribution
F1 <- density(dapi,n = 512, from=1,to=350, width=3);
# Find locations of G1 and G2 peaks
peakLocs <- ppc.peaks(F1$y,.3)
g1_peak <- F1$x[peakLocs][1]
g2_peak <- F1$x[peakLocs][2]
# Create the density function data frame on first loop
# with x data
if (count==1){
F_total1 <- data.frame(F1$x)
F_total2 <- data.frame(F1$x)
}
# Store identified G1 and G2 peaks in testG1 and testG2
testG1[count] <- g1_peak
testG2[count] <- g2_peak
# Apply scaling factors per colony
u <- 2*g1_peak - g2_peak
v <- 100/(g1_peak - u)
dapiRescaled <- (dapi-u)*v
dapiRescaledTotal <- c(dapiRescaledTotal,dapiRescaled)
# Find the kernal density function of the rescaled DNA distribution
F2 <- density(dapiRescaled,n = 512, from=1,to=350, width=4)
F_total1 <- data.frame(F_total1,F1$y)
F_total2 <- data.frame(F_total2,F2$y)
count <- count+1
}
}
# Plot G1 peaks vs G2 peaks for each colony
ggplot(data.frame(testG1,testG2),aes(x=testG1,y=testG2)) +
geom_point(shape=1) + geom_smooth(method=lm) + xlab("G1 peak") +
ylab("G2 peak") + ggtitle("G1 vs G2 peaks in individual colonies")
df <- melt(F_total1[,1:12] , id = 'F1.x', variable_name = 'series')
ggplot(df, aes(F1.x,value)) + geom_line(aes(color = series)) +
ggtitle('Density estimation for unnormalized DNA content') + labs(y="probability")
df <- melt(F_total2[,1:12] , id = 'F1.x', variable_name = 'series')
ggplot(df, aes(F1.x,value)) + geom_line(aes(color = series)) +
ggtitle('Density estimation for normalized DNA content') + labs(y="probability")
Model based on: Improved Program for the Analysis of DNA Histograms M.G. Ormerod, A.W.R. Payne, and J.V. Watson Cytometry 8:637-641 (1987).
# pdfG1 calculates PDF of x|G1
pdfG1 <- function(x,mu1,sigma1){
1/(sqrt(2*pi*sigma1^2)) * exp(-(x-mu1)^2/(2*sigma1^2))
}
# pdfG2 calculates PDF of x|G2
pdfG2 <- function(x,mu2,sigma2){
1/(sqrt(2*pi*sigma2^2)) * exp(-(x-mu2)^2/(2*sigma2^2))
}
# pdfS.unnormalized calculates unnormalized PDF of x|S
pdfS.unnormalized <- function(x,mu1,sigma1,mu2,sigma2){
erf((x-mu1)/sigma1) - erf((x-mu2)/sigma2)
}
# pdfS calculates PDF of x|S
pdfS <- function(x,mu1,sigma1,mu2,sigma2){
normalization <- quadgk(function(x) pdfS.unnormalized(x,mu1,sigma1,mu2,sigma2),0,2000)
pdfS.unnormalized(x,mu1,sigma1,mu2,sigma2)/normalization
}
# pdfCellCycle calculates the total probability density function
pdfCellCycle <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
if(pG1+pG2 > 1){stop("Frequencies cannot add up to more than 1.")}
pS = 1-pG1-pG2
pG1*pdfG1(x,mu1,sigma1)+pS*pdfS(x,mu1,sigma1,mu2,sigma2)+pG2*pdfG2(x,mu2,sigma2)
}
# pdfCellCycleDF creates a dataframe with total and conditional PDFs
pdfCellCycleDF <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
Total <- pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
G1 <- pG1*pdfG1(x,mu1,sigma1)
S <- (1-pG1-pG2)*pdfS(x,mu1,sigma1,mu2,sigma2)
G2 <- pG2*pdfG2(x,mu2,sigma2)
data.frame(x,Total,G1,S,G2)
}
Below is an example probability density function for the cell cycle profile:
# Set parameters
params = list(mu1 = 100, sigma1 = 5, mu2 = 200, sigma2 = 10, pG1 = .2, pG2 = .3)
mu1 <- params$mu1; sigma1 = params$sigma1; mu2 = params$mu2; sigma2 = params$sigma2
pG1 = params$pG1; pG2 = params$pG2
xGrid <- seq(0,350,length=1000)
df <- pdfCellCycleDF(xGrid,mu1,sigma1,mu2,sigma2,pG1,pG2)
df <- melt(df , id = 'x', variable_name = 'series')
ggplot(df, aes(x,value)) + geom_line(aes(color = series)) +
ggtitle('Cell Cycle Model Probability Density Function') + labs(y="probability")
We can then look at the posterior probability functions for the G1, S, and G2 states.
# P(G1|x)
pG1x <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
pdfG1(x,mu1,sigma1) * pG1 / pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
}
# P(S|x)
pSx <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
pS <- 1-pG1-pG2
pdfS(x,mu1,sigma1,mu2,sigma2) * pS / pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
}
# P(G2|x)
pG2x <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
pdfG1(x,mu2,sigma2) * pG2 / pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
}
# Data frame with conditional probabilities on each column
pCondDF <- function(x,mu1,sigma1,mu2,sigma2,pG1,pG2){
G1 <- pG1x(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
S <- pSx(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
G2 <- pG2x(x,mu1,sigma1,mu2,sigma2,pG1,pG2)
data.frame(x,G1,S,G2)
}
The posterior probability functions are plotted below:
params = list(mu1 = 100, sigma1 = 5, mu2 = 200, sigma2 = 10, pG1 = .2, pG2 = .3)
mu1 <- params$mu1; sigma1 = params$sigma1; mu2 = params$mu2; sigma2 = params$sigma2
pG1 = params$pG1; pG2 = params$pG2
xGrid <- seq(3,350,length=1000)
df <- pCondDF(xGrid,mu1,sigma1,mu2,sigma2,pG1,pG2)
df <- melt(df , id = 'x', variable_name = 'series')
ggplot(df, aes(x,value)) + geom_line(aes(color = series)) +
ggtitle('Posterior probabilities P(G1|x),P(S|x),P(G2|x)') + labs(y="probability")
Maximum Likelihood Estimation was used to find the parameters for both the unnormalized and normalized DNA distributions.
# Unnormalized data
dapi <- cells$Intensity_IntegratedIntensity_DNA
DNA <- dapi[dapi < 300 & dapi > 50]
hist(DNA,1000)
params <- list(mu1 = 90, sigma1 = 20, mu2 = 180, sigma2 = 40, pG1 = .2, pG2 = .2)
lower <- list(mu1 = 70, sigma1 = 5, mu2 = 140, sigma2 = 10, pG1 = 0, pG2 = 0)
upper <- list(mu1 = 100, sigma1 = 30, mu2 = 200, sigma2 = 60, pG1 = .5, pG2 = .5)
fit <- fitdistr(DNA,
function(x,mu1,sigma1,mu2,sigma2,pG1,pG2) pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),
start = params,upper=upper,lower=lower)
fit
## mu1 sigma1 mu2 sigma2
## 9.761467e+01 1.601368e+01 1.837165e+02 3.985179e+01
## (4.199247e-01) (1.073553e-01) (1.972797e+00) (4.574038e-01)
## pG1 pG2
## 3.049117e-01 5.000000e-01
## (6.938977e-03) (3.281191e-02)
fitUnnormalized <- fit
mu1 <- fit$estimate[1]; sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3];
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5]; pG2 <- fit$estimate[6]
hist(DNA,20,freq=F,main = )
par(new=T)
curve(pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),from=50,to=300,xlab="",
ylab = "")
par(new=F)
# Normalized data
dapiRescaled <- dapiRescaledTotal[dapiRescaledTotal > 50 & dapiRescaledTotal < 250]
params = list(mu1 = 100, sigma1 = 5, mu2 = 200, sigma2 = 10, pG1 = .2, pG2 = .3)
lower <- list(mu1 = 90, sigma1 = 1, mu2 = 180, sigma2 = 2, pG1 = .1, pG2 = .1)
upper <- list(mu1 = 110, sigma1 = 10, mu2 = 220, sigma2 = 20, pG1 = .4, pG2 = .4)
fit <- fitdistr(dapiRescaled,
function(x,mu1,sigma1,mu2,sigma2,pG1,pG2) pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),
start = params,upper=upper,lower=lower)
fit
## mu1 sigma1 mu2 sigma2
## 9.889816e+01 1.000000e+01 2.017578e+02 1.445421e+01
## (7.404323e-02) (4.342101e-02) (9.321407e-02) (6.280588e-02)
## pG1 pG2
## 2.894091e-01 3.497422e-01
## (1.801729e-03) (1.993583e-03)
fitNormalized <- fit
mu1 <- fit$estimate[1]; sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3];
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5]; pG2 <- fit$estimate[6]
hist(dapiRescaled,20,freq=F)
par(new=T)
curve(pdfCellCycle(x,mu1,sigma1,mu2,sigma2,pG1,pG2),from=50,to=250,xlab="",
ylab = "")
par(new=F)
We tested whether applying the rescaling methodology improved the predictive nature of DNA intensity on cell cycle state identification. In addition to the DNA intensities, we added another dimension: EdU intensity, which is a marker for S-phase. We clustered data with this additional dimension using a k-means algorithm to label each cell. We then compared the cluster labels to labels derived from using just the DNA single variable, based on the posterior distribution with a p>.05 threshold.
cl <- kmeans(cells$Intensity_IntegratedIntensity_DNA,2)
clusterLabel <- cl$cluster
hist(cells$Intensity_MeanIntensity_EdU,100)
clusterLabel[cells$Intensity_MeanIntensity_EdU > .03] <- 3
df <- data.frame(DNA = cells$Intensity_IntegratedIntensity_DNA, meanEdU = cells$Intensity_MeanIntensity_EdU,
clusterLabel)
ggplot(df, aes(x=DNA, y=meanEdU,color=clusterLabel)) + geom_point(alpha=.05) +
coord_cartesian(xlim = c(0,300),ylim=c(0,.15))
fit <- fitUnnormalized
mu1 <- fit$estimate[1]; sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3];
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5]; pG2 <- fit$estimate[6]
condDF <- pCondDF(cells$Intensity_IntegratedIntensity_DNA,mu1,sigma1,mu2,sigma2,pG1,pG2)
clusterLabelSV <- numeric(length(clusterLabel))
clusterLabelSV[condDF$G1 > .5] <- clusterLabel[1]
clusterLabelSV[condDF$S > .5] <- 3
clusterLabelSV[condDF$G2 > .5] <- 3-clusterLabel[1]
frac <- sum(clusterLabelSV == clusterLabel)/length(clusterLabel)
The fraction of cells successfully classified using the unnormalized DNA distribution as a single variable predictor was: 0.4778347.
fit <- fitNormalized
mu1 <- fit$estimate[1]; sigma1 <- fit$estimate[2]; mu2 <- fit$estimate[3];
sigma2 <- fit$estimate[4]; pG1 <- fit$estimate[5]; pG2 <- fit$estimate[6]
condDF <- pCondDF(dapiRescaledTotal,mu1,sigma1,mu2,sigma2,pG1,pG2)
condDF <- pCondDF(cells$Intensity_IntegratedIntensity_DNA,mu1,sigma1,mu2,sigma2,pG1,pG2)
clusterLabelSV <- numeric(length(clusterLabel))
clusterLabelSV[condDF$G1 > .5] <- clusterLabel[1]
clusterLabelSV[condDF$S > .5] <- 3
clusterLabelSV[condDF$G2 > .5] <- 3-clusterLabel[1]
frac <- sum(clusterLabelSV == clusterLabel)/length(clusterLabel)
The fraction of cells successfully classified using the normalized DNA distribution as a single variable predictor was: 0.6112845.