load the gated data

library(flowWorkspace)
library(NPflow)
dataDir <- system.file("extdata",package="flowWorkspaceData")
gs <- load_gs(list.files(dataDir, pattern = "gs_manual",full = TRUE))
gh <- gs[[1]]

get singelts cells and all markers

fr <- getData(gh, "singlets")
chnls <- cytoEx:::getfluorescentChannels(fr)
mat <- exprs(fr)[, chnls]

set.seed(1234)

Define hyperprior

d <- 7
hyperG0 <- list()
hyperG0[["b_xi"]] <- colMeans(mat)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d+1
hyperG0[["lambda"]] <- diag(diag(var(mat)))

subsample the data to reduce the computation

n <- nrow(mat)
ind <- sample(1:n, size = 0.05*n)

visualize it in cd4, cd8 dimensions

# library(hexbin)
# plot(hexbin(mat))

mat <- mat[ind, ]
nTotal <- nrow(mat)
z <- t(mat)

time1 <- Sys.time()

Run NPflow with parallel (Estimate posterior)

MCMCsample_n <- DPMpost(data=z, hyperG0=hyperG0, N=5000, ncluster_init =15, 
   distrib="skewt", diagVar=TRUE
   , doPlot = F
 )
saveRDS(MCMCsample_n, file = "/loc/no-backup/mike/MCMCsample_n.rds")

time used for DPMpost

Summarizing Dirichlet Process Mixture Models

MCMCsample_n <- readRDS("/loc/no-backup/mike/MCMCsample_n.rds")
s <- summary(MCMCsample_n, burnin = 4000, thin=20, lossFn = "F-measure")
saveRDS(s, file = "/loc/no-backup/mike/s.rds")

visualize clusters

s
## summaryDPMMclust object with 50 observations sampled from the posterior:
## ------------------------------------------------------------------------
## Burnin: 4000 MCMC iterations discarded
## 
## Point estimate of the partition with a skewt mixture:
##       1       2       4       6       7       8       9      11      12 
## 0.0078%  0.063%   0.07%  0.026%  0.019%   0.05%  0.061%  0.028%   0.02% 
##      16      17      18      19      20      21      23      25      26 
##  0.017%   0.02%  0.042%   0.08%   0.15%  0.013%   0.03%   0.19%  0.024% 
##      27      28 
##  0.012%  0.084%
plot(s, ellipses = F)
## Plotting point estimate (may take a few sec)...
## DONE!

plot the manual gates for leaf nodes

leaf.nodes <- cytoUtils:::getLeafNode(gs, path = "auto")
plotGate(gh, leaf.nodes)

extract memberships and estimate its boundary (as gate) and plot it

# dd <- s$point_estim
# ind <- dd$c_est == 12
# cells <- mat[ind, ]
# sb <- chull(cells)
# plot(mat, pch = ".")
# lines(cells[sb,], col = "red")

get indices from manual gates

manual.ind <- sapply(leaf.nodes, function(n)getIndices(gh, n), simplify = FALSE)

convert it relative to singlets

singlets.ind <- getIndices(gh, "singlets")
manual.ind <- lapply(manual.ind, function(i)i[singlets.ind])

apply subsample ind

manual.ind <- lapply(manual.ind, function(i)i[ind])

verify manual ind for the sub-sampled data with the plots

df <- data.frame(mat, check.names = F)
fr1 <- fr
exprs(fr1) <- mat
library(ggcyto)
autoplot(fr1, "CD4", "CD8") + geom_point(data = df[manual.ind[["DNT"]],], col = "red", alpha = 0.5) + geom_point(data = df[manual.ind[["DPT"]],], alpha = 0.5)

convert auto clustering memberships to bool indices

dd <- s$point_estim
memberships <- dd$c_est 
clusterIDs <- as.character(unique(memberships))
auto.ind <- sapply(clusterIDs, function(id)memberships == id, simplify = F)

match between auto and manual by their hamming distance of event indices

library(data.table)
mapping <- sapply(manual.ind, function(i){
  dist <- sapply(auto.ind, function(j)sum(xor(i,j)))
  min.ind <- which.min(dist)
  auto.id <- names(min.ind)
  data.table(auto.id = auto.id, dist = dist[min.ind], manual.count = sum(i), auto.count = sum(auto.ind[[auto.id]]))
}, simplify = F)
mapping <- rbindlist(mapping, idcol = "manual")
mapping
##              manual auto.id dist manual.count auto.count
##  1:     CD4/38- DR+       1   94           60         34
##  2:     CD4/38+ DR+       1   94           60         34
##  3:     CD4/38+ DR-      20  313          737        638
##  4:     CD4/38- DR-      25  388          824        810
##  5: CD4/CCR7- 45RA+       1   72           38         34
##  6: CD4/CCR7+ 45RA+      20  125          615        638
##  7: CD4/CCR7+ 45RA-      25  285          717        810
##  8: CD4/CCR7- 45RA-       8  214          311        217
##  9:     CD8/38- DR+       1  110           76         34
## 10:     CD8/38+ DR+       1  111           77         34
## 11:     CD8/38+ DR-       9  167          215        266
## 12:     CD8/38- DR-      19  239          368        347
## 13: CD8/CCR7- 45RA+       1  244          210         34
## 14: CD8/CCR7+ 45RA+       9   83          321        266
## 15: CD8/CCR7+ 45RA-       1  115           81         34
## 16: CD8/CCR7- 45RA-       1  158          124         34
## 17:             DNT      21   69           65         56
## 18:             DPT      17   68           51         89

visually check the pop matching

check CD3+ by combining all the matched leaf nodes

clusterID.matched <- unique(mapping[, auto.id])
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
autoplot(fr1, "CD3", "CD4") + geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5)

check DNT vs DPT

pop.DNT <- geom_point(data = df[auto.ind[[mapping[manual == "DNT", auto.id]]],], col = "red") 
pop.DPT <-  geom_point(data = df[auto.ind[[mapping[manual == "DPT", auto.id]]],]) 
autoplot(fr1, "CD4", "CD8") + pop.DNT + pop.DPT

check CD4 vs CD8

clusterID.matched <- mapping[dirname(manual) == "CD4", auto.id]
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
pop.cd4 <- geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5, col = "red")

clusterID.matched <- mapping[dirname(manual) == "CD8", auto.id]
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
pop.cd8 <- geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5, col = "black")

autoplot(fr1, "CD4", "CD8") + pop.cd4 + pop.cd8

check CD4: CD38 vs HLDR

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38- DR+", auto.id]]],], col = "blue", alpha = 0.5)
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38+ DR+", auto.id]]],], col = "red", alpha = 0.5) 
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38+ DR-", auto.id]]],], col = "purple", alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38- DR-", auto.id]]],], alpha = 0.5)
autoplot(fr1, "38", "DR") + pop1 + pop2 + pop3 + pop4

check CD8: CD38 vs HLDR

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38- DR+", auto.id]]],], col = "blue", alpha = 0.5) 
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38+ DR+", auto.id]]],], col = "red", alpha = 0.5) 
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38+ DR-", auto.id]]],], col = "purple", alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38- DR-", auto.id]]],], alpha = 0.5)
autoplot(fr1, "38", "DR") + pop1 + pop2 + pop3 + pop4

check CD4: CCR7 vs CD45

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7+ 45RA+", auto.id]]],], col = "red", alpha = 0.5) 
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA+", auto.id]]],], col = "purple", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA-", auto.id]]],], alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA+", auto.id]]],], col = "blue", alpha = 0.5)
autoplot(fr1, "CCR", "CD45") + pop1 + pop2 + pop3 + pop4

check CD8: CCR7 vs CD45

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7+ 45RA+", auto.id]]],], col = "red", alpha = 0.5) 
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA+", auto.id]]],], col = "purple", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA-", auto.id]]],], alpha = 0.5)
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA+", auto.id]]],], col = "blue", alpha = 0.5) 
autoplot(fr1, "CCR", "CD45") + pop1 + pop2 + pop3 + pop4