load the gated data

library(flowWorkspace)
library(NPflow)
dataDir <- system.file("extdata",package="flowWorkspaceData")
gs <- load_gs(list.files(dataDir, pattern = "gs_manual",full = TRUE))
gh <- gs[[1]]

get singelts cells and all markers

fr <- getData(gh, "singlets")
chnls <- cytoEx:::getfluorescentChannels(fr)
mat <- exprs(fr)[, chnls]

set.seed(1234)

Define hyperprior

d <- 7
hyperG0 <- list()
hyperG0[["b_xi"]] <- colMeans(mat)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d+1
hyperG0[["lambda"]] <- diag(diag(var(mat)))

subsample the data to reduce the computation

n <- nrow(mat)
sampleRate <- 1#0.05
ind <- sample(1:n, size = sampleRate * n)
saveRDS(ind, file = "/loc/no-backup/mike/npflow/ind.rds")

visualize it in cd4, cd8 dimensions

# library(hexbin)
# plot(hexbin(mat))

mat <- mat[ind, ]
nTotal <- nrow(mat)
z <- t(mat)

time1 <- Sys.time()

Run NPflow with parallel (Estimate posterior)

MCMCsample_n <- DPMpost(data=z, hyperG0=hyperG0, N=5000, ncluster_init =15, 
   distrib="skewt", diagVar=TRUE
   , doPlot = F
 )
saveRDS(MCMCsample_n, file = "/loc/no-backup/mike/npflow/MCMCsample_n.rds")

time used for DPMpost

Summarizing Dirichlet Process Mixture Models

MCMCsample_n <- readRDS("/loc/no-backup/mike/npflow/MCMCsample_n.rds")
s <- summary(MCMCsample_n, burnin = 4000, thin=20, lossFn = "F-measure")
saveRDS(s, file = "/loc/no-backup/mike/npflow/s.rds")

visualize clusters

s
## summaryDPMMclust object with 50 observations sampled from the posterior:
## ------------------------------------------------------------------------
## Burnin: 4000 MCMC iterations discarded
## 
## Point estimate of the partition with a skewt mixture:
##        1        2        3        4        5        6        7        8 
##    0.01%  0.0033%  0.0034%  0.0041%   0.053%    0.02%  0.0056%   0.013% 
##        9       10       11       12       13       14       15       16 
##   0.071%    0.41%  0.0064%   0.039%  0.0045%   0.044%  0.0084%  0.0058% 
##       17       18       19       20       21       22       23       24 
##   0.029%   0.066%   0.025%   0.052%  0.0081%  0.0047%   0.023% 0.00032% 
##       25       26       27       28       29       30 
##   0.041%   0.014%   0.008%  0.0035%   0.012%  0.0073%
plot(s, ellipses = F)
## Plotting point estimate (may take a few sec)...
## DONE!

plot the manual gates for leaf nodes

leaf.nodes <- cytoUtils:::getLeafNode(gs, path = "auto")
plotGate(gh, leaf.nodes)

extract memberships and estimate its boundary (as gate) and plot it

# dd <- s$point_estim
# ind <- dd$c_est == 12
# cells <- mat[ind, ]
# sb <- chull(cells)
# plot(mat, pch = ".")
# lines(cells[sb,], col = "red")

get indices from manual gates

manual.ind <- sapply(leaf.nodes, function(n)getIndices(gh, n), simplify = FALSE)

convert it relative to singlets

singlets.ind <- getIndices(gh, "singlets")
manual.ind <- lapply(manual.ind, function(i)i[singlets.ind])

apply subsample ind

manual.ind <- lapply(manual.ind, function(i)i[ind])

verify manual ind for the sub-sampled data with the plots

df <- data.frame(mat, check.names = F)
fr1 <- fr
exprs(fr1) <- mat
library(ggcyto)
autoplot(fr1, "CD4", "CD8") + geom_point(data = df[manual.ind[["DNT"]],], col = "red", alpha = 0.5) + geom_point(data = df[manual.ind[["DPT"]],], alpha = 0.5)

convert auto clustering memberships to bool indices

dd <- s$point_estim
memberships <- dd$c_est 
clusterIDs <- as.character(unique(memberships))
auto.ind <- sapply(clusterIDs, function(id)memberships == id, simplify = F)

match between auto and manual

library(data.table)
mapping <- sapply(manual.ind, function(i){
  dist <- sapply(auto.ind, function(j){
    d1 <- sum(!(i&j)) #non-overlapped events
    d2 <- sum(xor(i,j))#  hamming distance 
    0.5 * d1 + d2
    })
  min.ind <- which.min(dist)
  auto.id <- names(min.ind)
  data.table(auto.id = auto.id, dist = dist[min.ind], manual.count = sum(i), auto.count = sum(auto.ind[[auto.id]]))
}, simplify = F)
mapping <- rbindlist(mapping, idcol = "manual")
mapping
##              manual auto.id    dist manual.count auto.count
##  1:     CD4/38- DR+       3 44603.5         1123        297
##  2:     CD4/38+ DR+       3 44512.0         1044        297
##  3:     CD4/38+ DR-      10 57653.0        14682      36040
##  4:     CD4/38- DR-      10 53901.5        17183      36040
##  5: CD4/CCR7- 45RA+      24 44279.0          740         28
##  6: CD4/CCR7+ 45RA+      24 55655.0        12116         28
##  7: CD4/CCR7+ 45RA-      10 57641.5        14743      36040
##  8: CD4/CCR7- 45RA-       3 49643.5         6433        297
##  9:     CD8/38- DR+      24 44847.0         1308         28
## 10:     CD8/38+ DR+      24 44874.0         1335         28
## 11:     CD8/38+ DR-      18 46044.5         4450       5701
## 12:     CD8/38- DR-      20 46811.0         7471       4539
## 13: CD8/CCR7- 45RA+      17 44389.5         3919       2567
## 14: CD8/CCR7+ 45RA+      18 42731.5         6802       5701
## 15: CD8/CCR7+ 45RA-      24 45108.0         1569         28
## 16: CD8/CCR7- 45RA-      19 45249.5         2274       2217
## 17:             DNT       6 44457.0         1293       1763
## 18:             DPT      29 43838.5          819       1061

visually check the pop matching

check CD3+ by combining all the matched leaf nodes

clusterID.matched <- unique(mapping[, auto.id])
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
autoplot(fr1, "CD3", "CD4") + geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5)

check DNT vs DPT

pop.DNT <- geom_point(data = df[auto.ind[[mapping[manual == "DNT", auto.id]]],], col = "red") 
pop.DPT <-  geom_point(data = df[auto.ind[[mapping[manual == "DPT", auto.id]]],]) 
autoplot(fr1, "CD4", "CD8") + pop.DNT + pop.DPT

check CD4 vs CD8

clusterID.matched <- mapping[dirname(manual) == "CD4", auto.id]
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
pop.cd4 <- geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5, col = "red")

clusterID.matched <- mapping[dirname(manual) == "CD8", auto.id]
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
pop.cd8 <- geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5, col = "black")

autoplot(fr1, "CD4", "CD8") + pop.cd4 + pop.cd8

check CD4: CD38 vs HLDR

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38- DR+", auto.id]]],], col = "blue", alpha = 0.5)
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38+ DR+", auto.id]]],], col = "red", alpha = 0.5) 
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38+ DR-", auto.id]]],], col = "purple", alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38- DR-", auto.id]]],], alpha = 0.5)
autoplot(fr1, "38", "DR") + pop1 + pop2 + pop3 + pop4

check CD8: CD38 vs HLDR

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38- DR+", auto.id]]],], col = "blue", alpha = 0.5) 
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38+ DR+", auto.id]]],], col = "red", alpha = 0.5) 
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38+ DR-", auto.id]]],], col = "purple", alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38- DR-", auto.id]]],], alpha = 0.5)
autoplot(fr1, "38", "DR") + pop1 + pop2 + pop3 + pop4

check CD4: CCR7 vs CD45

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7+ 45RA+", auto.id]]],], col = "red", alpha = 0.5) 
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA+", auto.id]]],], col = "purple", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA-", auto.id]]],], alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA+", auto.id]]],], col = "blue", alpha = 0.5)
autoplot(fr1, "CCR", "CD45") + pop1 + pop2 + pop3 + pop4

check CD8: CCR7 vs CD45

pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7+ 45RA+", auto.id]]],], col = "red", alpha = 0.5) 
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA+", auto.id]]],], col = "purple", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA-", auto.id]]],], alpha = 0.5)
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA+", auto.id]]],], col = "blue", alpha = 0.5) 
autoplot(fr1, "CCR", "CD45") + pop1 + pop2 + pop3 + pop4