load the gated data
library(flowWorkspace)
library(NPflow)
dataDir <- system.file("extdata",package="flowWorkspaceData")
gs <- load_gs(list.files(dataDir, pattern = "gs_manual",full = TRUE))
gh <- gs[[1]]
get singelts cells and all markers
fr <- getData(gh, "singlets")
chnls <- cytoEx:::getfluorescentChannels(fr)
mat <- exprs(fr)[, chnls]
set.seed(1234)
Define hyperprior
d <- 7
hyperG0 <- list()
hyperG0[["b_xi"]] <- colMeans(mat)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d+1
hyperG0[["lambda"]] <- diag(diag(var(mat)))
subsample the data to reduce the computation
n <- nrow(mat)
ind <- sample(1:n, size = 0.05*n)
visualize it in cd4, cd8 dimensions
# library(hexbin)
# plot(hexbin(mat))
mat <- mat[ind, ]
nTotal <- nrow(mat)
z <- t(mat)
time1 <- Sys.time()
Run NPflow with parallel (Estimate posterior)
MCMCsample_n <- DPMpost(data=z, hyperG0=hyperG0, N=5000, ncluster_init =15,
distrib="skewt", diagVar=TRUE
, doPlot = F
)
saveRDS(MCMCsample_n, file = "/loc/no-backup/mike/MCMCsample_n.rds")
time used for DPMpost
Summarizing Dirichlet Process Mixture Models
MCMCsample_n <- readRDS("/loc/no-backup/mike/MCMCsample_n.rds")
s <- summary(MCMCsample_n, burnin = 4000, thin=20, lossFn = "F-measure")
saveRDS(s, file = "/loc/no-backup/mike/s.rds")
visualize clusters
s
## summaryDPMMclust object with 50 observations sampled from the posterior:
## ------------------------------------------------------------------------
## Burnin: 4000 MCMC iterations discarded
##
## Point estimate of the partition with a skewt mixture:
## 1 2 4 6 7 8 9 11 12
## 0.0078% 0.063% 0.07% 0.026% 0.019% 0.05% 0.061% 0.028% 0.02%
## 16 17 18 19 20 21 23 25 26
## 0.017% 0.02% 0.042% 0.08% 0.15% 0.013% 0.03% 0.19% 0.024%
## 27 28
## 0.012% 0.084%
plot(s, ellipses = F)
## Plotting point estimate (may take a few sec)...
## DONE!
plot the manual gates for leaf nodes
leaf.nodes <- cytoUtils:::getLeafNode(gs, path = "auto")
plotGate(gh, leaf.nodes)
extract memberships and estimate its boundary (as gate) and plot it
# dd <- s$point_estim
# ind <- dd$c_est == 12
# cells <- mat[ind, ]
# sb <- chull(cells)
# plot(mat, pch = ".")
# lines(cells[sb,], col = "red")
get indices from manual gates
manual.ind <- sapply(leaf.nodes, function(n)getIndices(gh, n), simplify = FALSE)
convert it relative to singlets
singlets.ind <- getIndices(gh, "singlets")
manual.ind <- lapply(manual.ind, function(i)i[singlets.ind])
apply subsample ind
manual.ind <- lapply(manual.ind, function(i)i[ind])
verify manual ind for the sub-sampled data with the plots
df <- data.frame(mat, check.names = F)
fr1 <- fr
exprs(fr1) <- mat
library(ggcyto)
autoplot(fr1, "CD4", "CD8") + geom_point(data = df[manual.ind[["DNT"]],], col = "red", alpha = 0.5) + geom_point(data = df[manual.ind[["DPT"]],], alpha = 0.5)
convert auto clustering memberships to bool indices
dd <- s$point_estim
memberships <- dd$c_est
clusterIDs <- as.character(unique(memberships))
auto.ind <- sapply(clusterIDs, function(id)memberships == id, simplify = F)
match between auto and manual by their hamming distance of event indices
library(data.table)
mapping <- sapply(manual.ind, function(i){
dist <- sapply(auto.ind, function(j)sum(xor(i,j)))
min.ind <- which.min(dist)
auto.id <- names(min.ind)
data.table(auto.id = auto.id, dist = dist[min.ind], manual.count = sum(i), auto.count = sum(auto.ind[[auto.id]]))
}, simplify = F)
mapping <- rbindlist(mapping, idcol = "manual")
mapping
## manual auto.id dist manual.count auto.count
## 1: CD4/38- DR+ 1 94 60 34
## 2: CD4/38+ DR+ 1 94 60 34
## 3: CD4/38+ DR- 20 313 737 638
## 4: CD4/38- DR- 25 388 824 810
## 5: CD4/CCR7- 45RA+ 1 72 38 34
## 6: CD4/CCR7+ 45RA+ 20 125 615 638
## 7: CD4/CCR7+ 45RA- 25 285 717 810
## 8: CD4/CCR7- 45RA- 8 214 311 217
## 9: CD8/38- DR+ 1 110 76 34
## 10: CD8/38+ DR+ 1 111 77 34
## 11: CD8/38+ DR- 9 167 215 266
## 12: CD8/38- DR- 19 239 368 347
## 13: CD8/CCR7- 45RA+ 1 244 210 34
## 14: CD8/CCR7+ 45RA+ 9 83 321 266
## 15: CD8/CCR7+ 45RA- 1 115 81 34
## 16: CD8/CCR7- 45RA- 1 158 124 34
## 17: DNT 21 69 65 56
## 18: DPT 17 68 51 89
clusterID.matched <- unique(mapping[, auto.id])
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
autoplot(fr1, "CD3", "CD4") + geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5)
pop.DNT <- geom_point(data = df[auto.ind[[mapping[manual == "DNT", auto.id]]],], col = "red")
pop.DPT <- geom_point(data = df[auto.ind[[mapping[manual == "DPT", auto.id]]],])
autoplot(fr1, "CD4", "CD8") + pop.DNT + pop.DPT
clusterID.matched <- mapping[dirname(manual) == "CD4", auto.id]
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
pop.cd4 <- geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5, col = "red")
clusterID.matched <- mapping[dirname(manual) == "CD8", auto.id]
clusterID.matched.ind <- lapply(clusterID.matched, function(i)auto.ind[[i]])
merged.ind <- Reduce(`|`, clusterID.matched.ind)
pop.cd8 <- geom_point(data = df[merged.ind,], alpha = 0.5, size = 0.5, col = "black")
autoplot(fr1, "CD4", "CD8") + pop.cd4 + pop.cd8
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38- DR+", auto.id]]],], col = "blue", alpha = 0.5)
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38+ DR+", auto.id]]],], col = "red", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38+ DR-", auto.id]]],], col = "purple", alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/38- DR-", auto.id]]],], alpha = 0.5)
autoplot(fr1, "38", "DR") + pop1 + pop2 + pop3 + pop4
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38- DR+", auto.id]]],], col = "blue", alpha = 0.5)
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38+ DR+", auto.id]]],], col = "red", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38+ DR-", auto.id]]],], col = "purple", alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/38- DR-", auto.id]]],], alpha = 0.5)
autoplot(fr1, "38", "DR") + pop1 + pop2 + pop3 + pop4
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7+ 45RA+", auto.id]]],], col = "red", alpha = 0.5)
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA+", auto.id]]],], col = "purple", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA-", auto.id]]],], alpha = 0.5)
pop4 <- geom_point(data = df[auto.ind[[mapping[manual == "CD4/CCR7- 45RA+", auto.id]]],], col = "blue", alpha = 0.5)
autoplot(fr1, "CCR", "CD45") + pop1 + pop2 + pop3 + pop4
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7+ 45RA+", auto.id]]],], col = "red", alpha = 0.5)
pop2 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA+", auto.id]]],], col = "purple", alpha = 0.5)
pop3 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA-", auto.id]]],], alpha = 0.5)
pop1 <- geom_point(data = df[auto.ind[[mapping[manual == "CD8/CCR7- 45RA+", auto.id]]],], col = "blue", alpha = 0.5)
autoplot(fr1, "CCR", "CD45") + pop1 + pop2 + pop3 + pop4