suppressPackageStartupMessages({
library(flowCore)
library(flowWorkspace)
library(ggcyto)
library(ape)
library(dplyr)
library(tidyr)
library(tibble)
library(TreeSummarizedExperiment)
library(TreeHeatmap)
library(treeclimbR)
library(ggtree)
library(cowplot)
})
## Warning: replacing previous import 'data.table:::=' by 'ggplot2:::=' when
## loading 'ggcyto'
GatingSetThis is the full FAUST-generated GatingSet for a subset of samples from ImmPort SDY212, using 5 markers.
gs <- load_gs(file.path(basedir, "gs5", "gs_faust"))
gs
## A GatingSet with 186 samples
plot(gs)
GatingSet to TreeSummarizedExperimentThis will roughly follow some of the same steps as the treeclimbR toy example, for the sake of comparison.
flowWorkspace::gstreeexperiment yields TreeSummarizedExperiment with counts by leaf
tse <- gstreeexperiment(gs)
tse
## class: TreeSummarizedExperiment
## dim: 29 186
## metadata(0):
## assays(1): ''
## rownames(29): hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ ...
## hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6-
## hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6+
## rowData names(0):
## colnames(186): 011V3_B11_B11.391209.fcs 015V3_E5_E05.391227.fcs ...
## 15097_D2_D02.454436.fcs 15098_D3_D03.454446.fcs
## colData names(16): subtype race ... name file_info_name
## reducedDimNames(0):
## altExpNames(0):
## rowLinks: a LinkDataFrame (29 rows)
## rowTree: a phylo (29 leaves)
## colLinks: NULL
## colTree: NULL
Some glimpses of the structure.
# rows are leaves, columns are samples, values are counts
assays(tse)[[1]][1:5, 1:5]
## 011V3_B11_B11.391209.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- 182
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ 1565
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- 3
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+ 374
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- 6
## 015V3_E5_E05.391227.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- 50
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ 1590
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- 1
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+ 260
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- 8
## 024V3_D5_D05.391239.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- 209
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ 733
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- 3
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+ 408
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- 6
## 049V3_C5_C05.391251.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- 129
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ 1117
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- 9
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+ 606
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- 25
## 078V3_D11_D11.391257.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- 59
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ 866
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- 0
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+ 34
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- 8
# colData has the pData for each sample
colData(tse)[1:5,]
## DataFrame with 5 rows and 16 columns
## subtype race study_accession
## <character> <character> <character>
## 011V3_B11_B11.391209.fcs NA White SDY212
## 015V3_E5_E05.391227.fcs NA White SDY212
## 024V3_D5_D05.391239.fcs NA White SDY212
## 049V3_C5_C05.391251.fcs NA Asian SDY212
## 078V3_D11_D11.391257.fcs NA Other SDY212
## study_time_collected age_reported gender
## <character> <character> <character>
## 011V3_B11_B11.391209.fcs 28 Age at Study Day 0 Female
## 015V3_E5_E05.391227.fcs 28 Age at Study Day 0 Male
## 024V3_D5_D05.391239.fcs 28 Age at Study Day 0 Male
## 049V3_C5_C05.391251.fcs 28 Age at Study Day 0 Male
## 078V3_D11_D11.391257.fcs 28 Age at Study Day 0 Female
## to_impute batch participant_id
## <character> <character> <character>
## 011V3_B11_B11.391209.fcs FALSE FLU 021209 SUB134243
## 015V3_E5_E05.391227.fcs FALSE FLU 021209 SUB134247
## 024V3_D5_D05.391239.fcs FALSE FLU 021209 SUB134255
## 049V3_C5_C05.391251.fcs FALSE FLU 021209 SUB134277
## 078V3_D11_D11.391257.fcs FALSE FLU 021209 SUB134306
## study_time_collected_unit cohort imputed
## <character> <character> <character>
## 011V3_B11_B11.391209.fcs Days Cohort_2 FALSE
## 015V3_E5_E05.391227.fcs Days Cohort_2 FALSE
## 024V3_D5_D05.391239.fcs Days Cohort_2 FALSE
## 049V3_C5_C05.391251.fcs Days Cohort_1 FALSE
## 078V3_D11_D11.391257.fcs Days Cohort_1 FALSE
## description type
## <character> <character>
## 011V3_B11_B11.391209.fcs Flow cytometry result Whole blood
## 015V3_E5_E05.391227.fcs Flow cytometry result Whole blood
## 024V3_D5_D05.391239.fcs Flow cytometry result Whole blood
## 049V3_C5_C05.391251.fcs Flow cytometry result Whole blood
## 078V3_D11_D11.391257.fcs Flow cytometry result Whole blood
## name file_info_name
## <character> <character>
## 011V3_B11_B11.391209.fcs 011V3_B11_B11.391209.. 011V3_B11_B11.391209..
## 015V3_E5_E05.391227.fcs 015V3_E5_E05.391227... 015V3_E5_E05.391227...
## 024V3_D5_D05.391239.fcs 024V3_D5_D05.391239... 024V3_D5_D05.391239...
## 049V3_C5_C05.391251.fcs 049V3_C5_C05.391251... 049V3_C5_C05.391251...
## 078V3_D11_D11.391257.fcs 078V3_D11_D11.391257.. 078V3_D11_D11.391257..
This is built on the phylo object whose components are built by cytolib and wrapped up into ape::phylo object by flowWorkspace::gs_get_phylo,
tr <- tse@rowTree$phylo
tr
##
## Phylogenetic tree with 29 tips and 35 internal nodes.
##
## Tip labels:
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-, hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+, hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-, hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+, hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-, hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6+, ...
##
## Rooted; no branch lengths.
Aggregate counts for use by treeclimbR.
all_node <- showNode(tree = tr, only.leaf = FALSE)
tse_agg <- aggValue(x = tse, rowLevel = all_node, FUN = sum)
# Simple phylo plot with FAUST-generated gating paths as leaf names
plot(tr, cex = 0.5)
# For use as skeleton in later plots
simple_treefig <- ggtree(tr = tr, layout = "rectangular")
simple_treefig
There are 186 samples in two cohorts
table(colData(tse_agg)$cohort)
##
## Cohort_1 Cohort_2
## 68 118
Just to get the gist, subsample 10 from each cohort and plot tree with heatmap.
set.seed(42)
# TreeHeatMap view
count <- assays(tse)[[1]]
# Subsampling to make heatmap less unruly
cohort_1_sample <- sample(which(colData(tse)$cohort == "Cohort_1"), 10)
cohort_2_sample <- sample(which(colData(tse)$cohort == "Cohort_2"), 10)
count_small <- count[,c(cohort_1_sample, cohort_2_sample)]
cohorts <- colData(tse)[c(cohort_1_sample, cohort_2_sample), "cohort"]
# scale counts
scale_count <- t(apply(count_small, 1, FUN = function(x) {
xx <- x
rx <- (max(xx)-min(xx))
(xx - min(xx))/max(rx, 1)
}))
rownames(scale_count) <- rownames(count_small)
colnames(scale_count) <- colnames(count_small)
# tree + heatmap
vv <- cohorts
names(vv) <- colnames(scale_count)
anno_c <- vv
names(anno_c) <- vv
tree_heatmap <- TreeHeatmap(tree = tr, tree_fig = simple_treefig,
hm_data = scale_count,
column_split = vv, rel_width = 0.6,
tree_hm_gap = 0.3,
column_split_label = anno_c) +
scale_fill_viridis_c(option = "B") +
scale_y_continuous(expand = c(0, 10))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
tree_heatmap
treeclimbRFollowing treeclimbR example, just using Wilcoxon two-sample rank sum test on all nodes for differential abundance between the two cohorts.
test.func <- function(X, Y) {
Y <- as.numeric(factor(Y))
obj <- apply(X, 1, function(x) {
p.value <- suppressWarnings(wilcox.test(x ~ Y)$p.value)
e.sign <- sign(mean(x[Y == 2]) - mean(x[Y == 1]))
c(p.value, e.sign)
})
return(list(p.value=obj[1, ], e.sign=obj[2, ]))
}
Y <- colData(tse_agg)$cohort
X <- assays(tse_agg)[[1]]
resW <- test.func(X,Y)
outW <- data.frame(node = rowLinks(tse_agg)$nodeNum,
pvalue = resW$p.value,
sign = resW$e.sign)
cand <- suppressMessages(getCand(tree = rowTree(tse_agg), score_data = outW,
node_column = "node", p_column = "pvalue",
threshold = 0.05,
sign_column = "sign", message = TRUE))
best <- evalCand(tree = rowTree(tse_agg), levels = cand$candidate_list,
score_data = outW, node_column = "node",
p_column = "pvalue", sign_column = "sign")
infoCand(object = best)
## t upper_t is_valid method limit_rej level_name best rej_leaf rej_node
## 1 0.00 0.0200000 TRUE BH 0.05 0 FALSE 6 6
## 2 0.01 0.0200000 TRUE BH 0.05 0.01 FALSE 6 5
## 3 0.02 0.0200000 FALSE BH 0.05 0.02 FALSE 6 5
## 4 0.03 0.0200000 FALSE BH 0.05 0.03 FALSE 6 5
## 5 0.04 0.0200000 FALSE BH 0.05 0.04 FALSE 6 5
## 6 0.05 0.0400000 FALSE BH 0.05 0.05 FALSE 7 5
## 7 0.10 0.1333333 TRUE BH 0.05 0.1 TRUE 7 3
## 8 0.15 0.1333333 FALSE BH 0.05 0.15 FALSE 7 3
## 9 0.20 0.1333333 FALSE BH 0.05 0.2 FALSE 7 3
## 10 0.25 0.1333333 FALSE BH 0.05 0.25 FALSE 7 3
## 11 0.30 0.1333333 FALSE BH 0.05 0.3 FALSE 7 3
## 12 0.35 0.1333333 FALSE BH 0.05 0.35 FALSE 7 3
## 13 0.40 0.1333333 FALSE BH 0.05 0.4 FALSE 7 3
## 14 0.45 0.3500000 FALSE BH 0.05 0.45 FALSE 9 2
## 15 0.50 0.3500000 FALSE BH 0.05 0.5 FALSE 9 2
## 16 0.55 0.3500000 FALSE BH 0.05 0.55 FALSE 9 2
## 17 0.60 0.3500000 FALSE BH 0.05 0.6 FALSE 9 2
## 18 0.65 0.3500000 FALSE BH 0.05 0.65 FALSE 9 2
## 19 0.70 0.3500000 FALSE BH 0.05 0.7 FALSE 9 2
## 20 0.75 0.3500000 FALSE BH 0.05 0.75 FALSE 9 2
## 21 0.80 0.3500000 FALSE BH 0.05 0.8 FALSE 9 2
## 22 0.85 0.3500000 FALSE BH 0.05 0.85 FALSE 9 2
## 23 0.90 0.7000000 FALSE BH 0.05 0.9 FALSE 8 1
## 24 0.95 0.7000000 FALSE BH 0.05 0.95 FALSE 8 1
## 25 1.00 0.7000000 FALSE BH 0.05 1 FALSE 8 1
out_best <- topNodes(object = best, n = Inf, p_value = 0.05)
The number of nodes in each candidate.
cand_list <- cand$candidate_list
unlist(lapply(cand_list, length))
## 0 0.01 0.02 0.03 0.04 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55
## 29 28 28 28 28 27 24 24 24 24 24 24 24 21 21 21
## 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 1
## 21 21 21 21 21 21 20 20 20
Plots of nodes in each candidate.
candidate_plots <- lapply(seq_along(cand_list), FUN = function(x) {
cand.x <- cand_list[[x]]
fig.x <- simple_treefig +
geom_point2(aes(subset = (node %in% cand.x)), color = "navy", size = 0.5) +
labs(title = names(cand_list)[x]) +
theme(legend.position = "none",
plot.title = element_text(color="navy", size=7,
hjust = 0.5, vjust = -0.08))
})
legend <- get_legend(simple_treefig)
plot_grid(plotlist = c(candidate_plots, list(legend)), nrow = 3,
labels = paste0(letters[seq_along(cand_list)], "."),
label_size = 9, label_y = 0.99)
Plot of tree highlighting top nodes from treeclimbR along with their branches.
loc_tree <- out_best$node
br <- unlist(findOS(tree = tr, node = loc_tree, only.leaf = FALSE, self.include = TRUE))
df_color <- data.frame(node = showNode(tree = tr, only.leaf = FALSE)) %>%
mutate(signal = ifelse(node %in% br, "YES", "NO"))
final <- ggtree(tr = tr, layout = "rectangular", branch.length = "none", aes(color = signal)) %<+%
df_color +
scale_color_manual(values = c("NO" = "grey", "YES" = "orange")) +
geom_point2(aes(subset = node %in% loc_tree),
color = "red")
final
GatingSet for those top nodes.These node indices are in the phylo object indexing scheme
out_best$node
## [1] 5 47 48
node_idx_map attached to the phylo by gs_get_phylo provides the ability to get the original GatingSet nodes.
idx_map <- attr(tse_agg@rowTree$phylo, "node_idx_map")
idx_map
## phylo_idx gs_idx node_names is_leaf
## 1 1 10 hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- TRUE
## 2 2 11 hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ TRUE
## 3 3 13 hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- TRUE
## 4 4 14 hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+ TRUE
## 5 5 17 hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- TRUE
## 6 6 18 hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6+ TRUE
## 7 7 20 hCD4-/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6- TRUE
## 8 8 21 hCD4-/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6+ TRUE
## 9 9 25 hCD4-/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6- TRUE
## 10 10 26 hCD4-/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6+ TRUE
## 11 11 28 hCD27+/hCD3E-/hCD8A+/hPTPRC_iso:h6- TRUE
## 12 12 29 hCD27+/hCD3E-/hCD8A+/hPTPRC_iso:h6+ TRUE
## 13 13 32 hCD4-/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6- TRUE
## 14 14 33 hCD4-/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6+ TRUE
## 15 15 35 hCD4-/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6- TRUE
## 16 16 36 hCD4-/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6+ TRUE
## 17 17 41 hCD4+/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6- TRUE
## 18 18 42 hCD4+/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ TRUE
## 19 19 44 hCD4+/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6- TRUE
## 20 20 47 hCD4+/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- TRUE
## 21 21 48 hCD4+/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6+ TRUE
## 22 22 50 hCD4+/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6- TRUE
## 23 23 51 hCD4+/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6+ TRUE
## 24 24 55 hCD4+/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6- TRUE
## 25 25 56 hCD4+/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6+ TRUE
## 26 26 59 hCD4+/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6- TRUE
## 27 27 60 hCD4+/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6+ TRUE
## 28 28 62 hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6- TRUE
## 29 29 63 hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6+ TRUE
## 30 30 0 root FALSE
## 31 31 1 pos FALSE
## 32 32 2 SCFSC FALSE
## 33 33 3 SCSSC FALSE
## 34 34 4 Nondebris FALSE
## 35 35 5 Lymphocytes FALSE
## 36 36 6 hCD4- FALSE
## 37 37 7 hCD4-/hCD27- FALSE
## 38 38 8 hCD4-/hCD27-/hCD3E- FALSE
## 39 39 9 hCD4-/hCD27-/hCD3E-/hCD8A- FALSE
## 40 40 12 hCD4-/hCD27-/hCD3E-/hCD8A+ FALSE
## 41 41 15 hCD4-/hCD27-/hCD3E+ FALSE
## 42 42 16 hCD4-/hCD27-/hCD3E+/hCD8A- FALSE
## 43 43 19 hCD4-/hCD27-/hCD3E+/hCD8A+ FALSE
## 44 44 22 hCD4-/hCD27+ FALSE
## 45 45 23 hCD4-/hCD27+/hCD3E- FALSE
## 46 46 24 hCD4-/hCD27+/hCD3E-/hCD8A- FALSE
## 47 47 27 hCD27+/hCD3E-/hCD8A+ FALSE
## 48 48 30 hCD4-/hCD27+/hCD3E+ FALSE
## 49 49 31 hCD4-/hCD27+/hCD3E+/hCD8A- FALSE
## 50 50 34 hCD4-/hCD27+/hCD3E+/hCD8A+ FALSE
## 51 51 37 hCD4+ FALSE
## 52 52 38 hCD4+/hCD27- FALSE
## 53 53 39 hCD4+/hCD27-/hCD3E- FALSE
## 54 54 40 hCD4+/hCD27-/hCD3E-/hCD8A- FALSE
## 55 55 43 hCD4+/hCD27-/hCD3E-/hCD8A+ FALSE
## 56 56 45 hCD4+/hCD27-/hCD3E+ FALSE
## 57 57 46 hCD4+/hCD27-/hCD3E+/hCD8A- FALSE
## 58 58 49 hCD4+/hCD27-/hCD3E+/hCD8A+ FALSE
## 59 59 52 hCD4+/hCD27+ FALSE
## 60 60 53 hCD4+/hCD27+/hCD3E- FALSE
## 61 61 54 hCD4+/hCD27+/hCD3E-/hCD8A- FALSE
## 62 62 57 hCD4+/hCD27+/hCD3E+ FALSE
## 63 63 58 hCD4+/hCD27+/hCD3E+/hCD8A- FALSE
## 64 64 61 hCD4+/hCD27+/hCD3E+/hCD8A+ FALSE
node_names <- idx_map[idx_map$phylo_idx %in% out_best$node, "node_names"]
node_names
## [1] "hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-"
## [2] "hCD27+/hCD3E-/hCD8A+"
## [3] "hCD4-/hCD27+/hCD3E+"
Those node names are valid for indexing in to the original GatingSet and can be used accordingly.
gs_pop_get_stats(gs[[1]], node_names)
## sample pop count
## 1: 011V3_B11_B11.391209.fcs hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6- 6
## 2: 011V3_B11_B11.391209.fcs hCD27+/hCD3E-/hCD8A+ 44
## 3: 011V3_B11_B11.391209.fcs hCD4-/hCD27+/hCD3E+ 1606
ggcyto(gs[[1]], aes("hCD8A"), subset = node_names[[3]]) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.