suppressPackageStartupMessages({
    library(flowCore)
    library(flowWorkspace)
    library(ggcyto)
    library(ape)
    library(dplyr)
    library(tidyr)
    library(tibble)
    library(TreeSummarizedExperiment)
    library(TreeHeatmap)
    library(treeclimbR)
    library(ggtree)
    library(cowplot)  
})
## Warning: replacing previous import 'data.table:::=' by 'ggplot2:::=' when
## loading 'ggcyto'

Starting with a FAUST-generated GatingSet

This is the full FAUST-generated GatingSet for a subset of samples from ImmPort SDY212, using 5 markers.

gs <- load_gs(file.path(basedir, "gs5", "gs_faust"))
gs
## A GatingSet with 186 samples
plot(gs)

GatingSet to TreeSummarizedExperiment

This will roughly follow some of the same steps as the treeclimbR toy example, for the sake of comparison.

flowWorkspace::gstreeexperiment yields TreeSummarizedExperiment with counts by leaf

tse <- gstreeexperiment(gs)
tse
## class: TreeSummarizedExperiment 
## dim: 29 186 
## metadata(0):
## assays(1): ''
## rownames(29): hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-
##   hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+ ...
##   hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6-
##   hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6+
## rowData names(0):
## colnames(186): 011V3_B11_B11.391209.fcs 015V3_E5_E05.391227.fcs ...
##   15097_D2_D02.454436.fcs 15098_D3_D03.454446.fcs
## colData names(16): subtype race ... name file_info_name
## reducedDimNames(0):
## altExpNames(0):
## rowLinks: a LinkDataFrame (29 rows)
## rowTree: a phylo (29 leaves)
## colLinks: NULL
## colTree: NULL

Some glimpses of the structure.

# rows are leaves, columns are samples, values are counts
assays(tse)[[1]][1:5, 1:5]
##                                           011V3_B11_B11.391209.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-                      182
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+                     1565
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-                        3
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+                            374
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-                        6
##                                           015V3_E5_E05.391227.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-                      50
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+                    1590
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-                       1
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+                           260
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-                       8
##                                           024V3_D5_D05.391239.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-                     209
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+                     733
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-                       3
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+                           408
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-                       6
##                                           049V3_C5_C05.391251.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-                     129
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+                    1117
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-                       9
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+                           606
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-                      25
##                                           078V3_D11_D11.391257.fcs
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-                       59
## hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+                      866
## hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-                        0
## hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+                             34
## hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-                        8
# colData has the pData for each sample
colData(tse)[1:5,]
## DataFrame with 5 rows and 16 columns
##                              subtype        race study_accession
##                          <character> <character>     <character>
## 011V3_B11_B11.391209.fcs          NA       White          SDY212
## 015V3_E5_E05.391227.fcs           NA       White          SDY212
## 024V3_D5_D05.391239.fcs           NA       White          SDY212
## 049V3_C5_C05.391251.fcs           NA       Asian          SDY212
## 078V3_D11_D11.391257.fcs          NA       Other          SDY212
##                          study_time_collected       age_reported      gender
##                                   <character>        <character> <character>
## 011V3_B11_B11.391209.fcs                   28 Age at Study Day 0      Female
## 015V3_E5_E05.391227.fcs                    28 Age at Study Day 0        Male
## 024V3_D5_D05.391239.fcs                    28 Age at Study Day 0        Male
## 049V3_C5_C05.391251.fcs                    28 Age at Study Day 0        Male
## 078V3_D11_D11.391257.fcs                   28 Age at Study Day 0      Female
##                            to_impute       batch participant_id
##                          <character> <character>    <character>
## 011V3_B11_B11.391209.fcs       FALSE  FLU 021209      SUB134243
## 015V3_E5_E05.391227.fcs        FALSE  FLU 021209      SUB134247
## 024V3_D5_D05.391239.fcs        FALSE  FLU 021209      SUB134255
## 049V3_C5_C05.391251.fcs        FALSE  FLU 021209      SUB134277
## 078V3_D11_D11.391257.fcs       FALSE  FLU 021209      SUB134306
##                          study_time_collected_unit      cohort     imputed
##                                        <character> <character> <character>
## 011V3_B11_B11.391209.fcs                      Days    Cohort_2       FALSE
## 015V3_E5_E05.391227.fcs                       Days    Cohort_2       FALSE
## 024V3_D5_D05.391239.fcs                       Days    Cohort_2       FALSE
## 049V3_C5_C05.391251.fcs                       Days    Cohort_1       FALSE
## 078V3_D11_D11.391257.fcs                      Days    Cohort_1       FALSE
##                                    description        type
##                                    <character> <character>
## 011V3_B11_B11.391209.fcs Flow cytometry result Whole blood
## 015V3_E5_E05.391227.fcs  Flow cytometry result Whole blood
## 024V3_D5_D05.391239.fcs  Flow cytometry result Whole blood
## 049V3_C5_C05.391251.fcs  Flow cytometry result Whole blood
## 078V3_D11_D11.391257.fcs Flow cytometry result Whole blood
##                                            name         file_info_name
##                                     <character>            <character>
## 011V3_B11_B11.391209.fcs 011V3_B11_B11.391209.. 011V3_B11_B11.391209..
## 015V3_E5_E05.391227.fcs  015V3_E5_E05.391227... 015V3_E5_E05.391227...
## 024V3_D5_D05.391239.fcs  024V3_D5_D05.391239... 024V3_D5_D05.391239...
## 049V3_C5_C05.391251.fcs  049V3_C5_C05.391251... 049V3_C5_C05.391251...
## 078V3_D11_D11.391257.fcs 078V3_D11_D11.391257.. 078V3_D11_D11.391257..

This is built on the phylo object whose components are built by cytolib and wrapped up into ape::phylo object by flowWorkspace::gs_get_phylo,

tr <- tse@rowTree$phylo
tr
## 
## Phylogenetic tree with 29 tips and 35 internal nodes.
## 
## Tip labels:
##   hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-, hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+, hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-, hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+, hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-, hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6+, ...
## 
## Rooted; no branch lengths.

Aggregate counts for use by treeclimbR.

all_node <- showNode(tree = tr, only.leaf = FALSE)
tse_agg <- aggValue(x = tse, rowLevel = all_node, FUN = sum)

Basic Plots

# Simple phylo plot with FAUST-generated gating paths as leaf names
plot(tr, cex = 0.5)

# For use as skeleton in later plots
simple_treefig <- ggtree(tr = tr, layout = "rectangular")
simple_treefig

There are 186 samples in two cohorts

table(colData(tse_agg)$cohort)
## 
## Cohort_1 Cohort_2 
##       68      118

Just to get the gist, subsample 10 from each cohort and plot tree with heatmap.

set.seed(42)

# TreeHeatMap view
count <- assays(tse)[[1]]

# Subsampling to make heatmap less unruly
cohort_1_sample <- sample(which(colData(tse)$cohort == "Cohort_1"), 10)
cohort_2_sample <- sample(which(colData(tse)$cohort == "Cohort_2"), 10)
count_small <- count[,c(cohort_1_sample, cohort_2_sample)]
cohorts <- colData(tse)[c(cohort_1_sample, cohort_2_sample), "cohort"]

# scale counts
scale_count <- t(apply(count_small, 1, FUN = function(x) {
    xx <- x
    rx <- (max(xx)-min(xx))
    (xx - min(xx))/max(rx, 1)
}))
rownames(scale_count) <- rownames(count_small)
colnames(scale_count) <- colnames(count_small)

# tree + heatmap
vv <- cohorts
names(vv) <- colnames(scale_count)
anno_c <- vv
names(anno_c) <- vv
tree_heatmap <- TreeHeatmap(tree = tr, tree_fig = simple_treefig, 
                   hm_data = scale_count,
                   column_split = vv, rel_width = 0.6,
                   tree_hm_gap = 0.3,
                   column_split_label = anno_c) +
                   scale_fill_viridis_c(option = "B") +
                   scale_y_continuous(expand = c(0, 10))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
tree_heatmap

treeclimbR

Following treeclimbR example, just using Wilcoxon two-sample rank sum test on all nodes for differential abundance between the two cohorts.

test.func <- function(X, Y) {
    Y <- as.numeric(factor(Y))
    obj <- apply(X, 1, function(x) {                
        p.value <- suppressWarnings(wilcox.test(x ~ Y)$p.value)
        e.sign <- sign(mean(x[Y == 2]) - mean(x[Y == 1]))
        c(p.value, e.sign)          
    })
    return(list(p.value=obj[1, ], e.sign=obj[2, ])) 
}

Y <- colData(tse_agg)$cohort
X <- assays(tse_agg)[[1]]
resW <- test.func(X,Y)
outW <- data.frame(node = rowLinks(tse_agg)$nodeNum,
                   pvalue = resW$p.value,
                   sign = resW$e.sign)

cand <- suppressMessages(getCand(tree = rowTree(tse_agg), score_data = outW, 
                        node_column = "node", p_column = "pvalue",
                        threshold = 0.05,
                        sign_column = "sign", message = TRUE))

best <- evalCand(tree = rowTree(tse_agg), levels = cand$candidate_list, 
                 score_data = outW, node_column = "node",
                 p_column = "pvalue", sign_column = "sign")
infoCand(object = best)
##       t   upper_t is_valid method limit_rej level_name  best rej_leaf rej_node
## 1  0.00 0.0200000     TRUE     BH      0.05          0 FALSE        6        6
## 2  0.01 0.0200000     TRUE     BH      0.05       0.01 FALSE        6        5
## 3  0.02 0.0200000    FALSE     BH      0.05       0.02 FALSE        6        5
## 4  0.03 0.0200000    FALSE     BH      0.05       0.03 FALSE        6        5
## 5  0.04 0.0200000    FALSE     BH      0.05       0.04 FALSE        6        5
## 6  0.05 0.0400000    FALSE     BH      0.05       0.05 FALSE        7        5
## 7  0.10 0.1333333     TRUE     BH      0.05        0.1  TRUE        7        3
## 8  0.15 0.1333333    FALSE     BH      0.05       0.15 FALSE        7        3
## 9  0.20 0.1333333    FALSE     BH      0.05        0.2 FALSE        7        3
## 10 0.25 0.1333333    FALSE     BH      0.05       0.25 FALSE        7        3
## 11 0.30 0.1333333    FALSE     BH      0.05        0.3 FALSE        7        3
## 12 0.35 0.1333333    FALSE     BH      0.05       0.35 FALSE        7        3
## 13 0.40 0.1333333    FALSE     BH      0.05        0.4 FALSE        7        3
## 14 0.45 0.3500000    FALSE     BH      0.05       0.45 FALSE        9        2
## 15 0.50 0.3500000    FALSE     BH      0.05        0.5 FALSE        9        2
## 16 0.55 0.3500000    FALSE     BH      0.05       0.55 FALSE        9        2
## 17 0.60 0.3500000    FALSE     BH      0.05        0.6 FALSE        9        2
## 18 0.65 0.3500000    FALSE     BH      0.05       0.65 FALSE        9        2
## 19 0.70 0.3500000    FALSE     BH      0.05        0.7 FALSE        9        2
## 20 0.75 0.3500000    FALSE     BH      0.05       0.75 FALSE        9        2
## 21 0.80 0.3500000    FALSE     BH      0.05        0.8 FALSE        9        2
## 22 0.85 0.3500000    FALSE     BH      0.05       0.85 FALSE        9        2
## 23 0.90 0.7000000    FALSE     BH      0.05        0.9 FALSE        8        1
## 24 0.95 0.7000000    FALSE     BH      0.05       0.95 FALSE        8        1
## 25 1.00 0.7000000    FALSE     BH      0.05          1 FALSE        8        1
out_best <- topNodes(object = best, n = Inf, p_value = 0.05)

The number of nodes in each candidate.

cand_list <- cand$candidate_list
unlist(lapply(cand_list, length))
##    0 0.01 0.02 0.03 0.04 0.05  0.1 0.15  0.2 0.25  0.3 0.35  0.4 0.45  0.5 0.55 
##   29   28   28   28   28   27   24   24   24   24   24   24   24   21   21   21 
##  0.6 0.65  0.7 0.75  0.8 0.85  0.9 0.95    1 
##   21   21   21   21   21   21   20   20   20

Plots of nodes in each candidate.

candidate_plots <- lapply(seq_along(cand_list), FUN = function(x) {
cand.x <- cand_list[[x]]
fig.x <- simple_treefig +
         geom_point2(aes(subset = (node %in% cand.x)), color = "navy", size = 0.5) +
         labs(title = names(cand_list)[x]) +
         theme(legend.position = "none",
         plot.title = element_text(color="navy", size=7, 
                                   hjust = 0.5, vjust = -0.08))
})

legend <- get_legend(simple_treefig)
plot_grid(plotlist = c(candidate_plots, list(legend)), nrow = 3,
          labels = paste0(letters[seq_along(cand_list)], "."), 
          label_size = 9, label_y = 0.99)

Plot of tree highlighting top nodes from treeclimbR along with their branches.

loc_tree <- out_best$node

br <- unlist(findOS(tree = tr, node = loc_tree, only.leaf = FALSE, self.include = TRUE))
df_color <- data.frame(node = showNode(tree = tr, only.leaf = FALSE)) %>%
            mutate(signal = ifelse(node %in% br, "YES", "NO"))
final <- ggtree(tr = tr, layout = "rectangular", branch.length = "none", aes(color = signal)) %<+% 
                df_color +
                scale_color_manual(values = c("NO" = "grey", "YES" = "orange")) + 
                geom_point2(aes(subset = node %in% loc_tree),
                color = "red")
final

Pulling information from the GatingSet for those top nodes.

These node indices are in the phylo object indexing scheme

out_best$node
## [1]  5 47 48

node_idx_map attached to the phylo by gs_get_phylo provides the ability to get the original GatingSet nodes.

idx_map <- attr(tse_agg@rowTree$phylo, "node_idx_map")
idx_map
##    phylo_idx gs_idx                                node_names is_leaf
## 1          1     10 hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-    TRUE
## 2          2     11 hCD4-/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+    TRUE
## 3          3     13 hCD4-/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-    TRUE
## 4          4     14       hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6+    TRUE
## 5          5     17 hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-    TRUE
## 6          6     18 hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6+    TRUE
## 7          7     20 hCD4-/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6-    TRUE
## 8          8     21 hCD4-/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6+    TRUE
## 9          9     25 hCD4-/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6-    TRUE
## 10        10     26 hCD4-/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6+    TRUE
## 11        11     28       hCD27+/hCD3E-/hCD8A+/hPTPRC_iso:h6-    TRUE
## 12        12     29       hCD27+/hCD3E-/hCD8A+/hPTPRC_iso:h6+    TRUE
## 13        13     32 hCD4-/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6-    TRUE
## 14        14     33 hCD4-/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6+    TRUE
## 15        15     35 hCD4-/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6-    TRUE
## 16        16     36 hCD4-/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6+    TRUE
## 17        17     41 hCD4+/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6-    TRUE
## 18        18     42 hCD4+/hCD27-/hCD3E-/hCD8A-/hPTPRC_iso:h6+    TRUE
## 19        19     44 hCD4+/hCD27-/hCD3E-/hCD8A+/hPTPRC_iso:h6-    TRUE
## 20        20     47 hCD4+/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-    TRUE
## 21        21     48 hCD4+/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6+    TRUE
## 22        22     50 hCD4+/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6-    TRUE
## 23        23     51 hCD4+/hCD27-/hCD3E+/hCD8A+/hPTPRC_iso:h6+    TRUE
## 24        24     55 hCD4+/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6-    TRUE
## 25        25     56 hCD4+/hCD27+/hCD3E-/hCD8A-/hPTPRC_iso:h6+    TRUE
## 26        26     59 hCD4+/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6-    TRUE
## 27        27     60 hCD4+/hCD27+/hCD3E+/hCD8A-/hPTPRC_iso:h6+    TRUE
## 28        28     62 hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6-    TRUE
## 29        29     63 hCD4+/hCD27+/hCD3E+/hCD8A+/hPTPRC_iso:h6+    TRUE
## 30        30      0                                      root   FALSE
## 31        31      1                                       pos   FALSE
## 32        32      2                                     SCFSC   FALSE
## 33        33      3                                     SCSSC   FALSE
## 34        34      4                                 Nondebris   FALSE
## 35        35      5                               Lymphocytes   FALSE
## 36        36      6                                     hCD4-   FALSE
## 37        37      7                              hCD4-/hCD27-   FALSE
## 38        38      8                       hCD4-/hCD27-/hCD3E-   FALSE
## 39        39      9                hCD4-/hCD27-/hCD3E-/hCD8A-   FALSE
## 40        40     12                hCD4-/hCD27-/hCD3E-/hCD8A+   FALSE
## 41        41     15                       hCD4-/hCD27-/hCD3E+   FALSE
## 42        42     16                hCD4-/hCD27-/hCD3E+/hCD8A-   FALSE
## 43        43     19                hCD4-/hCD27-/hCD3E+/hCD8A+   FALSE
## 44        44     22                              hCD4-/hCD27+   FALSE
## 45        45     23                       hCD4-/hCD27+/hCD3E-   FALSE
## 46        46     24                hCD4-/hCD27+/hCD3E-/hCD8A-   FALSE
## 47        47     27                      hCD27+/hCD3E-/hCD8A+   FALSE
## 48        48     30                       hCD4-/hCD27+/hCD3E+   FALSE
## 49        49     31                hCD4-/hCD27+/hCD3E+/hCD8A-   FALSE
## 50        50     34                hCD4-/hCD27+/hCD3E+/hCD8A+   FALSE
## 51        51     37                                     hCD4+   FALSE
## 52        52     38                              hCD4+/hCD27-   FALSE
## 53        53     39                       hCD4+/hCD27-/hCD3E-   FALSE
## 54        54     40                hCD4+/hCD27-/hCD3E-/hCD8A-   FALSE
## 55        55     43                hCD4+/hCD27-/hCD3E-/hCD8A+   FALSE
## 56        56     45                       hCD4+/hCD27-/hCD3E+   FALSE
## 57        57     46                hCD4+/hCD27-/hCD3E+/hCD8A-   FALSE
## 58        58     49                hCD4+/hCD27-/hCD3E+/hCD8A+   FALSE
## 59        59     52                              hCD4+/hCD27+   FALSE
## 60        60     53                       hCD4+/hCD27+/hCD3E-   FALSE
## 61        61     54                hCD4+/hCD27+/hCD3E-/hCD8A-   FALSE
## 62        62     57                       hCD4+/hCD27+/hCD3E+   FALSE
## 63        63     58                hCD4+/hCD27+/hCD3E+/hCD8A-   FALSE
## 64        64     61                hCD4+/hCD27+/hCD3E+/hCD8A+   FALSE
node_names <- idx_map[idx_map$phylo_idx %in% out_best$node, "node_names"]
node_names
## [1] "hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-"
## [2] "hCD27+/hCD3E-/hCD8A+"                     
## [3] "hCD4-/hCD27+/hCD3E+"

Those node names are valid for indexing in to the original GatingSet and can be used accordingly.

gs_pop_get_stats(gs[[1]], node_names)
##                      sample                                       pop count
## 1: 011V3_B11_B11.391209.fcs hCD4-/hCD27-/hCD3E+/hCD8A-/hPTPRC_iso:h6-     6
## 2: 011V3_B11_B11.391209.fcs                      hCD27+/hCD3E-/hCD8A+    44
## 3: 011V3_B11_B11.391209.fcs                       hCD4-/hCD27+/hCD3E+  1606
ggcyto(gs[[1]], aes("hCD8A"), subset = node_names[[3]]) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.