Overview

Within this B-ALL model, two propagating populations have been characterised by bulk RNAseq: LSK_IL7R and LK_CLP. This script uses differential gene signatures from bulk RNAseq comparisons (Supplemental Tables S4 and S5) to score and annotate single-cell clusters in the leukaemia subset.

Analysis Strategy

BM and CNS leukaemia populations are clustered independently to allow niche-specific transcriptional states to be resolved without one compartment dominating the clustering. Signature scores are then applied to the cognate compartment:

  • The BM-derived LSK_IL7R vs LK/CLP signature (Supplemental Table S4) is applied to BM clusters
  • The CNS-derived LSK_IL7R vs LK/CLP signature (Supplemental Table S5) is applied to CNS clusters

This approach avoids potential confounding from tissue-specific transcriptional programmes unrelated to leukaemia propagating population identity.


Setup

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(clustree)
## Loading required package: ggraph
## 
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
## 
##     geometry
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(qs2)
## qs2 0.1.7

Load data

leuk_obj <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/03_harmony_recluster_leuk_only_cellcycleanno_march26.qs")
# Tissue colours used consistently throughout
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")

# PCA dims — consistent with original clustering of parent object
pca_dims <- 1:25

# Resolutions to test — same range as parent object
resolutions <- c(0.01, 0.04, 0.05, seq(0.1, 1, by = 0.1))
# Confirm leukaemia object is present
# If starting a fresh session, load with:
# leuk_obj <- readRDS("leuk_obj.rds")

stopifnot("leuk_obj not found - please load object" = exists("leuk_obj"))

cat("Leukaemia object loaded\n")
## Leukaemia object loaded
cat("Total cells : ", ncol(leuk_obj),                    "\n")
## Total cells :  91826
cat("BM cells    : ", sum(leuk_obj$Tissue == "BM"),  "\n")
## BM cells    :  44606
cat("CNS cells   : ", sum(leuk_obj$Tissue == "CNS"), "\n")
## CNS cells   :  47220

Split Object by Tissue

BM and CNS leukaemia cells are separated into independent Seurat objects for tissue-specific clustering and signature scoring.

leuk_BM  <- subset(leuk_obj, subset = Tissue == "BM")
leuk_CNS <- subset(leuk_obj, subset = Tissue == "CNS")

cat("BM leukaemia cells :", ncol(leuk_BM),  "\n")
## BM leukaemia cells : 44606
cat("CNS leukaemia cells:", ncol(leuk_CNS), "\n")
## CNS leukaemia cells: 47220

Re-clustering by Tissue

Each tissue compartment is independently normalised, dimensionality-reduced, and clustered across multiple resolutions. The same pipeline used for the parent object is applied here to ensure consistency.

recluster <- function(obj, dims = pca_dims, res = resolutions) {

  obj <- FindVariableFeatures(obj,
                               selection.method = "vst",
                               nfeatures        = 2000)

  obj <- ScaleData(obj, verbose = FALSE)

  obj <- RunPCA(obj, verbose = FALSE)

  obj <- RunUMAP(obj, dims = dims)

  obj <- FindNeighbors(obj, reduction = "pca", dims = dims)

  obj <- FindClusters(obj, resolution = res)

  return(obj)
}
bm_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs"

file_valid <- file.exists(bm_qs_path) && 
              tryCatch({ qs_read(bm_qs_path); TRUE }, 
                       error = function(e) FALSE)

if (!file_valid) {
  leuk_BM <- recluster(leuk_BM)
  qs_save(leuk_BM, bm_qs_path)
  
  cluster_columns <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
  srt_clusters_metadata_BM <- leuk_BM[[cluster_columns]]
  qs_save(srt_clusters_metadata_BM,
          "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26_metadata.qs")
} else {
  leuk_BM <- qs_read(bm_qs_path)
}

bm_cluster_cols <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
cat("BM reclustering complete\n")
## BM reclustering complete
cat("Resolutions available:\n")
## Resolutions available:
print(bm_cluster_cols)
##  [1] "originalexp_snn_res.0.5"  "originalexp_snn_res.0.01"
##  [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
##  [5] "originalexp_snn_res.0.1"  "originalexp_snn_res.0.2" 
##  [7] "originalexp_snn_res.0.3"  "originalexp_snn_res.0.4" 
##  [9] "originalexp_snn_res.0.6"  "originalexp_snn_res.0.7" 
## [11] "originalexp_snn_res.0.8"  "originalexp_snn_res.0.9" 
## [13] "originalexp_snn_res.1"
# Save clustered srt and metadata
qs_save(leuk_BM, "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")

# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
srt_clusters_metadata_BM <- leuk_BM[[cluster_columns]]
qs_save(
  srt_clusters_metadata_BM,
  file = "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26._metadata.qs")
cns_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs"

file_valid <- file.exists(cns_qs_path) && 
              tryCatch({ qs_read(cns_qs_path); TRUE }, 
                       error = function(e) FALSE)

if (!file_valid) {
  leuk_CNS <- recluster(leuk_CNS)
  qs_save(leuk_CNS, cns_qs_path)
  
  cluster_columns <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
  srt_clusters_metadata_CNS <- leuk_CNS[[cluster_columns]]
  qs_save(srt_clusters_metadata_CNS,
          "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26_metadata.qs")
} else {
  leuk_CNS <- qs_read(cns_qs_path)
}

cns_cluster_cols <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
cat("CNS reclustering complete\n")
## CNS reclustering complete
cat("Resolutions available:\n")
## Resolutions available:
print(cns_cluster_cols)
##  [1] "originalexp_snn_res.0.5"  "originalexp_snn_res.0.01"
##  [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
##  [5] "originalexp_snn_res.0.1"  "originalexp_snn_res.0.2" 
##  [7] "originalexp_snn_res.0.3"  "originalexp_snn_res.0.4" 
##  [9] "originalexp_snn_res.0.6"  "originalexp_snn_res.0.7" 
## [11] "originalexp_snn_res.0.8"  "originalexp_snn_res.0.9" 
## [13] "originalexp_snn_res.1"
# Save clustered srt and metadata
qs_save(leuk_CNS, "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")

# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
srt_clusters_metadata_CNS <- leuk_CNS[[cluster_columns]]
qs_save(
  srt_clusters_metadata_CNS,
  file = "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26._metadata.qs")

Resolution Selection

BM — Clustree Stability Analysis

# Check your actual column names
bm_cluster_cols <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
print(bm_cluster_cols)
##  [1] "originalexp_snn_res.0.5"  "originalexp_snn_res.0.01"
##  [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
##  [5] "originalexp_snn_res.0.1"  "originalexp_snn_res.0.2" 
##  [7] "originalexp_snn_res.0.3"  "originalexp_snn_res.0.4" 
##  [9] "originalexp_snn_res.0.6"  "originalexp_snn_res.0.7" 
## [11] "originalexp_snn_res.0.8"  "originalexp_snn_res.0.9" 
## [13] "originalexp_snn_res.1"
# Extract prefix by removing the trailing number correctly
# This removes everything from the last digit sequence onwards
bm_prefix <- gsub("[0-9]+\\.?[0-9]*$", "", bm_cluster_cols[1])
cat("BM prefix:", bm_prefix, "\n")
## BM prefix: originalexp_snn_res.
# Do the same for CNS
cns_cluster_cols <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
cns_prefix <- gsub("[0-9]+\\.?[0-9]*$", "", cns_cluster_cols[1])
cat("CNS prefix:", cns_prefix, "\n")
## CNS prefix: originalexp_snn_res.
# Now run clustree
bm_cluster_meta  <- leuk_BM[[bm_cluster_cols]]
cns_cluster_meta <- leuk_CNS[[cns_cluster_cols]]

clustree(bm_cluster_meta,  prefix = bm_prefix,  edge_arrow = FALSE) +
  ggtitle("BM leukaemia — resolution stability")
Clustree stability analysis for BM leukaemia subset

Clustree stability analysis for BM leukaemia subset

clustree(cns_cluster_meta, prefix = cns_prefix, edge_arrow = FALSE) +
  ggtitle("CNS leukaemia — resolution stability")
Clustree stability analysis for BM leukaemia subset

Clustree stability analysis for BM leukaemia subset

Resolution Selection — BM Leukaemia

Resolution 0.3 seems about right.

Resolution Selection — CNS Leukaemia

Clustree analysis of the CNS leukaemia subset revealed a finding: two completely separate cluster trees emerge from as early as res 0.04, indicating the presence of two highly transcriptionally distinct major populations within the CNS leukaemia compartment. This degree of separation — maintained from the lowest resolutions — suggests these are not subtle sub-states but fundamentally different cell populations.

The larger left tree fragments progressively with increasing resolution, with cross-arrows appearing from res 0.4 onwards indicating instability at higher resolutions. The smaller right tree remains comparatively stable throughout. Based on this analysis, res 0.3 was selected for CNS leukaemia as it captures both major branches and their early sub-structure without over-fragmenting into unstable clusters.

The biological identity of these two populations requires further investigation. Possibilities include:

  • LSK_IL7R vs LK_CLP — the two known propagating populations in this model, which may be transcriptionally more divergent in the CNS niche than in BM
  • CNS-adapted leukaemia vs BM-contaminating leukaemia — cells that have genuinely colonised and adapted to the CNS niche vs cells recently arrived from BM that retain a BM-like transcriptional programme
  • A CNS-specific leukaemia state — a novel transcriptional programme not represented in the BM compartment

Distinguishing between these possibilities is the primary objective of the downstream analysis.

Unpicking the divergent CNS transcriptional states

# Both tissues — res 0.3
bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"

Idents(leuk_BM)  <- bm_res_col
Idents(leuk_CNS) <- cns_res_col

# Immediately check the two-tree structure in CNS
DimPlot(leuk_CNS,
         reduction  = "umap",
         label      = TRUE,
         label.size = 3) +
  ggtitle("CNS leukaemia — res 0.3")

# 1. Sample distribution — does each island contain all 6 mice?
DimPlot(leuk_CNS, 
        reduction = "umap",
        group.by  = "Mouse_ID") +
  ggtitle("CNS — by sample")

# 2. GFP expression — are both islands GFP+?
FeaturePlot(leuk_CNS,
             features   = "GFP",
             cols       = c("lightgrey", "darkgreen"),
             order      = TRUE,
             min.cutoff = 0) +
  ggtitle("CNS — GFP expression")

# 3. Cell counts per island per mouse
# Label islands manually based on UMAP
leuk_CNS$umap_island <- case_when(
  leuk_CNS$originalexp_snn_res.0.3 %in% 
    c("0", "1", "8", "9", "10") ~ "Right_island",
  leuk_CNS$originalexp_snn_res.0.3 %in% 
    c("2", "3", "4", "5", "6", "7") ~ "Left_island",
  TRUE ~ "Other"
)

table(leuk_CNS$Mouse_ID, leuk_CNS$umap_island)
##          
##           Left_island Other Right_island
##   1838161        3634    12         5034
##   1838162        3286     6         5729
##   1838163        2577     3         3962
##   1838171        1121    14         6291
##   1838279        8391     4          102
##   1838280        6905     0          149

Interpretation - it’s the mice!

Samples 1838279 and 1838280 are almost entirely in the left island — fewer than 150 cells each in the right island. The other four samples distribute across both islands relatively normally.

Is this distribution also try for the BM samples

# Add island labels to BM using same cluster logic
leuk_BM$umap_island <- case_when(
  leuk_BM[[bm_res_col]] %in% c("0", "1", "8", "9", "10") ~ "Right_island",
  leuk_BM[[bm_res_col]] %in% c("2", "3", "4", "5", "6", "7") ~ "Left_island",
  TRUE ~ "Other"
)

# Check same samples in BM
table(leuk_BM$Mouse_ID, leuk_BM$umap_island)
##          
##           Other
##   1838161  5063
##   1838162  7390
##   1838163  5084
##   1838171  7078
##   1838279  9927
##   1838280 10064
# Visual check
DimPlot(leuk_BM, 
        reduction = "umap",
        group.by  = "Mouse_ID") +
  ggtitle("BM — by sample")

# Check QC metrics - rule out technical differences
leuk_BM@meta.data %>%
  group_by(Mouse_ID) %>%
  summarise(
    n_cells       = n(),
    mean_nFeature = round(mean(nFeature_originalexp), 0),
    mean_nCount   = round(mean(nCount_originalexp), 0),
    mean_mt       = round(mean(subsets_mt_percent), 2),
    mean_ribo     = round(mean(subsets_ribo_pct), 2),
    .groups       = "drop"
  ) %>% knitr::kable()
Mouse_ID n_cells mean_nFeature mean_nCount mean_mt mean_ribo
1838161 5063 4618 20197 2.56 24.18
1838162 7390 4326 18343 2.34 26.72
1838163 5084 4338 18092 2.55 26.21
1838171 7078 4511 18854 2.36 24.17
1838279 9927 4910 21618 2.44 23.12
1838280 10064 4597 19361 2.44 25.12
leuk_CNS@meta.data %>%
  group_by(Mouse_ID) %>%
  summarise(
    n_cells       = n(),
    mean_nFeature = round(mean(nFeature_originalexp), 0),
    mean_nCount   = round(mean(nCount_originalexp), 0),
    mean_mt       = round(mean(subsets_mt_percent), 2),
    mean_ribo     = round(mean(subsets_ribo_pct), 2),
    .groups       = "drop"
  ) %>% knitr::kable()
Mouse_ID n_cells mean_nFeature mean_nCount mean_mt mean_ribo
1838161 8680 4223 17478 2.52 26.06
1838162 9021 4247 17640 2.36 26.35
1838163 6542 4397 18775 2.42 26.39
1838171 7426 4415 18102 2.33 25.06
1838279 8497 4880 21859 2.39 23.83
1838280 7054 4885 22172 2.33 24.82
# Check transplant source
leuk_BM@meta.data %>%
  select(Mouse_ID, transplant_source, Sex) %>%
  distinct() %>%
  arrange(Mouse_ID) %>%
  knitr::kable()
Mouse_ID transplant_source Sex
5_AAACCAGCAACATGCT-1 1838161 AD4_EM3 FALSE
7_AAACCAAAGACGACGA-1 1838162 AD4_EM3 FALSE
1_AAACCAAAGACATGGG-1 1838163 AD2.2_EM4 FALSE
3_AAACCAAAGAACCCAC-1 1838171 AD2.2_EM4 FALSE
9_AAACCAAAGACGGAGC-1 1838279 AD5_EM4 FALSE
11_AAACCAAAGCTGAATT-1 1838280 AD5_EM4 FALSE

Same seen in BM! This is therefore a mouse-level biological variability based on transplant material source (two cluster together, one sepeartely), not a CNS-specific or QC.phenomenon.

Set Working Resolutions

Based on clustree stability analysis above, working resolutions are set for each tissue compartment. Higher resolutions are preferred here to maximise the chance of resolving tissue-specific transcriptional sub-states.

# !! Review clustree outputs above before setting these !!
bm_res_col  <- paste0(bm_prefix,  "0.3")  # update as appropriate
cns_res_col <- paste0(cns_prefix, "0.3")  # update as appropriate

Idents(leuk_BM)  <- bm_res_col
Idents(leuk_CNS) <- cns_res_col

cat("BM working resolution  :", bm_res_col,
    "—", length(unique(leuk_BM@meta.data[[bm_res_col]])),   "clusters\n")
## BM working resolution  : originalexp_snn_res.0.3 — 12 clusters
cat("CNS working resolution :", cns_res_col,
    "—", length(unique(leuk_CNS@meta.data[[cns_res_col]])), "clusters\n")
## CNS working resolution : originalexp_snn_res.0.3 — 12 clusters

UMAP Overview

p1 <- DimPlot(leuk_BM,
               reduction  = "umap",
               label      = TRUE,
               label.size = 3) +
  ggtitle("BM — clusters") + NoLegend()

p2 <- DimPlot(leuk_CNS,
               reduction  = "umap",
               label      = TRUE,
               label.size = 3) +
  ggtitle("CNS — clusters") + NoLegend()

p3 <- DimPlot(leuk_BM,
               reduction = "umap",
               group.by  = "Mouse_ID") +
  ggtitle("BM — by sample") +
  theme(legend.text = element_text(size = 7))

p4 <- DimPlot(leuk_CNS,
               reduction = "umap",
               group.by  = "Mouse_ID") +
  ggtitle("CNS — by sample") +
  theme(legend.text = element_text(size = 7))

(p1 + p2) / (p3 + p4)
UMAP of BM and CNS leukaemia subsets coloured by cluster and sample

UMAP of BM and CNS leukaemia subsets coloured by cluster and sample

Cells should be distributed across samples within each cluster, confirming that clusters represent genuine transcriptional states rather than sample-specific batch effects.


STOPPED AT THIS POINT - procceded with script 02_transplant_source_analysis

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] qs2_0.1.7          knitr_1.51         viridis_0.6.5      viridisLite_0.4.2 
##  [5] clustree_0.5.1     ggraph_2.2.2       dplyr_1.1.4        patchwork_1.3.2   
##  [9] ggplot2_4.0.2      Seurat_5.4.0       SeuratObject_5.3.0 sp_2.1-4          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.16.0      jsonlite_2.0.0        
##   [4] magrittr_2.0.3         spatstat.utils_3.2-1   farver_2.1.2          
##   [7] rmarkdown_2.30         vctrs_0.6.5            ROCR_1.0-12           
##  [10] memoise_2.0.1          spatstat.explore_3.7-0 htmltools_0.5.8.1     
##  [13] sass_0.4.9             sctransform_0.4.3      parallelly_1.37.1     
##  [16] KernSmooth_2.23-24     bslib_0.7.0            htmlwidgets_1.6.4     
##  [19] ica_1.0-3              plyr_1.8.9             plotly_4.12.0         
##  [22] zoo_1.8-15             cachem_1.1.0           igraph_2.2.2          
##  [25] mime_0.12              lifecycle_1.0.5        pkgconfig_2.0.3       
##  [28] Matrix_1.7-0           R6_2.6.1               fastmap_1.2.0         
##  [31] fitdistrplus_1.2-6     future_1.33.2          shiny_1.12.1          
##  [34] digest_0.6.35          tensor_1.5.1           RSpectra_0.16-2       
##  [37] irlba_2.3.7            labeling_0.4.3         progressr_0.18.0      
##  [40] fansi_1.0.6            spatstat.sparse_3.1-0  httr_1.4.7            
##  [43] polyclip_1.10-7        abind_1.4-5            compiler_4.4.1        
##  [46] withr_3.0.2            backports_1.4.1        S7_0.2.1              
##  [49] fastDummies_1.7.5      ggforce_0.5.0          MASS_7.3-60.2         
##  [52] tools_4.4.1            lmtest_0.9-40          otel_0.2.0            
##  [55] httpuv_1.6.16          future.apply_1.11.2    goftest_1.2-3         
##  [58] glue_1.8.0             nlme_3.1-164           promises_1.5.0        
##  [61] grid_4.4.1             checkmate_2.3.1        Rtsne_0.17            
##  [64] cluster_2.1.6          reshape2_1.4.4         generics_0.1.3        
##  [67] gtable_0.3.6           spatstat.data_3.1-9    tidyr_1.3.1           
##  [70] data.table_1.15.4      tidygraph_1.3.1        stringfish_0.18.0     
##  [73] utf8_1.2.4             spatstat.geom_3.7-0    RcppAnnoy_0.0.23      
##  [76] ggrepel_0.9.6          RANN_2.6.2             pillar_1.9.0          
##  [79] stringr_1.5.1          spam_2.11-3            RcppHNSW_0.6.0        
##  [82] later_1.4.6            splines_4.4.1          tweenr_2.0.3          
##  [85] lattice_0.22-6         survival_3.6-4         deldir_2.0-4          
##  [88] tidyselect_1.2.1       miniUI_0.1.2           pbapply_1.7-4         
##  [91] gridExtra_2.3          scattermore_1.2        xfun_0.56             
##  [94] graphlayouts_1.2.2     matrixStats_1.5.0      stringi_1.8.4         
##  [97] lazyeval_0.2.2         yaml_2.3.12            evaluate_1.0.5        
## [100] codetools_0.2-20       tibble_3.3.1           cli_3.6.5             
## [103] RcppParallel_5.1.8     uwot_0.2.4             xtable_1.8-4          
## [106] reticulate_1.45.0      jquerylib_0.1.4        dichromat_2.0-0.1     
## [109] Rcpp_1.0.12            globals_0.16.3         spatstat.random_3.4-4 
## [112] png_0.1-8              spatstat.univar_3.1-6  parallel_4.4.1        
## [115] dotCall64_1.2          listenv_0.9.1          scales_1.4.0          
## [118] ggridges_0.5.6         purrr_1.0.2            rlang_1.1.7           
## [121] cowplot_1.2.0