Overview

Within this B-ALL model, two propagating populations have been characterised by bulk RNAseq: LSK_IL7R and LK_CLP. This script uses differential gene signatures from bulk RNAseq comparisons (Supplemental Tables S4 and S5) to score and annotate single-cell clusters in the leukaemia subset.

Analysis Strategy

BM and CNS leukaemia populations are clustered independently to allow niche-specific transcriptional states to be resolved without one compartment dominating the clustering. Signature scores are then applied to the cognate compartment:

The BM-derived LSK_IL7R vs LK/CLP signature (Supplemental Table S4) is applied to BM clusters
The CNS-derived LSK_IL7R vs LK/CLP signature (Supplemental Table S5) is applied to CNS clusters

This approach avoids potential confounding from tissue-specific transcriptional programmes unrelated to leukaemia propagating population identity.

Setup

library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(ggplot2)
library(patchwork)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(clustree)

## Loading required package: ggraph

## 
## Attaching package: 'ggraph'

## The following object is masked from 'package:sp':
## 
##     geometry

library(viridis)

## Loading required package: viridisLite

library(knitr)
library(qs2)

## qs2 0.1.7

Load data

leuk_obj <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/03_harmony_recluster_leuk_only_cellcycleanno_march26.qs")

# Tissue colours used consistently throughout
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")

# PCA dims — consistent with original clustering of parent object
pca_dims <- 1:25

# Resolutions to test — same range as parent object
resolutions <- c(0.01, 0.04, 0.05, seq(0.1, 1, by = 0.1))

# Confirm leukaemia object is present
# If starting a fresh session, load with:
# leuk_obj <- readRDS("leuk_obj.rds")

stopifnot("leuk_obj not found - please load object" = exists("leuk_obj"))

cat("Leukaemia object loaded\n")

## Leukaemia object loaded

cat("Total cells : ", ncol(leuk_obj),                    "\n")

## Total cells :  91826

cat("BM cells    : ", sum(leuk_obj$Tissue == "BM"),  "\n")

## BM cells    :  44606

cat("CNS cells   : ", sum(leuk_obj$Tissue == "CNS"), "\n")

## CNS cells   :  47220

Split Object by Tissue

BM and CNS leukaemia cells are separated into independent Seurat objects for tissue-specific clustering and signature scoring.

leuk_BM  <- subset(leuk_obj, subset = Tissue == "BM")
leuk_CNS <- subset(leuk_obj, subset = Tissue == "CNS")

cat("BM leukaemia cells :", ncol(leuk_BM),  "\n")

## BM leukaemia cells : 44606

cat("CNS leukaemia cells:", ncol(leuk_CNS), "\n")

## CNS leukaemia cells: 47220

Re-clustering by Tissue

Each tissue compartment is independently normalised, dimensionality-reduced, and clustered across multiple resolutions. The same pipeline used for the parent object is applied here to ensure consistency.

recluster <- function(obj, dims = pca_dims, res = resolutions) {

  obj <- FindVariableFeatures(obj,
                               selection.method = "vst",
                               nfeatures        = 2000)

  obj <- ScaleData(obj, verbose = FALSE)

  obj <- RunPCA(obj, verbose = FALSE)

  obj <- RunUMAP(obj, dims = dims)

  obj <- FindNeighbors(obj, reduction = "pca", dims = dims)

  obj <- FindClusters(obj, resolution = res)

  return(obj)
}

bm_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs"

file_valid <- file.exists(bm_qs_path) && 
              tryCatch({ qs_read(bm_qs_path); TRUE }, 
                       error = function(e) FALSE)

if (!file_valid) {
  leuk_BM <- recluster(leuk_BM)
  qs_save(leuk_BM, bm_qs_path)
  
  cluster_columns <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
  srt_clusters_metadata_BM <- leuk_BM[[cluster_columns]]
  qs_save(srt_clusters_metadata_BM,
          "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26_metadata.qs")
} else {
  leuk_BM <- qs_read(bm_qs_path)
}

bm_cluster_cols <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
cat("BM reclustering complete\n")

## BM reclustering complete

cat("Resolutions available:\n")

## Resolutions available:

print(bm_cluster_cols)

##  [1] "originalexp_snn_res.0.5"  "originalexp_snn_res.0.01"
##  [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
##  [5] "originalexp_snn_res.0.1"  "originalexp_snn_res.0.2" 
##  [7] "originalexp_snn_res.0.3"  "originalexp_snn_res.0.4" 
##  [9] "originalexp_snn_res.0.6"  "originalexp_snn_res.0.7" 
## [11] "originalexp_snn_res.0.8"  "originalexp_snn_res.0.9" 
## [13] "originalexp_snn_res.1"

# Save clustered srt and metadata
qs_save(leuk_BM, "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")

# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
srt_clusters_metadata_BM <- leuk_BM[[cluster_columns]]
qs_save(
  srt_clusters_metadata_BM,
  file = "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26._metadata.qs")

cns_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs"

file_valid <- file.exists(cns_qs_path) && 
              tryCatch({ qs_read(cns_qs_path); TRUE }, 
                       error = function(e) FALSE)

if (!file_valid) {
  leuk_CNS <- recluster(leuk_CNS)
  qs_save(leuk_CNS, cns_qs_path)
  
  cluster_columns <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
  srt_clusters_metadata_CNS <- leuk_CNS[[cluster_columns]]
  qs_save(srt_clusters_metadata_CNS,
          "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26_metadata.qs")
} else {
  leuk_CNS <- qs_read(cns_qs_path)
}

cns_cluster_cols <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
cat("CNS reclustering complete\n")

## CNS reclustering complete

cat("Resolutions available:\n")

## Resolutions available:

print(cns_cluster_cols)

##  [1] "originalexp_snn_res.0.5"  "originalexp_snn_res.0.01"
##  [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
##  [5] "originalexp_snn_res.0.1"  "originalexp_snn_res.0.2" 
##  [7] "originalexp_snn_res.0.3"  "originalexp_snn_res.0.4" 
##  [9] "originalexp_snn_res.0.6"  "originalexp_snn_res.0.7" 
## [11] "originalexp_snn_res.0.8"  "originalexp_snn_res.0.9" 
## [13] "originalexp_snn_res.1"

# Save clustered srt and metadata
qs_save(leuk_CNS, "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")

# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
srt_clusters_metadata_CNS <- leuk_CNS[[cluster_columns]]
qs_save(
  srt_clusters_metadata_CNS,
  file = "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26._metadata.qs")

Resolution Selection

BM — Clustree Stability Analysis

# Check your actual column names
bm_cluster_cols <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
print(bm_cluster_cols)

##  [1] "originalexp_snn_res.0.5"  "originalexp_snn_res.0.01"
##  [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
##  [5] "originalexp_snn_res.0.1"  "originalexp_snn_res.0.2" 
##  [7] "originalexp_snn_res.0.3"  "originalexp_snn_res.0.4" 
##  [9] "originalexp_snn_res.0.6"  "originalexp_snn_res.0.7" 
## [11] "originalexp_snn_res.0.8"  "originalexp_snn_res.0.9" 
## [13] "originalexp_snn_res.1"

# Extract prefix by removing the trailing number correctly
# This removes everything from the last digit sequence onwards
bm_prefix <- gsub("[0-9]+\\.?[0-9]*$", "", bm_cluster_cols[1])
cat("BM prefix:", bm_prefix, "\n")

## BM prefix: originalexp_snn_res.

# Do the same for CNS
cns_cluster_cols <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
cns_prefix <- gsub("[0-9]+\\.?[0-9]*$", "", cns_cluster_cols[1])
cat("CNS prefix:", cns_prefix, "\n")

## CNS prefix: originalexp_snn_res.

# Now run clustree
bm_cluster_meta  <- leuk_BM[[bm_cluster_cols]]
cns_cluster_meta <- leuk_CNS[[cns_cluster_cols]]

clustree(bm_cluster_meta,  prefix = bm_prefix,  edge_arrow = FALSE) +
  ggtitle("BM leukaemia — resolution stability")

Clustree stability analysis for BM leukaemia subset

clustree(cns_cluster_meta, prefix = cns_prefix, edge_arrow = FALSE) +
  ggtitle("CNS leukaemia — resolution stability")

Clustree stability analysis for BM leukaemia subset

Resolution Selection — BM Leukaemia

Resolution 0.3 seems about right.

Resolution Selection — CNS Leukaemia

Clustree analysis of the CNS leukaemia subset revealed a finding: two completely separate cluster trees emerge from as early as res 0.04, indicating the presence of two highly transcriptionally distinct major populations within the CNS leukaemia compartment. This degree of separation — maintained from the lowest resolutions — suggests these are not subtle sub-states but fundamentally different cell populations.

The larger left tree fragments progressively with increasing resolution, with cross-arrows appearing from res 0.4 onwards indicating instability at higher resolutions. The smaller right tree remains comparatively stable throughout. Based on this analysis, res 0.3 was selected for CNS leukaemia as it captures both major branches and their early sub-structure without over-fragmenting into unstable clusters.

The biological identity of these two populations requires further investigation. Possibilities include:

LSK_IL7R vs LK_CLP — the two known propagating populations in this model, which may be transcriptionally more divergent in the CNS niche than in BM
CNS-adapted leukaemia vs BM-contaminating leukaemia — cells that have genuinely colonised and adapted to the CNS niche vs cells recently arrived from BM that retain a BM-like transcriptional programme
A CNS-specific leukaemia state — a novel transcriptional programme not represented in the BM compartment

Distinguishing between these possibilities is the primary objective of the downstream analysis.

Unpicking the divergent CNS transcriptional states

# Both tissues — res 0.3
bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"

Idents(leuk_BM)  <- bm_res_col
Idents(leuk_CNS) <- cns_res_col

# Immediately check the two-tree structure in CNS
DimPlot(leuk_CNS,
         reduction  = "umap",
         label      = TRUE,
         label.size = 3) +
  ggtitle("CNS leukaemia — res 0.3")

# 1. Sample distribution — does each island contain all 6 mice?
DimPlot(leuk_CNS, 
        reduction = "umap",
        group.by  = "Mouse_ID") +
  ggtitle("CNS — by sample")

# 2. GFP expression — are both islands GFP+?
FeaturePlot(leuk_CNS,
             features   = "GFP",
             cols       = c("lightgrey", "darkgreen"),
             order      = TRUE,
             min.cutoff = 0) +
  ggtitle("CNS — GFP expression")

# 3. Cell counts per island per mouse
# Label islands manually based on UMAP
leuk_CNS$umap_island <- case_when(
  leuk_CNS$originalexp_snn_res.0.3 %in% 
    c("0", "1", "8", "9", "10") ~ "Right_island",
  leuk_CNS$originalexp_snn_res.0.3 %in% 
    c("2", "3", "4", "5", "6", "7") ~ "Left_island",
  TRUE ~ "Other"
)

table(leuk_CNS$Mouse_ID, leuk_CNS$umap_island)

##          
##           Left_island Other Right_island
##   1838161        3634    12         5034
##   1838162        3286     6         5729
##   1838163        2577     3         3962
##   1838171        1121    14         6291
##   1838279        8391     4          102
##   1838280        6905     0          149

Interpretation - it’s the mice!

Samples 1838279 and 1838280 are almost entirely in the left island — fewer than 150 cells each in the right island. The other four samples distribute across both islands relatively normally.

Is this distribution also try for the BM samples

# Add island labels to BM using same cluster logic
leuk_BM$umap_island <- case_when(
  leuk_BM[[bm_res_col]] %in% c("0", "1", "8", "9", "10") ~ "Right_island",
  leuk_BM[[bm_res_col]] %in% c("2", "3", "4", "5", "6", "7") ~ "Left_island",
  TRUE ~ "Other"
)

# Check same samples in BM
table(leuk_BM$Mouse_ID, leuk_BM$umap_island)

##          
##           Other
##   1838161  5063
##   1838162  7390
##   1838163  5084
##   1838171  7078
##   1838279  9927
##   1838280 10064

# Visual check
DimPlot(leuk_BM, 
        reduction = "umap",
        group.by  = "Mouse_ID") +
  ggtitle("BM — by sample")

# Check QC metrics - rule out technical differences
leuk_BM@meta.data %>%
  group_by(Mouse_ID) %>%
  summarise(
    n_cells       = n(),
    mean_nFeature = round(mean(nFeature_originalexp), 0),
    mean_nCount   = round(mean(nCount_originalexp), 0),
    mean_mt       = round(mean(subsets_mt_percent), 2),
    mean_ribo     = round(mean(subsets_ribo_pct), 2),
    .groups       = "drop"
  ) %>% knitr::kable()

Mouse_ID	n_cells	mean_nFeature	mean_nCount	mean_mt	mean_ribo
1838161	5063	4618	20197	2.56	24.18
1838162	7390	4326	18343	2.34	26.72
1838163	5084	4338	18092	2.55	26.21
1838171	7078	4511	18854	2.36	24.17
1838279	9927	4910	21618	2.44	23.12
1838280	10064	4597	19361	2.44	25.12

leuk_CNS@meta.data %>%
  group_by(Mouse_ID) %>%
  summarise(
    n_cells       = n(),
    mean_nFeature = round(mean(nFeature_originalexp), 0),
    mean_nCount   = round(mean(nCount_originalexp), 0),
    mean_mt       = round(mean(subsets_mt_percent), 2),
    mean_ribo     = round(mean(subsets_ribo_pct), 2),
    .groups       = "drop"
  ) %>% knitr::kable()

Mouse_ID	n_cells	mean_nFeature	mean_nCount	mean_mt	mean_ribo
1838161	8680	4223	17478	2.52	26.06
1838162	9021	4247	17640	2.36	26.35
1838163	6542	4397	18775	2.42	26.39
1838171	7426	4415	18102	2.33	25.06
1838279	8497	4880	21859	2.39	23.83
1838280	7054	4885	22172	2.33	24.82

# Check transplant source
leuk_BM@meta.data %>%
  select(Mouse_ID, transplant_source, Sex) %>%
  distinct() %>%
  arrange(Mouse_ID) %>%
  knitr::kable()

	Mouse_ID	transplant_source	Sex
5_AAACCAGCAACATGCT-1	1838161	AD4_EM3	FALSE
7_AAACCAAAGACGACGA-1	1838162	AD4_EM3	FALSE
1_AAACCAAAGACATGGG-1	1838163	AD2.2_EM4	FALSE
3_AAACCAAAGAACCCAC-1	1838171	AD2.2_EM4	FALSE
9_AAACCAAAGACGGAGC-1	1838279	AD5_EM4	FALSE
11_AAACCAAAGCTGAATT-1	1838280	AD5_EM4	FALSE

Same seen in BM! This is therefore a mouse-level biological variability based on transplant material source (two cluster together, one sepeartely), not a CNS-specific or QC.phenomenon.

Set Working Resolutions

Based on clustree stability analysis above, working resolutions are set for each tissue compartment. Higher resolutions are preferred here to maximise the chance of resolving tissue-specific transcriptional sub-states.

# !! Review clustree outputs above before setting these !!
bm_res_col  <- paste0(bm_prefix,  "0.3")  # update as appropriate
cns_res_col <- paste0(cns_prefix, "0.3")  # update as appropriate

Idents(leuk_BM)  <- bm_res_col
Idents(leuk_CNS) <- cns_res_col

cat("BM working resolution  :", bm_res_col,
    "—", length(unique(leuk_BM@meta.data[[bm_res_col]])),   "clusters\n")

## BM working resolution  : originalexp_snn_res.0.3 — 12 clusters

cat("CNS working resolution :", cns_res_col,
    "—", length(unique(leuk_CNS@meta.data[[cns_res_col]])), "clusters\n")

## CNS working resolution : originalexp_snn_res.0.3 — 12 clusters

UMAP Overview

p1 <- DimPlot(leuk_BM,
               reduction  = "umap",
               label      = TRUE,
               label.size = 3) +
  ggtitle("BM — clusters") + NoLegend()

p2 <- DimPlot(leuk_CNS,
               reduction  = "umap",
               label      = TRUE,
               label.size = 3) +
  ggtitle("CNS — clusters") + NoLegend()

p3 <- DimPlot(leuk_BM,
               reduction = "umap",
               group.by  = "Mouse_ID") +
  ggtitle("BM — by sample") +
  theme(legend.text = element_text(size = 7))

p4 <- DimPlot(leuk_CNS,
               reduction = "umap",
               group.by  = "Mouse_ID") +
  ggtitle("CNS — by sample") +
  theme(legend.text = element_text(size = 7))

(p1 + p2) / (p3 + p4)

UMAP of BM and CNS leukaemia subsets coloured by cluster and sample

Cells should be distributed across samples within each cluster, confirming that clusters represent genuine transcriptional states rather than sample-specific batch effects.

STOPPED AT THIS POINT - procceded with script 02_transplant_source_analysis

Session Information

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] qs2_0.1.7          knitr_1.51         viridis_0.6.5      viridisLite_0.4.2 
##  [5] clustree_0.5.1     ggraph_2.2.2       dplyr_1.1.4        patchwork_1.3.2   
##  [9] ggplot2_4.0.2      Seurat_5.4.0       SeuratObject_5.3.0 sp_2.1-4          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.16.0      jsonlite_2.0.0        
##   [4] magrittr_2.0.3         spatstat.utils_3.2-1   farver_2.1.2          
##   [7] rmarkdown_2.30         vctrs_0.6.5            ROCR_1.0-12           
##  [10] memoise_2.0.1          spatstat.explore_3.7-0 htmltools_0.5.8.1     
##  [13] sass_0.4.9             sctransform_0.4.3      parallelly_1.37.1     
##  [16] KernSmooth_2.23-24     bslib_0.7.0            htmlwidgets_1.6.4     
##  [19] ica_1.0-3              plyr_1.8.9             plotly_4.12.0         
##  [22] zoo_1.8-15             cachem_1.1.0           igraph_2.2.2          
##  [25] mime_0.12              lifecycle_1.0.5        pkgconfig_2.0.3       
##  [28] Matrix_1.7-0           R6_2.6.1               fastmap_1.2.0         
##  [31] fitdistrplus_1.2-6     future_1.33.2          shiny_1.12.1          
##  [34] digest_0.6.35          tensor_1.5.1           RSpectra_0.16-2       
##  [37] irlba_2.3.7            labeling_0.4.3         progressr_0.18.0      
##  [40] fansi_1.0.6            spatstat.sparse_3.1-0  httr_1.4.7            
##  [43] polyclip_1.10-7        abind_1.4-5            compiler_4.4.1        
##  [46] withr_3.0.2            backports_1.4.1        S7_0.2.1              
##  [49] fastDummies_1.7.5      ggforce_0.5.0          MASS_7.3-60.2         
##  [52] tools_4.4.1            lmtest_0.9-40          otel_0.2.0            
##  [55] httpuv_1.6.16          future.apply_1.11.2    goftest_1.2-3         
##  [58] glue_1.8.0             nlme_3.1-164           promises_1.5.0        
##  [61] grid_4.4.1             checkmate_2.3.1        Rtsne_0.17            
##  [64] cluster_2.1.6          reshape2_1.4.4         generics_0.1.3        
##  [67] gtable_0.3.6           spatstat.data_3.1-9    tidyr_1.3.1           
##  [70] data.table_1.15.4      tidygraph_1.3.1        stringfish_0.18.0     
##  [73] utf8_1.2.4             spatstat.geom_3.7-0    RcppAnnoy_0.0.23      
##  [76] ggrepel_0.9.6          RANN_2.6.2             pillar_1.9.0          
##  [79] stringr_1.5.1          spam_2.11-3            RcppHNSW_0.6.0        
##  [82] later_1.4.6            splines_4.4.1          tweenr_2.0.3          
##  [85] lattice_0.22-6         survival_3.6-4         deldir_2.0-4          
##  [88] tidyselect_1.2.1       miniUI_0.1.2           pbapply_1.7-4         
##  [91] gridExtra_2.3          scattermore_1.2        xfun_0.56             
##  [94] graphlayouts_1.2.2     matrixStats_1.5.0      stringi_1.8.4         
##  [97] lazyeval_0.2.2         yaml_2.3.12            evaluate_1.0.5        
## [100] codetools_0.2-20       tibble_3.3.1           cli_3.6.5             
## [103] RcppParallel_5.1.8     uwot_0.2.4             xtable_1.8-4          
## [106] reticulate_1.45.0      jquerylib_0.1.4        dichromat_2.0-0.1     
## [109] Rcpp_1.0.12            globals_0.16.3         spatstat.random_3.4-4 
## [112] png_0.1-8              spatstat.univar_3.1-6  parallel_4.4.1        
## [115] dotCall64_1.2          listenv_0.9.1          scales_1.4.0          
## [118] ggridges_0.5.6         purrr_1.0.2            rlang_1.1.7           
## [121] cowplot_1.2.0

B-ALL Leukaemia Cluster Annotation - LSK_IL7R vs LK_CLP

Alasdair Duguid

2026-02-19