Within this B-ALL model, two propagating populations have been characterised by bulk RNAseq: LSK_IL7R and LK_CLP. This script uses differential gene signatures from bulk RNAseq comparisons (Supplemental Tables S4 and S5) to score and annotate single-cell clusters in the leukaemia subset.
BM and CNS leukaemia populations are clustered independently to allow niche-specific transcriptional states to be resolved without one compartment dominating the clustering. Signature scores are then applied to the cognate compartment:
This approach avoids potential confounding from tissue-specific transcriptional programmes unrelated to leukaemia propagating population identity.
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(clustree)
## Loading required package: ggraph
##
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
##
## geometry
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(qs2)
## qs2 0.1.7
leuk_obj <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/03_harmony_recluster_leuk_only_cellcycleanno_march26.qs")
# Tissue colours used consistently throughout
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")
# PCA dims — consistent with original clustering of parent object
pca_dims <- 1:25
# Resolutions to test — same range as parent object
resolutions <- c(0.01, 0.04, 0.05, seq(0.1, 1, by = 0.1))
# Confirm leukaemia object is present
# If starting a fresh session, load with:
# leuk_obj <- readRDS("leuk_obj.rds")
stopifnot("leuk_obj not found - please load object" = exists("leuk_obj"))
cat("Leukaemia object loaded\n")
## Leukaemia object loaded
cat("Total cells : ", ncol(leuk_obj), "\n")
## Total cells : 91826
cat("BM cells : ", sum(leuk_obj$Tissue == "BM"), "\n")
## BM cells : 44606
cat("CNS cells : ", sum(leuk_obj$Tissue == "CNS"), "\n")
## CNS cells : 47220
BM and CNS leukaemia cells are separated into independent Seurat objects for tissue-specific clustering and signature scoring.
leuk_BM <- subset(leuk_obj, subset = Tissue == "BM")
leuk_CNS <- subset(leuk_obj, subset = Tissue == "CNS")
cat("BM leukaemia cells :", ncol(leuk_BM), "\n")
## BM leukaemia cells : 44606
cat("CNS leukaemia cells:", ncol(leuk_CNS), "\n")
## CNS leukaemia cells: 47220
Each tissue compartment is independently normalised, dimensionality-reduced, and clustered across multiple resolutions. The same pipeline used for the parent object is applied here to ensure consistency.
recluster <- function(obj, dims = pca_dims, res = resolutions) {
obj <- FindVariableFeatures(obj,
selection.method = "vst",
nfeatures = 2000)
obj <- ScaleData(obj, verbose = FALSE)
obj <- RunPCA(obj, verbose = FALSE)
obj <- RunUMAP(obj, dims = dims)
obj <- FindNeighbors(obj, reduction = "pca", dims = dims)
obj <- FindClusters(obj, resolution = res)
return(obj)
}
bm_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs"
file_valid <- file.exists(bm_qs_path) &&
tryCatch({ qs_read(bm_qs_path); TRUE },
error = function(e) FALSE)
if (!file_valid) {
leuk_BM <- recluster(leuk_BM)
qs_save(leuk_BM, bm_qs_path)
cluster_columns <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
srt_clusters_metadata_BM <- leuk_BM[[cluster_columns]]
qs_save(srt_clusters_metadata_BM,
"/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26_metadata.qs")
} else {
leuk_BM <- qs_read(bm_qs_path)
}
bm_cluster_cols <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
cat("BM reclustering complete\n")
## BM reclustering complete
cat("Resolutions available:\n")
## Resolutions available:
print(bm_cluster_cols)
## [1] "originalexp_snn_res.0.5" "originalexp_snn_res.0.01"
## [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
## [5] "originalexp_snn_res.0.1" "originalexp_snn_res.0.2"
## [7] "originalexp_snn_res.0.3" "originalexp_snn_res.0.4"
## [9] "originalexp_snn_res.0.6" "originalexp_snn_res.0.7"
## [11] "originalexp_snn_res.0.8" "originalexp_snn_res.0.9"
## [13] "originalexp_snn_res.1"
# Save clustered srt and metadata
qs_save(leuk_BM, "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
srt_clusters_metadata_BM <- leuk_BM[[cluster_columns]]
qs_save(
srt_clusters_metadata_BM,
file = "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26._metadata.qs")
cns_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs"
file_valid <- file.exists(cns_qs_path) &&
tryCatch({ qs_read(cns_qs_path); TRUE },
error = function(e) FALSE)
if (!file_valid) {
leuk_CNS <- recluster(leuk_CNS)
qs_save(leuk_CNS, cns_qs_path)
cluster_columns <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
srt_clusters_metadata_CNS <- leuk_CNS[[cluster_columns]]
qs_save(srt_clusters_metadata_CNS,
"/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26_metadata.qs")
} else {
leuk_CNS <- qs_read(cns_qs_path)
}
cns_cluster_cols <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
cat("CNS reclustering complete\n")
## CNS reclustering complete
cat("Resolutions available:\n")
## Resolutions available:
print(cns_cluster_cols)
## [1] "originalexp_snn_res.0.5" "originalexp_snn_res.0.01"
## [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
## [5] "originalexp_snn_res.0.1" "originalexp_snn_res.0.2"
## [7] "originalexp_snn_res.0.3" "originalexp_snn_res.0.4"
## [9] "originalexp_snn_res.0.6" "originalexp_snn_res.0.7"
## [11] "originalexp_snn_res.0.8" "originalexp_snn_res.0.9"
## [13] "originalexp_snn_res.1"
# Save clustered srt and metadata
qs_save(leuk_CNS, "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")
# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
srt_clusters_metadata_CNS <- leuk_CNS[[cluster_columns]]
qs_save(
srt_clusters_metadata_CNS,
file = "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26._metadata.qs")
# Check your actual column names
bm_cluster_cols <- grep("snn_res\\.", names(leuk_BM@meta.data), value = TRUE)
print(bm_cluster_cols)
## [1] "originalexp_snn_res.0.5" "originalexp_snn_res.0.01"
## [3] "originalexp_snn_res.0.04" "originalexp_snn_res.0.05"
## [5] "originalexp_snn_res.0.1" "originalexp_snn_res.0.2"
## [7] "originalexp_snn_res.0.3" "originalexp_snn_res.0.4"
## [9] "originalexp_snn_res.0.6" "originalexp_snn_res.0.7"
## [11] "originalexp_snn_res.0.8" "originalexp_snn_res.0.9"
## [13] "originalexp_snn_res.1"
# Extract prefix by removing the trailing number correctly
# This removes everything from the last digit sequence onwards
bm_prefix <- gsub("[0-9]+\\.?[0-9]*$", "", bm_cluster_cols[1])
cat("BM prefix:", bm_prefix, "\n")
## BM prefix: originalexp_snn_res.
# Do the same for CNS
cns_cluster_cols <- grep("snn_res\\.", names(leuk_CNS@meta.data), value = TRUE)
cns_prefix <- gsub("[0-9]+\\.?[0-9]*$", "", cns_cluster_cols[1])
cat("CNS prefix:", cns_prefix, "\n")
## CNS prefix: originalexp_snn_res.
# Now run clustree
bm_cluster_meta <- leuk_BM[[bm_cluster_cols]]
cns_cluster_meta <- leuk_CNS[[cns_cluster_cols]]
clustree(bm_cluster_meta, prefix = bm_prefix, edge_arrow = FALSE) +
ggtitle("BM leukaemia — resolution stability")
Clustree stability analysis for BM leukaemia subset
clustree(cns_cluster_meta, prefix = cns_prefix, edge_arrow = FALSE) +
ggtitle("CNS leukaemia — resolution stability")
Clustree stability analysis for BM leukaemia subset
Resolution 0.3 seems about right.
Clustree analysis of the CNS leukaemia subset revealed a finding: two completely separate cluster trees emerge from as early as res 0.04, indicating the presence of two highly transcriptionally distinct major populations within the CNS leukaemia compartment. This degree of separation — maintained from the lowest resolutions — suggests these are not subtle sub-states but fundamentally different cell populations.
The larger left tree fragments progressively with increasing resolution, with cross-arrows appearing from res 0.4 onwards indicating instability at higher resolutions. The smaller right tree remains comparatively stable throughout. Based on this analysis, res 0.3 was selected for CNS leukaemia as it captures both major branches and their early sub-structure without over-fragmenting into unstable clusters.
The biological identity of these two populations requires further investigation. Possibilities include:
Distinguishing between these possibilities is the primary objective of the downstream analysis.
# Both tissues — res 0.3
bm_res_col <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"
Idents(leuk_BM) <- bm_res_col
Idents(leuk_CNS) <- cns_res_col
# Immediately check the two-tree structure in CNS
DimPlot(leuk_CNS,
reduction = "umap",
label = TRUE,
label.size = 3) +
ggtitle("CNS leukaemia — res 0.3")
# 1. Sample distribution — does each island contain all 6 mice?
DimPlot(leuk_CNS,
reduction = "umap",
group.by = "Mouse_ID") +
ggtitle("CNS — by sample")
# 2. GFP expression — are both islands GFP+?
FeaturePlot(leuk_CNS,
features = "GFP",
cols = c("lightgrey", "darkgreen"),
order = TRUE,
min.cutoff = 0) +
ggtitle("CNS — GFP expression")
# 3. Cell counts per island per mouse
# Label islands manually based on UMAP
leuk_CNS$umap_island <- case_when(
leuk_CNS$originalexp_snn_res.0.3 %in%
c("0", "1", "8", "9", "10") ~ "Right_island",
leuk_CNS$originalexp_snn_res.0.3 %in%
c("2", "3", "4", "5", "6", "7") ~ "Left_island",
TRUE ~ "Other"
)
table(leuk_CNS$Mouse_ID, leuk_CNS$umap_island)
##
## Left_island Other Right_island
## 1838161 3634 12 5034
## 1838162 3286 6 5729
## 1838163 2577 3 3962
## 1838171 1121 14 6291
## 1838279 8391 4 102
## 1838280 6905 0 149
Samples 1838279 and 1838280 are almost entirely in the left island — fewer than 150 cells each in the right island. The other four samples distribute across both islands relatively normally.
# Add island labels to BM using same cluster logic
leuk_BM$umap_island <- case_when(
leuk_BM[[bm_res_col]] %in% c("0", "1", "8", "9", "10") ~ "Right_island",
leuk_BM[[bm_res_col]] %in% c("2", "3", "4", "5", "6", "7") ~ "Left_island",
TRUE ~ "Other"
)
# Check same samples in BM
table(leuk_BM$Mouse_ID, leuk_BM$umap_island)
##
## Other
## 1838161 5063
## 1838162 7390
## 1838163 5084
## 1838171 7078
## 1838279 9927
## 1838280 10064
# Visual check
DimPlot(leuk_BM,
reduction = "umap",
group.by = "Mouse_ID") +
ggtitle("BM — by sample")
# Check QC metrics - rule out technical differences
leuk_BM@meta.data %>%
group_by(Mouse_ID) %>%
summarise(
n_cells = n(),
mean_nFeature = round(mean(nFeature_originalexp), 0),
mean_nCount = round(mean(nCount_originalexp), 0),
mean_mt = round(mean(subsets_mt_percent), 2),
mean_ribo = round(mean(subsets_ribo_pct), 2),
.groups = "drop"
) %>% knitr::kable()
| Mouse_ID | n_cells | mean_nFeature | mean_nCount | mean_mt | mean_ribo |
|---|---|---|---|---|---|
| 1838161 | 5063 | 4618 | 20197 | 2.56 | 24.18 |
| 1838162 | 7390 | 4326 | 18343 | 2.34 | 26.72 |
| 1838163 | 5084 | 4338 | 18092 | 2.55 | 26.21 |
| 1838171 | 7078 | 4511 | 18854 | 2.36 | 24.17 |
| 1838279 | 9927 | 4910 | 21618 | 2.44 | 23.12 |
| 1838280 | 10064 | 4597 | 19361 | 2.44 | 25.12 |
leuk_CNS@meta.data %>%
group_by(Mouse_ID) %>%
summarise(
n_cells = n(),
mean_nFeature = round(mean(nFeature_originalexp), 0),
mean_nCount = round(mean(nCount_originalexp), 0),
mean_mt = round(mean(subsets_mt_percent), 2),
mean_ribo = round(mean(subsets_ribo_pct), 2),
.groups = "drop"
) %>% knitr::kable()
| Mouse_ID | n_cells | mean_nFeature | mean_nCount | mean_mt | mean_ribo |
|---|---|---|---|---|---|
| 1838161 | 8680 | 4223 | 17478 | 2.52 | 26.06 |
| 1838162 | 9021 | 4247 | 17640 | 2.36 | 26.35 |
| 1838163 | 6542 | 4397 | 18775 | 2.42 | 26.39 |
| 1838171 | 7426 | 4415 | 18102 | 2.33 | 25.06 |
| 1838279 | 8497 | 4880 | 21859 | 2.39 | 23.83 |
| 1838280 | 7054 | 4885 | 22172 | 2.33 | 24.82 |
# Check transplant source
leuk_BM@meta.data %>%
select(Mouse_ID, transplant_source, Sex) %>%
distinct() %>%
arrange(Mouse_ID) %>%
knitr::kable()
| Mouse_ID | transplant_source | Sex | |
|---|---|---|---|
| 5_AAACCAGCAACATGCT-1 | 1838161 | AD4_EM3 | FALSE |
| 7_AAACCAAAGACGACGA-1 | 1838162 | AD4_EM3 | FALSE |
| 1_AAACCAAAGACATGGG-1 | 1838163 | AD2.2_EM4 | FALSE |
| 3_AAACCAAAGAACCCAC-1 | 1838171 | AD2.2_EM4 | FALSE |
| 9_AAACCAAAGACGGAGC-1 | 1838279 | AD5_EM4 | FALSE |
| 11_AAACCAAAGCTGAATT-1 | 1838280 | AD5_EM4 | FALSE |
Same seen in BM! This is therefore a mouse-level biological variability based on transplant material source (two cluster together, one sepeartely), not a CNS-specific or QC.phenomenon.
Based on clustree stability analysis above, working resolutions are set for each tissue compartment. Higher resolutions are preferred here to maximise the chance of resolving tissue-specific transcriptional sub-states.
# !! Review clustree outputs above before setting these !!
bm_res_col <- paste0(bm_prefix, "0.3") # update as appropriate
cns_res_col <- paste0(cns_prefix, "0.3") # update as appropriate
Idents(leuk_BM) <- bm_res_col
Idents(leuk_CNS) <- cns_res_col
cat("BM working resolution :", bm_res_col,
"—", length(unique(leuk_BM@meta.data[[bm_res_col]])), "clusters\n")
## BM working resolution : originalexp_snn_res.0.3 — 12 clusters
cat("CNS working resolution :", cns_res_col,
"—", length(unique(leuk_CNS@meta.data[[cns_res_col]])), "clusters\n")
## CNS working resolution : originalexp_snn_res.0.3 — 12 clusters
p1 <- DimPlot(leuk_BM,
reduction = "umap",
label = TRUE,
label.size = 3) +
ggtitle("BM — clusters") + NoLegend()
p2 <- DimPlot(leuk_CNS,
reduction = "umap",
label = TRUE,
label.size = 3) +
ggtitle("CNS — clusters") + NoLegend()
p3 <- DimPlot(leuk_BM,
reduction = "umap",
group.by = "Mouse_ID") +
ggtitle("BM — by sample") +
theme(legend.text = element_text(size = 7))
p4 <- DimPlot(leuk_CNS,
reduction = "umap",
group.by = "Mouse_ID") +
ggtitle("CNS — by sample") +
theme(legend.text = element_text(size = 7))
(p1 + p2) / (p3 + p4)
UMAP of BM and CNS leukaemia subsets coloured by cluster and sample
Cells should be distributed across samples within each cluster, confirming that clusters represent genuine transcriptional states rather than sample-specific batch effects.
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
##
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2; LAPACK version 3.10.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/London
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] qs2_0.1.7 knitr_1.51 viridis_0.6.5 viridisLite_0.4.2
## [5] clustree_0.5.1 ggraph_2.2.2 dplyr_1.1.4 patchwork_1.3.2
## [9] ggplot2_4.0.2 Seurat_5.4.0 SeuratObject_5.3.0 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_2.0.0
## [4] magrittr_2.0.3 spatstat.utils_3.2-1 farver_2.1.2
## [7] rmarkdown_2.30 vctrs_0.6.5 ROCR_1.0-12
## [10] memoise_2.0.1 spatstat.explore_3.7-0 htmltools_0.5.8.1
## [13] sass_0.4.9 sctransform_0.4.3 parallelly_1.37.1
## [16] KernSmooth_2.23-24 bslib_0.7.0 htmlwidgets_1.6.4
## [19] ica_1.0-3 plyr_1.8.9 plotly_4.12.0
## [22] zoo_1.8-15 cachem_1.1.0 igraph_2.2.2
## [25] mime_0.12 lifecycle_1.0.5 pkgconfig_2.0.3
## [28] Matrix_1.7-0 R6_2.6.1 fastmap_1.2.0
## [31] fitdistrplus_1.2-6 future_1.33.2 shiny_1.12.1
## [34] digest_0.6.35 tensor_1.5.1 RSpectra_0.16-2
## [37] irlba_2.3.7 labeling_0.4.3 progressr_0.18.0
## [40] fansi_1.0.6 spatstat.sparse_3.1-0 httr_1.4.7
## [43] polyclip_1.10-7 abind_1.4-5 compiler_4.4.1
## [46] withr_3.0.2 backports_1.4.1 S7_0.2.1
## [49] fastDummies_1.7.5 ggforce_0.5.0 MASS_7.3-60.2
## [52] tools_4.4.1 lmtest_0.9-40 otel_0.2.0
## [55] httpuv_1.6.16 future.apply_1.11.2 goftest_1.2-3
## [58] glue_1.8.0 nlme_3.1-164 promises_1.5.0
## [61] grid_4.4.1 checkmate_2.3.1 Rtsne_0.17
## [64] cluster_2.1.6 reshape2_1.4.4 generics_0.1.3
## [67] gtable_0.3.6 spatstat.data_3.1-9 tidyr_1.3.1
## [70] data.table_1.15.4 tidygraph_1.3.1 stringfish_0.18.0
## [73] utf8_1.2.4 spatstat.geom_3.7-0 RcppAnnoy_0.0.23
## [76] ggrepel_0.9.6 RANN_2.6.2 pillar_1.9.0
## [79] stringr_1.5.1 spam_2.11-3 RcppHNSW_0.6.0
## [82] later_1.4.6 splines_4.4.1 tweenr_2.0.3
## [85] lattice_0.22-6 survival_3.6-4 deldir_2.0-4
## [88] tidyselect_1.2.1 miniUI_0.1.2 pbapply_1.7-4
## [91] gridExtra_2.3 scattermore_1.2 xfun_0.56
## [94] graphlayouts_1.2.2 matrixStats_1.5.0 stringi_1.8.4
## [97] lazyeval_0.2.2 yaml_2.3.12 evaluate_1.0.5
## [100] codetools_0.2-20 tibble_3.3.1 cli_3.6.5
## [103] RcppParallel_5.1.8 uwot_0.2.4 xtable_1.8-4
## [106] reticulate_1.45.0 jquerylib_0.1.4 dichromat_2.0-0.1
## [109] Rcpp_1.0.12 globals_0.16.3 spatstat.random_3.4-4
## [112] png_0.1-8 spatstat.univar_3.1-6 parallel_4.4.1
## [115] dotCall64_1.2 listenv_0.9.1 scales_1.4.0
## [118] ggridges_0.5.6 purrr_1.0.2 rlang_1.1.7
## [121] cowplot_1.2.0