Overview

This script performs three linked analyses on the cleaned B-ALL leukaemia dataset (clusters 10 and 11 excluded from CNS):

LSK_IL7R vs LK_CLP population scoring — using bulk RNAseq-derived signatures (S4/BM, S5/CNS) to assign propagating population identity at the single-cell level
CNS signature scoring — scoring the clean DE-derived CNS and BM signatures back onto single cells using UCell, and cross-validating against a prior bulk RNAseq experiment (mir128a model)
CNS programme distribution — assessing whether the CNS transcriptional programme is concentrated in discrete subpopulations or distributed broadly across the CNS leukaemia compartment, and linking CNS-enriched genes to propagating population identity

Setup

library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(ggplot2)
library(patchwork)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(qs2)

## qs2 0.1.7

library(viridis)

## Loading required package: viridisLite

library(tidyr)
library(tibble)
library(ggrepel)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

library(harmony)

## Loading required package: Rcpp

library(UCell)

tissue_cols  <- c("BM" = "steelblue", "CNS" = "firebrick")
source_cols  <- c("AD4_EM3"   = "#E69F00",
                  "AD2.2_EM4" = "#56B4E9",
                  "AD5_EM4"   = "#CC79A7")
pop_cols     <- c("LSK_IL7R"     = "#1a3a6b",
                  "LK_CLP"       = "#8b0000",
                  "Intermediate" = "grey75")

bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"

# Ambient RNA blacklist — excluded from all scoring and interpretation
ambient_blacklist <- c("Slc1a2",  "Sparcl1", "Apod",  "Cpe",
                       "Mobp",    "S100b",   "Atp1a2","Ndrg2")

leuk_BM        <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS_clean <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_clean_march26.qs")

stopifnot("leuk_BM not found"        = exists("leuk_BM"))
stopifnot("leuk_CNS_clean not found" = exists("leuk_CNS_clean"))

# Load clean DE results from script 09
res_clean_df <- read.csv("/exports/eddie/scratch/aduguid3/Rmarkdown/04_DE_CNSvsBM_clean.csv")
stopifnot("res_clean_df not found"   = exists("res_clean_df"))

Idents(leuk_BM)        <- bm_res_col
Idents(leuk_CNS_clean) <- cns_res_col

cat("BM  cells:", ncol(leuk_BM),        "\n")

## BM  cells: 44606

cat("CNS cells:", ncol(leuk_CNS_clean), "\n")

## CNS cells: 47017

cat("DE genes loaded:", nrow(res_clean_df), "\n")

## DE genes loaded: 16195

Harmony Integration — Joint BM and CNS Clustering

Prior to population scoring, BM leukaemia cells and cleaned CNS leukaemia cells (clusters 10 and 11 excluded) are merged into a single object and re-embedded using Harmony integration. Harmony corrects for transcriptional differences attributable to transplant source (AD4_EM3, AD2.2_EM4, AD5_EM4) while preserving genuine biological variation, including tissue-specific transcriptional states and population identity.

This integration is performed at this stage — after contamination exclusion but before population scoring — to ensure that the UMAP embedding used for visualisation reflects the true transcriptional landscape of bona fide leukaemia cells. Clustering is performed in Harmony-corrected space using the same dimensionality (dims 1:25) and resolution (0.3) as the tissue-specific clustering in script 02. All downstream population scoring and signature analyses are performed on the tissue-specific objects (leuk_BM and leuk_CNS_clean) rather than the merged object, so that tissue-specific signatures are applied in their cognate compartment. The integrated leuk_all object is used for joint visualisation only.

harmony_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_all_harmony.qs"

if (!file.exists(harmony_path)) {
  
  leuk_all <- merge(leuk_BM, leuk_CNS_clean,
                     add.cell.ids = c("BM", "CNS"))
  
  cat("Merged object:\n")
  cat("Total cells:", ncol(leuk_all), "\n")
  print(table(leuk_all$Tissue))
  print(table(leuk_all$Tissue, leuk_all$transplant_source))
  
  leuk_all <- FindVariableFeatures(leuk_all,
                                    selection.method = "vst",
                                    nfeatures        = 2000)
  leuk_all <- ScaleData(leuk_all, verbose = FALSE)
  leuk_all <- RunPCA(leuk_all, verbose = FALSE)
  
  leuk_all <- harmony::RunHarmony(leuk_all,
                                   group.by.vars  = "transplant_source",
                                   reduction      = "pca",
                                   reduction.save = "harmony",
                                   verbose        = TRUE)
  
  leuk_all <- RunUMAP(leuk_all,
                       reduction = "harmony",
                       dims      = 1:25)
  
  leuk_all <- FindNeighbors(leuk_all,
                             reduction = "harmony",
                             dims      = 1:25)
  
  leuk_all <- FindClusters(leuk_all, resolution = 0.3)
  
  qs_save(leuk_all, harmony_path)
  
} else {
  leuk_all <- qs_read(harmony_path)
}

# Define barcode prefix vectors — used in all subsequent transfer chunks
bm_cells  <- colnames(leuk_all)[grepl("^BM_",  colnames(leuk_all))]
cns_cells <- colnames(leuk_all)[grepl("^CNS_", colnames(leuk_all))]
bm_orig   <- sub("^BM_",  "", bm_cells)
cns_orig  <- sub("^CNS_", "", cns_cells)

cat("Harmony integration complete\n")

## Harmony integration complete

cat("Cells:", ncol(leuk_all), "\n")

## Cells: 91623

p_tissue <- DimPlot(leuk_all,
                     group.by = "Tissue",
                     cols     = tissue_cols,
                     alpha    = 0.5) +
  ggtitle("All cells — by tissue")

p_source <- DimPlot(leuk_all,
                     group.by = "transplant_source",
                     cols     = source_cols,
                     alpha    = 0.5) +
  ggtitle("All cells — by transplant source")

p_tissue + p_source

Harmony-integrated UMAP — BM and CNS leukaemia

Integration Quality

Harmony integration successfully corrected for transplant source effects, with cells from all three sources (AD4_EM3, AD2.2_EM4, AD5_EM4) intermixing across shared transcriptional states in the integrated embedding. Critically, tissue identity was preserved as a major axis of variation — BM and CNS leukaemia cells occupied largely distinct regions of the UMAP despite source correction, confirming that the CNS vs BM transcriptional differences identified by pseudobulk DE are robust and represent the dominant biological signal in this dataset after transplant source effects are removed. This integrated embedding is used for all subsequent visualisation.

Part 1 — LSK_IL7R vs LK_CLP Population Scoring

Two propagating leukaemia populations have been characterised in this B-ALL model by bulk RNAseq: LSK_IL7R (Lin-Sca1+Kit+ IL-7 receptor positive) and LK_CLP (Lin-Sca1-Kit+ common lymphoid progenitor). To assign single-cell identities, the bulk-derived LSK_IL7R signature from Supplemental Table S4 (BM-derived, 35 genes, proliferation/cell cycle enriched) is scored onto both BM and CNS leukaemia cells using AddModuleScore. The resulting score distribution is used to assign population labels at the single-cell level, with cells scoring above threshold assigned as LSK_IL7R, cells scoring below threshold as LK_CLP, and cells in the intermediate range left unassigned.

All UMAP visualisations use the Harmony-integrated embedding from the merged leuk_all object, which corrects for transplant source effects while preserving genuine tissue-specific and population-specific transcriptional variation.

Define Signatures

# S4 — BM LSK_IL7R signature (proliferation/cell cycle enriched)
lsk_il7r_bm <- c("Cenpf",  "Kpna2",  "Ccnb1", "Plk1",   "Ube2c",
                  "Aspm",   "Cdk1",   "Ckap2", "Nuf2",   "Kif18b",
                  "Tpx2",   "Tubb4b", "Mki67", "Hmmr",   "Cdc20",
                  "Cdc25b", "Kifc1",  "Nusap1","Cenpa",  "Nek2",
                  "Kif14",  "Ccnb2",  "Prc1",  "Kif4",   "Aurka",
                  "Top2a",  "Dlgap5", "Kif11", "Gas2l3", "Jpt1",
                  "Incenp", "Cenpe",  "Kif20b","Knl1",   "Arl6ip1")

# S5 — CNS LSK_IL7R signature
lsk_il7r_cns <- c("Tshz2", "Ube2c", "Ptprm", "Muc5ac")

# Filter to available genes in each object
bm_sig_found  <- lsk_il7r_bm[lsk_il7r_bm   %in% rownames(leuk_BM)]
cns_sig_found <- lsk_il7r_cns[lsk_il7r_cns %in% rownames(leuk_CNS_clean)]

cat("BM  signature genes found:", length(bm_sig_found),
    "of", length(lsk_il7r_bm), "\n")

## BM  signature genes found: 35 of 35

cat("CNS signature genes found:", length(cns_sig_found),
    "of", length(lsk_il7r_cns), "\n")

## CNS signature genes found: 4 of 4

Add Module Scores — Bulk-Derived Signatures

Module scores are computed per cell using AddModuleScore, which calculates the mean expression of the signature genes relative to a set of randomly sampled control genes of matched expression level. Scores above zero indicate relative enrichment of the signature in that cell. The S4 signature is applied to both BM and CNS objects to allow cross-tissue comparison of population proportions.

# BM object — S4 signature
leuk_BM <- AddModuleScore(leuk_BM,
                           features = list(bm_sig_found),
                           name     = "LSK_IL7R_BM",
                           seed     = 42)

# CNS object — S4 signature (cross-tissue comparison)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
                                  features = list(bm_sig_found),
                                  name     = "LSK_IL7R_BM",
                                  seed     = 42)

# CNS object — S5 signature (CNS-specific)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
                                  features = list(cns_sig_found),
                                  name     = "LSK_IL7R_CNS",
                                  seed     = 42)

cat("Score ranges — BM S4 signature:\n")

## Score ranges — BM S4 signature:

cat("  BM  object:", round(range(leuk_BM$LSK_IL7R_BM1),        3), "\n")

##   BM  object: -0.522 1.879

cat("  CNS object:", round(range(leuk_CNS_clean$LSK_IL7R_BM1), 3), "\n")

##   CNS object: -0.531 1.368

Score Distribution on UMAP

LSK_IL7R signature scores are visualised on the Harmony-integrated UMAP to confirm that scoring captures spatially coherent transcriptional populations rather than diffuse noise.

lsk_scores <- c(
  setNames(leuk_BM$LSK_IL7R_BM1[match(bm_orig,  colnames(leuk_BM))],        bm_cells),
  setNames(leuk_CNS_clean$LSK_IL7R_BM1[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)

leuk_all <- AddMetaData(leuk_all, metadata = lsk_scores, col.name = "LSK_IL7R_BM1")

leuk_all_bm  <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")

cat("LSK_IL7R_BM1 transferred — NAs:", sum(is.na(leuk_all$LSK_IL7R_BM1)), "\n")

## LSK_IL7R_BM1 transferred — NAs: 0

p_bm_score <- FeaturePlot(leuk_all_bm,
                            features   = "LSK_IL7R_BM1",
                            cols       = c("lightgrey", "#1a3a6b"),
                            order      = TRUE,
                            min.cutoff = "q10") +
  ggtitle("BM — LSK_IL7R score (S4)")

p_cns_score <- FeaturePlot(leuk_all_cns,
                             features   = "LSK_IL7R_BM1",
                             cols       = c("lightgrey", "#8b0000"),
                             order      = TRUE,
                             min.cutoff = "q10") +
  ggtitle("CNS — LSK_IL7R score (S4)")

p_bm_score + p_cns_score

LSK_IL7R S4 signature score on Harmony-integrated UMAP

Score Distribution by Cluster — Violin Plots

Review the violin plots below before adjusting the population assignment thresholds. The S4 signature is proliferation-enriched — LSK_IL7R cells are expected to score positively and LK_CLP cells negatively.

v_bm <- VlnPlot(leuk_BM,
                 features  = "LSK_IL7R_BM1",
                 group.by  = bm_res_col,
                 pt.size   = 0,
                 cols      = viridis(length(unique(
                               leuk_BM@meta.data[[bm_res_col]])))) +
  ggtitle("BM — LSK_IL7R score by cluster") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

v_cns <- VlnPlot(leuk_CNS_clean,
                  features  = "LSK_IL7R_BM1",
                  group.by  = cns_res_col,
                  pt.size   = 0,
                  cols      = viridis(length(unique(
                                leuk_CNS_clean@meta.data[[cns_res_col]])))) +
  ggtitle("CNS — LSK_IL7R score by cluster") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

v_bm / v_cns

LSK_IL7R score distribution by cluster

Assign Population Labels

# !! Adjust thresholds after reviewing violin plots above !!
score_high <-  0.2    # above = LSK_IL7R
score_low  <- -0.1    # below = LK_CLP

assign_pop <- function(score, high = score_high, low = score_low) {
  case_when(
    score >  high ~ "LSK_IL7R",
    score <  low  ~ "LK_CLP",
    TRUE          ~ "Intermediate"
  )
}

leuk_BM$population        <- assign_pop(leuk_BM$LSK_IL7R_BM1)
leuk_CNS_clean$population <- assign_pop(leuk_CNS_clean$LSK_IL7R_BM1)

cat("BM population assignments:\n")

## BM population assignments:

print(table(leuk_BM$population))

## 
## Intermediate       LK_CLP     LSK_IL7R 
##         5133        29528         9945

cat("\nBM population as % of total:\n")

## 
## BM population as % of total:

print(round(prop.table(table(leuk_BM$population)) * 100, 1))

## 
## Intermediate       LK_CLP     LSK_IL7R 
##         11.5         66.2         22.3

cat("\nCNS population assignments:\n")

## 
## CNS population assignments:

print(table(leuk_CNS_clean$population))

## 
## Intermediate       LK_CLP     LSK_IL7R 
##         5063        31012        10942

cat("\nCNS population as % of total:\n")

## 
## CNS population as % of total:

print(round(prop.table(table(leuk_CNS_clean$population)) * 100, 1))

## 
## Intermediate       LK_CLP     LSK_IL7R 
##         10.8         66.0         23.3

Transfer Population Labels to Harmony Object

Population labels assigned on the tissue-specific objects are transferred to leuk_all for display on the Harmony-integrated UMAP.

pop_vec <- c(
  setNames(leuk_BM$population[match(bm_orig, colnames(leuk_BM))], bm_cells),
  setNames(leuk_CNS_clean$population[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)

leuk_all <- AddMetaData(leuk_all, metadata = pop_vec, col.name = "population")

cat("Population transferred — NAs:", sum(is.na(leuk_all$population)), "\n")

## Population transferred — NAs: 0

print(table(leuk_all$population, useNA = "ifany"))

## 
## Intermediate       LK_CLP     LSK_IL7R 
##        10196        60540        20887

leuk_all_bm  <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")

Population Labels on UMAP

p_bm_pop <- DimPlot(leuk_all_bm,
                     group.by   = "population",
                     cols       = pop_cols,
                     label      = TRUE,
                     label.size = 5,
                     repel      = TRUE) +
  ggtitle("BM — LK_CLP vs LSK_IL7R") +
  NoLegend()

p_cns_pop <- DimPlot(leuk_all_cns,
                      group.by   = "population",
                      cols       = pop_cols,
                      label      = TRUE,
                      label.size = 5,
                      repel      = TRUE) +
  ggtitle("CNS — LK_CLP vs LSK_IL7R") +
  NoLegend()

p_split <- DimPlot(leuk_all,
                    group.by   = "population",
                    cols       = pop_cols,
                    split.by   = "Tissue",
                    label      = TRUE,
                    label.size = 4,
                    repel      = TRUE) +
  ggtitle("LK_CLP vs LSK_IL7R — Harmony integrated") +
  theme(legend.position = "bottom")

p_bm_pop + p_cns_pop

LK_CLP and LSK_IL7R population labels on Harmony-integrated UMAP

p_split

$LK_CLP and LSK_IL7R population labels on Harmony-integrated UMAP$

LK_CLP and LSK_IL7R population labels on Harmony-integrated UMAP

Population Proportions by Cluster and Tissue

pop_cluster <- bind_rows(
  leuk_BM@meta.data %>%
    select(Tissue, cluster = all_of(bm_res_col), population),
  leuk_CNS_clean@meta.data %>%
    select(Tissue, cluster = all_of(cns_res_col), population)
) %>%
  group_by(Tissue, cluster, population) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Tissue, cluster) %>%
  mutate(prop = n / sum(n))

ggplot(pop_cluster,
       aes(x = factor(cluster), y = prop, fill = population)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = pop_cols) +
  facet_wrap(~Tissue, scales = "free_x") +
  theme_classic(base_size = 11) +
  labs(x     = "Cluster",
       y     = "Proportion",
       title = "Population composition by cluster and tissue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Population composition by cluster in BM and CNS

pop_cluster %>%
  filter(Tissue == "CNS") %>%
  select(cluster, population, n, proportion = prop) %>%
  mutate(proportion = round(proportion, 3)) %>%
  pivot_wider(names_from  = population,
              values_from = c(n, proportion),
              values_fill = 0) %>%
  kable(caption = "CNS cluster population composition")

## Adding missing grouping variables: `Tissue`

CNS cluster population composition
Tissue	cluster	n_Intermediate	n_LK_CLP	n_LSK_IL7R	proportion_Intermediate	proportion_LK_CLP	proportion_LSK_IL7R
CNS	0	25	11814	0	0.002	0.998	0.000
CNS	1	235	8424	0	0.027	0.973	0.000
CNS	2	129	7916	0	0.016	0.984	0.000
CNS	3	1608	154	4549	0.255	0.024	0.721
CNS	4	1041	157	3089	0.243	0.037	0.721
CNS	5	796	766	1433	0.266	0.256	0.478
CNS	6	721	420	1134	0.317	0.185	0.498
CNS	7	459	838	704	0.229	0.419	0.352
CNS	8	38	300	31	0.103	0.813	0.084
CNS	9	11	223	2	0.047	0.945	0.008

Part 2 — CNS Signature Scoring

The clean pseudobulk DE CNS signature (padj < 0.05, log2FC > 0.5, ambient blacklist removed) is scored back onto individual cells in both BM and CNS objects using UCell, which provides more robust per-cell scoring than AddModuleScore by using rank-based statistics rather than mean expression. Both the CNS signature and the reciprocal BM signature are scored onto all cells, allowing each cell to be positioned on a CNS-vs-BM transcriptional axis. All UMAP visualisations use the Harmony-integrated embedding.

Prepare CNS and BM Signatures

# CNS signature — genes enriched in CNS vs BM (clean DE, ambient blacklist removed)
cns_sig_genes <- res_clean_df %>%
  filter(padj < 0.05,
         log2FoldChange > 0.5,
         !gene %in% ambient_blacklist) %>%
  arrange(desc(log2FoldChange)) %>%
  pull(gene)

# BM signature — genes enriched in BM vs CNS
bm_sig_genes <- res_clean_df %>%
  filter(padj < 0.05,
         log2FoldChange < -0.5,
         !gene %in% ambient_blacklist) %>%
  arrange(log2FoldChange) %>%
  pull(gene)

# Filter to available genes in each object
cns_sig_bm  <- cns_sig_genes[cns_sig_genes %in% rownames(leuk_BM)]
cns_sig_cns <- cns_sig_genes[cns_sig_genes %in% rownames(leuk_CNS_clean)]
bm_sig_bm   <- bm_sig_genes[bm_sig_genes   %in% rownames(leuk_BM)]
bm_sig_cns  <- bm_sig_genes[bm_sig_genes   %in% rownames(leuk_CNS_clean)]

cat("CNS signature genes:", length(cns_sig_genes), "total\n")

## CNS signature genes: 59 total

cat("  Found in BM  object:", length(cns_sig_bm),  "\n")

##   Found in BM  object: 59

cat("  Found in CNS object:", length(cns_sig_cns), "\n")

##   Found in CNS object: 59

cat("\nBM signature genes:", length(bm_sig_genes), "total\n")

## 
## BM signature genes: 95 total

cat("  Found in BM  object:", length(bm_sig_bm),  "\n")

##   Found in BM  object: 95

cat("  Found in CNS object:", length(bm_sig_cns), "\n")

##   Found in CNS object: 95

Score CNS and BM Signatures on All Cells

# Note: UCell scoring was tested but produced identical dynamic range compression
# to AddModuleScore for these gene set sizes. AddModuleScore is retained for
# consistency with the LSK_IL7R population scoring approach used in Part 1.

leuk_BM <- AddModuleScore(leuk_BM,
                           features = list(cns_sig_bm),
                           name     = "CNS_sig",
                           seed     = 42)

leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
                                  features = list(cns_sig_cns),
                                  name     = "CNS_sig",
                                  seed     = 42)

leuk_BM <- AddModuleScore(leuk_BM,
                           features = list(bm_sig_bm),
                           name     = "BM_sig",
                           seed     = 42)

leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
                                  features = list(bm_sig_cns),
                                  name     = "BM_sig",
                                  seed     = 42)

# AddModuleScore appends "1" to the name — CNS_sig1, BM_sig1
cat("CNS signature score ranges:\n")

## CNS signature score ranges:

cat("  BM  cells:", round(range(leuk_BM$CNS_sig1),        3), "\n")

##   BM  cells: -0.053 0.108

cat("  CNS cells:", round(range(leuk_CNS_clean$CNS_sig1), 3), "\n")

##   CNS cells: -0.066 0.191

cat("\nBM signature score ranges:\n")

## 
## BM signature score ranges:

cat("  BM  cells:", round(range(leuk_BM$BM_sig1),         3), "\n")

##   BM  cells: -0.078 0.173

cat("  CNS cells:", round(range(leuk_CNS_clean$BM_sig1),  3), "\n")

##   CNS cells: -0.062 0.11

Transfer Signature Scores to Harmony Object

cns_sig_vec <- c(
  setNames(leuk_BM$CNS_sig1[match(bm_orig, colnames(leuk_BM))], bm_cells),
  setNames(leuk_CNS_clean$CNS_sig1[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
bm_sig_vec <- c(
  setNames(leuk_BM$BM_sig1[match(bm_orig, colnames(leuk_BM))], bm_cells),
  setNames(leuk_CNS_clean$BM_sig1[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)

leuk_all <- AddMetaData(leuk_all, metadata = cns_sig_vec, col.name = "CNS_sig1")
leuk_all <- AddMetaData(leuk_all, metadata = bm_sig_vec,  col.name = "BM_sig1")

leuk_all_bm  <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")

cat("CNS_sig1 NAs:", sum(is.na(leuk_all$CNS_sig1)), "\n")

## CNS_sig1 NAs: 0

cat("BM_sig1  NAs:", sum(is.na(leuk_all$BM_sig1)),  "\n")

## BM_sig1  NAs: 0

Cluster Summary — CNS vs BM Signature Delta

cns_cluster_summary <- leuk_CNS_clean@meta.data %>%
  group_by(cluster = .data[[cns_res_col]]) %>%
  summarise(
    n_cells         = n(),
    mean_CNS_score  = round(mean(CNS_sig1), 4),
    mean_BM_score   = round(mean(BM_sig1),  4),
    pct_CNS_pos     = round(mean(CNS_sig1 > 0) * 100, 1),
    pct_BM_pos      = round(mean(BM_sig1  > 0) * 100, 1),
    CNS_vs_BM_delta = round(mean(CNS_sig1) - mean(BM_sig1), 4),
    .groups         = "drop"
  ) %>%
  arrange(desc(CNS_vs_BM_delta))

kable(cns_cluster_summary,
      caption = "CNS vs BM signature delta by cluster — ranked by CNS enrichment")

CNS vs BM signature delta by cluster — ranked by CNS enrichment
cluster	n_cells	mean_CNS_score	mean_BM_score	pct_CNS_pos	pct_BM_pos	CNS_vs_BM_delta
0	11839	0.0007	-0.0165	46.9	14.8	0.0173
8	369	-0.0084	-0.0224	28.7	10.0	0.0140
5	2995	-0.0045	-0.0141	38.2	13.7	0.0096
3	6311	-0.0122	-0.0207	25.4	5.2	0.0085
7	2001	-0.0054	-0.0121	38.4	24.1	0.0066
9	236	-0.0087	-0.0153	33.1	17.4	0.0066
4	4287	-0.0157	-0.0196	18.7	5.8	0.0039
1	8659	-0.0113	-0.0149	26.8	13.9	0.0036
6	2275	-0.0114	-0.0133	26.0	14.8	0.0019
2	8045	-0.0146	-0.0126	21.7	17.5	-0.0020

CNS Signature Distribution — Cluster and Threshold Independence

To determine whether the conserved CNS transcriptional programme was concentrated in a discrete leukaemia subpopulation, CNS cells were stratified by bulk CNS signature score at both the 75th and 90th percentile thresholds. At both thresholds, high-scoring cells were uniformly distributed across all ten CNS leukaemia clusters (range 20–31% per cluster at q75; similar distribution at q90), with no cluster showing significant enrichment. This threshold independence confirms the finding is not an artefact of score cutoff selection.

The uniform distribution indicates that the conserved CNS transcriptional programme is a compartment-wide property of CNS leukaemia rather than a feature restricted to a discrete subpopulation. Every CNS cluster contains cells expressing the cross-validated CNS signature at similar frequencies, suggesting that CNS engraftment imposes a broad transcriptional adaptation that is shared across leukaemia cell states rather than driving the emergence of a specialised CNS-adapted clone.

CNS Signature Scoring — Assessment and Revised Strategy

Scoring of the pseudobulk-derived CNS signature onto individual cells produced compressed score distributions with CNS-vs-BM deltas of at most 0.017 across all ten CNS clusters. No cluster showed a clearly BM-like transcriptional profile, and the separation between the most and least CNS-enriched clusters was insufficient to support a meaningful high-CNS vs low-CNS contrast at the pseudobulk level.

This compression is a known limitation of module scoring when applied to transcriptionally homogeneous populations. Critically, the uniformity of CNS signature scores across clusters is itself a biologically meaningful finding: CNS leukaemia cells do not segregate into discrete CNS-adapted and non-adapted subpopulations at this resolution, suggesting that the CNS transcriptional programme is broadly shared across the engrafted leukaemia compartment rather than restricted to a specialised subset.

Given this, the pseudobulk CNS vs BM differential expression analysis from script 04 remains the primary and most statistically robust characterisation of the CNS leukaemia transcriptional signature. Downstream analyses focus on two questions: cross-validation of this signature against a prior bulk RNAseq experiment, and whether CNS-enriched genes are differentially expressed between the LSK_IL7R and LK_CLP propagating populations within the CNS compartment.

Part 3 — Bulk RNAseq Signature Validation

To validate the single-cell CNS vs BM transcriptional differences identified in this dataset, DE genes from a prior bulk RNAseq experiment (mir128a model, padj < 0.05) are scored onto individual cells. Concordance between the bulk and single-cell signatures would confirm that the CNS-specific transcriptional programme identified here is reproducible across experimental systems and is not a feature unique to this dataset. Direction confirmed: positive log2FC in the bulk data = CNS-enriched.

Load and Prepare Bulk Signatures

bulk_de <- read.csv("/exports/eddie/scratch/aduguid3/de_genes_mir128a_sig_0.05.csv") %>%
  filter(!is.na(symbol), symbol != "NA") %>%
  rename(gene = symbol)

cat("Bulk DE genes loaded:", nrow(bulk_de), "\n")

## Bulk DE genes loaded: 212

cat("CNS-enriched (positive log2FC):", sum(bulk_de$log2FoldChange > 0), "\n")

## CNS-enriched (positive log2FC): 81

cat("BM-enriched  (negative log2FC):", sum(bulk_de$log2FoldChange < 0), "\n")

## BM-enriched  (negative log2FC): 131

bulk_cns_genes <- bulk_de %>%
  filter(log2FoldChange > 0) %>%
  arrange(desc(log2FoldChange)) %>%
  pull(gene)

bulk_bm_genes <- bulk_de %>%
  filter(log2FoldChange < 0) %>%
  arrange(log2FoldChange) %>%
  pull(gene)

# Filter to genes present in single-cell objects
bulk_cns_bm  <- bulk_cns_genes[bulk_cns_genes %in% rownames(leuk_BM)]
bulk_cns_cns <- bulk_cns_genes[bulk_cns_genes %in% rownames(leuk_CNS_clean)]
bulk_bm_bm   <- bulk_bm_genes[bulk_bm_genes   %in% rownames(leuk_BM)]
bulk_bm_cns  <- bulk_bm_genes[bulk_bm_genes   %in% rownames(leuk_CNS_clean)]

cat("\nGenes found in single-cell objects:\n")

## 
## Genes found in single-cell objects:

cat("  Bulk CNS sig — BM  object:", length(bulk_cns_bm),  "\n")

##   Bulk CNS sig — BM  object: 77

cat("  Bulk CNS sig — CNS object:", length(bulk_cns_cns), "\n")

##   Bulk CNS sig — CNS object: 77

cat("  Bulk BM  sig — BM  object:", length(bulk_bm_bm),   "\n")

##   Bulk BM  sig — BM  object: 129

cat("  Bulk BM  sig — CNS object:", length(bulk_bm_cns),  "\n")

##   Bulk BM  sig — CNS object: 129

Concordance With Single-Cell DE Signature

sc_cns_genes <- res_clean_df %>%
  filter(padj < 0.05, log2FoldChange > 0.5, !gene %in% ambient_blacklist) %>%
  pull(gene)

sc_bm_genes <- res_clean_df %>%
  filter(padj < 0.05, log2FoldChange < -0.5, !gene %in% ambient_blacklist) %>%
  pull(gene)

cns_overlap <- intersect(bulk_cns_genes, sc_cns_genes)
bm_overlap  <- intersect(bulk_bm_genes,  sc_bm_genes)

cat("CNS signature overlap (bulk ∩ single-cell):", length(cns_overlap), "genes\n")

## CNS signature overlap (bulk ∩ single-cell): 12 genes

cat("  Bulk CNS genes:       ", length(bulk_cns_genes), "\n")

##   Bulk CNS genes:        81

cat("  Single-cell CNS genes:", length(sc_cns_genes),   "\n")

##   Single-cell CNS genes: 59

cat("  Jaccard:              ",
    round(length(cns_overlap) /
            length(union(bulk_cns_genes, sc_cns_genes)), 3), "\n")

##   Jaccard:               0.094

cat("\nBM signature overlap (bulk ∩ single-cell):", length(bm_overlap), "genes\n")

## 
## BM signature overlap (bulk ∩ single-cell): 11 genes

cat("  Jaccard:",
    round(length(bm_overlap) /
            length(union(bulk_bm_genes, sc_bm_genes)), 3), "\n")

##   Jaccard: 0.051

cat("\nOverlapping CNS genes:", paste(cns_overlap, collapse = ", "), "\n")

## 
## Overlapping CNS genes: Ctla4, Gm6093, Tdrd5, Dntt, Grik1, Greb1, Tmem151b, Dsg1c, Pax7, Nectin1, Soat1, Cd96

cat("Overlapping BM  genes:", paste(bm_overlap,  collapse = ", "), "\n")

## Overlapping BM  genes: Rhob, Cdkn1a, Klf11, Fgf3, Rab44, Gas7, Vim, Lgals1, Lmna, Cldn10, Trpm1

Conserved Genes Across Both Experimental Systems

conserved_cns <- data.frame(gene = cns_overlap) %>%
  left_join(bulk_de      %>% select(gene, bulk_lfc = log2FoldChange), by = "gene") %>%
  left_join(res_clean_df %>% select(gene, sc_lfc   = log2FoldChange,
                                         sc_padj   = padj), by = "gene") %>%
  arrange(desc(bulk_lfc)) %>%
  mutate(across(where(is.numeric), ~round(., 3)))

kable(conserved_cns,
      caption = "CNS-enriched genes conserved across bulk (mir128a) and single-cell datasets")

CNS-enriched genes conserved across bulk (mir128a) and single-cell datasets
gene	bulk_lfc	sc_lfc	sc_padj
Ctla4	3.889	1.876	0.037
Gm6093	2.105	4.069	0.000
Tdrd5	0.988	1.325	0.044
Dntt	0.958	1.291	0.005
Grik1	0.949	1.628	0.008
Greb1	0.832	0.797	0.000
Tmem151b	0.811	1.451	0.026
Dsg1c	0.669	0.714	0.031
Pax7	0.642	0.893	0.010
Nectin1	0.608	0.665	0.044
Soat1	0.594	0.690	0.000
Cd96	0.572	0.585	0.000

conserved_bm <- data.frame(gene = bm_overlap) %>%
  left_join(bulk_de      %>% select(gene, bulk_lfc = log2FoldChange), by = "gene") %>%
  left_join(res_clean_df %>% select(gene, sc_lfc   = log2FoldChange,
                                         sc_padj   = padj), by = "gene") %>%
  arrange(bulk_lfc) %>%
  mutate(across(where(is.numeric), ~round(., 3)))

kable(conserved_bm,
      caption = "BM-enriched genes conserved across bulk (mir128a) and single-cell datasets")

BM-enriched genes conserved across bulk (mir128a) and single-cell datasets
gene	bulk_lfc	sc_lfc	sc_padj
Rhob	-1.530	-2.320	0.012
Cdkn1a	-1.361	-1.345	0.000
Klf11	-1.078	-1.343	0.000
Fgf3	-0.873	-0.676	0.000
Rab44	-0.800	-0.758	0.001
Gas7	-0.713	-0.648	0.002
Vim	-0.671	-0.574	0.000
Lgals1	-0.651	-0.579	0.000
Lmna	-0.630	-0.616	0.000
Cldn10	-0.576	-0.582	0.000
Trpm1	-0.527	-0.642	0.000

Cross-Experiment Validation — Conserved BM-Enriched Signature

Eleven BM-enriched genes were identified in both the bulk RNAseq mir128a dataset and the single-cell pseudobulk analysis, representing the highest-confidence BM-specific leukaemia transcriptional features across independent experimental systems. Log2 fold changes were strikingly concordant between datasets for all 11 genes, with all showing consistent BM-enrichment in both bulk and single-cell comparisons. The conserved BM signature includes the cell cycle inhibitor Cdkn1a (p21), the Rho GTPase Rhob, the transcription factor Klf11, and the immunosuppressive lectin Lgals1, suggesting that BM-resident leukaemia cells are characterised by greater cell cycle restraint, cytoskeletal remodelling, and immunosuppressive signalling relative to CNS-engrafted cells — features consistent with established properties of the BM niche.

Score Conserved Signatures onto Single Cells

The full bulk signatures showed modest overlap with single-cell DE results (Jaccard CNS = 0.094, BM = 0.051), consistent with differences between model systems and cell populations profiled. Scoring is therefore restricted to the conserved gene sets validated in both experiments, representing the highest-confidence cross-experiment features.

# Guard — recreate overlap vectors if not in session
if (!exists("cns_overlap")) {
  cns_overlap <- intersect(
    bulk_de %>% filter(log2FoldChange > 0) %>% pull(gene), sc_cns_genes)
  bm_overlap  <- intersect(
    bulk_de %>% filter(log2FoldChange < 0) %>% pull(gene), sc_bm_genes)
  cat("Overlap vectors recreated — CNS:", length(cns_overlap),
      "/ BM:", length(bm_overlap), "\n")
}

bulk_cns_conserved <- cns_overlap  # 12 genes
bulk_bm_conserved  <- bm_overlap   # 11 genes

bulk_cns_conserved_bm  <- bulk_cns_conserved[bulk_cns_conserved %in% rownames(leuk_BM)]
bulk_cns_conserved_cns <- bulk_cns_conserved[bulk_cns_conserved %in% rownames(leuk_CNS_clean)]
bulk_bm_conserved_bm   <- bulk_bm_conserved[bulk_bm_conserved   %in% rownames(leuk_BM)]
bulk_bm_conserved_cns  <- bulk_bm_conserved[bulk_bm_conserved   %in% rownames(leuk_CNS_clean)]

cat("Conserved CNS genes — BM:", length(bulk_cns_conserved_bm),
    "/ CNS:", length(bulk_cns_conserved_cns), "\n")

## Conserved CNS genes — BM: 12 / CNS: 12

cat("Conserved BM  genes — BM:", length(bulk_bm_conserved_bm),
    "/ CNS:", length(bulk_bm_conserved_cns), "\n")

## Conserved BM  genes — BM: 11 / CNS: 11

leuk_BM <- AddModuleScore(leuk_BM,
                           features = list(bulk_cns_conserved_bm),
                           name     = "bulk_CNS_sig",
                           seed     = 42)

leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
                                  features = list(bulk_cns_conserved_cns),
                                  name     = "bulk_CNS_sig",
                                  seed     = 42)

leuk_BM <- AddModuleScore(leuk_BM,
                           features = list(bulk_bm_conserved_bm),
                           name     = "bulk_BM_sig",
                           seed     = 42)

leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
                                  features = list(bulk_bm_conserved_cns),
                                  name     = "bulk_BM_sig",
                                  seed     = 42)

cat("\nConserved CNS signature score ranges:\n")

## 
## Conserved CNS signature score ranges:

cat("  BM  cells:", round(range(leuk_BM$bulk_CNS_sig1),        3), "\n")

##   BM  cells: -0.104 0.453

cat("  CNS cells:", round(range(leuk_CNS_clean$bulk_CNS_sig1), 3), "\n")

##   CNS cells: -0.158 0.468

cat("\nConserved BM signature score ranges:\n")

## 
## Conserved BM signature score ranges:

cat("  BM  cells:", round(range(leuk_BM$bulk_BM_sig1),         3), "\n")

##   BM  cells: -0.345 0.701

cat("  CNS cells:", round(range(leuk_CNS_clean$bulk_BM_sig1),  3), "\n")

##   CNS cells: -0.346 0.723

# Sanity check — BM cells should score higher on BM sig, CNS cells on CNS sig
bind_rows(
  leuk_BM@meta.data        %>% summarise(Tissue       = "BM",
                                          mean_CNS_sig = round(mean(bulk_CNS_sig1), 4),
                                          mean_BM_sig  = round(mean(bulk_BM_sig1),  4)),
  leuk_CNS_clean@meta.data %>% summarise(Tissue       = "CNS",
                                          mean_CNS_sig = round(mean(bulk_CNS_sig1), 4),
                                          mean_BM_sig  = round(mean(bulk_BM_sig1),  4))
) %>% kable(caption = "Sanity check — mean conserved signature scores by tissue")

Sanity check — mean conserved signature scores by tissue
Tissue	mean_CNS_sig	mean_BM_sig
BM	-0.0038	0.0484
CNS	-0.0097	-0.0301

Bulk Signature Scores — BM vs CNS

Both conserved signatures showed statistically significant tissue separation at the single-cell level. The BM signature showed stronger effect size, with BM cells scoring substantially higher than CNS cells. The CNS signature showed more modest but significant separation, driven by a subset of highly scoring CNS cells visible as a right tail in the score distribution.

bulk_scores_tissue <- bind_rows(
  leuk_BM@meta.data %>%
    select(Tissue, bulk_CNS_sig1, bulk_BM_sig1),
  leuk_CNS_clean@meta.data %>%
    select(Tissue, bulk_CNS_sig1, bulk_BM_sig1)
) %>%
  pivot_longer(c(bulk_CNS_sig1, bulk_BM_sig1),
               names_to  = "signature",
               values_to = "score") %>%
  mutate(signature = recode(signature,
                             "bulk_CNS_sig1" = "Bulk CNS signature",
                             "bulk_BM_sig1"  = "Bulk BM signature"))

wt_cns <- wilcox.test(leuk_BM$bulk_CNS_sig1, leuk_CNS_clean$bulk_CNS_sig1)
wt_bm  <- wilcox.test(leuk_BM$bulk_BM_sig1,  leuk_CNS_clean$bulk_BM_sig1)

ggplot(bulk_scores_tissue,
       aes(x = Tissue, y = score, fill = Tissue)) +
  geom_violin(trim = FALSE, alpha = 0.8) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
  scale_fill_manual(values = tissue_cols) +
  facet_wrap(~signature, scales = "free_y") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(y     = "Module score",
       title = paste0("Bulk RNAseq signatures by tissue\n",
                      "BM sig p = ", signif(wt_bm$p.value, 3),
                      "  |  CNS sig p = ", signif(wt_cns$p.value, 3))) +
  theme(legend.position = "none")

Conserved bulk RNAseq signature scores by tissue

Part 4 — Linking CNS Biology to Population Identity

Are CNS-Enriched Genes Differentially Expressed Between LSK_IL7R and LK_CLP?

The CNS-enriched gene set identified by pseudobulk DE (script 04) is interrogated for differential expression between the LSK_IL7R and LK_CLP propagating populations within the CNS compartment. Genes that are both CNS-enriched relative to BM and differentially expressed between populations represent the most biologically informative candidates — they are part of the CNS adaptation programme and are preferentially associated with one propagating population, directly connecting tissue-specific transcriptional reprogramming to leukaemia population biology.

cns_pop_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/05_de_cns_sig_by_population.qs"

if (!file.exists(cns_pop_path)) {

  leuk_cns_pop <- subset(leuk_CNS_clean,
                          subset = population != "Intermediate")

  Idents(leuk_cns_pop) <- "population"

  markers_cns_pop <- FindMarkers(
    leuk_cns_pop,
    ident.1         = "LSK_IL7R",
    ident.2         = "LK_CLP",
    features        = cns_sig_genes[cns_sig_genes %in% rownames(leuk_cns_pop)],
    test.use        = "wilcox",
    min.pct         = 0.05,
    logfc.threshold = 0
  ) %>%
    rownames_to_column("gene") %>%
    left_join(
      res_clean_df %>% select(gene,
                               cns_log2FC = log2FoldChange,
                               cns_padj   = padj),
      by = "gene"
    ) %>%
    arrange(p_val_adj)

  qs_save(markers_cns_pop, cns_pop_path)

} else {
  markers_cns_pop <- qs_read(cns_pop_path)
}

cat("CNS signature genes tested:", nrow(markers_cns_pop), "\n")

## CNS signature genes tested: 28

cat("Significant between populations (padj < 0.05):",
    sum(markers_cns_pop$p_val_adj < 0.05, na.rm = TRUE), "\n")

## Significant between populations (padj < 0.05): 19

markers_cns_pop %>%
  filter(p_val_adj < 0.05) %>%
  select(gene, avg_log2FC, pct.1, pct.2, p_val_adj, cns_log2FC) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  arrange(p_val_adj) %>%
  kable(caption = "CNS-enriched genes that differ between LSK_IL7R and LK_CLP within CNS")

CNS-enriched genes that differ between LSK_IL7R and LK_CLP within CNS
gene	avg_log2FC	pct.1	pct.2	p_val_adj	cns_log2FC
Dsg1c	1.1574	0.060	0.019	0e+00	0.7137
Grik1	0.3566	0.138	0.076	0e+00	1.6282
Gm30230	0.7821	0.072	0.031	0e+00	0.5048
Egfem1	0.2575	0.169	0.097	0e+00	0.8184
Plaat3	-0.7237	0.410	0.419	0e+00	0.5200
Plpp3	0.5384	0.065	0.030	0e+00	0.5034
Gm44067	0.4729	0.069	0.034	0e+00	0.5263
Soat1	0.0388	0.390	0.273	0e+00	0.6900
Gpr83	0.0069	0.257	0.176	0e+00	0.8085
Pdzd2	0.0427	0.107	0.065	0e+00	0.5502
Dnm3os	-0.8819	0.224	0.241	0e+00	0.6839
Dntt	-0.0654	0.645	0.538	0e+00	1.2912
Stc1	-0.1145	0.064	0.039	0e+00	0.8787
Nectin1	0.2391	0.052	0.032	0e+00	0.6646
Fgd1	0.0356	0.062	0.041	0e+00	0.7965
Tdrd5	0.0270	0.070	0.047	0e+00	1.3246
Slc5a9	-0.2499	0.306	0.250	0e+00	0.5022
Greb1	-0.3496	0.076	0.057	0e+00	0.7967
Snn	-0.9659	0.063	0.076	6e-04	0.5038

CNS Signature Genes — Tissue Enrichment vs Population Difference

Each point represents a CNS-enriched gene. Position on the x-axis reflects how strongly it is enriched in CNS vs BM leukaemia (from the pseudobulk DE); position on the y-axis reflects whether it is higher in LSK_IL7R (positive) or LK_CLP (negative) within the CNS compartment. Genes in the upper-right quadrant are CNS-enriched and LSK_IL7R-high; genes in the lower-right are CNS-enriched and LK_CLP-high. Highlighted genes are significant between populations (padj < 0.05).

markers_cns_pop %>%
  mutate(
    direction = case_when(
      p_val_adj < 0.05 & avg_log2FC > 0 ~ "LSK_IL7R-high",
      p_val_adj < 0.05 & avg_log2FC < 0 ~ "LK_CLP-high",
      TRUE                               ~ "NS"
    ),
    label = ifelse(p_val_adj < 0.05 & abs(avg_log2FC) > 0.3, gene, NA)
  ) %>%
  ggplot(aes(x = cns_log2FC, y = avg_log2FC, colour = direction)) +
  geom_point(size = 1.5, alpha = 0.7) +
  ggrepel::geom_text_repel(aes(label = label),
                            size         = 2.5,
                            max.overlaps = 25,
                            box.padding  = 0.3) +
  scale_colour_manual(
    values = c("LSK_IL7R-high" = "#1a3a6b",
               "LK_CLP-high"   = "#8b0000",
               "NS"            = "grey70")) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(x      = "log2FC (CNS vs BM) — tissue enrichment",
       y      = "log2FC (LSK_IL7R vs LK_CLP) — population difference within CNS",
       title  = "CNS signature genes — tissue enrichment vs population difference",
       colour = NULL) +
  theme(legend.position = "bottom")

## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).

CNS signature genes — tissue enrichment vs population difference within CNS

Violin Plots — Top Population-Specific CNS Genes

top_lsk_cns <- markers_cns_pop %>%
  filter(p_val_adj < 0.05, avg_log2FC > 0) %>%
  head(4) %>% pull(gene)

top_clp_cns <- markers_cns_pop %>%
  filter(p_val_adj < 0.05, avg_log2FC < 0) %>%
  head(4) %>% pull(gene)

top_genes <- c(top_lsk_cns, top_clp_cns)

if (length(top_genes) > 0) {
  VlnPlot(leuk_CNS_clean,
           features = top_genes,
           group.by = "population",
           cols     = pop_cols,
           pt.size  = 0,
           ncol     = 4) &
    theme(legend.position = "none",
          axis.text.x = element_text(angle = 45, hjust = 1))
} else {
  cat("No significant population-specific CNS genes to plot\n")
}

Top CNS-enriched genes by population within CNS

Summary Table — Population-Specific CNS Genes by Direction

markers_cns_pop %>%
  filter(p_val_adj < 0.05) %>%
  mutate(population_enriched = ifelse(avg_log2FC > 0, "LSK_IL7R", "LK_CLP")) %>%
  group_by(population_enriched) %>%
  summarise(
    n_genes   = n(),
    top_genes = paste(head(gene, 5), collapse = ", "),
    .groups   = "drop"
  ) %>%
  kable(caption = "CNS-enriched genes significant between populations — summary by direction")

CNS-enriched genes significant between populations — summary by direction
population_enriched	n_genes	top_genes
LK_CLP	7	Plaat3, Dnm3os, Dntt, Stc1, Slc5a9
LSK_IL7R	12	Dsg1c, Grik1, Gm30230, Egfem1, Plpp3

write.csv(markers_cns_pop,
           "10_DE_cns_sig_by_population.csv",
           row.names = FALSE)

cat("Saved: 10_DE_cns_sig_by_population.csv\n")

## Saved: 10_DE_cns_sig_by_population.csv

Part 5 — Cell Cycle Analysis

Cell cycle phase assignment was performed on the parent leukaemia object (script 03) using Seurat’s CellCycleScoring with canonical S-phase and G2M-phase gene sets. Here, cell cycle phase distributions are compared across two axes: tissue compartment (BM vs CNS) and propagating population identity (LSK_IL7R vs LK_CLP). This is of particular biological interest because the S4 bulk signature used for population scoring is proliferation and cell cycle enriched — LSK_IL7R cells are expected to be more proliferative, and the CNS niche may impose differential cell cycle constraints relative to BM.

Confirm Cell Cycle Metadata Is Present

cc_cols_bm  <- c("S.Score", "G2M.Score", "Phase") %in% colnames(leuk_BM@meta.data)
cc_cols_cns <- c("S.Score", "G2M.Score", "Phase") %in% colnames(leuk_CNS_clean@meta.data)

cat("BM  — cell cycle columns present:", all(cc_cols_bm),  "\n")

## BM  — cell cycle columns present: TRUE

cat("CNS — cell cycle columns present:", all(cc_cols_cns), "\n")

## CNS — cell cycle columns present: TRUE

if (!all(cc_cols_bm) || !all(cc_cols_cns)) {
  cat("Cell cycle scores missing — rerunning CellCycleScoring\n")
  
  s_genes   <- stringr::str_to_title(tolower(cc.genes.updated.2019$s.genes))
  g2m_genes <- stringr::str_to_title(tolower(cc.genes.updated.2019$g2m.genes))
  
  leuk_BM <- CellCycleScoring(leuk_BM,
                                s.features   = s_genes,
                                g2m.features = g2m_genes,
                                set.ident    = FALSE)
  
  leuk_CNS_clean <- CellCycleScoring(leuk_CNS_clean,
                                      s.features   = s_genes,
                                      g2m.features = g2m_genes,
                                      set.ident    = FALSE)
  cat("CellCycleScoring complete\n")
}

cat("\nBM phase distribution:\n")

## 
## BM phase distribution:

print(round(prop.table(table(leuk_BM$Phase)) * 100, 1))

## 
##   G1  G2M    S 
## 45.3 22.5 32.2

cat("\nCNS phase distribution:\n")

## 
## CNS phase distribution:

print(round(prop.table(table(leuk_CNS_clean$Phase)) * 100, 1))

## 
##   G1  G2M    S 
## 46.0 24.0 30.1

BM vs CNS — Cell Cycle Phase Distribution

phase_cols <- c("G1" = "#4a90d9", "S" = "#e67e22", "G2M" = "#27ae60")

cc_tissue <- bind_rows(
  leuk_BM@meta.data        %>% select(Tissue, Phase, S.Score, G2M.Score),
  leuk_CNS_clean@meta.data %>% select(Tissue, Phase, S.Score, G2M.Score)
) %>%
  group_by(Tissue, Phase) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Tissue) %>%
  mutate(prop = n / sum(n))

ggplot(cc_tissue, aes(x = Tissue, y = prop, fill = Phase)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = phase_cols) +
  scale_y_continuous(labels = scales::percent) +
  theme_classic(base_size = 12) +
  labs(x = NULL, y = "Proportion of cells",
       title = "Cell cycle phase distribution — BM vs CNS") +
  theme(legend.position = "right")

Cell cycle phase distribution — BM vs CNS

bind_rows(
  leuk_BM@meta.data        %>% select(Tissue, S.Score, G2M.Score),
  leuk_CNS_clean@meta.data %>% select(Tissue, S.Score, G2M.Score)
) %>%
  pivot_longer(c(S.Score, G2M.Score),
               names_to  = "score_type",
               values_to = "score") %>%
  ggplot(aes(x = Tissue, y = score, fill = Tissue)) +
  geom_violin(trim = FALSE, alpha = 0.8) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
  scale_fill_manual(values = tissue_cols) +
  facet_wrap(~score_type, scales = "free_y") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(y = "Score", title = "Cell cycle scores — BM vs CNS") +
  theme(legend.position = "none")

S and G2M scores — BM vs CNS

wt_s   <- wilcox.test(leuk_BM$S.Score,   leuk_CNS_clean$S.Score)
wt_g2m <- wilcox.test(leuk_BM$G2M.Score, leuk_CNS_clean$G2M.Score)

bind_rows(
  leuk_BM@meta.data %>%
    summarise(Tissue   = "BM",
              mean_S   = round(mean(S.Score),           4),
              mean_G2M = round(mean(G2M.Score),         4),
              pct_S    = round(mean(Phase == "S")   * 100, 1),
              pct_G2M  = round(mean(Phase == "G2M") * 100, 1),
              pct_G1   = round(mean(Phase == "G1")  * 100, 1)),
  leuk_CNS_clean@meta.data %>%
    summarise(Tissue   = "CNS",
              mean_S   = round(mean(S.Score),           4),
              mean_G2M = round(mean(G2M.Score),         4),
              pct_S    = round(mean(Phase == "S")   * 100, 1),
              pct_G2M  = round(mean(Phase == "G2M") * 100, 1),
              pct_G1   = round(mean(Phase == "G1")  * 100, 1))
) %>%
  kable(caption = paste0(
    "Cell cycle summary by tissue — ",
    "S score p = ",   signif(wt_s$p.value,   3), ", ",
    "G2M score p = ", signif(wt_g2m$p.value, 3)
  ))

Cell cycle summary by tissue — S score p = 4.49e-17, G2M score p = 0.387
Tissue	mean_S	mean_G2M	pct_S	pct_G2M	pct_G1
BM	-0.0501	-0.0320	32.2	22.5	45.3
CNS	-0.0646	-0.0205	30.1	24.0	46.0

LSK_IL7R vs LK_CLP — Cell Cycle Phase Distribution

cc_pop <- bind_rows(
  leuk_BM@meta.data        %>% select(Tissue, population, Phase, S.Score, G2M.Score),
  leuk_CNS_clean@meta.data %>% select(Tissue, population, Phase, S.Score, G2M.Score)
) %>%
  filter(population != "Intermediate") %>%
  group_by(Tissue, population, Phase) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Tissue, population) %>%
  mutate(prop = n / sum(n))

ggplot(cc_pop, aes(x = population, y = prop, fill = Phase)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = phase_cols) +
  scale_y_continuous(labels = scales::percent) +
  facet_wrap(~Tissue) +
  theme_classic(base_size = 12) +
  labs(x = NULL, y = "Proportion of cells",
       title = "Cell cycle phase by population and tissue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Cell cycle phase distribution by population — BM and CNS

bind_rows(
  leuk_BM@meta.data        %>% select(Tissue, population, S.Score, G2M.Score),
  leuk_CNS_clean@meta.data %>% select(Tissue, population, S.Score, G2M.Score)
) %>%
  filter(population != "Intermediate") %>%
  pivot_longer(c(S.Score, G2M.Score),
               names_to  = "score_type",
               values_to = "score") %>%
  ggplot(aes(x = population, y = score, fill = population)) +
  geom_violin(trim = FALSE, alpha = 0.8) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
  scale_fill_manual(values = pop_cols) +
  facet_grid(score_type ~ Tissue, scales = "free_y") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 11) +
  labs(y = "Score", title = "Cell cycle scores by population and tissue") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

S and G2M scores by population within each tissue

cc_stats <- bind_rows(
  leuk_BM@meta.data        %>% select(Tissue, population, Phase, S.Score, G2M.Score),
  leuk_CNS_clean@meta.data %>% select(Tissue, population, Phase, S.Score, G2M.Score)
) %>%
  filter(population != "Intermediate")

wt_results <- lapply(c("BM", "CNS"), function(tiss) {
  dat <- cc_stats %>% filter(Tissue == tiss)
  lsk <- dat %>% filter(population == "LSK_IL7R")
  clp <- dat %>% filter(population == "LK_CLP")
  data.frame(
    Tissue      = tiss,
    p_S_score   = signif(wilcox.test(lsk$S.Score,   clp$S.Score)$p.value,   3),
    p_G2M_score = signif(wilcox.test(lsk$G2M.Score, clp$G2M.Score)$p.value, 3)
  )
}) %>% bind_rows()

cc_stats %>%
  group_by(Tissue, population) %>%
  summarise(
    n        = n(),
    mean_S   = round(mean(S.Score),           4),
    mean_G2M = round(mean(G2M.Score),         4),
    pct_S    = round(mean(Phase == "S")   * 100, 1),
    pct_G2M  = round(mean(Phase == "G2M") * 100, 1),
    pct_G1   = round(mean(Phase == "G1")  * 100, 1),
    .groups  = "drop"
  ) %>%
  kable(caption = "Cell cycle summary by population and tissue")

Cell cycle summary by population and tissue
Tissue	population	n	mean_S	mean_G2M	pct_S	pct_G2M	pct_G1
BM	LK_CLP	29528	-0.1270	-0.2608	32.1	0.4	67.5
BM	LSK_IL7R	9945	0.0537	0.5784	15.5	84.5	0.0
CNS	LK_CLP	31012	-0.1436	-0.2641	30.4	0.5	69.1
CNS	LSK_IL7R	10942	0.0378	0.6020	13.1	86.9	0.0

kable(wt_results,
      caption = "Wilcoxon test — LSK_IL7R vs LK_CLP cell cycle scores within each tissue")

Wilcoxon test — LSK_IL7R vs LK_CLP cell cycle scores within each tissue
Tissue	p_S_score	p_G2M_score
BM	0	0
CNS	0	0

Cell Cycle on Harmony UMAP

phase_vec <- c(
  setNames(leuk_BM$Phase[match(bm_orig, colnames(leuk_BM))], bm_cells),
  setNames(leuk_CNS_clean$Phase[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
s_vec <- c(
  setNames(leuk_BM$S.Score[match(bm_orig, colnames(leuk_BM))], bm_cells),
  setNames(leuk_CNS_clean$S.Score[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
g2m_vec <- c(
  setNames(leuk_BM$G2M.Score[match(bm_orig, colnames(leuk_BM))], bm_cells),
  setNames(leuk_CNS_clean$G2M.Score[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)

leuk_all <- AddMetaData(leuk_all, metadata = phase_vec, col.name = "Phase")
leuk_all <- AddMetaData(leuk_all, metadata = s_vec,     col.name = "S.Score")
leuk_all <- AddMetaData(leuk_all, metadata = g2m_vec,   col.name = "G2M.Score")

leuk_all_bm  <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")

p_phase_bm  <- DimPlot(leuk_all_bm,  group.by = "Phase", cols = phase_cols) +
  ggtitle("BM — cell cycle phase")
p_phase_cns <- DimPlot(leuk_all_cns, group.by = "Phase", cols = phase_cols) +
  ggtitle("CNS — cell cycle phase")

p_s_bm  <- FeaturePlot(leuk_all_bm,
                        features   = "S.Score",
                        cols       = c("lightgrey", "#e67e22"),
                        order      = TRUE,
                        min.cutoff = "q10") +
  ggtitle("BM — S score")

p_s_cns <- FeaturePlot(leuk_all_cns,
                        features   = "S.Score",
                        cols       = c("lightgrey", "#e67e22"),
                        order      = TRUE,
                        min.cutoff = "q10") +
  ggtitle("CNS — S score")

(p_phase_bm + p_phase_cns) / (p_s_bm + p_s_cns)

Cell cycle phase and S score on Harmony-integrated UMAP

# Save tissue-specific objects with all metadata added during this script:
# LSK_IL7R_BM1, population, CNS_sig1, BM_sig1,
# bulk_CNS_sig1, bulk_BM_sig1, Phase, S.Score, G2M.Score

qs_save(leuk_BM,
         "/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_BM_scored.qs")
qs_save(leuk_CNS_clean,
         "/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_CNS_clean_scored.qs")
qs_save(leuk_all,
         "/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_all_scored.qs")

cat("Saved: 05_leuk_BM_scored.qs\n")

## Saved: 05_leuk_BM_scored.qs

cat("Saved: 05_leuk_CNS_clean_scored.qs\n")

## Saved: 05_leuk_CNS_clean_scored.qs

cat("Saved: 05_leuk_all_scored.qs\n")

## Saved: 05_leuk_all_scored.qs

# Confirm all expected columns are present
expected_cols <- c("LSK_IL7R_BM1", "population",
                   "CNS_sig1", "BM_sig1",
                   "bulk_CNS_sig1", "bulk_BM_sig1",
                   "Phase", "S.Score", "G2M.Score")

cat("\nColumn check — BM object:\n")

## 
## Column check — BM object:

for (col in expected_cols) {
  cat(" ", col, ":", col %in% colnames(leuk_BM@meta.data), "\n")
}

##   LSK_IL7R_BM1 : TRUE 
##   population : TRUE 
##   CNS_sig1 : TRUE 
##   BM_sig1 : TRUE 
##   bulk_CNS_sig1 : TRUE 
##   bulk_BM_sig1 : TRUE 
##   Phase : TRUE 
##   S.Score : TRUE 
##   G2M.Score : TRUE

cat("\nColumn check — CNS object:\n")

## 
## Column check — CNS object:

for (col in expected_cols) {
  cat(" ", col, ":", col %in% colnames(leuk_CNS_clean@meta.data), "\n")
}

##   LSK_IL7R_BM1 : TRUE 
##   population : TRUE 
##   CNS_sig1 : TRUE 
##   BM_sig1 : TRUE 
##   bulk_CNS_sig1 : TRUE 
##   bulk_BM_sig1 : TRUE 
##   Phase : TRUE 
##   S.Score : TRUE 
##   G2M.Score : TRUE

Session Information

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] UCell_2.10.1       harmony_1.2.4      Rcpp_1.0.12        scales_1.4.0      
##  [5] ggrepel_0.9.6      tibble_3.3.1       tidyr_1.3.1        viridis_0.6.5     
##  [9] viridisLite_0.4.2  qs2_0.1.7          knitr_1.51         dplyr_1.1.4       
## [13] patchwork_1.3.2    ggplot2_4.0.2      Seurat_5.4.0       SeuratObject_5.3.0
## [17] sp_2.1-4          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3          rstudioapi_0.16.0          
##   [3] jsonlite_2.0.0              magrittr_2.0.3             
##   [5] ggbeeswarm_0.7.3            spatstat.utils_3.2-1       
##   [7] farver_2.1.2                rmarkdown_2.30             
##   [9] zlibbioc_1.52.0             vctrs_0.6.5                
##  [11] ROCR_1.0-12                 spatstat.explore_3.7-0     
##  [13] S4Arrays_1.6.0              htmltools_0.5.8.1          
##  [15] BiocNeighbors_2.0.1         SparseArray_1.6.2          
##  [17] sass_0.4.9                  sctransform_0.4.3          
##  [19] parallelly_1.37.1           KernSmooth_2.23-24         
##  [21] bslib_0.7.0                 htmlwidgets_1.6.4          
##  [23] ica_1.0-3                   plyr_1.8.9                 
##  [25] plotly_4.12.0               zoo_1.8-15                 
##  [27] cachem_1.1.0                igraph_2.2.2               
##  [29] mime_0.12                   lifecycle_1.0.5            
##  [31] pkgconfig_2.0.3             Matrix_1.7-0               
##  [33] R6_2.6.1                    fastmap_1.2.0              
##  [35] GenomeInfoDbData_1.2.13     MatrixGenerics_1.18.1      
##  [37] fitdistrplus_1.2-6          future_1.33.2              
##  [39] shiny_1.12.1                digest_0.6.35              
##  [41] S4Vectors_0.44.0            tensor_1.5.1               
##  [43] RSpectra_0.16-2             irlba_2.3.7                
##  [45] GenomicRanges_1.58.0        labeling_0.4.3             
##  [47] progressr_0.18.0            fansi_1.0.6                
##  [49] spatstat.sparse_3.1-0       httr_1.4.7                 
##  [51] polyclip_1.10-7             abind_1.4-5                
##  [53] compiler_4.4.1              withr_3.0.2                
##  [55] S7_0.2.1                    BiocParallel_1.40.2        
##  [57] fastDummies_1.7.5           MASS_7.3-60.2              
##  [59] DelayedArray_0.32.0         tools_4.4.1                
##  [61] vipor_0.4.7                 lmtest_0.9-40              
##  [63] otel_0.2.0                  beeswarm_0.4.0             
##  [65] httpuv_1.6.16               future.apply_1.11.2        
##  [67] goftest_1.2-3               glue_1.8.0                 
##  [69] nlme_3.1-164                promises_1.5.0             
##  [71] grid_4.4.1                  Rtsne_0.17                 
##  [73] cluster_2.1.6               reshape2_1.4.4             
##  [75] generics_0.1.3              gtable_0.3.6               
##  [77] spatstat.data_3.1-9         data.table_1.15.4          
##  [79] XVector_0.46.0              stringfish_0.18.0          
##  [81] utf8_1.2.4                  BiocGenerics_0.52.0        
##  [83] spatstat.geom_3.7-0         RcppAnnoy_0.0.23           
##  [85] RANN_2.6.2                  pillar_1.9.0               
##  [87] stringr_1.5.1               spam_2.11-3                
##  [89] RcppHNSW_0.6.0              later_1.4.6                
##  [91] splines_4.4.1               lattice_0.22-6             
##  [93] survival_3.6-4              deldir_2.0-4               
##  [95] tidyselect_1.2.1            SingleCellExperiment_1.28.1
##  [97] miniUI_0.1.2                pbapply_1.7-4              
##  [99] gridExtra_2.3               IRanges_2.40.1             
## [101] SummarizedExperiment_1.36.0 scattermore_1.2            
## [103] stats4_4.4.1                xfun_0.56                  
## [105] Biobase_2.66.0              matrixStats_1.5.0          
## [107] UCSC.utils_1.2.0            stringi_1.8.4              
## [109] lazyeval_0.2.2              yaml_2.3.12                
## [111] evaluate_1.0.5              codetools_0.2-20           
## [113] cli_3.6.5                   uwot_0.2.4                 
## [115] RcppParallel_5.1.8          xtable_1.8-4               
## [117] reticulate_1.45.0           jquerylib_0.1.4            
## [119] GenomeInfoDb_1.42.3         dichromat_2.0-0.1          
## [121] globals_0.16.3              spatstat.random_3.4-4      
## [123] png_0.1-8                   ggrastr_1.0.2              
## [125] spatstat.univar_3.1-6       parallel_4.4.1             
## [127] dotCall64_1.2               listenv_0.9.1              
## [129] ggridges_0.5.6              crayon_1.5.2               
## [131] purrr_1.0.2                 rlang_1.1.7                
## [133] cowplot_1.2.0

B-ALL Population Scoring & CNS Signature Analysis

2026-02-19