This script performs three linked analyses on the cleaned B-ALL leukaemia dataset (clusters 10 and 11 excluded from CNS):
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(qs2)
## qs2 0.1.7
library(viridis)
## Loading required package: viridisLite
library(tidyr)
library(tibble)
library(ggrepel)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
library(harmony)
## Loading required package: Rcpp
library(UCell)
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")
source_cols <- c("AD4_EM3" = "#E69F00",
"AD2.2_EM4" = "#56B4E9",
"AD5_EM4" = "#CC79A7")
pop_cols <- c("LSK_IL7R" = "#1a3a6b",
"LK_CLP" = "#8b0000",
"Intermediate" = "grey75")
bm_res_col <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"
# Ambient RNA blacklist — excluded from all scoring and interpretation
ambient_blacklist <- c("Slc1a2", "Sparcl1", "Apod", "Cpe",
"Mobp", "S100b", "Atp1a2","Ndrg2")
leuk_BM <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS_clean <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_clean_march26.qs")
stopifnot("leuk_BM not found" = exists("leuk_BM"))
stopifnot("leuk_CNS_clean not found" = exists("leuk_CNS_clean"))
# Load clean DE results from script 09
res_clean_df <- read.csv("/exports/eddie/scratch/aduguid3/Rmarkdown/04_DE_CNSvsBM_clean.csv")
stopifnot("res_clean_df not found" = exists("res_clean_df"))
Idents(leuk_BM) <- bm_res_col
Idents(leuk_CNS_clean) <- cns_res_col
cat("BM cells:", ncol(leuk_BM), "\n")
## BM cells: 44606
cat("CNS cells:", ncol(leuk_CNS_clean), "\n")
## CNS cells: 47017
cat("DE genes loaded:", nrow(res_clean_df), "\n")
## DE genes loaded: 16195
Prior to population scoring, BM leukaemia cells and cleaned CNS leukaemia cells (clusters 10 and 11 excluded) are merged into a single object and re-embedded using Harmony integration. Harmony corrects for transcriptional differences attributable to transplant source (AD4_EM3, AD2.2_EM4, AD5_EM4) while preserving genuine biological variation, including tissue-specific transcriptional states and population identity.
This integration is performed at this stage — after contamination
exclusion but before population scoring — to ensure that the UMAP
embedding used for visualisation reflects the true transcriptional
landscape of bona fide leukaemia cells. Clustering is performed in
Harmony-corrected space using the same dimensionality (dims 1:25) and
resolution (0.3) as the tissue-specific clustering in script 02. All
downstream population scoring and signature analyses are performed on
the tissue-specific objects (leuk_BM and
leuk_CNS_clean) rather than the merged object, so that
tissue-specific signatures are applied in their cognate compartment. The
integrated leuk_all object is used for joint visualisation
only.
harmony_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_all_harmony.qs"
if (!file.exists(harmony_path)) {
leuk_all <- merge(leuk_BM, leuk_CNS_clean,
add.cell.ids = c("BM", "CNS"))
cat("Merged object:\n")
cat("Total cells:", ncol(leuk_all), "\n")
print(table(leuk_all$Tissue))
print(table(leuk_all$Tissue, leuk_all$transplant_source))
leuk_all <- FindVariableFeatures(leuk_all,
selection.method = "vst",
nfeatures = 2000)
leuk_all <- ScaleData(leuk_all, verbose = FALSE)
leuk_all <- RunPCA(leuk_all, verbose = FALSE)
leuk_all <- harmony::RunHarmony(leuk_all,
group.by.vars = "transplant_source",
reduction = "pca",
reduction.save = "harmony",
verbose = TRUE)
leuk_all <- RunUMAP(leuk_all,
reduction = "harmony",
dims = 1:25)
leuk_all <- FindNeighbors(leuk_all,
reduction = "harmony",
dims = 1:25)
leuk_all <- FindClusters(leuk_all, resolution = 0.3)
qs_save(leuk_all, harmony_path)
} else {
leuk_all <- qs_read(harmony_path)
}
# Define barcode prefix vectors — used in all subsequent transfer chunks
bm_cells <- colnames(leuk_all)[grepl("^BM_", colnames(leuk_all))]
cns_cells <- colnames(leuk_all)[grepl("^CNS_", colnames(leuk_all))]
bm_orig <- sub("^BM_", "", bm_cells)
cns_orig <- sub("^CNS_", "", cns_cells)
cat("Harmony integration complete\n")
## Harmony integration complete
cat("Cells:", ncol(leuk_all), "\n")
## Cells: 91623
p_tissue <- DimPlot(leuk_all,
group.by = "Tissue",
cols = tissue_cols,
alpha = 0.5) +
ggtitle("All cells — by tissue")
p_source <- DimPlot(leuk_all,
group.by = "transplant_source",
cols = source_cols,
alpha = 0.5) +
ggtitle("All cells — by transplant source")
p_tissue + p_source
Harmony-integrated UMAP — BM and CNS leukaemia
Harmony integration successfully corrected for transplant source effects, with cells from all three sources (AD4_EM3, AD2.2_EM4, AD5_EM4) intermixing across shared transcriptional states in the integrated embedding. Critically, tissue identity was preserved as a major axis of variation — BM and CNS leukaemia cells occupied largely distinct regions of the UMAP despite source correction, confirming that the CNS vs BM transcriptional differences identified by pseudobulk DE are robust and represent the dominant biological signal in this dataset after transplant source effects are removed. This integrated embedding is used for all subsequent visualisation.
Two propagating leukaemia populations have been characterised in this
B-ALL model by bulk RNAseq: LSK_IL7R (Lin-Sca1+Kit+
IL-7 receptor positive) and LK_CLP (Lin-Sca1-Kit+
common lymphoid progenitor). To assign single-cell identities, the
bulk-derived LSK_IL7R signature from Supplemental Table S4 (BM-derived,
35 genes, proliferation/cell cycle enriched) is scored onto both BM and
CNS leukaemia cells using AddModuleScore. The resulting
score distribution is used to assign population labels at the
single-cell level, with cells scoring above threshold assigned as
LSK_IL7R, cells scoring below threshold as LK_CLP, and cells in the
intermediate range left unassigned.
All UMAP visualisations use the Harmony-integrated embedding from the
merged leuk_all object, which corrects for transplant
source effects while preserving genuine tissue-specific and
population-specific transcriptional variation.
# S4 — BM LSK_IL7R signature (proliferation/cell cycle enriched)
lsk_il7r_bm <- c("Cenpf", "Kpna2", "Ccnb1", "Plk1", "Ube2c",
"Aspm", "Cdk1", "Ckap2", "Nuf2", "Kif18b",
"Tpx2", "Tubb4b", "Mki67", "Hmmr", "Cdc20",
"Cdc25b", "Kifc1", "Nusap1","Cenpa", "Nek2",
"Kif14", "Ccnb2", "Prc1", "Kif4", "Aurka",
"Top2a", "Dlgap5", "Kif11", "Gas2l3", "Jpt1",
"Incenp", "Cenpe", "Kif20b","Knl1", "Arl6ip1")
# S5 — CNS LSK_IL7R signature
lsk_il7r_cns <- c("Tshz2", "Ube2c", "Ptprm", "Muc5ac")
# Filter to available genes in each object
bm_sig_found <- lsk_il7r_bm[lsk_il7r_bm %in% rownames(leuk_BM)]
cns_sig_found <- lsk_il7r_cns[lsk_il7r_cns %in% rownames(leuk_CNS_clean)]
cat("BM signature genes found:", length(bm_sig_found),
"of", length(lsk_il7r_bm), "\n")
## BM signature genes found: 35 of 35
cat("CNS signature genes found:", length(cns_sig_found),
"of", length(lsk_il7r_cns), "\n")
## CNS signature genes found: 4 of 4
Module scores are computed per cell using
AddModuleScore, which calculates the mean expression of the
signature genes relative to a set of randomly sampled control genes of
matched expression level. Scores above zero indicate relative enrichment
of the signature in that cell. The S4 signature is applied to both BM
and CNS objects to allow cross-tissue comparison of population
proportions.
# BM object — S4 signature
leuk_BM <- AddModuleScore(leuk_BM,
features = list(bm_sig_found),
name = "LSK_IL7R_BM",
seed = 42)
# CNS object — S4 signature (cross-tissue comparison)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
features = list(bm_sig_found),
name = "LSK_IL7R_BM",
seed = 42)
# CNS object — S5 signature (CNS-specific)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
features = list(cns_sig_found),
name = "LSK_IL7R_CNS",
seed = 42)
cat("Score ranges — BM S4 signature:\n")
## Score ranges — BM S4 signature:
cat(" BM object:", round(range(leuk_BM$LSK_IL7R_BM1), 3), "\n")
## BM object: -0.522 1.879
cat(" CNS object:", round(range(leuk_CNS_clean$LSK_IL7R_BM1), 3), "\n")
## CNS object: -0.531 1.368
LSK_IL7R signature scores are visualised on the Harmony-integrated UMAP to confirm that scoring captures spatially coherent transcriptional populations rather than diffuse noise.
lsk_scores <- c(
setNames(leuk_BM$LSK_IL7R_BM1[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$LSK_IL7R_BM1[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
leuk_all <- AddMetaData(leuk_all, metadata = lsk_scores, col.name = "LSK_IL7R_BM1")
leuk_all_bm <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")
cat("LSK_IL7R_BM1 transferred — NAs:", sum(is.na(leuk_all$LSK_IL7R_BM1)), "\n")
## LSK_IL7R_BM1 transferred — NAs: 0
p_bm_score <- FeaturePlot(leuk_all_bm,
features = "LSK_IL7R_BM1",
cols = c("lightgrey", "#1a3a6b"),
order = TRUE,
min.cutoff = "q10") +
ggtitle("BM — LSK_IL7R score (S4)")
p_cns_score <- FeaturePlot(leuk_all_cns,
features = "LSK_IL7R_BM1",
cols = c("lightgrey", "#8b0000"),
order = TRUE,
min.cutoff = "q10") +
ggtitle("CNS — LSK_IL7R score (S4)")
p_bm_score + p_cns_score
LSK_IL7R S4 signature score on Harmony-integrated UMAP
Review the violin plots below before adjusting the population assignment thresholds. The S4 signature is proliferation-enriched — LSK_IL7R cells are expected to score positively and LK_CLP cells negatively.
v_bm <- VlnPlot(leuk_BM,
features = "LSK_IL7R_BM1",
group.by = bm_res_col,
pt.size = 0,
cols = viridis(length(unique(
leuk_BM@meta.data[[bm_res_col]])))) +
ggtitle("BM — LSK_IL7R score by cluster") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
v_cns <- VlnPlot(leuk_CNS_clean,
features = "LSK_IL7R_BM1",
group.by = cns_res_col,
pt.size = 0,
cols = viridis(length(unique(
leuk_CNS_clean@meta.data[[cns_res_col]])))) +
ggtitle("CNS — LSK_IL7R score by cluster") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
v_bm / v_cns
LSK_IL7R score distribution by cluster
# !! Adjust thresholds after reviewing violin plots above !!
score_high <- 0.2 # above = LSK_IL7R
score_low <- -0.1 # below = LK_CLP
assign_pop <- function(score, high = score_high, low = score_low) {
case_when(
score > high ~ "LSK_IL7R",
score < low ~ "LK_CLP",
TRUE ~ "Intermediate"
)
}
leuk_BM$population <- assign_pop(leuk_BM$LSK_IL7R_BM1)
leuk_CNS_clean$population <- assign_pop(leuk_CNS_clean$LSK_IL7R_BM1)
cat("BM population assignments:\n")
## BM population assignments:
print(table(leuk_BM$population))
##
## Intermediate LK_CLP LSK_IL7R
## 5133 29528 9945
cat("\nBM population as % of total:\n")
##
## BM population as % of total:
print(round(prop.table(table(leuk_BM$population)) * 100, 1))
##
## Intermediate LK_CLP LSK_IL7R
## 11.5 66.2 22.3
cat("\nCNS population assignments:\n")
##
## CNS population assignments:
print(table(leuk_CNS_clean$population))
##
## Intermediate LK_CLP LSK_IL7R
## 5063 31012 10942
cat("\nCNS population as % of total:\n")
##
## CNS population as % of total:
print(round(prop.table(table(leuk_CNS_clean$population)) * 100, 1))
##
## Intermediate LK_CLP LSK_IL7R
## 10.8 66.0 23.3
Population labels assigned on the tissue-specific objects are
transferred to leuk_all for display on the
Harmony-integrated UMAP.
pop_vec <- c(
setNames(leuk_BM$population[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$population[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
leuk_all <- AddMetaData(leuk_all, metadata = pop_vec, col.name = "population")
cat("Population transferred — NAs:", sum(is.na(leuk_all$population)), "\n")
## Population transferred — NAs: 0
print(table(leuk_all$population, useNA = "ifany"))
##
## Intermediate LK_CLP LSK_IL7R
## 10196 60540 20887
leuk_all_bm <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")
p_bm_pop <- DimPlot(leuk_all_bm,
group.by = "population",
cols = pop_cols,
label = TRUE,
label.size = 5,
repel = TRUE) +
ggtitle("BM — LK_CLP vs LSK_IL7R") +
NoLegend()
p_cns_pop <- DimPlot(leuk_all_cns,
group.by = "population",
cols = pop_cols,
label = TRUE,
label.size = 5,
repel = TRUE) +
ggtitle("CNS — LK_CLP vs LSK_IL7R") +
NoLegend()
p_split <- DimPlot(leuk_all,
group.by = "population",
cols = pop_cols,
split.by = "Tissue",
label = TRUE,
label.size = 4,
repel = TRUE) +
ggtitle("LK_CLP vs LSK_IL7R — Harmony integrated") +
theme(legend.position = "bottom")
p_bm_pop + p_cns_pop
LK_CLP and LSK_IL7R population labels on Harmony-integrated UMAP
p_split
LK_CLP and LSK_IL7R population labels on Harmony-integrated UMAP
pop_cluster <- bind_rows(
leuk_BM@meta.data %>%
select(Tissue, cluster = all_of(bm_res_col), population),
leuk_CNS_clean@meta.data %>%
select(Tissue, cluster = all_of(cns_res_col), population)
) %>%
group_by(Tissue, cluster, population) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Tissue, cluster) %>%
mutate(prop = n / sum(n))
ggplot(pop_cluster,
aes(x = factor(cluster), y = prop, fill = population)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = pop_cols) +
facet_wrap(~Tissue, scales = "free_x") +
theme_classic(base_size = 11) +
labs(x = "Cluster",
y = "Proportion",
title = "Population composition by cluster and tissue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Population composition by cluster in BM and CNS
pop_cluster %>%
filter(Tissue == "CNS") %>%
select(cluster, population, n, proportion = prop) %>%
mutate(proportion = round(proportion, 3)) %>%
pivot_wider(names_from = population,
values_from = c(n, proportion),
values_fill = 0) %>%
kable(caption = "CNS cluster population composition")
## Adding missing grouping variables: `Tissue`
| Tissue | cluster | n_Intermediate | n_LK_CLP | n_LSK_IL7R | proportion_Intermediate | proportion_LK_CLP | proportion_LSK_IL7R |
|---|---|---|---|---|---|---|---|
| CNS | 0 | 25 | 11814 | 0 | 0.002 | 0.998 | 0.000 |
| CNS | 1 | 235 | 8424 | 0 | 0.027 | 0.973 | 0.000 |
| CNS | 2 | 129 | 7916 | 0 | 0.016 | 0.984 | 0.000 |
| CNS | 3 | 1608 | 154 | 4549 | 0.255 | 0.024 | 0.721 |
| CNS | 4 | 1041 | 157 | 3089 | 0.243 | 0.037 | 0.721 |
| CNS | 5 | 796 | 766 | 1433 | 0.266 | 0.256 | 0.478 |
| CNS | 6 | 721 | 420 | 1134 | 0.317 | 0.185 | 0.498 |
| CNS | 7 | 459 | 838 | 704 | 0.229 | 0.419 | 0.352 |
| CNS | 8 | 38 | 300 | 31 | 0.103 | 0.813 | 0.084 |
| CNS | 9 | 11 | 223 | 2 | 0.047 | 0.945 | 0.008 |
The clean pseudobulk DE CNS signature (padj < 0.05, log2FC >
0.5, ambient blacklist removed) is scored back onto individual cells in
both BM and CNS objects using UCell, which provides more robust per-cell
scoring than AddModuleScore by using rank-based statistics
rather than mean expression. Both the CNS signature and the reciprocal
BM signature are scored onto all cells, allowing each cell to be
positioned on a CNS-vs-BM transcriptional axis. All UMAP visualisations
use the Harmony-integrated embedding.
# CNS signature — genes enriched in CNS vs BM (clean DE, ambient blacklist removed)
cns_sig_genes <- res_clean_df %>%
filter(padj < 0.05,
log2FoldChange > 0.5,
!gene %in% ambient_blacklist) %>%
arrange(desc(log2FoldChange)) %>%
pull(gene)
# BM signature — genes enriched in BM vs CNS
bm_sig_genes <- res_clean_df %>%
filter(padj < 0.05,
log2FoldChange < -0.5,
!gene %in% ambient_blacklist) %>%
arrange(log2FoldChange) %>%
pull(gene)
# Filter to available genes in each object
cns_sig_bm <- cns_sig_genes[cns_sig_genes %in% rownames(leuk_BM)]
cns_sig_cns <- cns_sig_genes[cns_sig_genes %in% rownames(leuk_CNS_clean)]
bm_sig_bm <- bm_sig_genes[bm_sig_genes %in% rownames(leuk_BM)]
bm_sig_cns <- bm_sig_genes[bm_sig_genes %in% rownames(leuk_CNS_clean)]
cat("CNS signature genes:", length(cns_sig_genes), "total\n")
## CNS signature genes: 59 total
cat(" Found in BM object:", length(cns_sig_bm), "\n")
## Found in BM object: 59
cat(" Found in CNS object:", length(cns_sig_cns), "\n")
## Found in CNS object: 59
cat("\nBM signature genes:", length(bm_sig_genes), "total\n")
##
## BM signature genes: 95 total
cat(" Found in BM object:", length(bm_sig_bm), "\n")
## Found in BM object: 95
cat(" Found in CNS object:", length(bm_sig_cns), "\n")
## Found in CNS object: 95
# Note: UCell scoring was tested but produced identical dynamic range compression
# to AddModuleScore for these gene set sizes. AddModuleScore is retained for
# consistency with the LSK_IL7R population scoring approach used in Part 1.
leuk_BM <- AddModuleScore(leuk_BM,
features = list(cns_sig_bm),
name = "CNS_sig",
seed = 42)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
features = list(cns_sig_cns),
name = "CNS_sig",
seed = 42)
leuk_BM <- AddModuleScore(leuk_BM,
features = list(bm_sig_bm),
name = "BM_sig",
seed = 42)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
features = list(bm_sig_cns),
name = "BM_sig",
seed = 42)
# AddModuleScore appends "1" to the name — CNS_sig1, BM_sig1
cat("CNS signature score ranges:\n")
## CNS signature score ranges:
cat(" BM cells:", round(range(leuk_BM$CNS_sig1), 3), "\n")
## BM cells: -0.053 0.108
cat(" CNS cells:", round(range(leuk_CNS_clean$CNS_sig1), 3), "\n")
## CNS cells: -0.066 0.191
cat("\nBM signature score ranges:\n")
##
## BM signature score ranges:
cat(" BM cells:", round(range(leuk_BM$BM_sig1), 3), "\n")
## BM cells: -0.078 0.173
cat(" CNS cells:", round(range(leuk_CNS_clean$BM_sig1), 3), "\n")
## CNS cells: -0.062 0.11
cns_sig_vec <- c(
setNames(leuk_BM$CNS_sig1[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$CNS_sig1[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
bm_sig_vec <- c(
setNames(leuk_BM$BM_sig1[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$BM_sig1[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
leuk_all <- AddMetaData(leuk_all, metadata = cns_sig_vec, col.name = "CNS_sig1")
leuk_all <- AddMetaData(leuk_all, metadata = bm_sig_vec, col.name = "BM_sig1")
leuk_all_bm <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")
cat("CNS_sig1 NAs:", sum(is.na(leuk_all$CNS_sig1)), "\n")
## CNS_sig1 NAs: 0
cat("BM_sig1 NAs:", sum(is.na(leuk_all$BM_sig1)), "\n")
## BM_sig1 NAs: 0
cns_cluster_summary <- leuk_CNS_clean@meta.data %>%
group_by(cluster = .data[[cns_res_col]]) %>%
summarise(
n_cells = n(),
mean_CNS_score = round(mean(CNS_sig1), 4),
mean_BM_score = round(mean(BM_sig1), 4),
pct_CNS_pos = round(mean(CNS_sig1 > 0) * 100, 1),
pct_BM_pos = round(mean(BM_sig1 > 0) * 100, 1),
CNS_vs_BM_delta = round(mean(CNS_sig1) - mean(BM_sig1), 4),
.groups = "drop"
) %>%
arrange(desc(CNS_vs_BM_delta))
kable(cns_cluster_summary,
caption = "CNS vs BM signature delta by cluster — ranked by CNS enrichment")
| cluster | n_cells | mean_CNS_score | mean_BM_score | pct_CNS_pos | pct_BM_pos | CNS_vs_BM_delta |
|---|---|---|---|---|---|---|
| 0 | 11839 | 0.0007 | -0.0165 | 46.9 | 14.8 | 0.0173 |
| 8 | 369 | -0.0084 | -0.0224 | 28.7 | 10.0 | 0.0140 |
| 5 | 2995 | -0.0045 | -0.0141 | 38.2 | 13.7 | 0.0096 |
| 3 | 6311 | -0.0122 | -0.0207 | 25.4 | 5.2 | 0.0085 |
| 7 | 2001 | -0.0054 | -0.0121 | 38.4 | 24.1 | 0.0066 |
| 9 | 236 | -0.0087 | -0.0153 | 33.1 | 17.4 | 0.0066 |
| 4 | 4287 | -0.0157 | -0.0196 | 18.7 | 5.8 | 0.0039 |
| 1 | 8659 | -0.0113 | -0.0149 | 26.8 | 13.9 | 0.0036 |
| 6 | 2275 | -0.0114 | -0.0133 | 26.0 | 14.8 | 0.0019 |
| 2 | 8045 | -0.0146 | -0.0126 | 21.7 | 17.5 | -0.0020 |
To determine whether the conserved CNS transcriptional programme was concentrated in a discrete leukaemia subpopulation, CNS cells were stratified by bulk CNS signature score at both the 75th and 90th percentile thresholds. At both thresholds, high-scoring cells were uniformly distributed across all ten CNS leukaemia clusters (range 20–31% per cluster at q75; similar distribution at q90), with no cluster showing significant enrichment. This threshold independence confirms the finding is not an artefact of score cutoff selection.
The uniform distribution indicates that the conserved CNS transcriptional programme is a compartment-wide property of CNS leukaemia rather than a feature restricted to a discrete subpopulation. Every CNS cluster contains cells expressing the cross-validated CNS signature at similar frequencies, suggesting that CNS engraftment imposes a broad transcriptional adaptation that is shared across leukaemia cell states rather than driving the emergence of a specialised CNS-adapted clone.
Scoring of the pseudobulk-derived CNS signature onto individual cells produced compressed score distributions with CNS-vs-BM deltas of at most 0.017 across all ten CNS clusters. No cluster showed a clearly BM-like transcriptional profile, and the separation between the most and least CNS-enriched clusters was insufficient to support a meaningful high-CNS vs low-CNS contrast at the pseudobulk level.
This compression is a known limitation of module scoring when applied to transcriptionally homogeneous populations. Critically, the uniformity of CNS signature scores across clusters is itself a biologically meaningful finding: CNS leukaemia cells do not segregate into discrete CNS-adapted and non-adapted subpopulations at this resolution, suggesting that the CNS transcriptional programme is broadly shared across the engrafted leukaemia compartment rather than restricted to a specialised subset.
Given this, the pseudobulk CNS vs BM differential expression analysis from script 04 remains the primary and most statistically robust characterisation of the CNS leukaemia transcriptional signature. Downstream analyses focus on two questions: cross-validation of this signature against a prior bulk RNAseq experiment, and whether CNS-enriched genes are differentially expressed between the LSK_IL7R and LK_CLP propagating populations within the CNS compartment.
To validate the single-cell CNS vs BM transcriptional differences identified in this dataset, DE genes from a prior bulk RNAseq experiment (mir128a model, padj < 0.05) are scored onto individual cells. Concordance between the bulk and single-cell signatures would confirm that the CNS-specific transcriptional programme identified here is reproducible across experimental systems and is not a feature unique to this dataset. Direction confirmed: positive log2FC in the bulk data = CNS-enriched.
bulk_de <- read.csv("/exports/eddie/scratch/aduguid3/de_genes_mir128a_sig_0.05.csv") %>%
filter(!is.na(symbol), symbol != "NA") %>%
rename(gene = symbol)
cat("Bulk DE genes loaded:", nrow(bulk_de), "\n")
## Bulk DE genes loaded: 212
cat("CNS-enriched (positive log2FC):", sum(bulk_de$log2FoldChange > 0), "\n")
## CNS-enriched (positive log2FC): 81
cat("BM-enriched (negative log2FC):", sum(bulk_de$log2FoldChange < 0), "\n")
## BM-enriched (negative log2FC): 131
bulk_cns_genes <- bulk_de %>%
filter(log2FoldChange > 0) %>%
arrange(desc(log2FoldChange)) %>%
pull(gene)
bulk_bm_genes <- bulk_de %>%
filter(log2FoldChange < 0) %>%
arrange(log2FoldChange) %>%
pull(gene)
# Filter to genes present in single-cell objects
bulk_cns_bm <- bulk_cns_genes[bulk_cns_genes %in% rownames(leuk_BM)]
bulk_cns_cns <- bulk_cns_genes[bulk_cns_genes %in% rownames(leuk_CNS_clean)]
bulk_bm_bm <- bulk_bm_genes[bulk_bm_genes %in% rownames(leuk_BM)]
bulk_bm_cns <- bulk_bm_genes[bulk_bm_genes %in% rownames(leuk_CNS_clean)]
cat("\nGenes found in single-cell objects:\n")
##
## Genes found in single-cell objects:
cat(" Bulk CNS sig — BM object:", length(bulk_cns_bm), "\n")
## Bulk CNS sig — BM object: 77
cat(" Bulk CNS sig — CNS object:", length(bulk_cns_cns), "\n")
## Bulk CNS sig — CNS object: 77
cat(" Bulk BM sig — BM object:", length(bulk_bm_bm), "\n")
## Bulk BM sig — BM object: 129
cat(" Bulk BM sig — CNS object:", length(bulk_bm_cns), "\n")
## Bulk BM sig — CNS object: 129
sc_cns_genes <- res_clean_df %>%
filter(padj < 0.05, log2FoldChange > 0.5, !gene %in% ambient_blacklist) %>%
pull(gene)
sc_bm_genes <- res_clean_df %>%
filter(padj < 0.05, log2FoldChange < -0.5, !gene %in% ambient_blacklist) %>%
pull(gene)
cns_overlap <- intersect(bulk_cns_genes, sc_cns_genes)
bm_overlap <- intersect(bulk_bm_genes, sc_bm_genes)
cat("CNS signature overlap (bulk ∩ single-cell):", length(cns_overlap), "genes\n")
## CNS signature overlap (bulk ∩ single-cell): 12 genes
cat(" Bulk CNS genes: ", length(bulk_cns_genes), "\n")
## Bulk CNS genes: 81
cat(" Single-cell CNS genes:", length(sc_cns_genes), "\n")
## Single-cell CNS genes: 59
cat(" Jaccard: ",
round(length(cns_overlap) /
length(union(bulk_cns_genes, sc_cns_genes)), 3), "\n")
## Jaccard: 0.094
cat("\nBM signature overlap (bulk ∩ single-cell):", length(bm_overlap), "genes\n")
##
## BM signature overlap (bulk ∩ single-cell): 11 genes
cat(" Jaccard:",
round(length(bm_overlap) /
length(union(bulk_bm_genes, sc_bm_genes)), 3), "\n")
## Jaccard: 0.051
cat("\nOverlapping CNS genes:", paste(cns_overlap, collapse = ", "), "\n")
##
## Overlapping CNS genes: Ctla4, Gm6093, Tdrd5, Dntt, Grik1, Greb1, Tmem151b, Dsg1c, Pax7, Nectin1, Soat1, Cd96
cat("Overlapping BM genes:", paste(bm_overlap, collapse = ", "), "\n")
## Overlapping BM genes: Rhob, Cdkn1a, Klf11, Fgf3, Rab44, Gas7, Vim, Lgals1, Lmna, Cldn10, Trpm1
conserved_cns <- data.frame(gene = cns_overlap) %>%
left_join(bulk_de %>% select(gene, bulk_lfc = log2FoldChange), by = "gene") %>%
left_join(res_clean_df %>% select(gene, sc_lfc = log2FoldChange,
sc_padj = padj), by = "gene") %>%
arrange(desc(bulk_lfc)) %>%
mutate(across(where(is.numeric), ~round(., 3)))
kable(conserved_cns,
caption = "CNS-enriched genes conserved across bulk (mir128a) and single-cell datasets")
| gene | bulk_lfc | sc_lfc | sc_padj |
|---|---|---|---|
| Ctla4 | 3.889 | 1.876 | 0.037 |
| Gm6093 | 2.105 | 4.069 | 0.000 |
| Tdrd5 | 0.988 | 1.325 | 0.044 |
| Dntt | 0.958 | 1.291 | 0.005 |
| Grik1 | 0.949 | 1.628 | 0.008 |
| Greb1 | 0.832 | 0.797 | 0.000 |
| Tmem151b | 0.811 | 1.451 | 0.026 |
| Dsg1c | 0.669 | 0.714 | 0.031 |
| Pax7 | 0.642 | 0.893 | 0.010 |
| Nectin1 | 0.608 | 0.665 | 0.044 |
| Soat1 | 0.594 | 0.690 | 0.000 |
| Cd96 | 0.572 | 0.585 | 0.000 |
conserved_bm <- data.frame(gene = bm_overlap) %>%
left_join(bulk_de %>% select(gene, bulk_lfc = log2FoldChange), by = "gene") %>%
left_join(res_clean_df %>% select(gene, sc_lfc = log2FoldChange,
sc_padj = padj), by = "gene") %>%
arrange(bulk_lfc) %>%
mutate(across(where(is.numeric), ~round(., 3)))
kable(conserved_bm,
caption = "BM-enriched genes conserved across bulk (mir128a) and single-cell datasets")
| gene | bulk_lfc | sc_lfc | sc_padj |
|---|---|---|---|
| Rhob | -1.530 | -2.320 | 0.012 |
| Cdkn1a | -1.361 | -1.345 | 0.000 |
| Klf11 | -1.078 | -1.343 | 0.000 |
| Fgf3 | -0.873 | -0.676 | 0.000 |
| Rab44 | -0.800 | -0.758 | 0.001 |
| Gas7 | -0.713 | -0.648 | 0.002 |
| Vim | -0.671 | -0.574 | 0.000 |
| Lgals1 | -0.651 | -0.579 | 0.000 |
| Lmna | -0.630 | -0.616 | 0.000 |
| Cldn10 | -0.576 | -0.582 | 0.000 |
| Trpm1 | -0.527 | -0.642 | 0.000 |
Eleven BM-enriched genes were identified in both the bulk RNAseq mir128a dataset and the single-cell pseudobulk analysis, representing the highest-confidence BM-specific leukaemia transcriptional features across independent experimental systems. Log2 fold changes were strikingly concordant between datasets for all 11 genes, with all showing consistent BM-enrichment in both bulk and single-cell comparisons. The conserved BM signature includes the cell cycle inhibitor Cdkn1a (p21), the Rho GTPase Rhob, the transcription factor Klf11, and the immunosuppressive lectin Lgals1, suggesting that BM-resident leukaemia cells are characterised by greater cell cycle restraint, cytoskeletal remodelling, and immunosuppressive signalling relative to CNS-engrafted cells — features consistent with established properties of the BM niche.
The full bulk signatures showed modest overlap with single-cell DE results (Jaccard CNS = 0.094, BM = 0.051), consistent with differences between model systems and cell populations profiled. Scoring is therefore restricted to the conserved gene sets validated in both experiments, representing the highest-confidence cross-experiment features.
# Guard — recreate overlap vectors if not in session
if (!exists("cns_overlap")) {
cns_overlap <- intersect(
bulk_de %>% filter(log2FoldChange > 0) %>% pull(gene), sc_cns_genes)
bm_overlap <- intersect(
bulk_de %>% filter(log2FoldChange < 0) %>% pull(gene), sc_bm_genes)
cat("Overlap vectors recreated — CNS:", length(cns_overlap),
"/ BM:", length(bm_overlap), "\n")
}
bulk_cns_conserved <- cns_overlap # 12 genes
bulk_bm_conserved <- bm_overlap # 11 genes
bulk_cns_conserved_bm <- bulk_cns_conserved[bulk_cns_conserved %in% rownames(leuk_BM)]
bulk_cns_conserved_cns <- bulk_cns_conserved[bulk_cns_conserved %in% rownames(leuk_CNS_clean)]
bulk_bm_conserved_bm <- bulk_bm_conserved[bulk_bm_conserved %in% rownames(leuk_BM)]
bulk_bm_conserved_cns <- bulk_bm_conserved[bulk_bm_conserved %in% rownames(leuk_CNS_clean)]
cat("Conserved CNS genes — BM:", length(bulk_cns_conserved_bm),
"/ CNS:", length(bulk_cns_conserved_cns), "\n")
## Conserved CNS genes — BM: 12 / CNS: 12
cat("Conserved BM genes — BM:", length(bulk_bm_conserved_bm),
"/ CNS:", length(bulk_bm_conserved_cns), "\n")
## Conserved BM genes — BM: 11 / CNS: 11
leuk_BM <- AddModuleScore(leuk_BM,
features = list(bulk_cns_conserved_bm),
name = "bulk_CNS_sig",
seed = 42)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
features = list(bulk_cns_conserved_cns),
name = "bulk_CNS_sig",
seed = 42)
leuk_BM <- AddModuleScore(leuk_BM,
features = list(bulk_bm_conserved_bm),
name = "bulk_BM_sig",
seed = 42)
leuk_CNS_clean <- AddModuleScore(leuk_CNS_clean,
features = list(bulk_bm_conserved_cns),
name = "bulk_BM_sig",
seed = 42)
cat("\nConserved CNS signature score ranges:\n")
##
## Conserved CNS signature score ranges:
cat(" BM cells:", round(range(leuk_BM$bulk_CNS_sig1), 3), "\n")
## BM cells: -0.104 0.453
cat(" CNS cells:", round(range(leuk_CNS_clean$bulk_CNS_sig1), 3), "\n")
## CNS cells: -0.158 0.468
cat("\nConserved BM signature score ranges:\n")
##
## Conserved BM signature score ranges:
cat(" BM cells:", round(range(leuk_BM$bulk_BM_sig1), 3), "\n")
## BM cells: -0.345 0.701
cat(" CNS cells:", round(range(leuk_CNS_clean$bulk_BM_sig1), 3), "\n")
## CNS cells: -0.346 0.723
# Sanity check — BM cells should score higher on BM sig, CNS cells on CNS sig
bind_rows(
leuk_BM@meta.data %>% summarise(Tissue = "BM",
mean_CNS_sig = round(mean(bulk_CNS_sig1), 4),
mean_BM_sig = round(mean(bulk_BM_sig1), 4)),
leuk_CNS_clean@meta.data %>% summarise(Tissue = "CNS",
mean_CNS_sig = round(mean(bulk_CNS_sig1), 4),
mean_BM_sig = round(mean(bulk_BM_sig1), 4))
) %>% kable(caption = "Sanity check — mean conserved signature scores by tissue")
| Tissue | mean_CNS_sig | mean_BM_sig |
|---|---|---|
| BM | -0.0038 | 0.0484 |
| CNS | -0.0097 | -0.0301 |
Both conserved signatures showed statistically significant tissue separation at the single-cell level. The BM signature showed stronger effect size, with BM cells scoring substantially higher than CNS cells. The CNS signature showed more modest but significant separation, driven by a subset of highly scoring CNS cells visible as a right tail in the score distribution.
bulk_scores_tissue <- bind_rows(
leuk_BM@meta.data %>%
select(Tissue, bulk_CNS_sig1, bulk_BM_sig1),
leuk_CNS_clean@meta.data %>%
select(Tissue, bulk_CNS_sig1, bulk_BM_sig1)
) %>%
pivot_longer(c(bulk_CNS_sig1, bulk_BM_sig1),
names_to = "signature",
values_to = "score") %>%
mutate(signature = recode(signature,
"bulk_CNS_sig1" = "Bulk CNS signature",
"bulk_BM_sig1" = "Bulk BM signature"))
wt_cns <- wilcox.test(leuk_BM$bulk_CNS_sig1, leuk_CNS_clean$bulk_CNS_sig1)
wt_bm <- wilcox.test(leuk_BM$bulk_BM_sig1, leuk_CNS_clean$bulk_BM_sig1)
ggplot(bulk_scores_tissue,
aes(x = Tissue, y = score, fill = Tissue)) +
geom_violin(trim = FALSE, alpha = 0.8) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
scale_fill_manual(values = tissue_cols) +
facet_wrap(~signature, scales = "free_y") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
theme_classic(base_size = 12) +
labs(y = "Module score",
title = paste0("Bulk RNAseq signatures by tissue\n",
"BM sig p = ", signif(wt_bm$p.value, 3),
" | CNS sig p = ", signif(wt_cns$p.value, 3))) +
theme(legend.position = "none")
Conserved bulk RNAseq signature scores by tissue
The CNS-enriched gene set identified by pseudobulk DE (script 04) is interrogated for differential expression between the LSK_IL7R and LK_CLP propagating populations within the CNS compartment. Genes that are both CNS-enriched relative to BM and differentially expressed between populations represent the most biologically informative candidates — they are part of the CNS adaptation programme and are preferentially associated with one propagating population, directly connecting tissue-specific transcriptional reprogramming to leukaemia population biology.
cns_pop_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/05_de_cns_sig_by_population.qs"
if (!file.exists(cns_pop_path)) {
leuk_cns_pop <- subset(leuk_CNS_clean,
subset = population != "Intermediate")
Idents(leuk_cns_pop) <- "population"
markers_cns_pop <- FindMarkers(
leuk_cns_pop,
ident.1 = "LSK_IL7R",
ident.2 = "LK_CLP",
features = cns_sig_genes[cns_sig_genes %in% rownames(leuk_cns_pop)],
test.use = "wilcox",
min.pct = 0.05,
logfc.threshold = 0
) %>%
rownames_to_column("gene") %>%
left_join(
res_clean_df %>% select(gene,
cns_log2FC = log2FoldChange,
cns_padj = padj),
by = "gene"
) %>%
arrange(p_val_adj)
qs_save(markers_cns_pop, cns_pop_path)
} else {
markers_cns_pop <- qs_read(cns_pop_path)
}
cat("CNS signature genes tested:", nrow(markers_cns_pop), "\n")
## CNS signature genes tested: 28
cat("Significant between populations (padj < 0.05):",
sum(markers_cns_pop$p_val_adj < 0.05, na.rm = TRUE), "\n")
## Significant between populations (padj < 0.05): 19
markers_cns_pop %>%
filter(p_val_adj < 0.05) %>%
select(gene, avg_log2FC, pct.1, pct.2, p_val_adj, cns_log2FC) %>%
mutate(across(where(is.numeric), ~round(., 4))) %>%
arrange(p_val_adj) %>%
kable(caption = "CNS-enriched genes that differ between LSK_IL7R and LK_CLP within CNS")
| gene | avg_log2FC | pct.1 | pct.2 | p_val_adj | cns_log2FC |
|---|---|---|---|---|---|
| Dsg1c | 1.1574 | 0.060 | 0.019 | 0e+00 | 0.7137 |
| Grik1 | 0.3566 | 0.138 | 0.076 | 0e+00 | 1.6282 |
| Gm30230 | 0.7821 | 0.072 | 0.031 | 0e+00 | 0.5048 |
| Egfem1 | 0.2575 | 0.169 | 0.097 | 0e+00 | 0.8184 |
| Plaat3 | -0.7237 | 0.410 | 0.419 | 0e+00 | 0.5200 |
| Plpp3 | 0.5384 | 0.065 | 0.030 | 0e+00 | 0.5034 |
| Gm44067 | 0.4729 | 0.069 | 0.034 | 0e+00 | 0.5263 |
| Soat1 | 0.0388 | 0.390 | 0.273 | 0e+00 | 0.6900 |
| Gpr83 | 0.0069 | 0.257 | 0.176 | 0e+00 | 0.8085 |
| Pdzd2 | 0.0427 | 0.107 | 0.065 | 0e+00 | 0.5502 |
| Dnm3os | -0.8819 | 0.224 | 0.241 | 0e+00 | 0.6839 |
| Dntt | -0.0654 | 0.645 | 0.538 | 0e+00 | 1.2912 |
| Stc1 | -0.1145 | 0.064 | 0.039 | 0e+00 | 0.8787 |
| Nectin1 | 0.2391 | 0.052 | 0.032 | 0e+00 | 0.6646 |
| Fgd1 | 0.0356 | 0.062 | 0.041 | 0e+00 | 0.7965 |
| Tdrd5 | 0.0270 | 0.070 | 0.047 | 0e+00 | 1.3246 |
| Slc5a9 | -0.2499 | 0.306 | 0.250 | 0e+00 | 0.5022 |
| Greb1 | -0.3496 | 0.076 | 0.057 | 0e+00 | 0.7967 |
| Snn | -0.9659 | 0.063 | 0.076 | 6e-04 | 0.5038 |
Each point represents a CNS-enriched gene. Position on the x-axis reflects how strongly it is enriched in CNS vs BM leukaemia (from the pseudobulk DE); position on the y-axis reflects whether it is higher in LSK_IL7R (positive) or LK_CLP (negative) within the CNS compartment. Genes in the upper-right quadrant are CNS-enriched and LSK_IL7R-high; genes in the lower-right are CNS-enriched and LK_CLP-high. Highlighted genes are significant between populations (padj < 0.05).
markers_cns_pop %>%
mutate(
direction = case_when(
p_val_adj < 0.05 & avg_log2FC > 0 ~ "LSK_IL7R-high",
p_val_adj < 0.05 & avg_log2FC < 0 ~ "LK_CLP-high",
TRUE ~ "NS"
),
label = ifelse(p_val_adj < 0.05 & abs(avg_log2FC) > 0.3, gene, NA)
) %>%
ggplot(aes(x = cns_log2FC, y = avg_log2FC, colour = direction)) +
geom_point(size = 1.5, alpha = 0.7) +
ggrepel::geom_text_repel(aes(label = label),
size = 2.5,
max.overlaps = 25,
box.padding = 0.3) +
scale_colour_manual(
values = c("LSK_IL7R-high" = "#1a3a6b",
"LK_CLP-high" = "#8b0000",
"NS" = "grey70")) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey40") +
theme_classic(base_size = 12) +
labs(x = "log2FC (CNS vs BM) — tissue enrichment",
y = "log2FC (LSK_IL7R vs LK_CLP) — population difference within CNS",
title = "CNS signature genes — tissue enrichment vs population difference",
colour = NULL) +
theme(legend.position = "bottom")
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
CNS signature genes — tissue enrichment vs population difference within CNS
top_lsk_cns <- markers_cns_pop %>%
filter(p_val_adj < 0.05, avg_log2FC > 0) %>%
head(4) %>% pull(gene)
top_clp_cns <- markers_cns_pop %>%
filter(p_val_adj < 0.05, avg_log2FC < 0) %>%
head(4) %>% pull(gene)
top_genes <- c(top_lsk_cns, top_clp_cns)
if (length(top_genes) > 0) {
VlnPlot(leuk_CNS_clean,
features = top_genes,
group.by = "population",
cols = pop_cols,
pt.size = 0,
ncol = 4) &
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
} else {
cat("No significant population-specific CNS genes to plot\n")
}
Top CNS-enriched genes by population within CNS
markers_cns_pop %>%
filter(p_val_adj < 0.05) %>%
mutate(population_enriched = ifelse(avg_log2FC > 0, "LSK_IL7R", "LK_CLP")) %>%
group_by(population_enriched) %>%
summarise(
n_genes = n(),
top_genes = paste(head(gene, 5), collapse = ", "),
.groups = "drop"
) %>%
kable(caption = "CNS-enriched genes significant between populations — summary by direction")
| population_enriched | n_genes | top_genes |
|---|---|---|
| LK_CLP | 7 | Plaat3, Dnm3os, Dntt, Stc1, Slc5a9 |
| LSK_IL7R | 12 | Dsg1c, Grik1, Gm30230, Egfem1, Plpp3 |
write.csv(markers_cns_pop,
"10_DE_cns_sig_by_population.csv",
row.names = FALSE)
cat("Saved: 10_DE_cns_sig_by_population.csv\n")
## Saved: 10_DE_cns_sig_by_population.csv
Cell cycle phase assignment was performed on the parent leukaemia
object (script 03) using Seurat’s CellCycleScoring with
canonical S-phase and G2M-phase gene sets. Here, cell cycle phase
distributions are compared across two axes: tissue compartment (BM vs
CNS) and propagating population identity (LSK_IL7R vs LK_CLP). This is
of particular biological interest because the S4 bulk signature used for
population scoring is proliferation and cell cycle enriched — LSK_IL7R
cells are expected to be more proliferative, and the CNS niche may
impose differential cell cycle constraints relative to BM.
cc_cols_bm <- c("S.Score", "G2M.Score", "Phase") %in% colnames(leuk_BM@meta.data)
cc_cols_cns <- c("S.Score", "G2M.Score", "Phase") %in% colnames(leuk_CNS_clean@meta.data)
cat("BM — cell cycle columns present:", all(cc_cols_bm), "\n")
## BM — cell cycle columns present: TRUE
cat("CNS — cell cycle columns present:", all(cc_cols_cns), "\n")
## CNS — cell cycle columns present: TRUE
if (!all(cc_cols_bm) || !all(cc_cols_cns)) {
cat("Cell cycle scores missing — rerunning CellCycleScoring\n")
s_genes <- stringr::str_to_title(tolower(cc.genes.updated.2019$s.genes))
g2m_genes <- stringr::str_to_title(tolower(cc.genes.updated.2019$g2m.genes))
leuk_BM <- CellCycleScoring(leuk_BM,
s.features = s_genes,
g2m.features = g2m_genes,
set.ident = FALSE)
leuk_CNS_clean <- CellCycleScoring(leuk_CNS_clean,
s.features = s_genes,
g2m.features = g2m_genes,
set.ident = FALSE)
cat("CellCycleScoring complete\n")
}
cat("\nBM phase distribution:\n")
##
## BM phase distribution:
print(round(prop.table(table(leuk_BM$Phase)) * 100, 1))
##
## G1 G2M S
## 45.3 22.5 32.2
cat("\nCNS phase distribution:\n")
##
## CNS phase distribution:
print(round(prop.table(table(leuk_CNS_clean$Phase)) * 100, 1))
##
## G1 G2M S
## 46.0 24.0 30.1
phase_cols <- c("G1" = "#4a90d9", "S" = "#e67e22", "G2M" = "#27ae60")
cc_tissue <- bind_rows(
leuk_BM@meta.data %>% select(Tissue, Phase, S.Score, G2M.Score),
leuk_CNS_clean@meta.data %>% select(Tissue, Phase, S.Score, G2M.Score)
) %>%
group_by(Tissue, Phase) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Tissue) %>%
mutate(prop = n / sum(n))
ggplot(cc_tissue, aes(x = Tissue, y = prop, fill = Phase)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = phase_cols) +
scale_y_continuous(labels = scales::percent) +
theme_classic(base_size = 12) +
labs(x = NULL, y = "Proportion of cells",
title = "Cell cycle phase distribution — BM vs CNS") +
theme(legend.position = "right")
Cell cycle phase distribution — BM vs CNS
bind_rows(
leuk_BM@meta.data %>% select(Tissue, S.Score, G2M.Score),
leuk_CNS_clean@meta.data %>% select(Tissue, S.Score, G2M.Score)
) %>%
pivot_longer(c(S.Score, G2M.Score),
names_to = "score_type",
values_to = "score") %>%
ggplot(aes(x = Tissue, y = score, fill = Tissue)) +
geom_violin(trim = FALSE, alpha = 0.8) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
scale_fill_manual(values = tissue_cols) +
facet_wrap(~score_type, scales = "free_y") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
theme_classic(base_size = 12) +
labs(y = "Score", title = "Cell cycle scores — BM vs CNS") +
theme(legend.position = "none")
S and G2M scores — BM vs CNS
wt_s <- wilcox.test(leuk_BM$S.Score, leuk_CNS_clean$S.Score)
wt_g2m <- wilcox.test(leuk_BM$G2M.Score, leuk_CNS_clean$G2M.Score)
bind_rows(
leuk_BM@meta.data %>%
summarise(Tissue = "BM",
mean_S = round(mean(S.Score), 4),
mean_G2M = round(mean(G2M.Score), 4),
pct_S = round(mean(Phase == "S") * 100, 1),
pct_G2M = round(mean(Phase == "G2M") * 100, 1),
pct_G1 = round(mean(Phase == "G1") * 100, 1)),
leuk_CNS_clean@meta.data %>%
summarise(Tissue = "CNS",
mean_S = round(mean(S.Score), 4),
mean_G2M = round(mean(G2M.Score), 4),
pct_S = round(mean(Phase == "S") * 100, 1),
pct_G2M = round(mean(Phase == "G2M") * 100, 1),
pct_G1 = round(mean(Phase == "G1") * 100, 1))
) %>%
kable(caption = paste0(
"Cell cycle summary by tissue — ",
"S score p = ", signif(wt_s$p.value, 3), ", ",
"G2M score p = ", signif(wt_g2m$p.value, 3)
))
| Tissue | mean_S | mean_G2M | pct_S | pct_G2M | pct_G1 |
|---|---|---|---|---|---|
| BM | -0.0501 | -0.0320 | 32.2 | 22.5 | 45.3 |
| CNS | -0.0646 | -0.0205 | 30.1 | 24.0 | 46.0 |
cc_pop <- bind_rows(
leuk_BM@meta.data %>% select(Tissue, population, Phase, S.Score, G2M.Score),
leuk_CNS_clean@meta.data %>% select(Tissue, population, Phase, S.Score, G2M.Score)
) %>%
filter(population != "Intermediate") %>%
group_by(Tissue, population, Phase) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Tissue, population) %>%
mutate(prop = n / sum(n))
ggplot(cc_pop, aes(x = population, y = prop, fill = Phase)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = phase_cols) +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~Tissue) +
theme_classic(base_size = 12) +
labs(x = NULL, y = "Proportion of cells",
title = "Cell cycle phase by population and tissue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Cell cycle phase distribution by population — BM and CNS
bind_rows(
leuk_BM@meta.data %>% select(Tissue, population, S.Score, G2M.Score),
leuk_CNS_clean@meta.data %>% select(Tissue, population, S.Score, G2M.Score)
) %>%
filter(population != "Intermediate") %>%
pivot_longer(c(S.Score, G2M.Score),
names_to = "score_type",
values_to = "score") %>%
ggplot(aes(x = population, y = score, fill = population)) +
geom_violin(trim = FALSE, alpha = 0.8) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
scale_fill_manual(values = pop_cols) +
facet_grid(score_type ~ Tissue, scales = "free_y") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
theme_classic(base_size = 11) +
labs(y = "Score", title = "Cell cycle scores by population and tissue") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
S and G2M scores by population within each tissue
cc_stats <- bind_rows(
leuk_BM@meta.data %>% select(Tissue, population, Phase, S.Score, G2M.Score),
leuk_CNS_clean@meta.data %>% select(Tissue, population, Phase, S.Score, G2M.Score)
) %>%
filter(population != "Intermediate")
wt_results <- lapply(c("BM", "CNS"), function(tiss) {
dat <- cc_stats %>% filter(Tissue == tiss)
lsk <- dat %>% filter(population == "LSK_IL7R")
clp <- dat %>% filter(population == "LK_CLP")
data.frame(
Tissue = tiss,
p_S_score = signif(wilcox.test(lsk$S.Score, clp$S.Score)$p.value, 3),
p_G2M_score = signif(wilcox.test(lsk$G2M.Score, clp$G2M.Score)$p.value, 3)
)
}) %>% bind_rows()
cc_stats %>%
group_by(Tissue, population) %>%
summarise(
n = n(),
mean_S = round(mean(S.Score), 4),
mean_G2M = round(mean(G2M.Score), 4),
pct_S = round(mean(Phase == "S") * 100, 1),
pct_G2M = round(mean(Phase == "G2M") * 100, 1),
pct_G1 = round(mean(Phase == "G1") * 100, 1),
.groups = "drop"
) %>%
kable(caption = "Cell cycle summary by population and tissue")
| Tissue | population | n | mean_S | mean_G2M | pct_S | pct_G2M | pct_G1 |
|---|---|---|---|---|---|---|---|
| BM | LK_CLP | 29528 | -0.1270 | -0.2608 | 32.1 | 0.4 | 67.5 |
| BM | LSK_IL7R | 9945 | 0.0537 | 0.5784 | 15.5 | 84.5 | 0.0 |
| CNS | LK_CLP | 31012 | -0.1436 | -0.2641 | 30.4 | 0.5 | 69.1 |
| CNS | LSK_IL7R | 10942 | 0.0378 | 0.6020 | 13.1 | 86.9 | 0.0 |
kable(wt_results,
caption = "Wilcoxon test — LSK_IL7R vs LK_CLP cell cycle scores within each tissue")
| Tissue | p_S_score | p_G2M_score |
|---|---|---|
| BM | 0 | 0 |
| CNS | 0 | 0 |
phase_vec <- c(
setNames(leuk_BM$Phase[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$Phase[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
s_vec <- c(
setNames(leuk_BM$S.Score[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$S.Score[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
g2m_vec <- c(
setNames(leuk_BM$G2M.Score[match(bm_orig, colnames(leuk_BM))], bm_cells),
setNames(leuk_CNS_clean$G2M.Score[match(cns_orig, colnames(leuk_CNS_clean))], cns_cells)
)
leuk_all <- AddMetaData(leuk_all, metadata = phase_vec, col.name = "Phase")
leuk_all <- AddMetaData(leuk_all, metadata = s_vec, col.name = "S.Score")
leuk_all <- AddMetaData(leuk_all, metadata = g2m_vec, col.name = "G2M.Score")
leuk_all_bm <- subset(leuk_all, subset = Tissue == "BM")
leuk_all_cns <- subset(leuk_all, subset = Tissue == "CNS")
p_phase_bm <- DimPlot(leuk_all_bm, group.by = "Phase", cols = phase_cols) +
ggtitle("BM — cell cycle phase")
p_phase_cns <- DimPlot(leuk_all_cns, group.by = "Phase", cols = phase_cols) +
ggtitle("CNS — cell cycle phase")
p_s_bm <- FeaturePlot(leuk_all_bm,
features = "S.Score",
cols = c("lightgrey", "#e67e22"),
order = TRUE,
min.cutoff = "q10") +
ggtitle("BM — S score")
p_s_cns <- FeaturePlot(leuk_all_cns,
features = "S.Score",
cols = c("lightgrey", "#e67e22"),
order = TRUE,
min.cutoff = "q10") +
ggtitle("CNS — S score")
(p_phase_bm + p_phase_cns) / (p_s_bm + p_s_cns)
Cell cycle phase and S score on Harmony-integrated UMAP
# Save tissue-specific objects with all metadata added during this script:
# LSK_IL7R_BM1, population, CNS_sig1, BM_sig1,
# bulk_CNS_sig1, bulk_BM_sig1, Phase, S.Score, G2M.Score
qs_save(leuk_BM,
"/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_BM_scored.qs")
qs_save(leuk_CNS_clean,
"/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_CNS_clean_scored.qs")
qs_save(leuk_all,
"/exports/eddie/scratch/aduguid3/harmony_clustering/05_leuk_all_scored.qs")
cat("Saved: 05_leuk_BM_scored.qs\n")
## Saved: 05_leuk_BM_scored.qs
cat("Saved: 05_leuk_CNS_clean_scored.qs\n")
## Saved: 05_leuk_CNS_clean_scored.qs
cat("Saved: 05_leuk_all_scored.qs\n")
## Saved: 05_leuk_all_scored.qs
# Confirm all expected columns are present
expected_cols <- c("LSK_IL7R_BM1", "population",
"CNS_sig1", "BM_sig1",
"bulk_CNS_sig1", "bulk_BM_sig1",
"Phase", "S.Score", "G2M.Score")
cat("\nColumn check — BM object:\n")
##
## Column check — BM object:
for (col in expected_cols) {
cat(" ", col, ":", col %in% colnames(leuk_BM@meta.data), "\n")
}
## LSK_IL7R_BM1 : TRUE
## population : TRUE
## CNS_sig1 : TRUE
## BM_sig1 : TRUE
## bulk_CNS_sig1 : TRUE
## bulk_BM_sig1 : TRUE
## Phase : TRUE
## S.Score : TRUE
## G2M.Score : TRUE
cat("\nColumn check — CNS object:\n")
##
## Column check — CNS object:
for (col in expected_cols) {
cat(" ", col, ":", col %in% colnames(leuk_CNS_clean@meta.data), "\n")
}
## LSK_IL7R_BM1 : TRUE
## population : TRUE
## CNS_sig1 : TRUE
## BM_sig1 : TRUE
## bulk_CNS_sig1 : TRUE
## bulk_BM_sig1 : TRUE
## Phase : TRUE
## S.Score : TRUE
## G2M.Score : TRUE
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
##
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2; LAPACK version 3.10.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/London
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] UCell_2.10.1 harmony_1.2.4 Rcpp_1.0.12 scales_1.4.0
## [5] ggrepel_0.9.6 tibble_3.3.1 tidyr_1.3.1 viridis_0.6.5
## [9] viridisLite_0.4.2 qs2_0.1.7 knitr_1.51 dplyr_1.1.4
## [13] patchwork_1.3.2 ggplot2_4.0.2 Seurat_5.4.0 SeuratObject_5.3.0
## [17] sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0
## [3] jsonlite_2.0.0 magrittr_2.0.3
## [5] ggbeeswarm_0.7.3 spatstat.utils_3.2-1
## [7] farver_2.1.2 rmarkdown_2.30
## [9] zlibbioc_1.52.0 vctrs_0.6.5
## [11] ROCR_1.0-12 spatstat.explore_3.7-0
## [13] S4Arrays_1.6.0 htmltools_0.5.8.1
## [15] BiocNeighbors_2.0.1 SparseArray_1.6.2
## [17] sass_0.4.9 sctransform_0.4.3
## [19] parallelly_1.37.1 KernSmooth_2.23-24
## [21] bslib_0.7.0 htmlwidgets_1.6.4
## [23] ica_1.0-3 plyr_1.8.9
## [25] plotly_4.12.0 zoo_1.8-15
## [27] cachem_1.1.0 igraph_2.2.2
## [29] mime_0.12 lifecycle_1.0.5
## [31] pkgconfig_2.0.3 Matrix_1.7-0
## [33] R6_2.6.1 fastmap_1.2.0
## [35] GenomeInfoDbData_1.2.13 MatrixGenerics_1.18.1
## [37] fitdistrplus_1.2-6 future_1.33.2
## [39] shiny_1.12.1 digest_0.6.35
## [41] S4Vectors_0.44.0 tensor_1.5.1
## [43] RSpectra_0.16-2 irlba_2.3.7
## [45] GenomicRanges_1.58.0 labeling_0.4.3
## [47] progressr_0.18.0 fansi_1.0.6
## [49] spatstat.sparse_3.1-0 httr_1.4.7
## [51] polyclip_1.10-7 abind_1.4-5
## [53] compiler_4.4.1 withr_3.0.2
## [55] S7_0.2.1 BiocParallel_1.40.2
## [57] fastDummies_1.7.5 MASS_7.3-60.2
## [59] DelayedArray_0.32.0 tools_4.4.1
## [61] vipor_0.4.7 lmtest_0.9-40
## [63] otel_0.2.0 beeswarm_0.4.0
## [65] httpuv_1.6.16 future.apply_1.11.2
## [67] goftest_1.2-3 glue_1.8.0
## [69] nlme_3.1-164 promises_1.5.0
## [71] grid_4.4.1 Rtsne_0.17
## [73] cluster_2.1.6 reshape2_1.4.4
## [75] generics_0.1.3 gtable_0.3.6
## [77] spatstat.data_3.1-9 data.table_1.15.4
## [79] XVector_0.46.0 stringfish_0.18.0
## [81] utf8_1.2.4 BiocGenerics_0.52.0
## [83] spatstat.geom_3.7-0 RcppAnnoy_0.0.23
## [85] RANN_2.6.2 pillar_1.9.0
## [87] stringr_1.5.1 spam_2.11-3
## [89] RcppHNSW_0.6.0 later_1.4.6
## [91] splines_4.4.1 lattice_0.22-6
## [93] survival_3.6-4 deldir_2.0-4
## [95] tidyselect_1.2.1 SingleCellExperiment_1.28.1
## [97] miniUI_0.1.2 pbapply_1.7-4
## [99] gridExtra_2.3 IRanges_2.40.1
## [101] SummarizedExperiment_1.36.0 scattermore_1.2
## [103] stats4_4.4.1 xfun_0.56
## [105] Biobase_2.66.0 matrixStats_1.5.0
## [107] UCSC.utils_1.2.0 stringi_1.8.4
## [109] lazyeval_0.2.2 yaml_2.3.12
## [111] evaluate_1.0.5 codetools_0.2-20
## [113] cli_3.6.5 uwot_0.2.4
## [115] RcppParallel_5.1.8 xtable_1.8-4
## [117] reticulate_1.45.0 jquerylib_0.1.4
## [119] GenomeInfoDb_1.42.3 dichromat_2.0-0.1
## [121] globals_0.16.3 spatstat.random_3.4-4
## [123] png_0.1-8 ggrastr_1.0.2
## [125] spatstat.univar_3.1-6 parallel_4.4.1
## [127] dotCall64_1.2 listenv_0.9.1
## [129] ggridges_0.5.6 crayon_1.5.2
## [131] purrr_1.0.2 rlang_1.1.7
## [133] cowplot_1.2.0