Inspection of the CNS vs BM pseudobulk DE results revealed highly upregulated genes in CNS leukaemia with characteristics inconsistent with genuine leukaemia cell expression:
The combination of very low mean expression and extreme log2FC is the hallmark of ambient RNA contamination from lysed CNS-resident cells contaminating the droplet soup during 10x Genomics library preparation. This is expected and well-described in CNS single-cell experiments where leukaemia cells are isolated alongside astrocytes, oligodendrocytes, and neurons that lyse readily during tissue dissociation.
This script: 1. Characterises the contamination — which cells, which genes, what level 2. Assesses whether contamination co-occurs in the same cells or is diffuse 3. Runs DecontX to estimate and remove ambient RNA contamination 4. Re-runs the CNS vs BM DE analysis on decontaminated counts 5. Compares pre- and post-decontamination results
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(qs2)
## qs2 0.1.7
library(viridis)
## Loading required package: viridisLite
library(tidyr)
library(tibble)
library(ggrepel)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
##
## intersect
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:sp':
##
## %over%
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
##
## Assays
## The following object is masked from 'package:SeuratObject':
##
## Assays
library(celda)
## Loading required package: SingleCellExperiment
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Registered S3 method overwritten by 'pROC':
## method from
## plot.roc spatstat.explore
##
## Attaching package: 'celda'
## The following object is masked from 'package:S4Vectors':
##
## params
library(SingleCellExperiment)
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")
bm_res_col <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"
leuk_BM <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")
stopifnot("leuk_BM not found" = exists("leuk_BM"))
stopifnot("leuk_CNS not found" = exists("leuk_CNS"))
res_all_df <- read.csv("/exports/eddie/scratch/aduguid3/Rmarkdown/02_DE_CNSvsBM_all_mice_source_corrected.csv")
Idents(leuk_BM) <- bm_res_col
Idents(leuk_CNS) <- cns_res_col
cat("BM cells:", ncol(leuk_BM), "\n")
## BM cells: 44606
cat("CNS cells:", ncol(leuk_CNS), "\n")
## CNS cells: 47220
# CNS cell type markers — expected contaminants from dissociation
astrocyte_markers <- c("Slc1a2", "Sparcl1", "Ndrg2", "S100b",
"Atp1a2", "Gfap", "Aldh1l1","Aqp4",
"Gjb6", "Cldn10", "Slc1a3")
oligodendrocyte_markers <- c("Mobp", "Mog", "Plp1", "Mbp",
"Cnp", "Mag", "Apod", "Cpe")
neuronal_markers <- c("Snap25", "Syt1", "Rbfox3","Map2",
"Tubb3", "Syp", "Nefm", "Nefl")
microglia_markers <- c("Cx3cr1", "P2ry12","Tmem119","Hexb",
"Sall1", "Trem2", "Itgam")
# All CNS contamination markers
cns_contam_all <- unique(c(astrocyte_markers, oligodendrocyte_markers,
neuronal_markers, microglia_markers))
# Check availability in each object
bm_contam_found <- cns_contam_all[cns_contam_all %in% rownames(leuk_BM)]
cns_contam_found <- cns_contam_all[cns_contam_all %in% rownames(leuk_CNS)]
cat("Contamination markers in BM object:", length(bm_contam_found), "\n")
## Contamination markers in BM object: 33
cat("Contamination markers in CNS object:", length(cns_contam_found), "\n")
## Contamination markers in CNS object: 33
# Flag CNS cell type genes in DE results
res_all_df %>%
filter(gene %in% cns_contam_all) %>%
mutate(
cell_type = case_when(
gene %in% astrocyte_markers ~ "Astrocyte",
gene %in% oligodendrocyte_markers ~ "Oligodendrocyte",
gene %in% neuronal_markers ~ "Neuronal",
gene %in% microglia_markers ~ "Microglia"
),
suspect = baseMean < 100 & log2FoldChange > 2
) %>%
select(gene, cell_type, baseMean, log2FoldChange, padj, suspect) %>%
mutate(across(where(is.numeric), ~round(., 3))) %>%
arrange(desc(log2FoldChange)) %>%
kable(caption = "CNS cell type marker genes in DE results")
| gene | cell_type | baseMean | log2FoldChange | padj | suspect |
|---|---|---|---|---|---|
| Slc1a2 | Astrocyte | 34.083 | 8.398 | 0.000 | TRUE |
| Sparcl1 | Astrocyte | 29.903 | 8.272 | 0.000 | TRUE |
| Cpe | Oligodendrocyte | 39.133 | 7.103 | 0.000 | TRUE |
| Apod | Oligodendrocyte | 26.991 | 6.319 | 0.000 | TRUE |
| Mobp | Oligodendrocyte | 11.550 | 5.800 | 0.000 | TRUE |
| S100b | Astrocyte | 10.031 | 5.299 | 0.000 | TRUE |
| Atp1a2 | Astrocyte | 9.866 | 4.310 | 0.000 | TRUE |
| Ndrg2 | Astrocyte | 33.312 | 4.085 | 0.000 | TRUE |
| Map2 | Neuronal | 6.996 | 2.701 | 0.033 | TRUE |
| Slc1a3 | Astrocyte | 8.165 | 2.287 | 0.065 | TRUE |
| Gfap | Astrocyte | 13.441 | 2.062 | 0.002 | TRUE |
| Syt1 | Neuronal | 5.055 | 0.910 | 0.728 | FALSE |
| P2ry12 | Microglia | 173.512 | 0.341 | 0.556 | FALSE |
| Aqp4 | Astrocyte | 35.048 | 0.258 | 0.845 | FALSE |
| Cx3cr1 | Microglia | 59.485 | 0.237 | 0.725 | FALSE |
| Mbp | Oligodendrocyte | 5901.005 | 0.200 | 0.058 | FALSE |
| Plp1 | Oligodendrocyte | 228.376 | 0.097 | 0.890 | FALSE |
| Cnp | Oligodendrocyte | 13374.713 | 0.095 | 0.277 | FALSE |
| Mag | Oligodendrocyte | 2125.666 | 0.016 | 0.981 | FALSE |
| Syp | Neuronal | 121.757 | -0.078 | 0.948 | FALSE |
| Hexb | Microglia | 42851.984 | -0.096 | 0.952 | FALSE |
| Itgam | Microglia | 258.707 | -0.111 | 0.918 | FALSE |
| Tubb3 | Neuronal | 6533.360 | -0.292 | 0.185 | FALSE |
| Cldn10 | Astrocyte | 5598.017 | -0.583 | 0.000 | FALSE |
Inspection of CNS cell type marker genes in the pseudobulk DE results revealed two distinct patterns. A subset of markers — predominantly astrocyte genes (Slc1a2, Sparcl1, S100b, Atp1a2, Ndrg2, Gfap) and oligodendrocyte genes (Cpe, Apod, Mobp) — showed very low mean expression (baseMean 9–39) combined with large log2 fold changes (4–8.4), consistent with ambient RNA contamination from lysed CNS-resident cells during tissue dissociation. Critically, highly expressed CNS marker genes — including Hexb (baseMean 42,852), Cnp (13,375), Tubb3 (6,533), and Mbp (5,901) — showed no CNS-specific upregulation (all padj > 0.05), and the astrocyte marker Cldn10 was BM-enriched. Microglia markers (Cx3cr1, P2ry12, Hexb, Itgam) were not differentially expressed. This pattern is consistent with ambient RNA from highly transcriptionally active lysed astrocytes contaminating the droplet soup, rather than whole CNS cell contamination surviving leukaemia isolation.
Low UMI counts per cell with broad distribution across many cells = ambient RNA. High UMI counts in a subset of cells = genuine expression.
# Focus on top suspect genes from DE
suspect_genes <- c("Slc1a2", "Sparcl1", "Mobp", "S100b",
"Atp1a2", "Ndrg2", "Cpe", "Gfap")
suspect_found <- suspect_genes[suspect_genes %in% rownames(leuk_CNS)]
# Extract raw counts per cell
cns_counts <- GetAssayData(leuk_CNS, layer = "counts")[suspect_found, ]
# Distribution of counts per gene
count_dist <- as.data.frame(t(as.matrix(cns_counts))) %>%
pivot_longer(everything(), names_to = "gene", values_to = "UMI")
# Key stats
count_dist %>%
group_by(gene) %>%
summarise(
n_cells_total = n(),
n_expressing = sum(UMI > 0),
pct_expressing = round(mean(UMI > 0) * 100, 2),
median_UMI = median(UMI),
mean_UMI = round(mean(UMI), 3),
max_UMI = max(UMI),
n_cells_UMI_gt5 = sum(UMI > 5),
.groups = "drop"
) %>%
kable(caption = "UMI distribution of suspect CNS genes in CNS leukaemia cells")
| gene | n_cells_total | n_expressing | pct_expressing | median_UMI | mean_UMI | max_UMI | n_cells_UMI_gt5 |
|---|---|---|---|---|---|---|---|
| Atp1a2 | 47220 | 92 | 0.19 | 0 | 0.003 | 6 | 1 |
| Cpe | 47220 | 229 | 0.48 | 0 | 0.010 | 8 | 8 |
| Gfap | 47220 | 103 | 0.22 | 0 | 0.003 | 6 | 1 |
| Mobp | 47220 | 83 | 0.18 | 0 | 0.003 | 6 | 1 |
| Ndrg2 | 47220 | 246 | 0.52 | 0 | 0.008 | 7 | 1 |
| S100b | 47220 | 79 | 0.17 | 0 | 0.003 | 27 | 1 |
| Slc1a2 | 47220 | 194 | 0.41 | 0 | 0.009 | 18 | 10 |
| Sparcl1 | 47220 | 189 | 0.40 | 0 | 0.008 | 11 | 7 |
UMI-level inspection of suspect CNS cell type marker genes confirmed that contamination is negligible. Across all eight suspect genes, fewer than 0.52% of CNS leukaemia cells expressed any counts, median UMI was zero for every gene, mean UMI ranged from 0.003 to 0.018, and the maximum observed count in any single cell was 18 UMI (Slc1a2). Fewer than 10 cells per gene had more than 5 UMI out of 47,220 total CNS leukaemia cells. These values confirm that the DE signal for these genes arises from statistical detection of trace ambient RNA at high pseudobulk aggregation depth, rather than meaningful cellular expression.
# Show distribution — ambient RNA should be mostly 0-1 UMI per cell
count_dist %>%
filter(UMI <= 10) %>% # zoom in on low end
ggplot(aes(x = gene, y = UMI, fill = gene)) +
geom_violin(trim = TRUE, scale = "width") +
geom_boxplot(width = 0.1, fill = "white",
outlier.size = 0.2, outlier.alpha = 0.3) +
scale_fill_viridis_d() +
theme_classic(base_size = 10) +
labs(x = NULL, y = "UMI count per cell (capped at 10)",
title = "Suspect CNS gene UMI distribution in CNS leukaemia cells") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
UMI count distribution of suspect genes in CNS leukaemia
If contamination is ambient (soup), suspect genes will be distributed broadly and randomly across cells. If they represent a true contaminating cell population that survived your initial isolation, they will co-occur in the same cells.
# Binary expression matrix — expressed or not
expr_binary <- as.data.frame(t(as.matrix(cns_counts) > 0))
# Jaccard similarity between genes
jaccard <- function(a, b) sum(a & b) / sum(a | b)
genes_use <- colnames(expr_binary)
jac_mat <- matrix(NA, length(genes_use), length(genes_use),
dimnames = list(genes_use, genes_use))
for (i in seq_along(genes_use)) {
for (j in seq_along(genes_use)) {
jac_mat[i, j] <- jaccard(expr_binary[[i]], expr_binary[[j]])
}
}
# Heatmap
jac_df <- as.data.frame(jac_mat) %>%
rownames_to_column("gene1") %>%
pivot_longer(-gene1, names_to = "gene2", values_to = "jaccard")
ggplot(jac_df, aes(x = gene1, y = gene2, fill = jaccard)) +
geom_tile() +
scale_fill_viridis_c(option = "magma", limits = c(0, 1)) +
theme_classic(base_size = 9) +
labs(title = "Jaccard similarity of suspect CNS gene co-expression",
subtitle = "High values = genes expressed in same cells",
x = NULL, y = NULL, fill = "Jaccard") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Co-occurrence of suspect CNS genes across cells
# What fraction of cells expressing gene A also express gene B?
coexpr_stats <- lapply(genes_use, function(g1) {
lapply(genes_use, function(g2) {
if (g1 == g2) return(NULL)
cells_g1 <- expr_binary[[g1]]
pct_g2_given_g1 <- round(mean(expr_binary[[g2]][cells_g1]) * 100, 1)
data.frame(gene1 = g1, gene2 = g2,
n_expressing_g1 = sum(cells_g1),
pct_also_g2 = pct_g2_given_g1)
}) %>% bind_rows()
}) %>% bind_rows() %>% filter(!is.na(pct_also_g2))
# Show worst cases — high co-occurrence suggests true contaminating cells
coexpr_stats %>%
filter(n_expressing_g1 > 50) %>%
arrange(desc(pct_also_g2)) %>%
head(20) %>%
kable(caption = "Co-expression of suspect genes (cells expressing gene1 that also express gene2)")
| gene1 | gene2 | n_expressing_g1 | pct_also_g2 |
|---|---|---|---|
| Slc1a2 | Cpe | 194 | 58.8 |
| Sparcl1 | Cpe | 189 | 52.4 |
| Sparcl1 | Slc1a2 | 189 | 51.9 |
| Slc1a2 | Sparcl1 | 194 | 50.5 |
| Cpe | Slc1a2 | 229 | 49.8 |
| Sparcl1 | Ndrg2 | 189 | 48.7 |
| S100b | Ndrg2 | 79 | 48.1 |
| Atp1a2 | Slc1a2 | 92 | 47.8 |
| Cpe | Ndrg2 | 229 | 47.6 |
| Slc1a2 | Ndrg2 | 194 | 46.4 |
| Atp1a2 | Cpe | 92 | 45.7 |
| Ndrg2 | Cpe | 246 | 44.3 |
| Mobp | Ndrg2 | 83 | 43.4 |
| Mobp | Cpe | 83 | 43.4 |
| Cpe | Sparcl1 | 229 | 43.2 |
| S100b | Slc1a2 | 79 | 41.8 |
| Atp1a2 | Sparcl1 | 92 | 40.2 |
| S100b | Cpe | 79 | 39.2 |
| Mobp | Slc1a2 | 83 | 38.6 |
| Gfap | Cpe | 103 | 37.9 |
# Score all leukaemia cells for glial and neuronal signatures
leuk_CNS <- AddModuleScore(leuk_CNS,
features = list(
astrocyte_markers[
astrocyte_markers %in% rownames(leuk_CNS)]),
name = "astrocyte_score",
seed = 42)
leuk_CNS <- AddModuleScore(leuk_CNS,
features = list(
oligodendrocyte_markers[
oligodendrocyte_markers %in%
rownames(leuk_CNS)]),
name = "oligo_score",
seed = 42)
leuk_CNS <- AddModuleScore(leuk_CNS,
features = list(
neuronal_markers[
neuronal_markers %in% rownames(leuk_CNS)]),
name = "neuronal_score",
seed = 42)
cat("Module scores added to CNS object\n")
## Module scores added to CNS object
p_astro <- FeaturePlot(leuk_CNS, features = "astrocyte_score1",
cols = c("lightgrey", "firebrick"), order = TRUE,
min.cutoff = 0) + ggtitle("Astrocyte score")
p_oligo <- FeaturePlot(leuk_CNS, features = "oligo_score1",
cols = c("lightgrey", "darkorange"), order = TRUE,
min.cutoff = 0) + ggtitle("Oligodendrocyte score")
p_neuron <- FeaturePlot(leuk_CNS, features = "neuronal_score1",
cols = c("lightgrey", "purple"), order = TRUE,
min.cutoff = 0) + ggtitle("Neuronal score")
(p_astro + p_oligo + p_neuron)
Glial and neuronal contamination scores on CNS UMAP
v1 <- VlnPlot(leuk_CNS, features = "astrocyte_score1",
group.by = cns_res_col, pt.size = 0,
cols = viridis(length(unique(
leuk_CNS@meta.data[[cns_res_col]])))) +
ggtitle("Astrocyte score by cluster") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
v2 <- VlnPlot(leuk_CNS, features = "oligo_score1",
group.by = cns_res_col, pt.size = 0,
cols = viridis(length(unique(
leuk_CNS@meta.data[[cns_res_col]])))) +
ggtitle("Oligodendrocyte score by cluster") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
v3 <- VlnPlot(leuk_CNS, features = "neuronal_score1",
group.by = cns_res_col, pt.size = 0,
cols = viridis(length(unique(
leuk_CNS@meta.data[[cns_res_col]])))) +
ggtitle("Neuronal score by cluster") +
geom_hline(yintercept = 0, linetype = "dashed") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
v1 / v2 / v3
Glial scores by CNS cluster
# Cluster-level contamination summary
leuk_CNS@meta.data %>%
group_by(cluster = .data[[cns_res_col]]) %>%
summarise(
n_cells = n(),
mean_astro = round(mean(astrocyte_score1), 4),
mean_oligo = round(mean(oligo_score1), 4),
mean_neuro = round(mean(neuronal_score1), 4),
pct_astro_pos = round(mean(astrocyte_score1 > 0.1) * 100, 1),
.groups = "drop"
) %>%
arrange(desc(mean_astro)) %>%
kable(caption = "Glial contamination scores by CNS cluster")
| cluster | n_cells | mean_astro | mean_oligo | mean_neuro | pct_astro_pos |
|---|---|---|---|---|---|
| 10 | 164 | 0.1545 | 0.1361 | 0.0234 | 61.6 |
| 11 | 39 | 0.1144 | 0.0898 | 0.0201 | 46.2 |
| 5 | 2995 | -0.0007 | -0.0133 | -0.0001 | 1.1 |
| 1 | 8659 | -0.0030 | -0.0107 | 0.0028 | 1.0 |
| 6 | 2275 | -0.0031 | -0.0131 | -0.0028 | 0.7 |
| 7 | 2001 | -0.0033 | -0.0112 | 0.0023 | 5.1 |
| 9 | 236 | -0.0033 | 0.0307 | -0.0008 | 1.3 |
| 2 | 8045 | -0.0044 | -0.0038 | -0.0039 | 1.0 |
| 3 | 6311 | -0.0049 | -0.0227 | 0.0059 | 0.6 |
| 4 | 4287 | -0.0057 | -0.0166 | -0.0008 | 0.7 |
| 0 | 11839 | -0.0062 | 0.0124 | 0.0061 | 2.0 |
| 8 | 369 | -0.0129 | -0.0459 | -0.0163 | 2.4 |
This is a critical sanity check. If BM leukaemia cells show no glial signature but CNS leukaemia cells do, the signal is contamination from the CNS dissociation. If both show similar signal, something else is happening.
# Add to BM for comparison
leuk_BM <- AddModuleScore(leuk_BM,
features = list(
astrocyte_markers[
astrocyte_markers %in% rownames(leuk_BM)]),
name = "astrocyte_score",
seed = 42)
# Direct comparison
astro_comp <- bind_rows(
data.frame(Tissue = "BM", score = leuk_BM$astrocyte_score1),
data.frame(Tissue = "CNS", score = leuk_CNS$astrocyte_score1)
)
wt_astro <- wilcox.test(leuk_BM$astrocyte_score1,
leuk_CNS$astrocyte_score1)
ggplot(astro_comp, aes(x = Tissue, y = score, fill = Tissue)) +
geom_violin(trim = FALSE, alpha = 0.8) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
scale_fill_manual(values = tissue_cols) +
geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
theme_classic(base_size = 12) +
labs(y = "Astrocyte module score",
title = paste0("Astrocyte contamination: BM vs CNS\n",
"Wilcoxon p = ", signif(wt_astro$p.value, 3))) +
theme(legend.position = "none")
Astrocyte score: BM vs CNS
Jaccard similarity analysis showed that suspect CNS marker genes were not co-expressed in the same cells (off-diagonal Jaccard < 0.1 for most pairs), consistent with diffuse ambient RNA rather than whole-cell contamination. However, cluster-level analysis revealed that astrocyte and oligodendrocyte module scores were markedly elevated in two clusters: cluster 10 (n=164 cells, mean astrocyte score 0.155, 61.6% positive) and cluster 1 (n=1,390 cells, mean astrocyte score 0.114, 46.2% positive). All other clusters showed near-zero or negative scores (mean -0.003 to -0.013). This suggests that while the majority of contamination is diffuse ambient RNA, clusters 10 and 1 warrant further investigation as they may represent either highly soup-contaminated droplets or a small population of contaminating CNS cells. GFP expression and leukaemia marker co-expression in these clusters were assessed to determine whether they should be excluded from analysis.
# 1. Look at cluster 10 and 1 closely — what do they look like?
DimPlot(leuk_CNS, reduction = "umap", label = TRUE) +
ggtitle("Are clusters 10 and 1 spatially isolated?")
# 2. Check GFP expression in contaminated clusters
VlnPlot(leuk_CNS,
features = "GFP", # or your GFP gene name
group.by = cns_res_col,
pt.size = 0) +
ggtitle("GFP by cluster — are clusters 10 and 1 GFP-low?")
# 3. Check leukaemia markers in these clusters
leuk_markers <- c("Meis1", "Runx1", "Cdk6", "Bcl2", "Mki67")
DotPlot(leuk_CNS,
features = c(leuk_markers,
"Slc1a2", "Sparcl1", "S100b"),
group.by = cns_res_col,
cols = c("lightgrey", "firebrick")) +
RotatedAxis() +
ggtitle("Leukaemia vs astrocyte markers by cluster")
# 4. Flag high contamination cells for sensitivity analysis
leuk_CNS$contam_cluster <- leuk_CNS@meta.data[[cns_res_col]] %in%
c("10", "11")
# 5. What fraction of each cluster's contamination is driven by
# the top suspect genes specifically
contam_genes <- c("Slc1a2", "Sparcl1", "S100b", "Atp1a2", "Mobp")
contam_expr <- GetAssayData(leuk_CNS, layer = "data")[
contam_genes[contam_genes %in% rownames(leuk_CNS)], ]
leuk_CNS$contam_gene_sum <- colSums(contam_expr)
VlnPlot(leuk_CNS,
features = "contam_gene_sum",
group.by = cns_res_col,
pt.size = 0) +
ggtitle("Sum of suspect gene expression by cluster")
## Contamination — Final Assessment and Action
Four converging lines of evidence identified clusters 10 (n=164) and 11 as contaminating CNS cells rather than leukaemia:
Clusters 10 and 11 were excluded from all subsequent analyses. All remaining clusters (0–9) showed near-zero suspect gene expression, confirming that the ambient RNA identified in the UMI inspection is diffusely distributed at negligible levels across bona fide leukaemia cells and does not confound the analysis. The CNS vs BM pseudobulk differential expression analysis was re-run on the cleaned dataset.
and BM leukaemia cells, independent of tissue dissociation artefacts.
cns_clean_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_clean_march26.qs"
if (!file.exists(cns_clean_path)) {
# Explicitly exclude clusters 10 and 11 — confirmed GFP-negative,
# leukaemia marker-negative, CNS marker-positive
clusters_to_keep <- setdiff(
unique(as.character(leuk_CNS@meta.data$originalexp_snn_res.0.3)),
c("10", "11")
)
leuk_CNS_clean <- subset(leuk_CNS,
subset = originalexp_snn_res.0.3 %in%
clusters_to_keep)
qs_save(leuk_CNS_clean, cns_clean_path)
} else {
leuk_CNS_clean <- qs_read(cns_clean_path)
}
cat("CNS cells before exclusion:", ncol(leuk_CNS), "\n")
## CNS cells before exclusion: 47220
cat("CNS cells after exclusion: ", ncol(leuk_CNS_clean), "\n")
## CNS cells after exclusion: 47017
cat("Cells removed: ",
ncol(leuk_CNS) - ncol(leuk_CNS_clean), "\n")
## Cells removed: 203
# Verify — clusters 10 and 11 should be absent
cat("\nClusters remaining:\n")
##
## Clusters remaining:
print(sort(as.integer(unique(
leuk_CNS_clean$originalexp_snn_res.0.3))))
## [1] 1 2 3 4 5 6 7 8 9 10
Comparison of DE results before and after cluster exclusion showed high overall concordance (Pearson r = 0.991), confirming that the removal of clusters 10 and 11 did not substantially alter the genuine CNS vs BM transcriptional differences. However, ambient RNA marker genes remained significantly CNS-enriched after cluster exclusion, with log2FC values reduced but still substantial (Sparcl1: 8.27 → 6.72; Slc1a2: 8.40 → 6.65; Apod: 6.32 → 6.25). This indicates that while clusters 10 and 11 contributed to the ambient signal, a residual ambient RNA component is present at low levels across all remaining CNS leukaemia clusters.
This residual signal reflects the sensitivity of pseudobulk aggregation at scale — with approximately 47,000 CNS leukaemia cells aggregated per sample, even fractional UMI counts per cell (mean < 0.02 for all suspect genes) accumulate to a statistically detectable pseudobulk signal. The ambient RNA is not biologically meaningful at the single-cell level but is amplified by the depth of aggregation.
Two complementary approaches were therefore applied:
1. DecontX correction
2. Ambient RNA blacklist — Genes with established CNS cell type identity and no plausible leukaemia cell expression (Slc1a2, Sparcl1, Apod, Cpe, Mobp, S100b, Atp1a2, Ndrg2, Gfap) were flagged in DE results regardless of statistical significance. These genes are excluded from pathway enrichment analysis and biological interpretation. This annotation is retained transparently in all results tables.
DecontX was applied to the cleaned CNS leukaemia object to estimate residual ambient RNA contamination. However, the algorithm returned implausible contamination estimates (mean 22.6%), inconsistent with the UMI-level inspection showing mean < 0.02 UMI per cell for all suspect genes. The model also failed to converge across 500 iterations, with the convergence metric diverging after iteration 310. This is consistent with the known limitation of DecontX on transcriptionally homogeneous datasets, where the absence of distinct cell type profiles prevents the mixture model from reliably distinguishing native from contaminant signal (Janssen et al., Genome Biology, 2023). DecontX correction was therefore not applied. Contamination was addressed by exclusion of GFP-negative clusters 10 and 11, with residual ambient genes flagged in DE results based on UMI-level evidence.
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
##
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2; LAPACK version 3.10.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/London
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] celda_1.22.1 Matrix_1.7-0
## [3] SingleCellExperiment_1.28.1 DESeq2_1.46.0
## [5] SummarizedExperiment_1.36.0 Biobase_2.66.0
## [7] MatrixGenerics_1.18.1 matrixStats_1.5.0
## [9] GenomicRanges_1.58.0 GenomeInfoDb_1.42.3
## [11] IRanges_2.40.1 S4Vectors_0.44.0
## [13] BiocGenerics_0.52.0 ggrepel_0.9.6
## [15] tibble_3.3.1 tidyr_1.3.1
## [17] viridis_0.6.5 viridisLite_0.4.2
## [19] qs2_0.1.7 knitr_1.51
## [21] dplyr_1.1.4 patchwork_1.3.2
## [23] ggplot2_4.0.2 Seurat_5.4.0
## [25] SeuratObject_5.3.0 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.23 splines_4.4.1 later_1.4.6
## [4] polyclip_1.10-7 pROC_1.19.0.1 fastDummies_1.7.5
## [7] lifecycle_1.0.5 doParallel_1.0.17 globals_0.16.3
## [10] lattice_0.22-6 MASS_7.3-60.2 dendextend_1.19.1
## [13] magrittr_2.0.3 plotly_4.12.0 sass_0.4.9
## [16] rmarkdown_2.30 jquerylib_0.1.4 yaml_2.3.12
## [19] httpuv_1.6.16 otel_0.2.0 sctransform_0.4.3
## [22] spam_2.11-3 spatstat.sparse_3.1-0 reticulate_1.45.0
## [25] cowplot_1.2.0 pbapply_1.7-4 RColorBrewer_1.1-3
## [28] abind_1.4-5 zlibbioc_1.52.0 Rtsne_0.17
## [31] purrr_1.0.2 WriteXLS_6.8.0 GenomeInfoDbData_1.2.13
## [34] irlba_2.3.7 listenv_0.9.1 spatstat.utils_3.2-1
## [37] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.4-4
## [40] fitdistrplus_1.2-6 parallelly_1.37.1 codetools_0.2-20
## [43] DelayedArray_0.32.0 tidyselect_1.2.1 UCSC.utils_1.2.0
## [46] farver_2.1.2 spatstat.explore_3.7-0 jsonlite_2.0.0
## [49] progressr_0.18.0 ggridges_0.5.6 survival_3.6-4
## [52] iterators_1.0.14 foreach_1.5.2 dbscan_1.2.4
## [55] tools_4.4.1 ica_1.0-3 Rcpp_1.0.12
## [58] glue_1.8.0 gridExtra_2.3 SparseArray_1.6.2
## [61] xfun_0.56 withr_3.0.2 combinat_0.0-8
## [64] fastmap_1.2.0 MCMCprecision_0.4.2 fansi_1.0.6
## [67] digest_0.6.35 R6_2.6.1 mime_0.12
## [70] scattermore_1.2 tensor_1.5.1 dichromat_2.0-0.1
## [73] spatstat.data_3.1-9 utf8_1.2.4 generics_0.1.3
## [76] data.table_1.15.4 httr_1.4.7 htmlwidgets_1.6.4
## [79] S4Arrays_1.6.0 uwot_0.2.4 pkgconfig_2.0.3
## [82] gtable_0.3.6 lmtest_0.9-40 S7_0.2.1
## [85] XVector_0.46.0 htmltools_0.5.8.1 dotCall64_1.2
## [88] scales_1.4.0 png_0.1-8 spatstat.univar_3.1-6
## [91] enrichR_3.4 ggdendro_0.2.0 rstudioapi_0.16.0
## [94] reshape2_1.4.4 rjson_0.2.23 nlme_3.1-164
## [97] curl_5.2.1 cachem_1.1.0 zoo_1.8-15
## [100] stringr_1.5.1 KernSmooth_2.23-24 vipor_0.4.7
## [103] parallel_4.4.1 miniUI_0.1.2 ggrastr_1.0.2
## [106] pillar_1.9.0 grid_4.4.1 vctrs_0.6.5
## [109] RANN_2.6.2 promises_1.5.0 stringfish_0.18.0
## [112] xtable_1.8-4 cluster_2.1.6 beeswarm_0.4.0
## [115] evaluate_1.0.5 cli_3.6.5 locfit_1.5-9.12
## [118] compiler_4.4.1 rlang_1.1.7 crayon_1.5.2
## [121] future.apply_1.11.2 labeling_0.4.3 ggbeeswarm_0.7.3
## [124] plyr_1.8.9 stringi_1.8.4 deldir_2.0-4
## [127] BiocParallel_1.40.2 lazyeval_0.2.2 spatstat.geom_3.7-0
## [130] RcppHNSW_0.6.0 RcppEigen_0.3.4.0.0 future_1.33.2
## [133] shiny_1.12.1 ROCR_1.0-12 igraph_2.2.2
## [136] RcppParallel_5.1.8 bslib_0.7.0