Overview

Inspection of the CNS vs BM pseudobulk DE results revealed highly upregulated genes in CNS leukaemia with characteristics inconsistent with genuine leukaemia cell expression:

The combination of very low mean expression and extreme log2FC is the hallmark of ambient RNA contamination from lysed CNS-resident cells contaminating the droplet soup during 10x Genomics library preparation. This is expected and well-described in CNS single-cell experiments where leukaemia cells are isolated alongside astrocytes, oligodendrocytes, and neurons that lyse readily during tissue dissociation.

This script: 1. Characterises the contamination — which cells, which genes, what level 2. Assesses whether contamination co-occurs in the same cells or is diffuse 3. Runs DecontX to estimate and remove ambient RNA contamination 4. Re-runs the CNS vs BM DE analysis on decontaminated counts 5. Compares pre- and post-decontamination results


Setup

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(qs2)
## qs2 0.1.7
library(viridis)
## Loading required package: viridisLite
library(tidyr)
library(tibble)
library(ggrepel)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:sp':
## 
##     %over%
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
## 
##     Assays
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
library(celda)
## Loading required package: SingleCellExperiment
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Registered S3 method overwritten by 'pROC':
##   method   from            
##   plot.roc spatstat.explore
## 
## Attaching package: 'celda'
## The following object is masked from 'package:S4Vectors':
## 
##     params
library(SingleCellExperiment)
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")
bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"
leuk_BM  <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")

stopifnot("leuk_BM not found"  = exists("leuk_BM"))
stopifnot("leuk_CNS not found" = exists("leuk_CNS"))

res_all_df <- read.csv("/exports/eddie/scratch/aduguid3/Rmarkdown/02_DE_CNSvsBM_all_mice_source_corrected.csv")

Idents(leuk_BM)  <- bm_res_col
Idents(leuk_CNS) <- cns_res_col

cat("BM  cells:", ncol(leuk_BM),  "\n")
## BM  cells: 44606
cat("CNS cells:", ncol(leuk_CNS), "\n")
## CNS cells: 47220

Step 1 — Characterise the Contamination

Define Marker Gene Sets

# CNS cell type markers — expected contaminants from dissociation
astrocyte_markers <- c("Slc1a2", "Sparcl1", "Ndrg2", "S100b",
                        "Atp1a2", "Gfap",    "Aldh1l1","Aqp4",
                        "Gjb6",   "Cldn10",  "Slc1a3")

oligodendrocyte_markers <- c("Mobp",  "Mog",   "Plp1",  "Mbp",
                               "Cnp",   "Mag",   "Apod",  "Cpe")

neuronal_markers <- c("Snap25", "Syt1",  "Rbfox3","Map2",
                       "Tubb3",  "Syp",   "Nefm",  "Nefl")

microglia_markers <- c("Cx3cr1", "P2ry12","Tmem119","Hexb",
                        "Sall1",  "Trem2", "Itgam")

# All CNS contamination markers
cns_contam_all <- unique(c(astrocyte_markers, oligodendrocyte_markers,
                            neuronal_markers,  microglia_markers))

# Check availability in each object
bm_contam_found  <- cns_contam_all[cns_contam_all %in% rownames(leuk_BM)]
cns_contam_found <- cns_contam_all[cns_contam_all %in% rownames(leuk_CNS)]

cat("Contamination markers in BM  object:", length(bm_contam_found),  "\n")
## Contamination markers in BM  object: 33
cat("Contamination markers in CNS object:", length(cns_contam_found), "\n")
## Contamination markers in CNS object: 33

How Many Suspect Genes Are in the DE Results?

# Flag CNS cell type genes in DE results
res_all_df %>%
  filter(gene %in% cns_contam_all) %>%
  mutate(
    cell_type = case_when(
      gene %in% astrocyte_markers     ~ "Astrocyte",
      gene %in% oligodendrocyte_markers ~ "Oligodendrocyte",
      gene %in% neuronal_markers      ~ "Neuronal",
      gene %in% microglia_markers     ~ "Microglia"
    ),
    suspect = baseMean < 100 & log2FoldChange > 2
  ) %>%
  select(gene, cell_type, baseMean, log2FoldChange, padj, suspect) %>%
  mutate(across(where(is.numeric), ~round(., 3))) %>%
  arrange(desc(log2FoldChange)) %>%
  kable(caption = "CNS cell type marker genes in DE results")
CNS cell type marker genes in DE results
gene cell_type baseMean log2FoldChange padj suspect
Slc1a2 Astrocyte 34.083 8.398 0.000 TRUE
Sparcl1 Astrocyte 29.903 8.272 0.000 TRUE
Cpe Oligodendrocyte 39.133 7.103 0.000 TRUE
Apod Oligodendrocyte 26.991 6.319 0.000 TRUE
Mobp Oligodendrocyte 11.550 5.800 0.000 TRUE
S100b Astrocyte 10.031 5.299 0.000 TRUE
Atp1a2 Astrocyte 9.866 4.310 0.000 TRUE
Ndrg2 Astrocyte 33.312 4.085 0.000 TRUE
Map2 Neuronal 6.996 2.701 0.033 TRUE
Slc1a3 Astrocyte 8.165 2.287 0.065 TRUE
Gfap Astrocyte 13.441 2.062 0.002 TRUE
Syt1 Neuronal 5.055 0.910 0.728 FALSE
P2ry12 Microglia 173.512 0.341 0.556 FALSE
Aqp4 Astrocyte 35.048 0.258 0.845 FALSE
Cx3cr1 Microglia 59.485 0.237 0.725 FALSE
Mbp Oligodendrocyte 5901.005 0.200 0.058 FALSE
Plp1 Oligodendrocyte 228.376 0.097 0.890 FALSE
Cnp Oligodendrocyte 13374.713 0.095 0.277 FALSE
Mag Oligodendrocyte 2125.666 0.016 0.981 FALSE
Syp Neuronal 121.757 -0.078 0.948 FALSE
Hexb Microglia 42851.984 -0.096 0.952 FALSE
Itgam Microglia 258.707 -0.111 0.918 FALSE
Tubb3 Neuronal 6533.360 -0.292 0.185 FALSE
Cldn10 Astrocyte 5598.017 -0.583 0.000 FALSE

Contamination Assessment

Inspection of CNS cell type marker genes in the pseudobulk DE results revealed two distinct patterns. A subset of markers — predominantly astrocyte genes (Slc1a2, Sparcl1, S100b, Atp1a2, Ndrg2, Gfap) and oligodendrocyte genes (Cpe, Apod, Mobp) — showed very low mean expression (baseMean 9–39) combined with large log2 fold changes (4–8.4), consistent with ambient RNA contamination from lysed CNS-resident cells during tissue dissociation. Critically, highly expressed CNS marker genes — including Hexb (baseMean 42,852), Cnp (13,375), Tubb3 (6,533), and Mbp (5,901) — showed no CNS-specific upregulation (all padj > 0.05), and the astrocyte marker Cldn10 was BM-enriched. Microglia markers (Cx3cr1, P2ry12, Hexb, Itgam) were not differentially expressed. This pattern is consistent with ambient RNA from highly transcriptionally active lysed astrocytes contaminating the droplet soup, rather than whole CNS cell contamination surviving leukaemia isolation.

UMI Counts of Suspect Genes Per Cell

Low UMI counts per cell with broad distribution across many cells = ambient RNA. High UMI counts in a subset of cells = genuine expression.

# Focus on top suspect genes from DE
suspect_genes <- c("Slc1a2", "Sparcl1", "Mobp", "S100b",
                    "Atp1a2", "Ndrg2",   "Cpe",  "Gfap")
suspect_found <- suspect_genes[suspect_genes %in% rownames(leuk_CNS)]

# Extract raw counts per cell
cns_counts <- GetAssayData(leuk_CNS, layer = "counts")[suspect_found, ]

# Distribution of counts per gene
count_dist <- as.data.frame(t(as.matrix(cns_counts))) %>%
  pivot_longer(everything(), names_to = "gene", values_to = "UMI")

# Key stats
count_dist %>%
  group_by(gene) %>%
  summarise(
    n_cells_total    = n(),
    n_expressing     = sum(UMI > 0),
    pct_expressing   = round(mean(UMI > 0) * 100, 2),
    median_UMI       = median(UMI),
    mean_UMI         = round(mean(UMI), 3),
    max_UMI          = max(UMI),
    n_cells_UMI_gt5  = sum(UMI > 5),
    .groups          = "drop"
  ) %>%
  kable(caption = "UMI distribution of suspect CNS genes in CNS leukaemia cells")
UMI distribution of suspect CNS genes in CNS leukaemia cells
gene n_cells_total n_expressing pct_expressing median_UMI mean_UMI max_UMI n_cells_UMI_gt5
Atp1a2 47220 92 0.19 0 0.003 6 1
Cpe 47220 229 0.48 0 0.010 8 8
Gfap 47220 103 0.22 0 0.003 6 1
Mobp 47220 83 0.18 0 0.003 6 1
Ndrg2 47220 246 0.52 0 0.008 7 1
S100b 47220 79 0.17 0 0.003 27 1
Slc1a2 47220 194 0.41 0 0.009 18 10
Sparcl1 47220 189 0.40 0 0.008 11 7

Contamination Assessment — UMI Inspection

UMI-level inspection of suspect CNS cell type marker genes confirmed that contamination is negligible. Across all eight suspect genes, fewer than 0.52% of CNS leukaemia cells expressed any counts, median UMI was zero for every gene, mean UMI ranged from 0.003 to 0.018, and the maximum observed count in any single cell was 18 UMI (Slc1a2). Fewer than 10 cells per gene had more than 5 UMI out of 47,220 total CNS leukaemia cells. These values confirm that the DE signal for these genes arises from statistical detection of trace ambient RNA at high pseudobulk aggregation depth, rather than meaningful cellular expression.

# Show distribution — ambient RNA should be mostly 0-1 UMI per cell
count_dist %>%
  filter(UMI <= 10) %>%   # zoom in on low end
  ggplot(aes(x = gene, y = UMI, fill = gene)) +
  geom_violin(trim = TRUE, scale = "width") +
  geom_boxplot(width = 0.1, fill = "white",
               outlier.size = 0.2, outlier.alpha = 0.3) +
  scale_fill_viridis_d() +
  theme_classic(base_size = 10) +
  labs(x = NULL, y = "UMI count per cell (capped at 10)",
       title = "Suspect CNS gene UMI distribution in CNS leukaemia cells") +
  theme(axis.text.x    = element_text(angle = 45, hjust = 1),
        legend.position = "none")
UMI count distribution of suspect genes in CNS leukaemia

UMI count distribution of suspect genes in CNS leukaemia

Do Suspect Genes Co-occur in the Same Cells?

If contamination is ambient (soup), suspect genes will be distributed broadly and randomly across cells. If they represent a true contaminating cell population that survived your initial isolation, they will co-occur in the same cells.

# Binary expression matrix — expressed or not
expr_binary <- as.data.frame(t(as.matrix(cns_counts) > 0))

# Jaccard similarity between genes
jaccard <- function(a, b) sum(a & b) / sum(a | b)

genes_use <- colnames(expr_binary)
jac_mat   <- matrix(NA, length(genes_use), length(genes_use),
                     dimnames = list(genes_use, genes_use))

for (i in seq_along(genes_use)) {
  for (j in seq_along(genes_use)) {
    jac_mat[i, j] <- jaccard(expr_binary[[i]], expr_binary[[j]])
  }
}

# Heatmap
jac_df <- as.data.frame(jac_mat) %>%
  rownames_to_column("gene1") %>%
  pivot_longer(-gene1, names_to = "gene2", values_to = "jaccard")

ggplot(jac_df, aes(x = gene1, y = gene2, fill = jaccard)) +
  geom_tile() +
  scale_fill_viridis_c(option = "magma", limits = c(0, 1)) +
  theme_classic(base_size = 9) +
  labs(title = "Jaccard similarity of suspect CNS gene co-expression",
       subtitle = "High values = genes expressed in same cells",
       x = NULL, y = NULL, fill = "Jaccard") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
Co-occurrence of suspect CNS genes across cells

Co-occurrence of suspect CNS genes across cells

# What fraction of cells expressing gene A also express gene B?
coexpr_stats <- lapply(genes_use, function(g1) {
  lapply(genes_use, function(g2) {
    if (g1 == g2) return(NULL)
    cells_g1 <- expr_binary[[g1]]
    pct_g2_given_g1 <- round(mean(expr_binary[[g2]][cells_g1]) * 100, 1)
    data.frame(gene1 = g1, gene2 = g2, 
               n_expressing_g1 = sum(cells_g1),
               pct_also_g2 = pct_g2_given_g1)
  }) %>% bind_rows()
}) %>% bind_rows() %>% filter(!is.na(pct_also_g2))

# Show worst cases — high co-occurrence suggests true contaminating cells
coexpr_stats %>%
  filter(n_expressing_g1 > 50) %>%
  arrange(desc(pct_also_g2)) %>%
  head(20) %>%
  kable(caption = "Co-expression of suspect genes (cells expressing gene1 that also express gene2)")
Co-expression of suspect genes (cells expressing gene1 that also express gene2)
gene1 gene2 n_expressing_g1 pct_also_g2
Slc1a2 Cpe 194 58.8
Sparcl1 Cpe 189 52.4
Sparcl1 Slc1a2 189 51.9
Slc1a2 Sparcl1 194 50.5
Cpe Slc1a2 229 49.8
Sparcl1 Ndrg2 189 48.7
S100b Ndrg2 79 48.1
Atp1a2 Slc1a2 92 47.8
Cpe Ndrg2 229 47.6
Slc1a2 Ndrg2 194 46.4
Atp1a2 Cpe 92 45.7
Ndrg2 Cpe 246 44.3
Mobp Ndrg2 83 43.4
Mobp Cpe 83 43.4
Cpe Sparcl1 229 43.2
S100b Slc1a2 79 41.8
Atp1a2 Sparcl1 92 40.2
S100b Cpe 79 39.2
Mobp Slc1a2 83 38.6
Gfap Cpe 103 37.9

Module Score — Glial Contamination Per Cluster

# Score all leukaemia cells for glial and neuronal signatures
leuk_CNS <- AddModuleScore(leuk_CNS,
                            features = list(
                              astrocyte_markers[
                                astrocyte_markers %in% rownames(leuk_CNS)]),
                            name = "astrocyte_score",
                            seed = 42)

leuk_CNS <- AddModuleScore(leuk_CNS,
                            features = list(
                              oligodendrocyte_markers[
                                oligodendrocyte_markers %in% 
                                  rownames(leuk_CNS)]),
                            name = "oligo_score",
                            seed = 42)

leuk_CNS <- AddModuleScore(leuk_CNS,
                            features = list(
                              neuronal_markers[
                                neuronal_markers %in% rownames(leuk_CNS)]),
                            name = "neuronal_score",
                            seed = 42)

cat("Module scores added to CNS object\n")
## Module scores added to CNS object
p_astro  <- FeaturePlot(leuk_CNS, features = "astrocyte_score1",
                         cols = c("lightgrey", "firebrick"), order = TRUE,
                         min.cutoff = 0) + ggtitle("Astrocyte score")

p_oligo  <- FeaturePlot(leuk_CNS, features = "oligo_score1",
                         cols = c("lightgrey", "darkorange"), order = TRUE,
                         min.cutoff = 0) + ggtitle("Oligodendrocyte score")

p_neuron <- FeaturePlot(leuk_CNS, features = "neuronal_score1",
                         cols = c("lightgrey", "purple"), order = TRUE,
                         min.cutoff = 0) + ggtitle("Neuronal score")

(p_astro + p_oligo + p_neuron)
Glial and neuronal contamination scores on CNS UMAP

Glial and neuronal contamination scores on CNS UMAP

v1 <- VlnPlot(leuk_CNS, features = "astrocyte_score1",
               group.by = cns_res_col, pt.size = 0,
               cols = viridis(length(unique(
                 leuk_CNS@meta.data[[cns_res_col]])))) +
  ggtitle("Astrocyte score by cluster") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

v2 <- VlnPlot(leuk_CNS, features = "oligo_score1",
               group.by = cns_res_col, pt.size = 0,
               cols = viridis(length(unique(
                 leuk_CNS@meta.data[[cns_res_col]])))) +
  ggtitle("Oligodendrocyte score by cluster") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

v3 <- VlnPlot(leuk_CNS, features = "neuronal_score1",
               group.by = cns_res_col, pt.size = 0,
               cols = viridis(length(unique(
                 leuk_CNS@meta.data[[cns_res_col]])))) +
  ggtitle("Neuronal score by cluster") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

v1 / v2 / v3
Glial scores by CNS cluster

Glial scores by CNS cluster

# Cluster-level contamination summary
leuk_CNS@meta.data %>%
  group_by(cluster = .data[[cns_res_col]]) %>%
  summarise(
    n_cells          = n(),
    mean_astro       = round(mean(astrocyte_score1), 4),
    mean_oligo       = round(mean(oligo_score1),     4),
    mean_neuro       = round(mean(neuronal_score1),  4),
    pct_astro_pos    = round(mean(astrocyte_score1 > 0.1) * 100, 1),
    .groups          = "drop"
  ) %>%
  arrange(desc(mean_astro)) %>%
  kable(caption = "Glial contamination scores by CNS cluster")
Glial contamination scores by CNS cluster
cluster n_cells mean_astro mean_oligo mean_neuro pct_astro_pos
10 164 0.1545 0.1361 0.0234 61.6
11 39 0.1144 0.0898 0.0201 46.2
5 2995 -0.0007 -0.0133 -0.0001 1.1
1 8659 -0.0030 -0.0107 0.0028 1.0
6 2275 -0.0031 -0.0131 -0.0028 0.7
7 2001 -0.0033 -0.0112 0.0023 5.1
9 236 -0.0033 0.0307 -0.0008 1.3
2 8045 -0.0044 -0.0038 -0.0039 1.0
3 6311 -0.0049 -0.0227 0.0059 0.6
4 4287 -0.0057 -0.0166 -0.0008 0.7
0 11839 -0.0062 0.0124 0.0061 2.0
8 369 -0.0129 -0.0459 -0.0163 2.4

Compare to BM — BM Should Have Minimal CNS Contamination

This is a critical sanity check. If BM leukaemia cells show no glial signature but CNS leukaemia cells do, the signal is contamination from the CNS dissociation. If both show similar signal, something else is happening.

# Add to BM for comparison
leuk_BM <- AddModuleScore(leuk_BM,
                           features = list(
                             astrocyte_markers[
                               astrocyte_markers %in% rownames(leuk_BM)]),
                           name = "astrocyte_score",
                           seed = 42)

# Direct comparison
astro_comp <- bind_rows(
  data.frame(Tissue = "BM",  score = leuk_BM$astrocyte_score1),
  data.frame(Tissue = "CNS", score = leuk_CNS$astrocyte_score1)
)

wt_astro <- wilcox.test(leuk_BM$astrocyte_score1,
                         leuk_CNS$astrocyte_score1)

ggplot(astro_comp, aes(x = Tissue, y = score, fill = Tissue)) +
  geom_violin(trim = FALSE, alpha = 0.8) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.2) +
  scale_fill_manual(values = tissue_cols) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(y     = "Astrocyte module score",
       title = paste0("Astrocyte contamination: BM vs CNS\n",
                      "Wilcoxon p = ", signif(wt_astro$p.value, 3))) +
  theme(legend.position = "none")
Astrocyte score: BM vs CNS

Astrocyte score: BM vs CNS

Contamination Assessment — Co-occurrence and Cluster Analysis

Jaccard similarity analysis showed that suspect CNS marker genes were not co-expressed in the same cells (off-diagonal Jaccard < 0.1 for most pairs), consistent with diffuse ambient RNA rather than whole-cell contamination. However, cluster-level analysis revealed that astrocyte and oligodendrocyte module scores were markedly elevated in two clusters: cluster 10 (n=164 cells, mean astrocyte score 0.155, 61.6% positive) and cluster 1 (n=1,390 cells, mean astrocyte score 0.114, 46.2% positive). All other clusters showed near-zero or negative scores (mean -0.003 to -0.013). This suggests that while the majority of contamination is diffuse ambient RNA, clusters 10 and 1 warrant further investigation as they may represent either highly soup-contaminated droplets or a small population of contaminating CNS cells. GFP expression and leukaemia marker co-expression in these clusters were assessed to determine whether they should be excluded from analysis.

# 1. Look at cluster 10 and 1 closely — what do they look like?
DimPlot(leuk_CNS, reduction = "umap", label = TRUE) +
  ggtitle("Are clusters 10 and 1 spatially isolated?")

# 2. Check GFP expression in contaminated clusters
VlnPlot(leuk_CNS,
         features = "GFP",   # or your GFP gene name
         group.by = cns_res_col,
         pt.size  = 0) +
  ggtitle("GFP by cluster — are clusters 10 and 1 GFP-low?")

# 3. Check leukaemia markers in these clusters
leuk_markers <- c("Meis1", "Runx1", "Cdk6", "Bcl2", "Mki67")
DotPlot(leuk_CNS,
         features = c(leuk_markers,
                      "Slc1a2", "Sparcl1", "S100b"),
         group.by = cns_res_col,
         cols     = c("lightgrey", "firebrick")) +
  RotatedAxis() +
  ggtitle("Leukaemia vs astrocyte markers by cluster")

# 4. Flag high contamination cells for sensitivity analysis
leuk_CNS$contam_cluster <- leuk_CNS@meta.data[[cns_res_col]] %in% 
                             c("10", "11")

# 5. What fraction of each cluster's contamination is driven by 
# the top suspect genes specifically
contam_genes <- c("Slc1a2", "Sparcl1", "S100b", "Atp1a2", "Mobp")
contam_expr  <- GetAssayData(leuk_CNS, layer = "data")[
                  contam_genes[contam_genes %in% rownames(leuk_CNS)], ]

leuk_CNS$contam_gene_sum <- colSums(contam_expr)

VlnPlot(leuk_CNS,
         features = "contam_gene_sum",
         group.by = cns_res_col,
         pt.size  = 0) +
  ggtitle("Sum of suspect gene expression by cluster")

## Contamination — Final Assessment and Action

Four converging lines of evidence identified clusters 10 (n=164) and 11 as contaminating CNS cells rather than leukaemia:

  1. GFP expression — both clusters were GFP-negative, in contrast to all other leukaemia clusters
  2. Leukaemia markers — negligible expression of Meis1, Runx1, Cdk6, Bcl2, and Mki67
  3. CNS marker expression — clear Slc1a2, Sparcl1, and S100b expression
  4. Suspect gene sum — categorical outliers relative to all other clusters

Clusters 10 and 11 were excluded from all subsequent analyses. All remaining clusters (0–9) showed near-zero suspect gene expression, confirming that the ambient RNA identified in the UMI inspection is diffusely distributed at negligible levels across bona fide leukaemia cells and does not confound the analysis. The CNS vs BM pseudobulk differential expression analysis was re-run on the cleaned dataset.

and BM leukaemia cells, independent of tissue dissociation artefacts.

Exclude clusters 10 and 11, suspected neural contamination, from CNS object

cns_clean_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_clean_march26.qs"

if (!file.exists(cns_clean_path)) {
  
  # Explicitly exclude clusters 10 and 11 — confirmed GFP-negative,
  # leukaemia marker-negative, CNS marker-positive
  clusters_to_keep <- setdiff(
    unique(as.character(leuk_CNS@meta.data$originalexp_snn_res.0.3)),
    c("10", "11")
  )
  
  leuk_CNS_clean <- subset(leuk_CNS,
                             subset = originalexp_snn_res.0.3 %in%
                               clusters_to_keep)
  
  qs_save(leuk_CNS_clean, cns_clean_path)
  
} else {
  leuk_CNS_clean <- qs_read(cns_clean_path)
}

cat("CNS cells before exclusion:", ncol(leuk_CNS),       "\n")
## CNS cells before exclusion: 47220
cat("CNS cells after exclusion: ", ncol(leuk_CNS_clean), "\n")
## CNS cells after exclusion:  47017
cat("Cells removed:             ", 
    ncol(leuk_CNS) - ncol(leuk_CNS_clean),               "\n")
## Cells removed:              203
# Verify — clusters 10 and 11 should be absent
cat("\nClusters remaining:\n")
## 
## Clusters remaining:
print(sort(as.integer(unique(
  leuk_CNS_clean$originalexp_snn_res.0.3))))
##  [1]  1  2  3  4  5  6  7  8  9 10

Script 04_DEanalysis_cleanCNS re-run for DEanalysis with clusters 10 and 11 excluded

Concordance Analysis — Pre vs Post Cluster Exclusion

Comparison of DE results before and after cluster exclusion showed high overall concordance (Pearson r = 0.991), confirming that the removal of clusters 10 and 11 did not substantially alter the genuine CNS vs BM transcriptional differences. However, ambient RNA marker genes remained significantly CNS-enriched after cluster exclusion, with log2FC values reduced but still substantial (Sparcl1: 8.27 → 6.72; Slc1a2: 8.40 → 6.65; Apod: 6.32 → 6.25). This indicates that while clusters 10 and 11 contributed to the ambient signal, a residual ambient RNA component is present at low levels across all remaining CNS leukaemia clusters.

This residual signal reflects the sensitivity of pseudobulk aggregation at scale — with approximately 47,000 CNS leukaemia cells aggregated per sample, even fractional UMI counts per cell (mean < 0.02 for all suspect genes) accumulate to a statistically detectable pseudobulk signal. The ambient RNA is not biologically meaningful at the single-cell level but is amplified by the depth of aggregation.

Two complementary approaches were therefore applied:

1. DecontX correction

2. Ambient RNA blacklist — Genes with established CNS cell type identity and no plausible leukaemia cell expression (Slc1a2, Sparcl1, Apod, Cpe, Mobp, S100b, Atp1a2, Ndrg2, Gfap) were flagged in DE results regardless of statistical significance. These genes are excluded from pathway enrichment analysis and biological interpretation. This annotation is retained transparently in all results tables.

Note on DecontX

DecontX was applied to the cleaned CNS leukaemia object to estimate residual ambient RNA contamination. However, the algorithm returned implausible contamination estimates (mean 22.6%), inconsistent with the UMI-level inspection showing mean < 0.02 UMI per cell for all suspect genes. The model also failed to converge across 500 iterations, with the convergence metric diverging after iteration 310. This is consistent with the known limitation of DecontX on transcriptionally homogeneous datasets, where the absence of distinct cell type profiles prevents the mixture model from reliably distinguishing native from contaminant signal (Janssen et al., Genome Biology, 2023). DecontX correction was therefore not applied. Contamination was addressed by exclusion of GFP-negative clusters 10 and 11, with residual ambient genes flagged in DE results based on UMI-level evidence.

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] celda_1.22.1                Matrix_1.7-0               
##  [3] SingleCellExperiment_1.28.1 DESeq2_1.46.0              
##  [5] SummarizedExperiment_1.36.0 Biobase_2.66.0             
##  [7] MatrixGenerics_1.18.1       matrixStats_1.5.0          
##  [9] GenomicRanges_1.58.0        GenomeInfoDb_1.42.3        
## [11] IRanges_2.40.1              S4Vectors_0.44.0           
## [13] BiocGenerics_0.52.0         ggrepel_0.9.6              
## [15] tibble_3.3.1                tidyr_1.3.1                
## [17] viridis_0.6.5               viridisLite_0.4.2          
## [19] qs2_0.1.7                   knitr_1.51                 
## [21] dplyr_1.1.4                 patchwork_1.3.2            
## [23] ggplot2_4.0.2               Seurat_5.4.0               
## [25] SeuratObject_5.3.0          sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.23        splines_4.4.1           later_1.4.6            
##   [4] polyclip_1.10-7         pROC_1.19.0.1           fastDummies_1.7.5      
##   [7] lifecycle_1.0.5         doParallel_1.0.17       globals_0.16.3         
##  [10] lattice_0.22-6          MASS_7.3-60.2           dendextend_1.19.1      
##  [13] magrittr_2.0.3          plotly_4.12.0           sass_0.4.9             
##  [16] rmarkdown_2.30          jquerylib_0.1.4         yaml_2.3.12            
##  [19] httpuv_1.6.16           otel_0.2.0              sctransform_0.4.3      
##  [22] spam_2.11-3             spatstat.sparse_3.1-0   reticulate_1.45.0      
##  [25] cowplot_1.2.0           pbapply_1.7-4           RColorBrewer_1.1-3     
##  [28] abind_1.4-5             zlibbioc_1.52.0         Rtsne_0.17             
##  [31] purrr_1.0.2             WriteXLS_6.8.0          GenomeInfoDbData_1.2.13
##  [34] irlba_2.3.7             listenv_0.9.1           spatstat.utils_3.2-1   
##  [37] goftest_1.2-3           RSpectra_0.16-2         spatstat.random_3.4-4  
##  [40] fitdistrplus_1.2-6      parallelly_1.37.1       codetools_0.2-20       
##  [43] DelayedArray_0.32.0     tidyselect_1.2.1        UCSC.utils_1.2.0       
##  [46] farver_2.1.2            spatstat.explore_3.7-0  jsonlite_2.0.0         
##  [49] progressr_0.18.0        ggridges_0.5.6          survival_3.6-4         
##  [52] iterators_1.0.14        foreach_1.5.2           dbscan_1.2.4           
##  [55] tools_4.4.1             ica_1.0-3               Rcpp_1.0.12            
##  [58] glue_1.8.0              gridExtra_2.3           SparseArray_1.6.2      
##  [61] xfun_0.56               withr_3.0.2             combinat_0.0-8         
##  [64] fastmap_1.2.0           MCMCprecision_0.4.2     fansi_1.0.6            
##  [67] digest_0.6.35           R6_2.6.1                mime_0.12              
##  [70] scattermore_1.2         tensor_1.5.1            dichromat_2.0-0.1      
##  [73] spatstat.data_3.1-9     utf8_1.2.4              generics_0.1.3         
##  [76] data.table_1.15.4       httr_1.4.7              htmlwidgets_1.6.4      
##  [79] S4Arrays_1.6.0          uwot_0.2.4              pkgconfig_2.0.3        
##  [82] gtable_0.3.6            lmtest_0.9-40           S7_0.2.1               
##  [85] XVector_0.46.0          htmltools_0.5.8.1       dotCall64_1.2          
##  [88] scales_1.4.0            png_0.1-8               spatstat.univar_3.1-6  
##  [91] enrichR_3.4             ggdendro_0.2.0          rstudioapi_0.16.0      
##  [94] reshape2_1.4.4          rjson_0.2.23            nlme_3.1-164           
##  [97] curl_5.2.1              cachem_1.1.0            zoo_1.8-15             
## [100] stringr_1.5.1           KernSmooth_2.23-24      vipor_0.4.7            
## [103] parallel_4.4.1          miniUI_0.1.2            ggrastr_1.0.2          
## [106] pillar_1.9.0            grid_4.4.1              vctrs_0.6.5            
## [109] RANN_2.6.2              promises_1.5.0          stringfish_0.18.0      
## [112] xtable_1.8-4            cluster_2.1.6           beeswarm_0.4.0         
## [115] evaluate_1.0.5          cli_3.6.5               locfit_1.5-9.12        
## [118] compiler_4.4.1          rlang_1.1.7             crayon_1.5.2           
## [121] future.apply_1.11.2     labeling_0.4.3          ggbeeswarm_0.7.3       
## [124] plyr_1.8.9              stringi_1.8.4           deldir_2.0-4           
## [127] BiocParallel_1.40.2     lazyeval_0.2.2          spatstat.geom_3.7-0    
## [130] RcppHNSW_0.6.0          RcppEigen_0.3.4.0.0     future_1.33.2          
## [133] shiny_1.12.1            ROCR_1.0-12             igraph_2.2.2           
## [136] RcppParallel_5.1.8      bslib_0.7.0