Overview

Contamination assessment (script 08) identified clusters 10 and 11 in the CNS leukaemia object as non-leukaemia cells based on four converging lines of evidence: GFP negativity, absence of leukaemia markers (Meis1, Runx1, Cdk6, Bcl2, Mki67), expression of CNS cell type markers (Slc1a2, Sparcl1, S100b), and categorically elevated suspect gene expression sums relative to all other clusters. These clusters (n = 164 + cluster 11 cells) were excluded prior to re-running the pseudobulk differential expression analysis.

All remaining CNS leukaemia clusters (0–9) showed near-zero ambient RNA signal (mean suspect gene UMI < 0.02, median = 0), confirming that diffuse ambient contamination is negligible and does not require further correction.

Setup

library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(ggplot2)
library(patchwork)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following object is masked from 'package:SeuratObject':
## 
##     intersect

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:sp':
## 
##     %over%

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'SummarizedExperiment'

## The following object is masked from 'package:Seurat':
## 
##     Assays

## The following object is masked from 'package:SeuratObject':
## 
##     Assays

library(knitr)
library(qs2)

## qs2 0.1.7

library(ggrepel)
library(tibble)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:S4Vectors':
## 
##     expand

library(scales)

tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")
source_cols <- c("AD4_EM3"   = "#E69F00",
                 "AD2.2_EM4" = "#56B4E9",
                 "AD5_EM4"   = "#CC79A7")

bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"

# Clusters confirmed as contaminating CNS cells
contam_clusters <- c("10", "11")

# Ambient RNA genes to flag in results
ambient_genes <- c("Slc1a2", "Sparcl1", "Cpe",   "Apod",
                    "Mobp",   "S100b",   "Atp1a2","Ndrg2",
                    "Map2",   "Gfap",    "Slc1a3")

leuk_BM  <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")  # pre-cleaning
leuk_CNS_clean <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_clean_march26.qs")

stopifnot("leuk_BM not found"  = exists("leuk_BM"))
stopifnot("leuk_CNS not found" = exists("leuk_CNS"))

cat("BM  cells (input):", ncol(leuk_BM),  "\n")

## BM  cells (input): 44606

cat("CNS cells (input):", ncol(leuk_CNS), "\n")

## CNS cells (input): 47220

Verify Exclusion on UMAP

p_before <- DimPlot(leuk_CNS,
                     reduction  = "umap",
                     group.by   = cns_res_col,
                     label      = TRUE,
                     label.size = 3) +
  ggtitle(paste("Before exclusion —", ncol(leuk_CNS), "cells")) +
  NoLegend()

p_after <- DimPlot(leuk_CNS_clean,
                    reduction  = "umap",
                    group.by   = cns_res_col,
                    label      = TRUE,
                    label.size = 3) +
  ggtitle(paste("After exclusion —", ncol(leuk_CNS_clean), "cells")) +
  NoLegend()

p_before + p_after

CNS leukaemia before and after cluster exclusion

Confirm Suspect Gene Expression Is Resolved

suspect_genes <- c("Slc1a2", "Sparcl1", "S100b", "Atp1a2", "Mobp")
suspect_found <- suspect_genes[suspect_genes %in% rownames(leuk_CNS_clean)]

suspect_expr <- GetAssayData(leuk_CNS_clean, layer = "data")[suspect_found, ]
leuk_CNS_clean$contam_gene_sum <- colSums(suspect_expr)

VlnPlot(leuk_CNS_clean,
         features  = "contam_gene_sum",
         group.by  = cns_res_col,
         pt.size   = 0,
         cols      = scales::hue_pal()(length(unique(
                       leuk_CNS_clean@meta.data[[cns_res_col]])))) +
  ggtitle("Suspect gene sum after exclusion — all clusters should be flat") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

Suspect gene sum after cluster exclusion

Pseudobulk Differential Expression — Cleaned Dataset

Combine BM and Cleaned CNS

leuk_all_clean <- merge(leuk_BM, leuk_CNS_clean,
                         add.cell.ids = c("BM", "CNS"))

cat("Combined object:\n")

## Combined object:

cat("Total cells:", ncol(leuk_all_clean), "\n")

## Total cells: 91623

print(table(leuk_all_clean$Tissue))

## 
##    BM   CNS 
## 44606 47017

print(table(leuk_all_clean$Tissue, leuk_all_clean$transplant_source))

##      
##       AD2.2_EM4 AD4_EM3 AD5_EM4
##   BM      12162   12453   19991
##   CNS     13857   17640   15520

Aggregate to Pseudobulk

pseudo_clean_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/pseudo_clean.qs"

file_valid <- file.exists(pseudo_clean_path) &&
              tryCatch({ qs_read(pseudo_clean_path); TRUE },
                       error = function(e) FALSE)

if (!file_valid) {
  pseudo_clean <- AggregateExpression(
    leuk_all_clean,
    group.by      = c("Mouse_ID", "Tissue", "transplant_source"),
    return.seurat = TRUE
  )
  qs_save(pseudo_clean, pseudo_clean_path)
} else {
  pseudo_clean <- qs_read(pseudo_clean_path)
}

## Names of identity class contain underscores ('_'), replacing with dashes ('-')
## First group.by variable `Mouse_ID` starts with a number, appending `g` to ensure valid variable names
## Centering and scaling data matrix
## 
## This message is displayed once every 8 hours.

counts_clean <- GetAssayData(pseudo_clean, layer = "counts")
meta_clean   <- pseudo_clean@meta.data
meta_clean$Tissue            <- factor(meta_clean$Tissue, levels = c("BM", "CNS"))
meta_clean$transplant_source <- factor(meta_clean$transplant_source)
meta_clean$Mouse_ID          <- factor(meta_clean$Mouse_ID)

cat("Pseudobulk matrix:", nrow(counts_clean), "genes x",
    ncol(counts_clean), "samples\n")

## Pseudobulk matrix: 26694 genes x 12 samples

print(table(meta_clean$Tissue, meta_clean$transplant_source))

##      
##       AD2.2-EM4 AD4-EM3 AD5-EM4
##   BM          2       2       2
##   CNS         2       2       2

DESeq2 — Transplant Source as Covariate

dds_clean_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/dds_clean.qs"

if (!file.exists(dds_clean_path)) {
  dds_clean <- DESeqDataSetFromMatrix(
    countData = counts_clean,
    colData   = meta_clean,
    design    = ~ transplant_source + Tissue
  )
  keep      <- rowSums(counts(dds_clean) >= 10) >= 3
  dds_clean <- dds_clean[keep, ]
  dds_clean <- DESeq(dds_clean)
  qs_save(dds_clean, dds_clean_path)
} else {
  dds_clean <- qs_read(dds_clean_path)
}

## converting counts to integer mode

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## estimating size factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## final dispersion estimates

## fitting model and testing

# These always run — fast operations
res_clean <- results(dds_clean,
                      contrast = c("Tissue", "CNS", "BM"),
                      alpha    = 0.05)

cat("\n--- CNS vs BM DE Results (cleaned dataset) ---\n")

## 
## --- CNS vs BM DE Results (cleaned dataset) ---

summary(res_clean)

## 
## out of 16195 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 168, 1%
## LFC < 0 (down)     : 170, 1%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

res_clean_df <- as.data.frame(res_clean) %>%
  rownames_to_column("gene") %>%
  filter(!is.na(padj)) %>%
  arrange(padj) %>%
  mutate(ambient_flag = gene %in% ambient_genes)

Confirm Suspect Genes Are Resolved

cat("--- Suspect gene status after cleaning ---\n")

## --- Suspect gene status after cleaning ---

res_clean_df %>%
  filter(gene %in% ambient_genes) %>%
  select(gene, log2FoldChange, baseMean, padj, ambient_flag) %>%
  mutate(across(where(is.numeric), ~round(., 3))) %>%
  arrange(desc(log2FoldChange)) %>%
  kable(caption = "Ambient RNA genes in cleaned DE results")

Ambient RNA genes in cleaned DE results
gene	log2FoldChange	baseMean	padj	ambient_flag
Sparcl1	6.716	10.022	0.000	TRUE
Slc1a2	6.647	9.545	0.000	TRUE
Apod	6.248	25.797	0.000	TRUE
Cpe	4.998	10.576	0.000	TRUE
Mobp	4.943	6.459	0.003	TRUE
S100b	4.460	5.747	0.003	TRUE
Atp1a2	3.315	4.863	0.006	TRUE
Ndrg2	2.957	15.332	0.000	TRUE
Slc1a3	1.385	4.810	0.565	TRUE
Gfap	1.194	8.840	0.327	TRUE

Compare to Pre-cleaning Results

# Load original results
res_orig_df <- read.csv("02_DE_CNSvsBM_all_mice_source_corrected.csv")

# Concordance
shared <- inner_join(
  res_orig_df  %>% select(gene, lfc_orig  = log2FoldChange),
  res_clean_df %>% select(gene, lfc_clean = log2FoldChange),
  by = "gene"
) %>%
  mutate(
    suspect  = gene %in% ambient_genes,
    resolved = suspect & abs(lfc_clean) < abs(lfc_orig)
  )

cor_val <- round(cor(shared$lfc_orig, shared$lfc_clean,
                      method = "pearson"), 3)

cat("Pearson correlation (pre vs post cleaning):", cor_val, "\n")

## Pearson correlation (pre vs post cleaning): 0.991

cat("Significant genes pre-cleaning (padj<0.05):",
    sum(res_orig_df$padj  < 0.05, na.rm = TRUE), "\n")

## Significant genes pre-cleaning (padj<0.05): 346

cat("Significant genes post-cleaning (padj<0.05):",
    sum(res_clean_df$padj < 0.05, na.rm = TRUE), "\n")

## Significant genes post-cleaning (padj<0.05): 338

ggplot(shared, aes(x = lfc_orig, y = lfc_clean, colour = suspect)) +
  geom_point(size = 0.5, alpha = 0.3) +
  geom_point(data = filter(shared, suspect), size = 2.5, alpha = 0.9) +
  ggrepel::geom_text_repel(data    = filter(shared, suspect),
                            aes(label = gene),
                            size    = 2.5,
                            max.overlaps = 20) +
  scale_colour_manual(values = c("FALSE" = "grey60",
                                  "TRUE"  = "firebrick")) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "steelblue") +
  theme_classic(base_size = 12) +
  labs(x      = "log2FC — pre-cleaning",
       y      = "log2FC — post-cleaning",
       title  = paste0("DE concordance pre vs post cluster exclusion\n",
                       "Pearson r = ", cor_val),
       colour = "Ambient gene") +
  theme(legend.position = "bottom")

DE results before and after cluster exclusion

Black list genes

ambient_blacklist <- c("Slc1a2",  "Sparcl1", "Apod",  "Cpe",
                        "Mobp",    "S100b",   "Atp1a2","Ndrg2")

# Apply to clean DE results
res_clean_df <- res_clean_df %>%
  mutate(ambient_flag = gene %in% ambient_blacklist)

# For pathway enrichment — exclude flagged genes
res_for_enrichment <- res_clean_df %>%
  filter(!ambient_flag)

DE Results — Summary

Top CNS-Enriched Genes

res_clean_df %>%
  filter(padj < 0.05, log2FoldChange > 0,
         !ambient_flag) %>%
  head(30) %>%
  select(gene, log2FoldChange, baseMean, lfcSE, padj) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  kable(caption = "Top 30 CNS-enriched genes — cleaned dataset (padj < 0.05)")

Top 30 CNS-enriched genes — cleaned dataset (padj < 0.05)
gene	log2FoldChange	baseMean	lfcSE	padj
Soat1	0.6900	2687.4740	0.0879	0e+00
Gramd3	0.7522	785.7219	0.1005	0e+00
Plaat3	0.5200	3933.6808	0.0695	0e+00
Dnm3os	0.6839	1861.8634	0.0940	0e+00
Skil	0.5328	856.8290	0.0773	0e+00
Rnf125	0.2241	9220.7788	0.0335	0e+00
Tst	0.4258	555.0761	0.0658	0e+00
Ubb	0.1738	229500.6801	0.0271	0e+00
Snn	0.5038	529.5151	0.0836	0e+00
Abhd8	0.3369	4517.2409	0.0566	0e+00
2010110G14Rik	0.4061	2218.8930	0.0691	0e+00
Gm12958	0.4700	575.9567	0.0806	0e+00
Lef1os1	0.3475	4077.7245	0.0606	0e+00
Tiam1	0.3501	4520.4763	0.0620	0e+00
Stc1	0.8787	317.4775	0.1608	0e+00
Cd96	0.5849	1682.5851	0.1078	0e+00
Fgd1	0.7965	327.1907	0.1493	0e+00
Septin5	0.6392	226.4566	0.1213	0e+00
Cdr1os	6.2049	11.0269	1.1776	0e+00
Lef1	0.3490	76435.6625	0.0665	0e+00
Pgghg	0.2682	738.3383	0.0518	1e-04
Egfem1	0.8184	987.9560	0.1597	1e-04
Jak1	0.1906	28213.1886	0.0380	1e-04
Cmtm3	0.3654	337.3029	0.0734	2e-04
Nfkbie	0.4794	1633.4494	0.0988	3e-04
Tor3a	0.3259	480.9925	0.0682	4e-04
Greb1	0.7967	415.6434	0.1666	4e-04
Sla2	0.2994	4458.1527	0.0628	4e-04
Amz2	0.1891	5936.1767	0.0397	4e-04
Gpr83	0.8085	1918.6277	0.1708	5e-04

Top BM-Enriched Genes

res_clean_df %>%
  filter(padj < 0.05, log2FoldChange < 0,
         !ambient_flag) %>%
  arrange(log2FoldChange) %>%
  head(30) %>%
  select(gene, log2FoldChange, baseMean, lfcSE, padj) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  kable(caption = "Top 30 BM-enriched genes — cleaned dataset (padj < 0.05)")

Top 30 BM-enriched genes — cleaned dataset (padj < 0.05)
gene	log2FoldChange	baseMean	lfcSE	padj
Ighv1-55	-6.6472	156.2144	1.2691	0.0000
Igkv8-27	-6.3627	309.3203	1.1093	0.0000
Igkv10-96	-4.9884	58.2785	1.3255	0.0142
Igkv6-23	-4.4524	58.5085	1.3419	0.0452
Iglc1	-4.3387	12.2966	0.9317	0.0007
Iglv1	-4.2280	77.8822	1.0427	0.0058
Gm2682	-4.2260	44.9843	1.1035	0.0119
Gm57200	-3.6044	9.9410	1.0016	0.0220
Lpl	-3.5327	10.2122	0.6106	0.0000
Cd5l	-3.4785	24.8795	0.6469	0.0000
Clec7a	-3.3886	21.8621	0.5261	0.0000
Jchain	-3.1459	254.8241	0.6442	0.0003
Igha	-3.0918	292.1313	0.6310	0.0002
Spic	-2.9823	12.6075	0.6281	0.0005
Atp6v0d2	-2.8764	20.3445	0.4227	0.0000
Mgst1	-2.8658	7.7314	0.6057	0.0005
Gm30211	-2.8449	38.8918	0.5129	0.0000
Igkc	-2.8245	186.8446	0.5675	0.0002
Epb41l3	-2.8117	17.9392	0.4122	0.0000
Tgm2	-2.7874	6.4121	0.6300	0.0016
Fcna	-2.6913	15.3077	0.4695	0.0000
Pparg	-2.6005	6.8941	0.7700	0.0393
Iglc2	-2.5899	8.2540	0.6778	0.0120
Il6ra	-2.5841	6.5994	0.7705	0.0419
Vcam1	-2.5787	26.3177	0.3959	0.0000
Slc40a1	-2.5401	13.8461	0.4579	0.0000
Hebp1	-2.4878	9.5473	0.5872	0.0031
Tnfaip2	-2.4509	15.3431	0.5626	0.0021
Il18	-2.4213	11.0189	0.4332	0.0000
Hpgd	-2.4197	9.1420	0.6143	0.0083

Volcano Plot

res_clean_df %>%
  filter(!ambient_flag) %>%
  mutate(
    direction = case_when(
      log2FoldChange >  0.5 & padj < 0.05 ~ "CNS-enriched",
      log2FoldChange < -0.5 & padj < 0.05 ~ "BM-enriched",
      TRUE                                 ~ "NS"
    ),
    label = ifelse(abs(log2FoldChange) > 1 & padj < 0.01 &
                     !ambient_flag, gene, NA)
  ) %>%
  ggplot(aes(x = log2FoldChange, y = -log10(padj),
             colour = direction)) +
  geom_point(size = 0.6, alpha = 0.6) +
  geom_text_repel(aes(label = label), size = 2.5, max.overlaps = 20) +
  scale_colour_manual(
    values = c("CNS-enriched" = "firebrick",
               "BM-enriched"  = "steelblue",
               "NS"           = "grey70")) +
  geom_vline(xintercept = c(-0.5, 0.5),
             linetype = "dashed", colour = "grey40") +
  geom_hline(yintercept = -log10(0.05),
             linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(x      = "log2FC (CNS vs BM)",
       y      = "-log10 adjusted p-value",
       title  = "CNS vs BM — cleaned dataset\n(ambient RNA clusters excluded)",
       colour = NULL) +
  theme(legend.position = "bottom")

## Warning: Removed 16144 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).

Volcano plot — CNS vs BM, cleaned dataset

PCA of Pseudobulk Samples

vsd_clean <- vst(dds_clean, blind = FALSE)

pca_data <- plotPCA(vsd_clean,
                     intgroup = c("Tissue", "transplant_source"),
                     returnData = TRUE)

## using ntop=500 top features by variance

pct_var  <- round(100 * attr(pca_data, "percentVar"), 1)

ggplot(pca_data,
       aes(x = PC1, y = PC2,
           colour = transplant_source,
           shape  = Tissue,
           label  = name)) +
  geom_point(size = 4) +
  geom_text_repel(size = 2.8) +
  scale_colour_manual(values = source_cols) +
  scale_shape_manual(values = c("BM" = 16, "CNS" = 17)) +
  theme_classic(base_size = 12) +
  labs(x      = paste0("PC1: ", pct_var[1], "% variance"),
       y      = paste0("PC2: ", pct_var[2], "% variance"),
       title  = "Pseudobulk PCA — cleaned dataset",
       colour = "Transplant source",
       shape  = "Tissue")

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

Pseudobulk PCA after cluster exclusion

Save Outputs

# Save DE results — full and filtered
write.csv(res_clean_df,
           "04_DE_CNSvsBM_clean.csv",
           row.names = FALSE)

# Save CNS signature (clean, no ambient genes)
res_clean_df %>%
  filter(padj < 0.05, log2FoldChange > 0.5, !ambient_flag) %>%
  write.csv("04_CNS_signature_clean.csv", row.names = FALSE)

# Save BM signature (clean)
res_clean_df %>%
  filter(padj < 0.05, log2FoldChange < -0.5, !ambient_flag) %>%
  write.csv("04_BM_signature_clean.csv", row.names = FALSE)

cat("Saved:\n")

## Saved:

cat("04_leuk_CNS_clean.qs          —", ncol(leuk_CNS_clean), "cells\n")

## 04_leuk_CNS_clean.qs          — 47017 cells

cat("04_DE_CNSvsBM_clean.csv\n")

## 04_DE_CNSvsBM_clean.csv

cat("04_CNS_signature_clean.csv\n")

## 04_CNS_signature_clean.csv

cat("04_BM_signature_clean.csv\n")

## 04_BM_signature_clean.csv

Session Information

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] scales_1.4.0                tidyr_1.3.1                
##  [3] tibble_3.3.1                ggrepel_0.9.6              
##  [5] qs2_0.1.7                   knitr_1.51                 
##  [7] DESeq2_1.46.0               SummarizedExperiment_1.36.0
##  [9] Biobase_2.66.0              MatrixGenerics_1.18.1      
## [11] matrixStats_1.5.0           GenomicRanges_1.58.0       
## [13] GenomeInfoDb_1.42.3         IRanges_2.40.1             
## [15] S4Vectors_0.44.0            BiocGenerics_0.52.0        
## [17] dplyr_1.1.4                 patchwork_1.3.2            
## [19] ggplot2_4.0.2               Seurat_5.4.0               
## [21] SeuratObject_5.3.0          sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_2.0.0         
##   [4] magrittr_2.0.3          ggbeeswarm_0.7.3        spatstat.utils_3.2-1   
##   [7] farver_2.1.2            rmarkdown_2.30          zlibbioc_1.52.0        
##  [10] vctrs_0.6.5             ROCR_1.0-12             spatstat.explore_3.7-0 
##  [13] S4Arrays_1.6.0          htmltools_0.5.8.1       SparseArray_1.6.2      
##  [16] sass_0.4.9              sctransform_0.4.3       parallelly_1.37.1      
##  [19] KernSmooth_2.23-24      bslib_0.7.0             htmlwidgets_1.6.4      
##  [22] ica_1.0-3               plyr_1.8.9              plotly_4.12.0          
##  [25] zoo_1.8-15              cachem_1.1.0            igraph_2.2.2           
##  [28] mime_0.12               lifecycle_1.0.5         pkgconfig_2.0.3        
##  [31] Matrix_1.7-0            R6_2.6.1                fastmap_1.2.0          
##  [34] GenomeInfoDbData_1.2.13 fitdistrplus_1.2-6      future_1.33.2          
##  [37] shiny_1.12.1            digest_0.6.35           tensor_1.5.1           
##  [40] RSpectra_0.16-2         irlba_2.3.7             labeling_0.4.3         
##  [43] progressr_0.18.0        fansi_1.0.6             spatstat.sparse_3.1-0  
##  [46] httr_1.4.7              polyclip_1.10-7         abind_1.4-5            
##  [49] compiler_4.4.1          withr_3.0.2             S7_0.2.1               
##  [52] BiocParallel_1.40.2     fastDummies_1.7.5       MASS_7.3-60.2          
##  [55] DelayedArray_0.32.0     tools_4.4.1             vipor_0.4.7            
##  [58] lmtest_0.9-40           otel_0.2.0              beeswarm_0.4.0         
##  [61] httpuv_1.6.16           future.apply_1.11.2     goftest_1.2-3          
##  [64] glue_1.8.0              nlme_3.1-164            promises_1.5.0         
##  [67] grid_4.4.1              Rtsne_0.17              cluster_2.1.6          
##  [70] reshape2_1.4.4          generics_0.1.3          gtable_0.3.6           
##  [73] spatstat.data_3.1-9     data.table_1.15.4       stringfish_0.18.0      
##  [76] utf8_1.2.4              XVector_0.46.0          spatstat.geom_3.7-0    
##  [79] RcppAnnoy_0.0.23        RANN_2.6.2              pillar_1.9.0           
##  [82] stringr_1.5.1           spam_2.11-3             RcppHNSW_0.6.0         
##  [85] later_1.4.6             splines_4.4.1           lattice_0.22-6         
##  [88] survival_3.6-4          deldir_2.0-4            tidyselect_1.2.1       
##  [91] locfit_1.5-9.12         miniUI_0.1.2            pbapply_1.7-4          
##  [94] gridExtra_2.3           scattermore_1.2         xfun_0.56              
##  [97] stringi_1.8.4           UCSC.utils_1.2.0        lazyeval_0.2.2         
## [100] yaml_2.3.12             evaluate_1.0.5          codetools_0.2-20       
## [103] cli_3.6.5               RcppParallel_5.1.8      uwot_0.2.4             
## [106] xtable_1.8-4            reticulate_1.45.0       jquerylib_0.1.4        
## [109] dichromat_2.0-0.1       Rcpp_1.0.12             globals_0.16.3         
## [112] spatstat.random_3.4-4   png_0.1-8               ggrastr_1.0.2          
## [115] spatstat.univar_3.1-6   parallel_4.4.1          dotCall64_1.2          
## [118] listenv_0.9.1           viridisLite_0.4.2       ggridges_0.5.6         
## [121] crayon_1.5.2            purrr_1.0.2             rlang_1.1.7            
## [124] cowplot_1.2.0

B-ALL CNS vs BM Pseudobulk DE — Cleaned Dataset

2026-02-19