Overview

Contamination assessment (script 08) identified clusters 10 and 11 in the CNS leukaemia object as non-leukaemia cells based on four converging lines of evidence: GFP negativity, absence of leukaemia markers (Meis1, Runx1, Cdk6, Bcl2, Mki67), expression of CNS cell type markers (Slc1a2, Sparcl1, S100b), and categorically elevated suspect gene expression sums relative to all other clusters. These clusters (n = 164 + cluster 11 cells) were excluded prior to re-running the pseudobulk differential expression analysis.

All remaining CNS leukaemia clusters (0–9) showed near-zero ambient RNA signal (mean suspect gene UMI < 0.02, median = 0), confirming that diffuse ambient contamination is negligible and does not require further correction.


Setup

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:sp':
## 
##     %over%
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
## 
##     Assays
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
library(knitr)
library(qs2)
## qs2 0.1.7
library(ggrepel)
library(tibble)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(scales)
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")
source_cols <- c("AD4_EM3"   = "#E69F00",
                 "AD2.2_EM4" = "#56B4E9",
                 "AD5_EM4"   = "#CC79A7")

bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"

# Clusters confirmed as contaminating CNS cells
contam_clusters <- c("10", "11")

# Ambient RNA genes to flag in results
ambient_genes <- c("Slc1a2", "Sparcl1", "Cpe",   "Apod",
                    "Mobp",   "S100b",   "Atp1a2","Ndrg2",
                    "Map2",   "Gfap",    "Slc1a3")
leuk_BM  <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")  # pre-cleaning
leuk_CNS_clean <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_clean_march26.qs")

stopifnot("leuk_BM not found"  = exists("leuk_BM"))
stopifnot("leuk_CNS not found" = exists("leuk_CNS"))

cat("BM  cells (input):", ncol(leuk_BM),  "\n")
## BM  cells (input): 44606
cat("CNS cells (input):", ncol(leuk_CNS), "\n")
## CNS cells (input): 47220

Verify Exclusion on UMAP

p_before <- DimPlot(leuk_CNS,
                     reduction  = "umap",
                     group.by   = cns_res_col,
                     label      = TRUE,
                     label.size = 3) +
  ggtitle(paste("Before exclusion —", ncol(leuk_CNS), "cells")) +
  NoLegend()

p_after <- DimPlot(leuk_CNS_clean,
                    reduction  = "umap",
                    group.by   = cns_res_col,
                    label      = TRUE,
                    label.size = 3) +
  ggtitle(paste("After exclusion —", ncol(leuk_CNS_clean), "cells")) +
  NoLegend()

p_before + p_after
CNS leukaemia before and after cluster exclusion

CNS leukaemia before and after cluster exclusion

Confirm Suspect Gene Expression Is Resolved

suspect_genes <- c("Slc1a2", "Sparcl1", "S100b", "Atp1a2", "Mobp")
suspect_found <- suspect_genes[suspect_genes %in% rownames(leuk_CNS_clean)]

suspect_expr <- GetAssayData(leuk_CNS_clean, layer = "data")[suspect_found, ]
leuk_CNS_clean$contam_gene_sum <- colSums(suspect_expr)

VlnPlot(leuk_CNS_clean,
         features  = "contam_gene_sum",
         group.by  = cns_res_col,
         pt.size   = 0,
         cols      = scales::hue_pal()(length(unique(
                       leuk_CNS_clean@meta.data[[cns_res_col]])))) +
  ggtitle("Suspect gene sum after exclusion — all clusters should be flat") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))
Suspect gene sum after cluster exclusion

Suspect gene sum after cluster exclusion


Pseudobulk Differential Expression — Cleaned Dataset

Combine BM and Cleaned CNS

leuk_all_clean <- merge(leuk_BM, leuk_CNS_clean,
                         add.cell.ids = c("BM", "CNS"))

cat("Combined object:\n")
## Combined object:
cat("Total cells:", ncol(leuk_all_clean), "\n")
## Total cells: 91623
print(table(leuk_all_clean$Tissue))
## 
##    BM   CNS 
## 44606 47017
print(table(leuk_all_clean$Tissue, leuk_all_clean$transplant_source))
##      
##       AD2.2_EM4 AD4_EM3 AD5_EM4
##   BM      12162   12453   19991
##   CNS     13857   17640   15520

Aggregate to Pseudobulk

pseudo_clean_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/pseudo_clean.qs"

file_valid <- file.exists(pseudo_clean_path) &&
              tryCatch({ qs_read(pseudo_clean_path); TRUE },
                       error = function(e) FALSE)

if (!file_valid) {
  pseudo_clean <- AggregateExpression(
    leuk_all_clean,
    group.by      = c("Mouse_ID", "Tissue", "transplant_source"),
    return.seurat = TRUE
  )
  qs_save(pseudo_clean, pseudo_clean_path)
} else {
  pseudo_clean <- qs_read(pseudo_clean_path)
}
## Names of identity class contain underscores ('_'), replacing with dashes ('-')
## First group.by variable `Mouse_ID` starts with a number, appending `g` to ensure valid variable names
## Centering and scaling data matrix
## 
## This message is displayed once every 8 hours.
counts_clean <- GetAssayData(pseudo_clean, layer = "counts")
meta_clean   <- pseudo_clean@meta.data
meta_clean$Tissue            <- factor(meta_clean$Tissue, levels = c("BM", "CNS"))
meta_clean$transplant_source <- factor(meta_clean$transplant_source)
meta_clean$Mouse_ID          <- factor(meta_clean$Mouse_ID)

cat("Pseudobulk matrix:", nrow(counts_clean), "genes x",
    ncol(counts_clean), "samples\n")
## Pseudobulk matrix: 26694 genes x 12 samples
print(table(meta_clean$Tissue, meta_clean$transplant_source))
##      
##       AD2.2-EM4 AD4-EM3 AD5-EM4
##   BM          2       2       2
##   CNS         2       2       2

DESeq2 — Transplant Source as Covariate

dds_clean_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/dds_clean.qs"

if (!file.exists(dds_clean_path)) {
  dds_clean <- DESeqDataSetFromMatrix(
    countData = counts_clean,
    colData   = meta_clean,
    design    = ~ transplant_source + Tissue
  )
  keep      <- rowSums(counts(dds_clean) >= 10) >= 3
  dds_clean <- dds_clean[keep, ]
  dds_clean <- DESeq(dds_clean)
  qs_save(dds_clean, dds_clean_path)
} else {
  dds_clean <- qs_read(dds_clean_path)
}
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
# These always run — fast operations
res_clean <- results(dds_clean,
                      contrast = c("Tissue", "CNS", "BM"),
                      alpha    = 0.05)

cat("\n--- CNS vs BM DE Results (cleaned dataset) ---\n")
## 
## --- CNS vs BM DE Results (cleaned dataset) ---
summary(res_clean)
## 
## out of 16195 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 168, 1%
## LFC < 0 (down)     : 170, 1%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
res_clean_df <- as.data.frame(res_clean) %>%
  rownames_to_column("gene") %>%
  filter(!is.na(padj)) %>%
  arrange(padj) %>%
  mutate(ambient_flag = gene %in% ambient_genes)

Confirm Suspect Genes Are Resolved

cat("--- Suspect gene status after cleaning ---\n")
## --- Suspect gene status after cleaning ---
res_clean_df %>%
  filter(gene %in% ambient_genes) %>%
  select(gene, log2FoldChange, baseMean, padj, ambient_flag) %>%
  mutate(across(where(is.numeric), ~round(., 3))) %>%
  arrange(desc(log2FoldChange)) %>%
  kable(caption = "Ambient RNA genes in cleaned DE results")
Ambient RNA genes in cleaned DE results
gene log2FoldChange baseMean padj ambient_flag
Sparcl1 6.716 10.022 0.000 TRUE
Slc1a2 6.647 9.545 0.000 TRUE
Apod 6.248 25.797 0.000 TRUE
Cpe 4.998 10.576 0.000 TRUE
Mobp 4.943 6.459 0.003 TRUE
S100b 4.460 5.747 0.003 TRUE
Atp1a2 3.315 4.863 0.006 TRUE
Ndrg2 2.957 15.332 0.000 TRUE
Slc1a3 1.385 4.810 0.565 TRUE
Gfap 1.194 8.840 0.327 TRUE

Compare to Pre-cleaning Results

# Load original results
res_orig_df <- read.csv("02_DE_CNSvsBM_all_mice_source_corrected.csv")

# Concordance
shared <- inner_join(
  res_orig_df  %>% select(gene, lfc_orig  = log2FoldChange),
  res_clean_df %>% select(gene, lfc_clean = log2FoldChange),
  by = "gene"
) %>%
  mutate(
    suspect  = gene %in% ambient_genes,
    resolved = suspect & abs(lfc_clean) < abs(lfc_orig)
  )

cor_val <- round(cor(shared$lfc_orig, shared$lfc_clean,
                      method = "pearson"), 3)

cat("Pearson correlation (pre vs post cleaning):", cor_val, "\n")
## Pearson correlation (pre vs post cleaning): 0.991
cat("Significant genes pre-cleaning (padj<0.05):",
    sum(res_orig_df$padj  < 0.05, na.rm = TRUE), "\n")
## Significant genes pre-cleaning (padj<0.05): 346
cat("Significant genes post-cleaning (padj<0.05):",
    sum(res_clean_df$padj < 0.05, na.rm = TRUE), "\n")
## Significant genes post-cleaning (padj<0.05): 338
ggplot(shared, aes(x = lfc_orig, y = lfc_clean, colour = suspect)) +
  geom_point(size = 0.5, alpha = 0.3) +
  geom_point(data = filter(shared, suspect), size = 2.5, alpha = 0.9) +
  ggrepel::geom_text_repel(data    = filter(shared, suspect),
                            aes(label = gene),
                            size    = 2.5,
                            max.overlaps = 20) +
  scale_colour_manual(values = c("FALSE" = "grey60",
                                  "TRUE"  = "firebrick")) +
  geom_abline(slope = 1, intercept = 0,
              linetype = "dashed", colour = "steelblue") +
  theme_classic(base_size = 12) +
  labs(x      = "log2FC — pre-cleaning",
       y      = "log2FC — post-cleaning",
       title  = paste0("DE concordance pre vs post cluster exclusion\n",
                       "Pearson r = ", cor_val),
       colour = "Ambient gene") +
  theme(legend.position = "bottom")
DE results before and after cluster exclusion

DE results before and after cluster exclusion


Black list genes

ambient_blacklist <- c("Slc1a2",  "Sparcl1", "Apod",  "Cpe",
                        "Mobp",    "S100b",   "Atp1a2","Ndrg2")

# Apply to clean DE results
res_clean_df <- res_clean_df %>%
  mutate(ambient_flag = gene %in% ambient_blacklist)

# For pathway enrichment — exclude flagged genes
res_for_enrichment <- res_clean_df %>%
  filter(!ambient_flag)

DE Results — Summary

Top CNS-Enriched Genes

res_clean_df %>%
  filter(padj < 0.05, log2FoldChange > 0,
         !ambient_flag) %>%
  head(30) %>%
  select(gene, log2FoldChange, baseMean, lfcSE, padj) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  kable(caption = "Top 30 CNS-enriched genes — cleaned dataset (padj < 0.05)")
Top 30 CNS-enriched genes — cleaned dataset (padj < 0.05)
gene log2FoldChange baseMean lfcSE padj
Soat1 0.6900 2687.4740 0.0879 0e+00
Gramd3 0.7522 785.7219 0.1005 0e+00
Plaat3 0.5200 3933.6808 0.0695 0e+00
Dnm3os 0.6839 1861.8634 0.0940 0e+00
Skil 0.5328 856.8290 0.0773 0e+00
Rnf125 0.2241 9220.7788 0.0335 0e+00
Tst 0.4258 555.0761 0.0658 0e+00
Ubb 0.1738 229500.6801 0.0271 0e+00
Snn 0.5038 529.5151 0.0836 0e+00
Abhd8 0.3369 4517.2409 0.0566 0e+00
2010110G14Rik 0.4061 2218.8930 0.0691 0e+00
Gm12958 0.4700 575.9567 0.0806 0e+00
Lef1os1 0.3475 4077.7245 0.0606 0e+00
Tiam1 0.3501 4520.4763 0.0620 0e+00
Stc1 0.8787 317.4775 0.1608 0e+00
Cd96 0.5849 1682.5851 0.1078 0e+00
Fgd1 0.7965 327.1907 0.1493 0e+00
Septin5 0.6392 226.4566 0.1213 0e+00
Cdr1os 6.2049 11.0269 1.1776 0e+00
Lef1 0.3490 76435.6625 0.0665 0e+00
Pgghg 0.2682 738.3383 0.0518 1e-04
Egfem1 0.8184 987.9560 0.1597 1e-04
Jak1 0.1906 28213.1886 0.0380 1e-04
Cmtm3 0.3654 337.3029 0.0734 2e-04
Nfkbie 0.4794 1633.4494 0.0988 3e-04
Tor3a 0.3259 480.9925 0.0682 4e-04
Greb1 0.7967 415.6434 0.1666 4e-04
Sla2 0.2994 4458.1527 0.0628 4e-04
Amz2 0.1891 5936.1767 0.0397 4e-04
Gpr83 0.8085 1918.6277 0.1708 5e-04

Top BM-Enriched Genes

res_clean_df %>%
  filter(padj < 0.05, log2FoldChange < 0,
         !ambient_flag) %>%
  arrange(log2FoldChange) %>%
  head(30) %>%
  select(gene, log2FoldChange, baseMean, lfcSE, padj) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  kable(caption = "Top 30 BM-enriched genes — cleaned dataset (padj < 0.05)")
Top 30 BM-enriched genes — cleaned dataset (padj < 0.05)
gene log2FoldChange baseMean lfcSE padj
Ighv1-55 -6.6472 156.2144 1.2691 0.0000
Igkv8-27 -6.3627 309.3203 1.1093 0.0000
Igkv10-96 -4.9884 58.2785 1.3255 0.0142
Igkv6-23 -4.4524 58.5085 1.3419 0.0452
Iglc1 -4.3387 12.2966 0.9317 0.0007
Iglv1 -4.2280 77.8822 1.0427 0.0058
Gm2682 -4.2260 44.9843 1.1035 0.0119
Gm57200 -3.6044 9.9410 1.0016 0.0220
Lpl -3.5327 10.2122 0.6106 0.0000
Cd5l -3.4785 24.8795 0.6469 0.0000
Clec7a -3.3886 21.8621 0.5261 0.0000
Jchain -3.1459 254.8241 0.6442 0.0003
Igha -3.0918 292.1313 0.6310 0.0002
Spic -2.9823 12.6075 0.6281 0.0005
Atp6v0d2 -2.8764 20.3445 0.4227 0.0000
Mgst1 -2.8658 7.7314 0.6057 0.0005
Gm30211 -2.8449 38.8918 0.5129 0.0000
Igkc -2.8245 186.8446 0.5675 0.0002
Epb41l3 -2.8117 17.9392 0.4122 0.0000
Tgm2 -2.7874 6.4121 0.6300 0.0016
Fcna -2.6913 15.3077 0.4695 0.0000
Pparg -2.6005 6.8941 0.7700 0.0393
Iglc2 -2.5899 8.2540 0.6778 0.0120
Il6ra -2.5841 6.5994 0.7705 0.0419
Vcam1 -2.5787 26.3177 0.3959 0.0000
Slc40a1 -2.5401 13.8461 0.4579 0.0000
Hebp1 -2.4878 9.5473 0.5872 0.0031
Tnfaip2 -2.4509 15.3431 0.5626 0.0021
Il18 -2.4213 11.0189 0.4332 0.0000
Hpgd -2.4197 9.1420 0.6143 0.0083

Volcano Plot

res_clean_df %>%
  filter(!ambient_flag) %>%
  mutate(
    direction = case_when(
      log2FoldChange >  0.5 & padj < 0.05 ~ "CNS-enriched",
      log2FoldChange < -0.5 & padj < 0.05 ~ "BM-enriched",
      TRUE                                 ~ "NS"
    ),
    label = ifelse(abs(log2FoldChange) > 1 & padj < 0.01 &
                     !ambient_flag, gene, NA)
  ) %>%
  ggplot(aes(x = log2FoldChange, y = -log10(padj),
             colour = direction)) +
  geom_point(size = 0.6, alpha = 0.6) +
  geom_text_repel(aes(label = label), size = 2.5, max.overlaps = 20) +
  scale_colour_manual(
    values = c("CNS-enriched" = "firebrick",
               "BM-enriched"  = "steelblue",
               "NS"           = "grey70")) +
  geom_vline(xintercept = c(-0.5, 0.5),
             linetype = "dashed", colour = "grey40") +
  geom_hline(yintercept = -log10(0.05),
             linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(x      = "log2FC (CNS vs BM)",
       y      = "-log10 adjusted p-value",
       title  = "CNS vs BM — cleaned dataset\n(ambient RNA clusters excluded)",
       colour = NULL) +
  theme(legend.position = "bottom")
## Warning: Removed 16144 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
Volcano plot — CNS vs BM, cleaned dataset

Volcano plot — CNS vs BM, cleaned dataset

PCA of Pseudobulk Samples

vsd_clean <- vst(dds_clean, blind = FALSE)

pca_data <- plotPCA(vsd_clean,
                     intgroup = c("Tissue", "transplant_source"),
                     returnData = TRUE)
## using ntop=500 top features by variance
pct_var  <- round(100 * attr(pca_data, "percentVar"), 1)

ggplot(pca_data,
       aes(x = PC1, y = PC2,
           colour = transplant_source,
           shape  = Tissue,
           label  = name)) +
  geom_point(size = 4) +
  geom_text_repel(size = 2.8) +
  scale_colour_manual(values = source_cols) +
  scale_shape_manual(values = c("BM" = 16, "CNS" = 17)) +
  theme_classic(base_size = 12) +
  labs(x      = paste0("PC1: ", pct_var[1], "% variance"),
       y      = paste0("PC2: ", pct_var[2], "% variance"),
       title  = "Pseudobulk PCA — cleaned dataset",
       colour = "Transplant source",
       shape  = "Tissue")
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
Pseudobulk PCA after cluster exclusion

Pseudobulk PCA after cluster exclusion


Save Outputs

# Save DE results — full and filtered
write.csv(res_clean_df,
           "04_DE_CNSvsBM_clean.csv",
           row.names = FALSE)

# Save CNS signature (clean, no ambient genes)
res_clean_df %>%
  filter(padj < 0.05, log2FoldChange > 0.5, !ambient_flag) %>%
  write.csv("04_CNS_signature_clean.csv", row.names = FALSE)

# Save BM signature (clean)
res_clean_df %>%
  filter(padj < 0.05, log2FoldChange < -0.5, !ambient_flag) %>%
  write.csv("04_BM_signature_clean.csv", row.names = FALSE)

cat("Saved:\n")
## Saved:
cat("04_leuk_CNS_clean.qs          —", ncol(leuk_CNS_clean), "cells\n")
## 04_leuk_CNS_clean.qs          — 47017 cells
cat("04_DE_CNSvsBM_clean.csv\n")
## 04_DE_CNSvsBM_clean.csv
cat("04_CNS_signature_clean.csv\n")
## 04_CNS_signature_clean.csv
cat("04_BM_signature_clean.csv\n")
## 04_BM_signature_clean.csv

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] scales_1.4.0                tidyr_1.3.1                
##  [3] tibble_3.3.1                ggrepel_0.9.6              
##  [5] qs2_0.1.7                   knitr_1.51                 
##  [7] DESeq2_1.46.0               SummarizedExperiment_1.36.0
##  [9] Biobase_2.66.0              MatrixGenerics_1.18.1      
## [11] matrixStats_1.5.0           GenomicRanges_1.58.0       
## [13] GenomeInfoDb_1.42.3         IRanges_2.40.1             
## [15] S4Vectors_0.44.0            BiocGenerics_0.52.0        
## [17] dplyr_1.1.4                 patchwork_1.3.2            
## [19] ggplot2_4.0.2               Seurat_5.4.0               
## [21] SeuratObject_5.3.0          sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_2.0.0         
##   [4] magrittr_2.0.3          ggbeeswarm_0.7.3        spatstat.utils_3.2-1   
##   [7] farver_2.1.2            rmarkdown_2.30          zlibbioc_1.52.0        
##  [10] vctrs_0.6.5             ROCR_1.0-12             spatstat.explore_3.7-0 
##  [13] S4Arrays_1.6.0          htmltools_0.5.8.1       SparseArray_1.6.2      
##  [16] sass_0.4.9              sctransform_0.4.3       parallelly_1.37.1      
##  [19] KernSmooth_2.23-24      bslib_0.7.0             htmlwidgets_1.6.4      
##  [22] ica_1.0-3               plyr_1.8.9              plotly_4.12.0          
##  [25] zoo_1.8-15              cachem_1.1.0            igraph_2.2.2           
##  [28] mime_0.12               lifecycle_1.0.5         pkgconfig_2.0.3        
##  [31] Matrix_1.7-0            R6_2.6.1                fastmap_1.2.0          
##  [34] GenomeInfoDbData_1.2.13 fitdistrplus_1.2-6      future_1.33.2          
##  [37] shiny_1.12.1            digest_0.6.35           tensor_1.5.1           
##  [40] RSpectra_0.16-2         irlba_2.3.7             labeling_0.4.3         
##  [43] progressr_0.18.0        fansi_1.0.6             spatstat.sparse_3.1-0  
##  [46] httr_1.4.7              polyclip_1.10-7         abind_1.4-5            
##  [49] compiler_4.4.1          withr_3.0.2             S7_0.2.1               
##  [52] BiocParallel_1.40.2     fastDummies_1.7.5       MASS_7.3-60.2          
##  [55] DelayedArray_0.32.0     tools_4.4.1             vipor_0.4.7            
##  [58] lmtest_0.9-40           otel_0.2.0              beeswarm_0.4.0         
##  [61] httpuv_1.6.16           future.apply_1.11.2     goftest_1.2-3          
##  [64] glue_1.8.0              nlme_3.1-164            promises_1.5.0         
##  [67] grid_4.4.1              Rtsne_0.17              cluster_2.1.6          
##  [70] reshape2_1.4.4          generics_0.1.3          gtable_0.3.6           
##  [73] spatstat.data_3.1-9     data.table_1.15.4       stringfish_0.18.0      
##  [76] utf8_1.2.4              XVector_0.46.0          spatstat.geom_3.7-0    
##  [79] RcppAnnoy_0.0.23        RANN_2.6.2              pillar_1.9.0           
##  [82] stringr_1.5.1           spam_2.11-3             RcppHNSW_0.6.0         
##  [85] later_1.4.6             splines_4.4.1           lattice_0.22-6         
##  [88] survival_3.6-4          deldir_2.0-4            tidyselect_1.2.1       
##  [91] locfit_1.5-9.12         miniUI_0.1.2            pbapply_1.7-4          
##  [94] gridExtra_2.3           scattermore_1.2         xfun_0.56              
##  [97] stringi_1.8.4           UCSC.utils_1.2.0        lazyeval_0.2.2         
## [100] yaml_2.3.12             evaluate_1.0.5          codetools_0.2-20       
## [103] cli_3.6.5               RcppParallel_5.1.8      uwot_0.2.4             
## [106] xtable_1.8-4            reticulate_1.45.0       jquerylib_0.1.4        
## [109] dichromat_2.0-0.1       Rcpp_1.0.12             globals_0.16.3         
## [112] spatstat.random_3.4-4   png_0.1-8               ggrastr_1.0.2          
## [115] spatstat.univar_3.1-6   parallel_4.4.1          dotCall64_1.2          
## [118] listenv_0.9.1           viridisLite_0.4.2       ggridges_0.5.6         
## [121] crayon_1.5.2            purrr_1.0.2             rlang_1.1.7            
## [124] cowplot_1.2.0