Overview

Clustree and UMAP analysis of the CNS and BM leukaemia subsets revealed two transcriptionally distinct populations forming completely separate cluster trees from as early as res 0.04. Investigation of sample distribution showed that mice 1838279 and 1838280 consistently segregated from the other four mice in both BM and CNS compartments.

QC metrics confirmed this was not a technical artefact — mitochondrial percentage was consistent across all samples (2.33–2.56%), with no evidence of differential cell quality. Metadata interrogation revealed that this separation was attributable to transplant source: mice 1838279 and 1838280 received leukaemia from donor AD5_EM4, while the remaining four mice received leukaemia from AD4_EM3 (n=2) or AD2.2_EM4 (n=2).

Transplant Source Mouse IDs n BM cells n CNS cells
AD4_EM3 1838161, 1838162 ~12,494 ~17,701
AD2.2_EM4 1838163, 1838171 ~11,122 ~11,968
AD5_EM4 1838279, 1838280 ~19,991 ~15,551

To address this variability, two complementary analytical approaches are taken:

Results from both approaches are compared to identify findings that are robust regardless of transplant source inclusion.


Setup

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
library(ggplot2)
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:SeuratObject':
## 
##     intersect
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:sp':
## 
##     %over%
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:Seurat':
## 
##     Assays
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
library(knitr)
library(qs2)
## qs2 0.1.7
library(viridis)
## Loading required package: viridisLite
library(clustree)
## Loading required package: ggraph
## 
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
## 
##     geometry
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(ggrepel)
library(tibble)
tissue_cols <- c("BM" = "steelblue", "CNS" = "firebrick")

source_cols <- c("AD4_EM3"   = "#E69F00",
                 "AD2.2_EM4" = "#56B4E9",
                 "AD5_EM4"   = "#CC79A7")

# Resolution columns — update if different
bm_res_col  <- "originalexp_snn_res.0.3"
cns_res_col <- "originalexp_snn_res.0.3"
# Load objects if starting fresh session
leuk_BM  <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")

stopifnot("leuk_BM not found"  = exists("leuk_BM"))
stopifnot("leuk_CNS not found" = exists("leuk_CNS"))

cat("BM  cells:", ncol(leuk_BM),  "\n")
## BM  cells: 44606
cat("CNS cells:", ncol(leuk_CNS), "\n")
## CNS cells: 47220

Characterise the Transplant Source Effect

Before subsetting, the transcriptional differences between transplant sources are characterised to determine whether AD5_EM4 represents a distinct disease state, a more advanced passage, or simply stochastic inter-clonal variation.

Visualise Transplant Source on UMAP

p1 <- DimPlot(leuk_BM,
               reduction = "umap",
               group.by  = "transplant_source",
               cols      = source_cols,
               alpha     = 0.5) +
  ggtitle("BM — by transplant source")

p2 <- DimPlot(leuk_CNS,
               reduction = "umap",
               group.by  = "transplant_source",
               cols      = source_cols,
               alpha     = 0.5) +
  ggtitle("CNS — by transplant source")

p1 + p2
Transplant source distribution on BM and CNS UMAPs

Transplant source distribution on BM and CNS UMAPs

# Quantify how well sources mix in BM
# If one source dominates specific clusters, mixing is incomplete
leuk_BM@meta.data %>%
  group_by(.data[[bm_res_col]], transplant_source) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(.data[[bm_res_col]]) %>%
  mutate(prop = round(n / sum(n), 3)) %>%
  tidyr::pivot_wider(names_from  = transplant_source,
                     values_from = prop,
                     values_fill = 0) %>%
  knitr::kable(caption = "Transplant source proportion per BM cluster")
Transplant source proportion per BM cluster
originalexp_snn_res.0.3 n AD2.2_EM4 AD4_EM3 AD5_EM4
0 76 0.007 0.000 0.000
0 15 0.000 0.001 0.000
0 10427 0.000 0.000 0.991
1 1568 0.288 0.000 0.000
1 3861 0.000 0.709 0.000
1 17 0.000 0.000 0.003
2 1712 0.320 0.000 0.000
2 3493 0.000 0.652 0.000
2 153 0.000 0.000 0.029
3 4727 0.974 0.000 0.000
3 28 0.000 0.006 0.000
3 98 0.000 0.000 0.020
4 44 0.009 0.000 0.000
4 12 0.000 0.003 0.000
4 4628 0.000 0.000 0.988
5 1134 0.268 0.000 0.000
5 3080 0.000 0.729 0.000
5 11 0.000 0.000 0.003
6 319 0.087 0.000 0.000
6 5 0.000 0.001 0.000
6 3346 0.000 0.000 0.912
7 537 0.251 0.000 0.000
7 520 0.000 0.243 0.000
7 1083 0.000 0.000 0.506
8 560 0.303 0.000 0.000
8 1275 0.000 0.689 0.000
8 15 0.000 0.000 0.008
9 1175 0.974 0.000 0.000
9 31 0.000 0.000 0.026
10 267 0.471 0.000 0.000
10 162 0.000 0.286 0.000
10 138 0.000 0.000 0.243
11 43 0.483 0.000 0.000
11 2 0.000 0.022 0.000
11 44 0.000 0.000 0.494

Transplant Source Effect

UMAP visualisation coloured by transplant source confirmed that AD4_EM3 and AD2.2_EM4 cells intermix freely in both BM and CNS compartments, validating their use as a matched analytical cohort. AD5_EM4 cells form a completely separate transcriptional island in both tissues, consistent with a fundamentally distinct leukaemia biology that is independent of tissue niche. This separation is therefore not attributable to CNS-specific adaptation but reflects intrinsic properties of the AD5_EM4 leukaemia. All primary CNS vs BM analyses are performed on the matched cohort (AD4_EM3 + AD2.2_EM4, n=4 mice), with AD5_EM4 characterised separately. Results are validated using Option 2 (all six mice, transplant_source as DESeq2 covariate) to confirm robustness.

QC Metrics by Transplant Source

# BM QC
bm_qc <- leuk_BM@meta.data %>%
  group_by(Mouse_ID, transplant_source) %>%
  summarise(
    n_cells       = n(),
    mean_nFeature = round(mean(nFeature_originalexp), 0),
    mean_nCount   = round(mean(nCount_originalexp),   0),
    mean_mt       = round(mean(subsets_mt_percent),   2),
    mean_ribo     = round(mean(subsets_ribo_pct),     2),
    .groups       = "drop"
  )

kable(bm_qc, caption = "BM QC metrics by mouse and transplant source")
BM QC metrics by mouse and transplant source
Mouse_ID transplant_source n_cells mean_nFeature mean_nCount mean_mt mean_ribo
1838161 AD4_EM3 5063 4618 20197 2.56 24.18
1838162 AD4_EM3 7390 4326 18343 2.34 26.72
1838163 AD2.2_EM4 5084 4338 18092 2.55 26.21
1838171 AD2.2_EM4 7078 4511 18854 2.36 24.17
1838279 AD5_EM4 9927 4910 21618 2.44 23.12
1838280 AD5_EM4 10064 4597 19361 2.44 25.12
# CNS QC
cns_qc <- leuk_CNS@meta.data %>%
  group_by(Mouse_ID, transplant_source) %>%
  summarise(
    n_cells       = n(),
    mean_nFeature = round(mean(nFeature_originalexp), 0),
    mean_nCount   = round(mean(nCount_originalexp),   0),
    mean_mt       = round(mean(subsets_mt_percent),   2),
    mean_ribo     = round(mean(subsets_ribo_pct),     2),
    .groups       = "drop"
  )

kable(cns_qc, caption = "CNS QC metrics by mouse and transplant source")
CNS QC metrics by mouse and transplant source
Mouse_ID transplant_source n_cells mean_nFeature mean_nCount mean_mt mean_ribo
1838161 AD4_EM3 8680 4223 17478 2.52 26.06
1838162 AD4_EM3 9021 4247 17640 2.36 26.35
1838163 AD2.2_EM4 6542 4397 18775 2.42 26.39
1838171 AD2.2_EM4 7426 4415 18102 2.33 25.06
1838279 AD5_EM4 8497 4880 21859 2.39 23.83
1838280 AD5_EM4 7054 4885 22172 2.33 24.82

Revised Analytical Strategy

Quantification of transplant source composition per cluster revealed that BM leukaemia clusters are almost entirely source-specific, with individual clusters dominated (>90%) by cells from a single transplant source. This pattern indicates that all three transplant sources — AD2.2_EM4, AD4_EM3, and AD5_EM4 — are transcriptionally distinct in BM, precluding the use of any subset as a truly matched cohort (Option 1).

Consequently, all primary analyses use Option 2 — retaining all six mice with transplant_source included as a covariate in the DESeq2 pseudobulk design. This approach explicitly models inter-source variability while preserving maximum statistical power for detecting genuine CNS vs BM differences:

design = ~ transplant_source + Tissue

Findings from this analysis represent CNS vs BM differences that are consistent across all three transplant sources, making them robust to inter-animal leukaemia variability.

load data

leuk_all <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/03_harmony_recluster_leuk_only_cellcycleanno_march26.qs")
leuk_BM <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_BM_march26.qs")
leuk_CNS <- qs_read("/exports/eddie/scratch/aduguid3/harmony_clustering/04_harmony_recluster_leuk_CNS_march26.qs")

cat("Total cells (all mice):", ncol(leuk_all), "\n")
## Total cells (all mice): 91826
cat("By tissue:\n")
## By tissue:
print(table(leuk_all$Tissue))
## 
##    BM   CNS 
## 44606 47220
cat("By source:\n")
## By source:
print(table(leuk_all$transplant_source))
## 
## AD2.2_EM4   AD4_EM3   AD5_EM4 
##     26130     30154     35542

Primary Analysis — Pseudobulk DE: CNS vs BM (All Mice)

Pseudobulk differential expression analysis is performed by aggregating counts per sample per tissue, then applying DESeq2 with transplant_source as a covariate. This design identifies genes differentially expressed between CNS and BM leukaemia that are consistent across all three transplant sources.

pseudo_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/pseudo_all.qs"

if (!file.exists(pseudo_qs_path)) {
  pseudo_all <- AggregateExpression(
    leuk_all,
    group.by      = c("Mouse_ID", "Tissue", "transplant_source"),
    return.seurat = TRUE
  )
  qs_save(pseudo_all, pseudo_qs_path)
} else {
  pseudo_all <- qs_read(pseudo_qs_path)
}
## Names of identity class contain underscores ('_'), replacing with dashes ('-')
## First group.by variable `Mouse_ID` starts with a number, appending `g` to ensure valid variable names
## Centering and scaling data matrix
## 
## This message is displayed once every 8 hours.
counts_all <- GetAssayData(pseudo_all, layer = "counts")
meta_all   <- pseudo_all@meta.data
meta_all$Tissue            <- factor(meta_all$Tissue, levels = c("BM", "CNS"))
meta_all$transplant_source <- factor(meta_all$transplant_source)
meta_all$Mouse_ID          <- factor(meta_all$Mouse_ID)

cat("Pseudobulk matrix dimensions:", 
    nrow(counts_all), "genes x", ncol(counts_all), "samples\n")
## Pseudobulk matrix dimensions: 26694 genes x 12 samples
cat("\nSamples per group:\n")
## 
## Samples per group:
print(table(meta_all$Tissue, meta_all$transplant_source))
##      
##       AD2.2-EM4 AD4-EM3 AD5-EM4
##   BM          2       2       2
##   CNS         2       2       2

DESeq2 — Transplant Source as Covariate

dds_all <- DESeqDataSetFromMatrix(
  countData = counts_all,
  colData   = meta_all,
  design    = ~ transplant_source + Tissue
)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
# Filter lowly expressed genes — detected in at least 3 samples
keep_all <- rowSums(counts(dds_all) >= 10) >= 3
dds_all  <- dds_all[keep_all, ]
cat("Genes retained after filtering:", nrow(dds_all), "\n")
## Genes retained after filtering: 16228
dds_all <- DESeq(dds_all)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
# Extract CNS vs BM results
res_all <- results(dds_all,
                    contrast = c("Tissue", "CNS", "BM"),
                    alpha    = 0.05)

cat("\n--- CNS vs BM DE Results (all mice, source-corrected) ---\n")
## 
## --- CNS vs BM DE Results (all mice, source-corrected) ---
summary(res_all)
## 
## out of 16228 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 182, 1.1%
## LFC < 0 (down)     : 164, 1%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Convert to dataframe
res_all_df <- as.data.frame(res_all) %>%
  tibble::rownames_to_column("gene") %>%
  filter(!is.na(padj)) %>%
  arrange(padj)

vsd <- vst(dds_all, blind = FALSE)

Top CNS-Enriched Genes

res_all_df %>%
  filter(padj < 0.05, log2FoldChange > 0) %>%
  head(30) %>%
  select(gene, log2FoldChange, lfcSE, pvalue, padj) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  kable(caption = "Top 30 CNS-enriched genes (padj < 0.05)")
Top 30 CNS-enriched genes (padj < 0.05)
gene log2FoldChange lfcSE pvalue padj
Sparcl1 8.2725 0.9369 0 0
Slc1a2 8.3980 0.9819 0 0
Soat1 0.6893 0.0878 0 0
Plaat3 0.5193 0.0691 0 0
Gramd3 0.7510 0.1003 0 0
Dnm3os 0.6837 0.0940 0 0
Ndrg2 4.0850 0.5661 0 0
Skil 0.5322 0.0771 0 0
Rnf125 0.2226 0.0334 0 0
Tst 0.4295 0.0660 0 0
Apod 6.3194 0.9709 0 0
Ubb 0.1730 0.0273 0 0
Mt3 2.8308 0.4657 0 0
Snn 0.5055 0.0834 0 0
Cpe 7.1025 1.1862 0 0
Abhd8 0.3362 0.0564 0 0
2010110G14Rik 0.4054 0.0692 0 0
Gm12958 0.4700 0.0803 0 0
Lef1os1 0.3453 0.0599 0 0
Tspan7 2.8200 0.4950 0 0
Tiam1 0.3494 0.0619 0 0
Mobp 5.7997 1.0519 0 0
S100b 5.2989 0.9640 0 0
Stc1 0.8773 0.1611 0 0
Atp1a2 4.3096 0.7947 0 0
Cd96 0.5839 0.1078 0 0
Septin5 0.6515 0.1205 0 0
Fgd1 0.8003 0.1489 0 0
Lef1 0.3481 0.0664 0 0
Cdr1os 7.8196 1.4906 0 0

Top BM-Enriched Genes

res_all_df %>%
  filter(padj < 0.05, log2FoldChange < 0) %>%
  arrange(log2FoldChange) %>%
  head(30) %>%
  select(gene, log2FoldChange, lfcSE, pvalue, padj) %>%
  mutate(across(where(is.numeric), ~round(., 4))) %>%
  kable(caption = "Top 30 BM-enriched genes (padj < 0.05)")
Top 30 BM-enriched genes (padj < 0.05)
gene log2FoldChange lfcSE pvalue padj
Ighv1-55 -6.6518 1.2694 0e+00 0.0000
Igkv8-27 -6.3693 1.1094 0e+00 0.0000
Igkv10-96 -4.9947 1.3257 2e-04 0.0139
Igkv6-23 -4.4578 1.3423 9e-04 0.0444
Iglc1 -4.3471 0.9336 0e+00 0.0006
Iglv1 -4.2328 1.0433 0e+00 0.0055
Gm2682 -4.1528 1.1632 4e-04 0.0232
Gm57200 -3.6115 1.0042 3e-04 0.0218
Lpl -3.5416 0.6125 0e+00 0.0000
Cd5l -3.4865 0.6491 0e+00 0.0000
Clec7a -3.3970 0.5272 0e+00 0.0000
Jchain -3.1490 0.6450 0e+00 0.0002
Igha -3.0752 0.6227 0e+00 0.0002
Spic -2.9891 0.6304 0e+00 0.0004
Gm30211 -2.8528 0.5141 0e+00 0.0000
Igkc -2.8307 0.5685 0e+00 0.0002
Tgm2 -2.7946 0.6324 0e+00 0.0015
Atp6v0d2 -2.7865 0.4278 0e+00 0.0000
Epb41l3 -2.6724 0.4021 0e+00 0.0000
Fcna -2.6587 0.4735 0e+00 0.0000
Mgst1 -2.6402 0.5852 0e+00 0.0011
Pparg -2.6087 0.7735 7e-04 0.0395
Il6ra -2.5915 0.7730 8e-04 0.0415
Vcam1 -2.5860 0.3969 0e+00 0.0000
Slc40a1 -2.4988 0.4524 0e+00 0.0000
Hebp1 -2.4970 0.5896 0e+00 0.0030
Iglc2 -2.4824 0.6247 1e-04 0.0074
Tnfaip2 -2.4580 0.5641 0e+00 0.0019
Ccl6 -2.3027 0.4677 0e+00 0.0002
Rhob -2.3022 0.6038 1e-04 0.0121

Volcano Plot

res_all_df %>%
  mutate(
    direction = case_when(
      log2FoldChange >  0.5 & padj < 0.05 ~ "CNS-enriched",
      log2FoldChange < -0.5 & padj < 0.05 ~ "BM-enriched",
      TRUE                                 ~ "NS"
    ),
    label = ifelse(abs(log2FoldChange) > 1 & padj < 0.01, gene, NA)
  ) %>%
  ggplot(aes(x      = log2FoldChange,
             y      = -log10(padj),
             colour = direction)) +
  geom_point(size = 0.6, alpha = 0.6) +
  ggrepel::geom_text_repel(aes(label = label),
                            size = 2.5, max.overlaps = 20) +
  scale_colour_manual(
    values = c("CNS-enriched" = "firebrick",
               "BM-enriched"  = "steelblue",
               "NS"           = "grey70")) +
  geom_vline(xintercept = c(-0.5, 0.5),
             linetype = "dashed", colour = "grey40") +
  geom_hline(yintercept = -log10(0.05),
             linetype = "dashed", colour = "grey40") +
  theme_classic(base_size = 12) +
  labs(x      = "log2FC (CNS vs BM)",
       y      = "-log10 adjusted p-value",
       title  = "CNS vs BM — all mice, transplant source corrected",
       colour = NULL) +
  theme(legend.position = "bottom")
## Warning: Removed 16171 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
CNS vs BM differential expression — all mice, source-corrected

CNS vs BM differential expression — all mice, source-corrected

MA Plot

plotMA(res_all, 
       alpha = 0.05,
       main  = "MA plot — CNS vs BM (source-corrected)",
       xlab  = "Mean expression",
       ylab  = "log2FC (CNS vs BM)")
MA plot — CNS vs BM

MA plot — CNS vs BM

PCA of Pseudobulk Samples

Confirms that samples separate by tissue after accounting for source effects.

p_pca1 <- plotPCA(vsd, intgroup = "Tissue") +
  scale_colour_manual(values = c("BM" = "steelblue", "CNS" = "firebrick")) +
  theme_classic(base_size = 12) +
  ggtitle("PCA — by tissue")
## using ntop=500 top features by variance
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
##   Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Extract PCA data manually for more control
pca_data <- plotPCA(vsd, intgroup = c("Tissue", "transplant_source"),
                     returnData = TRUE)
## using ntop=500 top features by variance
# Check the data and identify the outlier
print(pca_data)
##                               PC1        PC2         group Tissue
## g1838161_BM_AD4-EM3     -7.711282 -6.9929883    BM:AD4-EM3     BM
## g1838161_CNS_AD4-EM3    -7.328604 -6.5420863   CNS:AD4-EM3    CNS
## g1838162_BM_AD4-EM3     -8.818700 -6.8091830    BM:AD4-EM3     BM
## g1838162_CNS_AD4-EM3    -9.216987 -7.2053986   CNS:AD4-EM3    CNS
## g1838163_BM_AD2.2-EM4  -11.938870  4.5645561  BM:AD2.2-EM4     BM
## g1838163_CNS_AD2.2-EM4  -8.929551  4.4899354 CNS:AD2.2-EM4    CNS
## g1838171_BM_AD2.2-EM4    0.711890 14.2837789  BM:AD2.2-EM4     BM
## g1838171_CNS_AD2.2-EM4 -11.490799  7.4016225 CNS:AD2.2-EM4    CNS
## g1838279_BM_AD5-EM4     17.344695 -1.3927350    BM:AD5-EM4     BM
## g1838279_CNS_AD5-EM4    17.868032 -2.4647185   CNS:AD5-EM4    CNS
## g1838280_BM_AD5-EM4     14.214978  0.8031972    BM:AD5-EM4     BM
## g1838280_CNS_AD5-EM4    15.295197 -0.1359804   CNS:AD5-EM4    CNS
##                        transplant_source                   name
## g1838161_BM_AD4-EM3              AD4-EM3    g1838161_BM_AD4-EM3
## g1838161_CNS_AD4-EM3             AD4-EM3   g1838161_CNS_AD4-EM3
## g1838162_BM_AD4-EM3              AD4-EM3    g1838162_BM_AD4-EM3
## g1838162_CNS_AD4-EM3             AD4-EM3   g1838162_CNS_AD4-EM3
## g1838163_BM_AD2.2-EM4          AD2.2-EM4  g1838163_BM_AD2.2-EM4
## g1838163_CNS_AD2.2-EM4         AD2.2-EM4 g1838163_CNS_AD2.2-EM4
## g1838171_BM_AD2.2-EM4          AD2.2-EM4  g1838171_BM_AD2.2-EM4
## g1838171_CNS_AD2.2-EM4         AD2.2-EM4 g1838171_CNS_AD2.2-EM4
## g1838279_BM_AD5-EM4              AD5-EM4    g1838279_BM_AD5-EM4
## g1838279_CNS_AD5-EM4             AD5-EM4   g1838279_CNS_AD5-EM4
## g1838280_BM_AD5-EM4              AD5-EM4    g1838280_BM_AD5-EM4
## g1838280_CNS_AD5-EM4             AD5-EM4   g1838280_CNS_AD5-EM4
# Percentage variance
pct_var <- round(100 * attr(pca_data, "percentVar"), 1)

# Plot with correct colour mapping
p_pca2 <- ggplot(pca_data, 
                 aes(x      = PC1, 
                     y      = PC2,
                     colour = transplant_source,
                     shape  = Tissue,
                     label  = name)) +
  geom_point(size = 4) +
  ggrepel::geom_text_repel(size = 3) +
  scale_colour_manual(values = c("AD4_EM3"   = "#E69F00",
                                  "AD2.2_EM4" = "#56B4E9",
                                  "AD5_EM4"   = "#CC79A7")) +
  scale_shape_manual(values = c("BM" = 16, "CNS" = 17)) +
  theme_classic(base_size = 12) +
  labs(x      = paste0("PC1: ", pct_var[1], "% variance"),
       y      = paste0("PC2: ", pct_var[2], "% variance"),
       title  = "Pseudobulk PCA — tissue and transplant source",
       colour = "Transplant source",
       shape  = "Tissue")

p_pca2
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
PCA of pseudobulk samples

PCA of pseudobulk samples

p_pca1 + p_pca2
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
PCA of pseudobulk samples

PCA of pseudobulk samples

Pseudobulk PCA

Principal component analysis of pseudobulk samples (aggregated per mouse per tissue) revealed that PC1 (58% variance) separates samples by transplant source, confirming that inter-source variability is the dominant transcriptional signal in this dataset. PC2 (17% variance) separates BM from CNS within each transplant source group, demonstrating that a consistent tissue-specific transcriptional difference is detectable across all three sources. This separation justifies the DESeq2 design including transplant_source as a covariate, which removes the PC1 source effect and allows detection of the CNS vs BM signal captured in PC2. One BM sample (1838171, AD2.2_EM4) showed notable separation from its paired CNS sample on PC2 and is flagged for sensitivity analysis.

Sensitivity check

# Re-run DESeq2 excluding 1838171 to confirm results are not driven by outlier
samples_no_outlier <- colnames(counts_all)[
  !grepl("1838171", colnames(counts_all))
]

dds_sensitivity <- DESeqDataSetFromMatrix(
  countData = counts_all[, samples_no_outlier],
  colData   = meta_all[samples_no_outlier, ],
  design    = ~ transplant_source + Tissue
)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
keep_s <- rowSums(counts(dds_sensitivity) >= 10) >= 3
dds_sensitivity <- dds_sensitivity[keep_s, ]
dds_sensitivity <- DESeq(dds_sensitivity)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
res_sensitivity <- results(dds_sensitivity,
                            contrast = c("Tissue", "CNS", "BM"),
                            alpha    = 0.05)

# Compare with main results
sens_df <- as.data.frame(res_sensitivity) %>%
  tibble::rownames_to_column("gene") %>%
  filter(!is.na(padj))

# Correlation of log2FC with and without outlier
shared <- inner_join(
  res_all_df  %>% select(gene, lfc_main = log2FoldChange),
  sens_df     %>% select(gene, lfc_sens = log2FoldChange),
  by = "gene"
)

cor_sens <- round(cor(shared$lfc_main, shared$lfc_sens), 3)
cat("Correlation of log2FC with vs without 1838171:", cor_sens, "\n")
## Correlation of log2FC with vs without 1838171: 0.954

Sensitivity Analysis — Outlier Sample

Mouse 1838171 (AD2.2_EM4, BM) showed notable separation from its paired CNS sample on PC2 of the pseudobulk PCA. To confirm that the primary CNS vs BM DE results were not driven by this sample, DESeq2 was re-run excluding 1838171 from both tissue compartments. The log2 fold changes from the full analysis (n=6 mice) and the sensitivity analysis (n=5 mice) showed a Pearson correlation of r = 0.954, confirming that the CNS vs BM transcriptional differences identified are robust to the exclusion of this sample. All subsequent analyses use the full six-mouse dataset.


CNS Signature Summary

cns_sig <- res_all_df %>%
  filter(padj < 0.05, log2FoldChange > 0) %>%
  arrange(padj)

bm_sig <- res_all_df %>%
  filter(padj < 0.05, log2FoldChange < 0) %>%
  arrange(log2FoldChange)

cat("Total DE genes (padj < 0.05):", 
    sum(res_all_df$padj < 0.05, na.rm = TRUE), "\n")
## Total DE genes (padj < 0.05): 346
cat("CNS-enriched:", nrow(cns_sig), "\n")
## CNS-enriched: 182
cat("BM-enriched :", nrow(bm_sig),  "\n")
## BM-enriched : 164
# Save full DE results
write.csv(res_all_df, 
           "02_DE_CNSvsBM_all_mice_source_corrected.csv",
           row.names = FALSE)

# Save CNS signature separately for downstream use
write.csv(cns_sig,
           "02_CNS_leukaemia_signature.csv",
           row.names = FALSE)

cat("Results saved:\n")
## Results saved:
cat("02_DE_CNSvsBM_all_mice_source_corrected.csv\n")
## 02_DE_CNSvsBM_all_mice_source_corrected.csv
cat("02_CNS_leukaemia_signature.csv\n")
## 02_CNS_leukaemia_signature.csv

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.5 (Blue Onyx)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2024.0/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.10.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/London
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tibble_3.3.1                ggrepel_0.9.6              
##  [3] tidyr_1.3.1                 clustree_0.5.1             
##  [5] ggraph_2.2.2                viridis_0.6.5              
##  [7] viridisLite_0.4.2           qs2_0.1.7                  
##  [9] knitr_1.51                  DESeq2_1.46.0              
## [11] SummarizedExperiment_1.36.0 Biobase_2.66.0             
## [13] MatrixGenerics_1.18.1       matrixStats_1.5.0          
## [15] GenomicRanges_1.58.0        GenomeInfoDb_1.42.3        
## [17] IRanges_2.40.1              S4Vectors_0.44.0           
## [19] BiocGenerics_0.52.0         dplyr_1.1.4                
## [21] patchwork_1.3.2             ggplot2_4.0.2              
## [23] Seurat_5.4.0                SeuratObject_5.3.0         
## [25] sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.16.0       jsonlite_2.0.0         
##   [4] magrittr_2.0.3          spatstat.utils_3.2-1    farver_2.1.2           
##   [7] rmarkdown_2.30          zlibbioc_1.52.0         vctrs_0.6.5            
##  [10] ROCR_1.0-12             memoise_2.0.1           spatstat.explore_3.7-0 
##  [13] S4Arrays_1.6.0          htmltools_0.5.8.1       SparseArray_1.6.2      
##  [16] sass_0.4.9              sctransform_0.4.3       parallelly_1.37.1      
##  [19] KernSmooth_2.23-24      bslib_0.7.0             htmlwidgets_1.6.4      
##  [22] ica_1.0-3               plyr_1.8.9              plotly_4.12.0          
##  [25] zoo_1.8-15              cachem_1.1.0            igraph_2.2.2           
##  [28] mime_0.12               lifecycle_1.0.5         pkgconfig_2.0.3        
##  [31] Matrix_1.7-0            R6_2.6.1                fastmap_1.2.0          
##  [34] GenomeInfoDbData_1.2.13 fitdistrplus_1.2-6      future_1.33.2          
##  [37] shiny_1.12.1            digest_0.6.35           tensor_1.5.1           
##  [40] RSpectra_0.16-2         irlba_2.3.7             labeling_0.4.3         
##  [43] progressr_0.18.0        fansi_1.0.6             spatstat.sparse_3.1-0  
##  [46] httr_1.4.7              polyclip_1.10-7         abind_1.4-5            
##  [49] compiler_4.4.1          withr_3.0.2             S7_0.2.1               
##  [52] BiocParallel_1.40.2     fastDummies_1.7.5       ggforce_0.5.0          
##  [55] MASS_7.3-60.2           DelayedArray_0.32.0     tools_4.4.1            
##  [58] lmtest_0.9-40           otel_0.2.0              httpuv_1.6.16          
##  [61] future.apply_1.11.2     goftest_1.2-3           glue_1.8.0             
##  [64] nlme_3.1-164            promises_1.5.0          grid_4.4.1             
##  [67] Rtsne_0.17              cluster_2.1.6           reshape2_1.4.4         
##  [70] generics_0.1.3          gtable_0.3.6            spatstat.data_3.1-9    
##  [73] data.table_1.15.4       tidygraph_1.3.1         stringfish_0.18.0      
##  [76] utf8_1.2.4              XVector_0.46.0          spatstat.geom_3.7-0    
##  [79] RcppAnnoy_0.0.23        RANN_2.6.2              pillar_1.9.0           
##  [82] stringr_1.5.1           spam_2.11-3             RcppHNSW_0.6.0         
##  [85] later_1.4.6             splines_4.4.1           tweenr_2.0.3           
##  [88] lattice_0.22-6          survival_3.6-4          deldir_2.0-4           
##  [91] tidyselect_1.2.1        locfit_1.5-9.12         miniUI_0.1.2           
##  [94] pbapply_1.7-4           gridExtra_2.3           scattermore_1.2        
##  [97] xfun_0.56               graphlayouts_1.2.2      stringi_1.8.4          
## [100] UCSC.utils_1.2.0        lazyeval_0.2.2          yaml_2.3.12            
## [103] evaluate_1.0.5          codetools_0.2-20        cli_3.6.5              
## [106] RcppParallel_5.1.8      uwot_0.2.4              xtable_1.8-4           
## [109] reticulate_1.45.0       jquerylib_0.1.4         dichromat_2.0-0.1      
## [112] Rcpp_1.0.12             globals_0.16.3          spatstat.random_3.4-4  
## [115] png_0.1-8               spatstat.univar_3.1-6   parallel_4.4.1         
## [118] dotCall64_1.2           listenv_0.9.1           scales_1.4.0           
## [121] ggridges_0.5.6          crayon_1.5.2            purrr_1.0.2            
## [124] rlang_1.1.7             cowplot_1.2.0