Primary Analysis — Pseudobulk DE: CNS vs BM (All Mice)
Pseudobulk differential expression analysis is performed by
aggregating counts per sample per tissue, then applying DESeq2 with
transplant_source as a covariate. This design identifies
genes differentially expressed between CNS and BM leukaemia that are
consistent across all three transplant sources.
pseudo_qs_path <- "/exports/eddie/scratch/aduguid3/harmony_clustering/pseudo_all.qs"
if (!file.exists(pseudo_qs_path)) {
pseudo_all <- AggregateExpression(
leuk_all,
group.by = c("Mouse_ID", "Tissue", "transplant_source"),
return.seurat = TRUE
)
qs_save(pseudo_all, pseudo_qs_path)
} else {
pseudo_all <- qs_read(pseudo_qs_path)
}
## Names of identity class contain underscores ('_'), replacing with dashes ('-')
## First group.by variable `Mouse_ID` starts with a number, appending `g` to ensure valid variable names
## Centering and scaling data matrix
##
## This message is displayed once every 8 hours.
counts_all <- GetAssayData(pseudo_all, layer = "counts")
meta_all <- pseudo_all@meta.data
meta_all$Tissue <- factor(meta_all$Tissue, levels = c("BM", "CNS"))
meta_all$transplant_source <- factor(meta_all$transplant_source)
meta_all$Mouse_ID <- factor(meta_all$Mouse_ID)
cat("Pseudobulk matrix dimensions:",
nrow(counts_all), "genes x", ncol(counts_all), "samples\n")
## Pseudobulk matrix dimensions: 26694 genes x 12 samples
cat("\nSamples per group:\n")
##
## Samples per group:
print(table(meta_all$Tissue, meta_all$transplant_source))
##
## AD2.2-EM4 AD4-EM3 AD5-EM4
## BM 2 2 2
## CNS 2 2 2
DESeq2 — Transplant Source as Covariate
dds_all <- DESeqDataSetFromMatrix(
countData = counts_all,
colData = meta_all,
design = ~ transplant_source + Tissue
)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
# Filter lowly expressed genes — detected in at least 3 samples
keep_all <- rowSums(counts(dds_all) >= 10) >= 3
dds_all <- dds_all[keep_all, ]
cat("Genes retained after filtering:", nrow(dds_all), "\n")
## Genes retained after filtering: 16228
dds_all <- DESeq(dds_all)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
# Extract CNS vs BM results
res_all <- results(dds_all,
contrast = c("Tissue", "CNS", "BM"),
alpha = 0.05)
cat("\n--- CNS vs BM DE Results (all mice, source-corrected) ---\n")
##
## --- CNS vs BM DE Results (all mice, source-corrected) ---
summary(res_all)
##
## out of 16228 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 182, 1.1%
## LFC < 0 (down) : 164, 1%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Convert to dataframe
res_all_df <- as.data.frame(res_all) %>%
tibble::rownames_to_column("gene") %>%
filter(!is.na(padj)) %>%
arrange(padj)
vsd <- vst(dds_all, blind = FALSE)
Top CNS-Enriched Genes
res_all_df %>%
filter(padj < 0.05, log2FoldChange > 0) %>%
head(30) %>%
select(gene, log2FoldChange, lfcSE, pvalue, padj) %>%
mutate(across(where(is.numeric), ~round(., 4))) %>%
kable(caption = "Top 30 CNS-enriched genes (padj < 0.05)")
| gene | log2FoldChange | lfcSE | pvalue | padj |
|---|---|---|---|---|
| Sparcl1 | 8.2725 | 0.9369 | 0 | 0 |
| Slc1a2 | 8.3980 | 0.9819 | 0 | 0 |
| Soat1 | 0.6893 | 0.0878 | 0 | 0 |
| Plaat3 | 0.5193 | 0.0691 | 0 | 0 |
| Gramd3 | 0.7510 | 0.1003 | 0 | 0 |
| Dnm3os | 0.6837 | 0.0940 | 0 | 0 |
| Ndrg2 | 4.0850 | 0.5661 | 0 | 0 |
| Skil | 0.5322 | 0.0771 | 0 | 0 |
| Rnf125 | 0.2226 | 0.0334 | 0 | 0 |
| Tst | 0.4295 | 0.0660 | 0 | 0 |
| Apod | 6.3194 | 0.9709 | 0 | 0 |
| Ubb | 0.1730 | 0.0273 | 0 | 0 |
| Mt3 | 2.8308 | 0.4657 | 0 | 0 |
| Snn | 0.5055 | 0.0834 | 0 | 0 |
| Cpe | 7.1025 | 1.1862 | 0 | 0 |
| Abhd8 | 0.3362 | 0.0564 | 0 | 0 |
| 2010110G14Rik | 0.4054 | 0.0692 | 0 | 0 |
| Gm12958 | 0.4700 | 0.0803 | 0 | 0 |
| Lef1os1 | 0.3453 | 0.0599 | 0 | 0 |
| Tspan7 | 2.8200 | 0.4950 | 0 | 0 |
| Tiam1 | 0.3494 | 0.0619 | 0 | 0 |
| Mobp | 5.7997 | 1.0519 | 0 | 0 |
| S100b | 5.2989 | 0.9640 | 0 | 0 |
| Stc1 | 0.8773 | 0.1611 | 0 | 0 |
| Atp1a2 | 4.3096 | 0.7947 | 0 | 0 |
| Cd96 | 0.5839 | 0.1078 | 0 | 0 |
| Septin5 | 0.6515 | 0.1205 | 0 | 0 |
| Fgd1 | 0.8003 | 0.1489 | 0 | 0 |
| Lef1 | 0.3481 | 0.0664 | 0 | 0 |
| Cdr1os | 7.8196 | 1.4906 | 0 | 0 |
Top BM-Enriched Genes
res_all_df %>%
filter(padj < 0.05, log2FoldChange < 0) %>%
arrange(log2FoldChange) %>%
head(30) %>%
select(gene, log2FoldChange, lfcSE, pvalue, padj) %>%
mutate(across(where(is.numeric), ~round(., 4))) %>%
kable(caption = "Top 30 BM-enriched genes (padj < 0.05)")
| gene | log2FoldChange | lfcSE | pvalue | padj |
|---|---|---|---|---|
| Ighv1-55 | -6.6518 | 1.2694 | 0e+00 | 0.0000 |
| Igkv8-27 | -6.3693 | 1.1094 | 0e+00 | 0.0000 |
| Igkv10-96 | -4.9947 | 1.3257 | 2e-04 | 0.0139 |
| Igkv6-23 | -4.4578 | 1.3423 | 9e-04 | 0.0444 |
| Iglc1 | -4.3471 | 0.9336 | 0e+00 | 0.0006 |
| Iglv1 | -4.2328 | 1.0433 | 0e+00 | 0.0055 |
| Gm2682 | -4.1528 | 1.1632 | 4e-04 | 0.0232 |
| Gm57200 | -3.6115 | 1.0042 | 3e-04 | 0.0218 |
| Lpl | -3.5416 | 0.6125 | 0e+00 | 0.0000 |
| Cd5l | -3.4865 | 0.6491 | 0e+00 | 0.0000 |
| Clec7a | -3.3970 | 0.5272 | 0e+00 | 0.0000 |
| Jchain | -3.1490 | 0.6450 | 0e+00 | 0.0002 |
| Igha | -3.0752 | 0.6227 | 0e+00 | 0.0002 |
| Spic | -2.9891 | 0.6304 | 0e+00 | 0.0004 |
| Gm30211 | -2.8528 | 0.5141 | 0e+00 | 0.0000 |
| Igkc | -2.8307 | 0.5685 | 0e+00 | 0.0002 |
| Tgm2 | -2.7946 | 0.6324 | 0e+00 | 0.0015 |
| Atp6v0d2 | -2.7865 | 0.4278 | 0e+00 | 0.0000 |
| Epb41l3 | -2.6724 | 0.4021 | 0e+00 | 0.0000 |
| Fcna | -2.6587 | 0.4735 | 0e+00 | 0.0000 |
| Mgst1 | -2.6402 | 0.5852 | 0e+00 | 0.0011 |
| Pparg | -2.6087 | 0.7735 | 7e-04 | 0.0395 |
| Il6ra | -2.5915 | 0.7730 | 8e-04 | 0.0415 |
| Vcam1 | -2.5860 | 0.3969 | 0e+00 | 0.0000 |
| Slc40a1 | -2.4988 | 0.4524 | 0e+00 | 0.0000 |
| Hebp1 | -2.4970 | 0.5896 | 0e+00 | 0.0030 |
| Iglc2 | -2.4824 | 0.6247 | 1e-04 | 0.0074 |
| Tnfaip2 | -2.4580 | 0.5641 | 0e+00 | 0.0019 |
| Ccl6 | -2.3027 | 0.4677 | 0e+00 | 0.0002 |
| Rhob | -2.3022 | 0.6038 | 1e-04 | 0.0121 |
Volcano Plot
res_all_df %>%
mutate(
direction = case_when(
log2FoldChange > 0.5 & padj < 0.05 ~ "CNS-enriched",
log2FoldChange < -0.5 & padj < 0.05 ~ "BM-enriched",
TRUE ~ "NS"
),
label = ifelse(abs(log2FoldChange) > 1 & padj < 0.01, gene, NA)
) %>%
ggplot(aes(x = log2FoldChange,
y = -log10(padj),
colour = direction)) +
geom_point(size = 0.6, alpha = 0.6) +
ggrepel::geom_text_repel(aes(label = label),
size = 2.5, max.overlaps = 20) +
scale_colour_manual(
values = c("CNS-enriched" = "firebrick",
"BM-enriched" = "steelblue",
"NS" = "grey70")) +
geom_vline(xintercept = c(-0.5, 0.5),
linetype = "dashed", colour = "grey40") +
geom_hline(yintercept = -log10(0.05),
linetype = "dashed", colour = "grey40") +
theme_classic(base_size = 12) +
labs(x = "log2FC (CNS vs BM)",
y = "-log10 adjusted p-value",
title = "CNS vs BM — all mice, transplant source corrected",
colour = NULL) +
theme(legend.position = "bottom")
## Warning: Removed 16171 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
CNS vs BM differential expression — all mice, source-corrected
MA Plot
plotMA(res_all,
alpha = 0.05,
main = "MA plot — CNS vs BM (source-corrected)",
xlab = "Mean expression",
ylab = "log2FC (CNS vs BM)")
MA plot — CNS vs BM
PCA of Pseudobulk Samples
Confirms that samples separate by tissue after accounting for source effects.
p_pca1 <- plotPCA(vsd, intgroup = "Tissue") +
scale_colour_manual(values = c("BM" = "steelblue", "CNS" = "firebrick")) +
theme_classic(base_size = 12) +
ggtitle("PCA — by tissue")
## using ntop=500 top features by variance
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DESeq2 package.
## Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Extract PCA data manually for more control
pca_data <- plotPCA(vsd, intgroup = c("Tissue", "transplant_source"),
returnData = TRUE)
## using ntop=500 top features by variance
# Check the data and identify the outlier
print(pca_data)
## PC1 PC2 group Tissue
## g1838161_BM_AD4-EM3 -7.711282 -6.9929883 BM:AD4-EM3 BM
## g1838161_CNS_AD4-EM3 -7.328604 -6.5420863 CNS:AD4-EM3 CNS
## g1838162_BM_AD4-EM3 -8.818700 -6.8091830 BM:AD4-EM3 BM
## g1838162_CNS_AD4-EM3 -9.216987 -7.2053986 CNS:AD4-EM3 CNS
## g1838163_BM_AD2.2-EM4 -11.938870 4.5645561 BM:AD2.2-EM4 BM
## g1838163_CNS_AD2.2-EM4 -8.929551 4.4899354 CNS:AD2.2-EM4 CNS
## g1838171_BM_AD2.2-EM4 0.711890 14.2837789 BM:AD2.2-EM4 BM
## g1838171_CNS_AD2.2-EM4 -11.490799 7.4016225 CNS:AD2.2-EM4 CNS
## g1838279_BM_AD5-EM4 17.344695 -1.3927350 BM:AD5-EM4 BM
## g1838279_CNS_AD5-EM4 17.868032 -2.4647185 CNS:AD5-EM4 CNS
## g1838280_BM_AD5-EM4 14.214978 0.8031972 BM:AD5-EM4 BM
## g1838280_CNS_AD5-EM4 15.295197 -0.1359804 CNS:AD5-EM4 CNS
## transplant_source name
## g1838161_BM_AD4-EM3 AD4-EM3 g1838161_BM_AD4-EM3
## g1838161_CNS_AD4-EM3 AD4-EM3 g1838161_CNS_AD4-EM3
## g1838162_BM_AD4-EM3 AD4-EM3 g1838162_BM_AD4-EM3
## g1838162_CNS_AD4-EM3 AD4-EM3 g1838162_CNS_AD4-EM3
## g1838163_BM_AD2.2-EM4 AD2.2-EM4 g1838163_BM_AD2.2-EM4
## g1838163_CNS_AD2.2-EM4 AD2.2-EM4 g1838163_CNS_AD2.2-EM4
## g1838171_BM_AD2.2-EM4 AD2.2-EM4 g1838171_BM_AD2.2-EM4
## g1838171_CNS_AD2.2-EM4 AD2.2-EM4 g1838171_CNS_AD2.2-EM4
## g1838279_BM_AD5-EM4 AD5-EM4 g1838279_BM_AD5-EM4
## g1838279_CNS_AD5-EM4 AD5-EM4 g1838279_CNS_AD5-EM4
## g1838280_BM_AD5-EM4 AD5-EM4 g1838280_BM_AD5-EM4
## g1838280_CNS_AD5-EM4 AD5-EM4 g1838280_CNS_AD5-EM4
# Percentage variance
pct_var <- round(100 * attr(pca_data, "percentVar"), 1)
# Plot with correct colour mapping
p_pca2 <- ggplot(pca_data,
aes(x = PC1,
y = PC2,
colour = transplant_source,
shape = Tissue,
label = name)) +
geom_point(size = 4) +
ggrepel::geom_text_repel(size = 3) +
scale_colour_manual(values = c("AD4_EM3" = "#E69F00",
"AD2.2_EM4" = "#56B4E9",
"AD5_EM4" = "#CC79A7")) +
scale_shape_manual(values = c("BM" = 16, "CNS" = 17)) +
theme_classic(base_size = 12) +
labs(x = paste0("PC1: ", pct_var[1], "% variance"),
y = paste0("PC2: ", pct_var[2], "% variance"),
title = "Pseudobulk PCA — tissue and transplant source",
colour = "Transplant source",
shape = "Tissue")
p_pca2
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
PCA of pseudobulk samples
p_pca1 + p_pca2
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
PCA of pseudobulk samples
Pseudobulk PCA
Principal component analysis of pseudobulk samples (aggregated per mouse per tissue) revealed that PC1 (58% variance) separates samples by transplant source, confirming that inter-source variability is the dominant transcriptional signal in this dataset. PC2 (17% variance) separates BM from CNS within each transplant source group, demonstrating that a consistent tissue-specific transcriptional difference is detectable across all three sources. This separation justifies the DESeq2 design including transplant_source as a covariate, which removes the PC1 source effect and allows detection of the CNS vs BM signal captured in PC2. One BM sample (1838171, AD2.2_EM4) showed notable separation from its paired CNS sample on PC2 and is flagged for sensitivity analysis.
Sensitivity check
# Re-run DESeq2 excluding 1838171 to confirm results are not driven by outlier
samples_no_outlier <- colnames(counts_all)[
!grepl("1838171", colnames(counts_all))
]
dds_sensitivity <- DESeqDataSetFromMatrix(
countData = counts_all[, samples_no_outlier],
colData = meta_all[samples_no_outlier, ],
design = ~ transplant_source + Tissue
)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
keep_s <- rowSums(counts(dds_sensitivity) >= 10) >= 3
dds_sensitivity <- dds_sensitivity[keep_s, ]
dds_sensitivity <- DESeq(dds_sensitivity)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
res_sensitivity <- results(dds_sensitivity,
contrast = c("Tissue", "CNS", "BM"),
alpha = 0.05)
# Compare with main results
sens_df <- as.data.frame(res_sensitivity) %>%
tibble::rownames_to_column("gene") %>%
filter(!is.na(padj))
# Correlation of log2FC with and without outlier
shared <- inner_join(
res_all_df %>% select(gene, lfc_main = log2FoldChange),
sens_df %>% select(gene, lfc_sens = log2FoldChange),
by = "gene"
)
cor_sens <- round(cor(shared$lfc_main, shared$lfc_sens), 3)
cat("Correlation of log2FC with vs without 1838171:", cor_sens, "\n")
## Correlation of log2FC with vs without 1838171: 0.954
Sensitivity Analysis — Outlier Sample
Mouse 1838171 (AD2.2_EM4, BM) showed notable separation from its paired CNS sample on PC2 of the pseudobulk PCA. To confirm that the primary CNS vs BM DE results were not driven by this sample, DESeq2 was re-run excluding 1838171 from both tissue compartments. The log2 fold changes from the full analysis (n=6 mice) and the sensitivity analysis (n=5 mice) showed a Pearson correlation of r = 0.954, confirming that the CNS vs BM transcriptional differences identified are robust to the exclusion of this sample. All subsequent analyses use the full six-mouse dataset.