suppressPackageStartupMessages({
library(Seurat)
library(monocle3)
library(SeuratWrappers)
library(dplyr)
library(tidyr)
library(tibble)
library(ggplot2)
library(patchwork)
library(RColorBrewer)
library(viridis)
library(ggridges)
library(ggrepel)
library(Matrix)
library(scales)
library(pheatmap)
library(cowplot)
library(igraph)
})
set.seed(1234)
options(future.globals.maxSize = 8e9)
# ── Consistent colour palettes used throughout ──────────────────────────────
azimuth_l2_colors <- c(
"CD4 Naive" = "#2166AC",
"CD4 TCM" = "#74ADD1",
"CD4 TEM" = "#FEE090",
"CD4 Temra/CTL" = "#D73027",
"Treg" = "#762A83"
)
bin_colors <- c(
"Naive-like" = "#2166AC",
"TCM-like" = "#74ADD1",
"Treg-like" = "#762A83",
"TEM-like" = "#FEE090",
"Temra-like" = "#D73027"
)
line_colors <- setNames(
colorRampPalette(brewer.pal(8, "Dark2"))(7),
paste0("L", 1:7)
)
# ── Output directories ───────────────────────────────────────────────────────
out_dir <- "results/Mapping_Pipeline_v3"
fig_dir_qc <- file.path(out_dir, "QC_Figures")
fig_dir_ms <- file.path(out_dir, "Manuscript_Figures")
fig_dir_pdf <- file.path(out_dir, "Manuscript_Figures/PDF")
for (d in c(out_dir, fig_dir_qc, fig_dir_ms, fig_dir_pdf))
dir.create(d, recursive = TRUE, showWarnings = FALSE)
# ── Save helper ──────────────────────────────────────────────────────────────
save_fig <- function(p, name, subdir = fig_dir_qc, w = 14, h = 9) {
png_path <- file.path(subdir, paste0(name, ".png"))
pdf_path <- file.path(fig_dir_pdf, paste0(name, ".pdf"))
ggsave(png_path, plot = p, width = w, height = h, dpi = 300, bg = "white")
if (subdir == fig_dir_ms)
ggsave(pdf_path, plot = p, width = w, height = h, device = cairo_pdf)
invisible(p)
}
cat("=== Environment ready ===\n")=== Environment ready ===
Seurat : 5.4.0
Monocle3: 1.4.26
Output : results/Mapping_Pipeline_v3
# ════════════════════════════════════════════════════════════════════════════
# Load the Slingshot-ready healthy reference object.
# This object was confirmed to have:
# ✅ UMAP model (intact — will NOT be replaced)
# ✅ predicted.celltype.l2 (Azimuth l2)
# ✅ cell_type, seurat_clusters, percent.mt, S.Score, G2M.Score
# ✅ SCT assay (HVGs=2902, per-sample models)
# ✅ integrated PCA 50 dims (intact — will NOT be replaced)
# ✅ No proliferating cells
# ════════════════════════════════════════════════════════════════════════════
reference_integrated <- readRDS(
"../../1-Final_Custom_MST_Monocle3_Trajectory_and_mapping/CD4_reference_clean_Azimuth_ready_for_Slingshot.rds"
)
cat("=== Reference object loaded ===\n")=== Reference object loaded ===
Cells : 11466
Assays : RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
Reductions: pca, umap, integrated_dr, ref.umap
# ── Hard stops: essential metadata must be present ───────────────────────────
stopifnot(
"predicted.celltype.l2 missing" =
"predicted.celltype.l2" %in% colnames(reference_integrated@meta.data),
"cell_type missing" =
"cell_type" %in% colnames(reference_integrated@meta.data),
"umap reduction missing" =
"umap" %in% names(reference_integrated@reductions),
"pca reduction missing" =
"pca" %in% names(reference_integrated@reductions)
)
# Proliferating cell check
prolif_check <- any(grepl("Prolif|prolif|cycling",
reference_integrated$predicted.celltype.l2,
ignore.case = TRUE))
if (prolif_check) stop("Proliferating cells detected — remove before proceeding.")
cat("\n✅ No proliferating cells\n")
✅ No proliferating cells
# ── Extend palette for any extra labels ─────────────────────────────────────
ref_l2_labels <- unique(as.character(reference_integrated$predicted.celltype.l2))
extra <- setdiff(ref_l2_labels, names(azimuth_l2_colors))
if (length(extra) > 0) {
extra_colors <- setNames(
colorRampPalette(brewer.pal(8, "Set2"))(length(extra)), extra)
azimuth_l2_colors <- c(azimuth_l2_colors, extra_colors)
}
# ── QC Figure 1: Incoming reference UMAP (Azimuth l2 + cell_type) ───────────
p_in_l2 <- DimPlot(
reference_integrated,
group.by = "predicted.celltype.l2",
reduction = "umap",
label = TRUE, repel = TRUE, label.size = 3.5
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey70") +
ggtitle("Incoming reference — Azimuth l2") +
theme_classic() + NoLegend()
p_in_ct <- DimPlot(
reference_integrated,
group.by = "cell_type",
reduction = "umap",
label = TRUE, repel = TRUE, label.size = 3
) +
ggtitle("Incoming reference — cell_type (original labels)") +
theme_classic() + NoLegend()
qc1 <- p_in_l2 | p_in_ct
qc1
Azimuth l2 distribution:
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
cell_type distribution:
CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like) CD4 CTL/Temra (GZMK+GZMA+CCL5+)
5479 3994 522 490
CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+) CD4 Tnaive-RTE (IGF1R+)
412 336 233
Design decision: The reference object was already integrated across 3 donors using Seurat RPCA (
integratedassay, 50-dim PCA, UMAP with frozen model). We do not re-run SCTransform or rebuild the UMAP. Doing so destroys the biology — cells scatter into 18+ clusters because SCT on a pre-integrated object re-introduces batch variation that CCA already removed.What we use instead: -
reference.reduction = "pca"— the existing integrated PCA (50 dims) -reduction.model = "umap"— the existing frozen UMAP model - Malignant cells are processed withnpcs = 50to match this dimensionalityThe SCT assay on the reference (HVGs = 2902) is used only for feature intersection with the query — not for rebuilding the PCA.
# Keep integrated assay active (PCA was built on this)
DefaultAssay(reference_integrated) <- "integrated"
cat("=== Reference integration summary ===\n")=== Reference integration summary ===
Active assay : integrated
PCA assay used : integrated
PCA dims : 50
SCT HVGs : 0
cat("SCT models :", length(reference_integrated@assays$SCT@SCTModel.list),
"(per-sample = correct)\n")SCT models : 3 (per-sample = correct)
# HARD STOP: UMAP model must be intact — never re-run RunUMAP
stopifnot(
"UMAP model missing — MapQuery will fail" =
!is.null(reference_integrated@reductions$umap@misc$model)
)
cat("UMAP model : intact\n")UMAP model : intact
Donors : 3
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
3379 3221 4866
# Define junk gene pattern (used later in §5 for query HVG filtering)
junk_pattern <- paste0(
"^MT-|^RPL|^RPS|",
"^HSP|^HSPA|^HSPB|^HSPD|^HSPE|^HSPH|",
"^SNHG|MALAT1|NEAT1|XIST|^HIST"
)
# Recompute clusters on existing integrated PCA — for Monocle3 CDS slot ONLY
# Does NOT touch the UMAP or PCA
reference_integrated <- FindNeighbors(
reference_integrated,
reduction = "pca",
dims = 1:50,
graph.name = "integrated_snn",
verbose = FALSE
)
reference_integrated <- FindClusters(
reference_integrated,
resolution = 0.3,
graph.name = "integrated_snn",
verbose = FALSE
)
cat("\nClusters (res=0.3):", nlevels(reference_integrated$seurat_clusters), "\n")
Clusters (res=0.3): 7
# QC Figure 2: confirm biology is preserved on the intact UMAP
p_umap_l2 <- DimPlot(
reference_integrated,
group.by = "predicted.celltype.l2",
reduction = "umap",
label = TRUE, repel = TRUE, label.size = 4
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey70") +
ggtitle("Reference UMAP (integrated) — Azimuth l2") +
theme_classic() + NoLegend()
p_umap_cl <- DimPlot(
reference_integrated,
group.by = "seurat_clusters",
reduction = "umap",
label = TRUE, label.size = 4
) +
ggtitle("Reference UMAP — Clusters (res=0.3) for Monocle3 CDS") +
theme_classic() + NoLegend()
p_umap_ct <- DimPlot(
reference_integrated,
group.by = "cell_type",
reduction = "umap",
label = TRUE, repel = TRUE, label.size = 3
) +
ggtitle("Reference UMAP — cell_type") +
theme_classic() + NoLegend()
qc2 <- (p_umap_l2 | p_umap_cl) / p_umap_ct
qc2save_fig(qc2, "QC2_reference_UMAP_verified", w = 18, h = 14)
# QC: Key marker feature plots on intact UMAP
DefaultAssay(reference_integrated) <- "SCT"
marker_genes <- c("CCR7", "SELL", "TCF7", "IL7R",
"GZMK", "GZMA", "GZMB", "PRF1",
"FOXP3", "IL2RA", "IKZF2",
"GNLY", "NKG7")
available_markers <- intersect(marker_genes, rownames(reference_integrated))
p_markers <- FeaturePlot(
reference_integrated,
features = available_markers,
reduction = "umap",
ncol = 5,
cols = c("lightgrey", "#D73027"),
order = TRUE
) &
theme_classic() &
theme(plot.title = element_text(size = 9, face = "bold"))
p_markerssave_fig(p_markers, "QC3_reference_marker_features", w = 20, h = 10)
DefaultAssay(reference_integrated) <- "integrated"
cat("\nReference verified — UMAP and PCA intact, biology preserved\n")
Reference verified — UMAP and PCA intact, biology preserved
DefaultAssay(reference_integrated) <- "RNA"
# Order matches trajectory: Naive → TCM → Treg branch / TEM → Temra
azimuth_l2_order <- c(
"CD4 Naive",
"CD4 TCM",
"Treg",
"CD4 TEM",
"CD4 Temra/CTL"
)
# Only keep l2 labels present in the object
azimuth_l2_order <- intersect(
azimuth_l2_order,
unique(reference_integrated$predicted.celltype.l2)
)
reference_integrated@meta.data$l2_factor <- factor(
reference_integrated$predicted.celltype.l2,
levels = azimuth_l2_order
)
Idents(reference_integrated) <- "l2_factor"
panel_genes <- c(
# Naive
"CCR7","LEF1","TCF7","SELL","KLF2","SATB1","IL7R","CD27","MAL",
# TCM
"S100A4","AQP3","LTB","ITGB1","CD44","CCR4",
# Shared activation
"CD69","LMNA",
# TEM
"GZMK","CCL5","EOMES","CXCR3","HOPX","CXCR4","IFNG","TNF","CCR5",
# Temra/CTL
"GZMB","GZMA","PRF1","NKG7","CX3CR1","FGFBP2","GNLY","TBX21","ZEB2",
"FCGR3A","KLRG1","NR4A2",
# Treg
"FOXP3","IL2RA","IKZF2","IKZF4","TIGIT","RTKN2","TNFRSF18","CTLA4",
# Co-inhibitory — Treg suppressive machinery
"PDCD1","HAVCR2","LAG3"
)
panel_genes <- unique(intersect(panel_genes, rownames(reference_integrated)))
cat("Genes found:", length(panel_genes), "/", length(unique(c(
"CCR7","LEF1","TCF7","SELL","KLF2","SATB1","IL7R","CD27","MAL",
"S100A4","AQP3","LTB","ITGB1","CD44","CCR4","CD69","LMNA",
"GZMK","CCL5","EOMES","CXCR3","HOPX","CXCR4","IFNG","TNF","CCR5",
"GZMB","GZMA","PRF1","NKG7","CX3CR1","FGFBP2","GNLY","TBX21","ZEB2",
"FCGR3A","KLRG1","NR4A2",
"FOXP3","IL2RA","IKZF2","IKZF4","TIGIT","RTKN2","TNFRSF18","CTLA4",
"PDCD1","HAVCR2","LAG3"
))), "\n")Genes found: 49 / 49
p_dotplot_l2 <- DotPlot(
reference_integrated,
features = panel_genes,
group.by = "l2_factor",
cols = c("lightgrey", "#d62728"),
dot.scale = 5,
scale = TRUE,
col.min = -1.5,
col.max = 2.5
) +
RotatedAxis() +
coord_flip() +
scale_color_gradient2(
low = "lightgrey",
mid = "#fee090",
high = "#d62728",
midpoint = 0.5,
name = "Avg Expression"
) +
theme(
axis.text.x = element_text(size = 9, face = "bold"),
axis.text.y = element_text(size = 7.5),
plot.title = element_text(size = 13, face = "bold"),
plot.subtitle = element_text(size = 9, colour = "grey40"),
legend.position = "bottom"
) +
labs(
title = "Marker gene expression — Azimuth l2 cell states",
subtitle = ""
)
print(p_dotplot_l2)save_fig(p_dotplot_l2, "QC_DotPlot_AzimuthL2_markers", w = 10, h = 12)
# Restore assay and idents
DefaultAssay(reference_integrated) <- "integrated"
Idents(reference_integrated) <- "predicted.celltype.l2"Critical:
learn_graphandorder_cellsrun once. The resultingmonocle3_pseudotimecolumn is never overwritten. This is the exact value MapQuery transfers to Sézary cells.
# Metadata
colData(cds)$predicted.celltype.l2 <- reference_integrated$predicted.celltype.l2
if ("cell_type" %in% colnames(reference_integrated@meta.data))
colData(cds)$cell_type <- reference_integrated$cell_type
cat("CDS built:", ncol(cds), "cells\n")CDS built: 11466 cells
Partitions: 1 (should be 1)
# ── Learn principal graph ────────────────────────────────────────────────────
set.seed(1234)
cds <- learn_graph(
cds,
use_partition = FALSE,
close_loop = FALSE,
learn_graph_control = list(
minimal_branch_len = 10,
ncenter = 900,
orthogonal_proj_tip = FALSE
),
verbose = FALSE
)
n_nodes <- length(igraph::V(principal_graph(cds)$UMAP))
n_branch <- sum(igraph::degree(principal_graph(cds)$UMAP) > 2)
cat("\n✅ Principal graph learned\n")
✅ Principal graph learned
Nodes : 403
Branch points: 20 (expected 1-3 for effector axis + Treg branch)
if (n_branch == 0) {
stop("No branch point found — Treg lineage not separated.\n",
"Fix: re-run with minimal_branch_len = 5")
} else if (n_branch > 5) {
warning(paste("Many branch points:", n_branch,
"— consider minimal_branch_len = 15"))
}
# ── QC: Graph coloured by cell type ─────────────────────────────────────────
p_graph_l2 <- plot_cells(
cds,
color_cells_by = "predicted.celltype.l2",
label_cell_groups = FALSE,
show_trajectory_graph = TRUE,
cell_size = 0.7,
trajectory_graph_color = "black",
trajectory_graph_segment_size = 1.2
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey70") +
ggtitle(sprintf("Principal graph — %d nodes, %d branch points", n_nodes, n_branch)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
p_graph_l2# ── Root selection: centroid of CD4 Naive cells ──────────────────────────────
naive_cells <- colnames(cds)[
grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2, ignore.case = TRUE)]
cat("Naive cells for root centroid:", length(naive_cells), "\n")Naive cells for root centroid: 2037
stopifnot("No Naive cells found" = length(naive_cells) > 0)
naive_umap <- Embeddings(reference_integrated, "umap")[naive_cells, ]
naive_centroid <- colMeans(naive_umap)
cat(sprintf("Naive centroid: UMAP1=%.3f, UMAP2=%.3f\n",
naive_centroid[1], naive_centroid[2]))Naive centroid: UMAP1=-3.437, UMAP2=-0.644
pr_nodes <- t(cds@principal_graph_aux[["UMAP"]]$dp_mst)
node_dists <- rowSums(
(pr_nodes - matrix(naive_centroid, nrow = nrow(pr_nodes), ncol = 2, byrow = TRUE))^2
)
root_node <- names(which.min(node_dists))
cat("Root node selected:", root_node, "\n")Root node selected: Y_19
# ════════════════════════════════════════════════════════════════════════════
# ORDER CELLS — monocle3_pseudotime computed here, NEVER overwritten
# ════════════════════════════════════════════════════════════════════════════
cds <- order_cells(cds, root_pr_nodes = root_node)
# Store in reference object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)
reference_integrated$monocle3_pseudotime[
!is.finite(reference_integrated$monocle3_pseudotime)] <- NA
cat("\n✅ monocle3_pseudotime stored in reference_integrated (FROZEN)\n")
✅ monocle3_pseudotime stored in reference_integrated (FROZEN)
print(summary(reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)])) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.545 11.180 12.310 18.214 29.034
# ── Topology validation — HARD STOPS ────────────────────────────────────────
pt_order <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n = n(),
med_pt = round(median(monocle3_pseudotime, na.rm = TRUE), 3),
.groups = "drop"
) %>%
arrange(med_pt)
cat("\n=== State median pseudotimes (topology check) ===\n")
=== State median pseudotimes (topology check) ===
get_med <- function(state) {
v <- pt_order$med_pt[pt_order$predicted.celltype.l2 == state]
if (length(v) == 0) stop(paste("State not found in topology:", state))
v
}
naive_med <- get_med("CD4 Naive")
tcm_med <- get_med("CD4 TCM")
treg_med <- get_med("Treg")
tem_med <- get_med("CD4 TEM")
temra_med <- get_med("CD4 Temra/CTL")
if (naive_med >= tcm_med)
stop(sprintf("TOPOLOGY ERROR: Naive(%.2f) >= TCM(%.2f)", naive_med, tcm_med))
if (tcm_med >= tem_med)
stop(sprintf("TOPOLOGY ERROR: TCM(%.2f) >= TEM(%.2f)", tcm_med, tem_med))
if (tem_med >= temra_med)
stop(sprintf("TOPOLOGY ERROR: TEM(%.2f) >= Temra(%.2f)", tem_med, temra_med))
if (treg_med >= tem_med)
stop(sprintf(
"TOPOLOGY ERROR: Treg(%.2f) >= TEM(%.2f) — Treg must branch from TCM\n",
treg_med, tem_med))
cat(sprintf(
"\n✅ Topology confirmed: Naive(%.3f) < TCM(%.3f) < Treg(%.3f) < TEM(%.3f) < Temra(%.3f)\n",
naive_med, tcm_med, treg_med, tem_med, temra_med
))
✅ Topology confirmed: Naive(3.389) < TCM(13.027) < Treg(18.114) < TEM(26.890) < Temra(27.350)
# ── QC Figure: Pseudotime on UMAP ────────────────────────────────────────────
p_pt_graph <- plot_cells(
cds,
color_cells_by = "pseudotime",
label_cell_groups = FALSE,
show_trajectory_graph = TRUE,
cell_size = 0.7,
trajectory_graph_color = "black",
trajectory_graph_segment_size = 1.2
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("Monocle3 Pseudotime — Root = CD4 Naive") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
p_pt_seurat <- FeaturePlot(
reference_integrated,
features = "monocle3_pseudotime",
reduction = "umap"
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("Pseudotime on Reference UMAP (Seurat)") +
theme_classic()
qc3 <- p_pt_graph | p_pt_seurat
qc3save_fig(qc3, "QC5_monocle3_pseudotime_UMAP", w = 18, h = 8)
# ── QC: Pseudotime distribution by state ─────────────────────────────────────
pt_meta <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
mutate(predicted.celltype.l2 = factor(
predicted.celltype.l2,
levels = c("CD4 Naive","CD4 TCM","Treg","CD4 TEM","CD4 Temra/CTL")))
p_pt_violin <- ggplot(pt_meta,
aes(x = predicted.celltype.l2,
y = monocle3_pseudotime,
fill = predicted.celltype.l2)) +
geom_violin(scale = "width", trim = TRUE, alpha = 0.85) +
geom_boxplot(width = 0.12, fill = "white", outlier.size = 0.5) +
scale_fill_manual(values = azimuth_l2_colors) +
geom_hline(yintercept = c(naive_med, tcm_med, treg_med, tem_med, temra_med),
linetype = "dashed", colour = "black", linewidth = 0.4, alpha = 0.5) +
theme_classic() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none") +
labs(
title = "Reference Pseudotime by State (Topology Confirmed)",
subtitle = sprintf("Naive=%.3f | TCM=%.3f | Treg=%.3f | TEM=%.3f | Temra=%.3f",
naive_med, tcm_med, treg_med, tem_med, temra_med),
x = NULL, y = "monocle3_pseudotime"
)
p_pt_violinAll_samples_Merged <- readRDS(
"../../../../../1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds"
)
All_samples_Merged$Group <- ifelse(
All_samples_Merged$cell_line %in% paste0("L", 1:7),
"MalignantCD4T", "Other"
)
MalignantCD4T_raw <- subset(All_samples_Merged, subset = Group == "MalignantCD4T")
cat("Sézary cells loaded:", ncol(MalignantCD4T_raw), "\n")Sézary cells loaded: 40695
L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab CD4Tcells_10x
5825 5935 6428 6006 6022 5148 5331 0 0
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 10262648 548.1 15749107 841.1 15749107 841.1
Vcells 1465774711 11183.0 3187287748 24317.1 2707109753 20653.7
# ── QC Figure: Cell count per line ────────────────────────────────────────────
line_df <- as.data.frame(table(MalignantCD4T_raw$cell_line))
colnames(line_df) <- c("Line", "Cells")
p_cell_count <- ggplot(line_df, aes(x = Line, y = Cells, fill = Line)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = comma(Cells)), vjust = -0.4, size = 3.5, fontface = "bold") +
scale_fill_manual(values = line_colors) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
theme_classic() +
theme(legend.position = "none",
axis.text = element_text(size = 11)) +
labs(title = "Sézary Cell Lines — Cell Counts",
x = "Cell Line", y = "Number of Cells")
p_cell_count# Must specify assay="SCT" — DefaultAssay is "integrated" at this point
ref_hvgs <- VariableFeatures(reference_integrated, assay = "integrated")
final_features <- intersect(ref_hvgs, shared_hvgs_clean)
cat(sprintf("\nShared HVGs : %d\n", length(shared_hvgs)))
Shared HVGs : 3000
After junk : 2866
Ref HVGs : 2902
Final (ref ∩ query): 1356 genes
if (length(final_features) < 1500)
warning("Fewer than 1500 shared features — consider nfeatures = 4000")
# ── Merge, scale, PCA ────────────────────────────────────────────────────────
MalignantCD4T <- merge(cell_line_list[[1]], y = cell_line_list[-1],
merge.data = TRUE)
VariableFeatures(MalignantCD4T) <- final_features
MalignantCD4T <- ScaleData(MalignantCD4T, features = final_features,
assay = "SCT", verbose = FALSE)
# npcs = 50 matches the reference integrated PCA (50 dims)
# FindTransferAnchors takes min(ref_dims, query_dims) automatically
MalignantCD4T <- RunPCA(MalignantCD4T, features = final_features,
assay = "SCT", npcs = 50, verbose = FALSE)
cat("\n✅ Query ready\n")
✅ Query ready
Cells : 40695
Features: 1356
PCA dims: 50
# ── QC Figure: HVG overlap ───────────────────────────────────────────────────
hvg_overlap_df <- data.frame(
Set = c("Reference HVGs", "Query shared HVGs (clean)", "Final intersection"),
Genes = c(length(ref_hvgs), length(shared_hvgs_clean), length(final_features))
)
p_hvg <- ggplot(hvg_overlap_df, aes(x = Set, y = Genes, fill = Set)) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = Genes), vjust = -0.4, size = 4, fontface = "bold") +
scale_fill_manual(values = c("#2166AC","#74ADD1","#D73027")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 20, hjust = 1)) +
labs(title = "HVG Overlap: Reference vs Query", x = NULL, y = "Gene Count")
p_hvg used (Mb) gc trigger (Mb) max used (Mb)
Ncells 10612439 566.8 15749107 841.1 15749107 841.1
Vcells 1392620214 10624.9 4407706339 33628.2 5509632923 42035.2
# Genes variable in ALL 7 lines
hvg_all7 <- names(hvg_freq[hvg_freq == 7])
cat("Genes variable in all 7 lines:", length(hvg_all7), "\n")Genes variable in all 7 lines: 689
[1] "AARS" "ABCC1" "ABHD3" "AC004687.1" "AC006064.4" "AC008105.3" "AC011603.2" "AC020916.1" "ACAT2" "ACLY"
[11] "ACTB" "ACTG1" "ACTN4" "ADA" "ADAM19" "ADGRE5" "AHNAK" "AHR" "AKAP13" "AL133415.1"
[21] "AL138963.4" "AL662797.1" "ALOX5AP" "ANKRD17" "ANKRD44" "ANLN" "ANXA1" "ANXA2" "ANXA6" "APP"
[31] "ARGLU1" "ARHGAP11A" "ARHGAP15" "ARHGDIB" "ARHGEF6" "ARL4C" "ARL6IP1" "ASF1B" "ASPM" "ATAD2"
[41] "ATP1A1" "ATP2A2" "ATP2B1" "ATP5MC1" "ATXN1" "AURKA" "AURKB" "B2M" "BACH2" "BCAT1"
[51] "BCL2" "BHLHE40" "BIRC3" "BIRC5" "BIRC6" "BLVRA" "BRCA1" "BRIP1" "BTG1" "BTG2"
[61] "BUB1" "BUB1B" "C12orf75" "C21orf58" "CALM1" "CALM2" "CALR" "CAMK4" "CANX" "CAPN2"
[71] "CARHSP1" "CASK" "CASP8" "CAST" "CBR3" "CCDC28B" "CCDC86" "CCL5" "CCNA2" "CCNB1"
[81] "CCNB2" "CCND2" "CCND3" "CCNE1" "CCNE2" "CCNF" "CCR7" "CCT5" "CD151" "CD3D"
[91] "CD48" "CD52" "CD55" "CD59" "CD69" "CD70" "CD74" "CD96" "CDC20" "CDC6"
[101] "CDCA2" "CDCA3" "CDCA5" "CDCA7" "CDCA8" "CDK1" "CDK2AP2" "CDK6" "CDKAL1" "CDKN1A"
[111] "CDKN2D" "CDKN3" "CDT1" "CEBPB" "CELF2" "CENPA" "CENPE" "CENPF" "CENPU" "CEP128"
[121] "CEP55" "CHAC1" "CHAF1A" "CHCHD10" "CHST11" "CISH" "CKAP2" "CKAP2L" "CKAP5" "CKS1B"
[131] "CKS2" "CLDND1" "CLSPN" "CLTC" "CNN2" "CORO1A" "CORO1B" "COTL1" "CRIP1" "CTSC"
[141] "CTSD" "CXCR4" "CYBA" "CYLD" "CYP51A1" "CYTH1" "CYTIP" "CYTOR" "DANCR" "DCTN1"
[151] "DDIT3" "DDIT4" "DDX21" "DDX3X" "DDX6" "DENND1B" "DENND4C" "DEPDC1" "DEPDC1B" "DIAPH3"
[161] "DLG1" "DLGAP5" "DNAJA1" "DNAJB1" "DOCK10" "DOCK2" "DOCK8" "DSCC1" "DTL" "DUSP2"
[171] "DYNLL1" "E2F1" "ECT2" "EEF1A1" "EEF2" "EFHD2" "EIF1" "EIF4G1" "ELMO1" "EMP3"
[181] "ENO1" "ERC1" "ERN1" "ESCO2" "ESYT1" "ESYT2" "ETS1" "EVL" "EXOC4" "EZH2"
[191] "FABP5" "FAM107B" "FAM111A" "FAM111B" "FAM83D" "FASN" "FBXL17" "FBXL20" "FBXO5" "FDFT1"
[201] "FEN1" "FKBP11" "FKBP4" "FKBP5" "FLI1" "FLNA" "FLOT1" "FNDC3A" "FOS" "FOXN3"
[211] "FOXP2" "FTL" "FTX" "FUS" "FXYD5" "FYN" "GABPB1-AS1" "GAPDH" "GARS" "GAS5"
[221] "GATA3" "GINS2" "GNG2" "GOLGB1" "GPHN" "GPR15" "GPR171" "GPRIN3" "GRN" "GSTP1"
[231] "GTSE1" "H1FX" "H2AFX" "H2AFZ" "H3F3B" "HCST" "HDAC9" "HELLS" "HERPUD1" "HIPK2"
[241] "HIST1H1A" "HIST1H1B" "HIST1H1C" "HIST1H1D" "HIST1H1E" "HIST1H2AC" "HIST1H2AE" "HIST1H2AG" "HIST1H2AL" "HIST1H2BC"
[251] "HIST1H2BJ" "HIST1H3B" "HIST1H3D" "HIST1H3H" "HIST1H3I" "HIST1H4C" "HIST2H2AB" "HIST2H2AC" "HIST2H2BF" "HJURP"
[261] "HLA-A" "HLA-B" "HLA-C" "HLA-E" "HMGB2" "HMGCR" "HMGCS1" "HMGN2" "HMMR" "HNRNPA3"
[271] "HNRNPAB" "HNRNPH1" "HNRNPU" "HNRNPUL2" "HP1BP3" "HPGD" "HSP90AA1" "HSP90AB1" "HSP90B1" "HSPA1A"
[281] "HSPA1B" "HSPA5" "HSPA8" "HSPA9" "HSPB1" "HSPD1" "HSPE1" "HSPH1" "HUWE1" "IARS"
[291] "ID2" "IDH2" "IDI1" "IER2" "IER3" "IFITM1" "IGF2R" "IKZF1" "IKZF2" "IL10RA"
[301] "IL2RB" "IL32" "IL4R" "IL9R" "ILF3-DT" "IMMP2L" "INCENP" "INPP4B" "INSIG1" "IPO5"
[311] "IQGAP1" "IRF1" "ISG20" "ITGA4" "ITGAL" "ITGB2" "ITGB7" "ITK" "ITM2B" "ITM2C"
[321] "JPT1" "JUN" "JUNB" "JUND" "KCNQ5" "KIF11" "KIF14" "KIF15" "KIF18B" "KIF20A"
[331] "KIF20B" "KIF21B" "KIF23" "KIF2C" "KIF4A" "KIFC1" "KLF6" "KMT2C" "KNL1" "KNSTRN"
[341] "KPNA2" "LARP1" "LASP1" "LAT" "LBR" "LCP1" "LCP2" "LDHA" "LDLR" "LDLRAD4"
[351] "LGALS1" "LIME1" "LINC00892" "LINC01572" "LMNA" "LMNB1" "LMO4" "LRBA" "LRPPRC" "LSP1"
[361] "LTA" "LTB" "LY6E" "LYST" "MACF1" "MALAT1" "MANF" "MAP3K8" "MARCKSL1" "MAT2A"
[371] "MBD5" "MBNL1" "MBP" "MCL1" "MCM10" "MCM2" "MCM3" "MCM4" "MCM5" "MCM6"
[381] "MCM7" "MDFIC" "MDN1" "MGST3" "MIB1" "MIF" "MIR4435-2HG" "MIS18BP1" "MKI67" "MKNK2"
[391] "MMP25" "MSH6" "MSI2" "MSMO1" "MSN" "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3"
[401] "MT-CYB" "MT-ND1" "MT-ND2" "MT-ND3" "MT-ND4" "MT-ND4L" "MT-ND5" "MT-ND6" "MT1X" "MT2A"
[411] "MTHFD2" "MTRNR2L12" "MXD3" "MYB" "MYC" "MYH9" "MYL6" "MYO1F" "MYO1G" "MZB1"
[421] "NAMPT" "NCAPD2" "NCAPG" "NCAPG2" "NCL" "NCOA3" "NDC80" "NEAT1" "NEIL3" "NEK2"
[431] "NEK7" "NFAT5" "NFATC2" "NFE2L3" "NFKB1" "NFKBIA" "NIBAN1" "NINJ1" "NKTR" "NME1"
[441] "NOLC1" "NOP16" "NORAD" "NPM1" "NQO1" "NR3C1" "NSD2" "NSMCE2" "NUCB2" "NUDT8"
[451] "NUF2" "NUFIP2" "NUMA1" "NUSAP1" "ODC1" "OPTN" "OSBPL3" "OSTF1" "P2RY8" "PALM2-AKAP2"
[461] "PARP14" "PCLAF" "PCNA" "PDE3B" "PDE4D" "PDE7A" "PGAM1" "PGK1" "PHACTR2" "PHF19"
[471] "PHLDA1" "PIK3CD" "PIK3R1" "PIM1" "PIM2" "PIM3" "PKM" "PKMYT1" "PLAAT4" "PLEC"
[481] "PLK1" "PLP2" "PLPP1" "PMAIP1" "PNN" "PNRC1" "POLR2A" "PPDPF" "PPP1R15A" "PPP3CA"
[491] "PRC1" "PRDX1" "PREX1" "PRKCA" "PRKDC" "PRNP" "PRR11" "PSAT1" "PTMA" "PTMS"
[501] "PTPN6" "PTPN7" "PTPRC" "PTTG1" "PUM3" "PUS7" "PVT1" "PYCARD" "RAB11FIP1" "RAB37"
[511] "RABGAP1L" "RACGAP1" "RAD21" "RAD51B" "RASGRP1" "RBL1" "RBM38" "RBPJ" "RCC2" "RCSD1"
[521] "REEP5" "RELB" "RERE" "RHBDD2" "RHOC" "RNF213" "RORA" "RPL10" "RPL11" "RPL12"
[531] "RPL13" "RPL19" "RPL22L1" "RPL32" "RPL41" "RPLP0" "RPLP1" "RPS12" "RPS14" "RPS18"
[541] "RPS2" "RPS23" "RPS3" "RPS3A" "RPS6KA5" "RPS8" "RRM2" "RSRP1" "RUNX3" "S100A10"
[551] "S100A11" "S100A4" "S100A6" "S100P" "SAC3D1" "SAMD9" "SASH3" "SCLT1" "SCPEP1" "SDCBP"
[561] "SEC14L1" "SELENOW" "SELPLG" "SEMA4D" "SEPTIN9" "SERPINB1" "SETX" "SFPQ" "SGO2" "SH2D2A"
[571] "SH3BGRL3" "SH3BP1" "SIK3" "SIT1" "SKAP1" "SLBP" "SLC16A1-AS1" "SLC1A5" "SLC20A1" "SLC25A32"
[581] "SLC2A3" "SLC38A2" "SLC3A2" "SLC43A3" "SLC4A7" "SLC7A5" "SLC9A3R1" "SLFN12L" "SMARCA2" "SMC1A"
[591] "SMC4" "SMG1" "SMYD3" "SNHG12" "SNHG15" "SNHG3" "SNHG7" "SNHG8" "SNRNP200" "SOCS1"
[601] "SORL1" "SOS1" "SP140" "SPAG5" "SPIDR" "SPOCK2" "SPTAN1" "SPTBN1" "SQLE" "SQSTM1"
[611] "SREBF2" "SRGN" "SRM" "SRRT" "SRSF7" "SSBP2" "ST8SIA4" "STAT1" "STAT3" "STAT4"
[621] "STK10" "STK17B" "STMN1" "SUN2" "SYNE2" "SYTL3" "TACC3" "TAF15" "TAGLN2" "TBC1D5"
[631] "TBL1X" "TCF12" "TCP1" "TFRC" "TIMP1" "TK1" "TMBIM1" "TMEM173" "TMPO" "TMSB10"
[641] "TMSB4X" "TMTC2" "TMX4" "TNFRSF1B" "TNFSF10" "TNIK" "TOP2A" "TPM4" "TPX2" "TRAF1"
[651] "TRAF3IP3" "TRBV20-1" "TRG-AS1" "TRIM44" "TRIM56" "TRIM59" "TRIO" "TROAP" "TSC22D3" "TTK"
[661] "TUBA1A" "TUBA1B" "TUBA1C" "TUBA4A" "TUBB" "TUBB4B" "TYMS" "UBALD2" "UBC" "UBE2C"
[671] "UBE2S" "UBR4" "UCP2" "UHRF1" "UNG" "VIM" "WARS" "WDR76" "WWOX" "XBP1"
[681] "YWHAG" "ZC3HAV1" "ZEB1" "ZFAND3" "ZFAS1" "ZFP36" "ZFP36L1" "ZFP36L2" "ZYX"
# Check which canonical T cell markers are in the all-7 set
t_cell_markers <- c(
# Naive/memory
"CCR7", "SELL", "TCF7", "IL7R", "LEF1", "KLF2",
# Activation/exhaustion
"TOX", "PDCD1", "LAG3", "TIGIT", "CTLA4", "HAVCR2",
# Effector
"GZMB", "GZMK", "GZMA", "PRF1", "IFNG", "TNF",
# Treg
"FOXP3", "IL2RA", "IKZF2", "CTLA4",
# Sézary specific
"KIR3DL2", "PLS3", "TWIST1", "EPHA4", "CD164",
# Proliferation
"MKI67", "TOP2A", "CDK1"
)
found_in_all7 <- intersect(t_cell_markers, hvg_all7)
cat("\nCanonical markers in all-7 HVG set:\n")
Canonical markers in all-7 HVG set:
[1] "CCR7" "IKZF2" "MKI67" "TOP2A" "CDK1"
Markers NOT in all-7 set:
[1] "SELL" "TCF7" "IL7R" "LEF1" "KLF2" "TOX" "PDCD1" "LAG3" "TIGIT" "CTLA4" "HAVCR2" "GZMB" "GZMK" "GZMA"
[15] "PRF1" "IFNG" "TNF" "FOXP3" "IL2RA" "KIR3DL2" "PLS3" "TWIST1" "EPHA4" "CD164"
# Check what junk genes are in your current HVG set
# before find-anchors runs
cat("=== Junk gene check on shared_hvgs ===\n")=== Junk gene check on shared_hvgs ===
mt_in_hvgs <- shared_hvgs[grepl("^MT-", shared_hvgs)]
cat("MT genes in shared_hvgs:", length(mt_in_hvgs), "\n")MT genes in shared_hvgs: 13
[1] "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" "MT-ND1" "MT-ND2" "MT-ND3" "MT-ND4" "MT-ND4L" "MT-ND5" "MT-ND6"
ribo_in_hvgs <- shared_hvgs[grepl("^RPL|^RPS", shared_hvgs)]
cat("\nRibosomal genes in shared_hvgs:", length(ribo_in_hvgs), "\n")
Ribosomal genes in shared_hvgs: 53
[1] "RPL10" "RPL11" "RPL12" "RPL13" "RPL19" "RPL22L1" "RPL32" "RPL41" "RPLP0" "RPLP1" "RPS12" "RPS14" "RPS18" "RPS2"
[15] "RPS23" "RPS3" "RPS3A" "RPS6KA5" "RPS8" "RPL13A" "RPL18A" "RPL28" "RPL29" "RPL35A" "RPL5" "RPL7A" "RPL8" "RPS13"
[29] "RPS6" "RPS6KA3" "RPS7" "RPL17" "RPL23" "RPL30" "RPL35" "RPL6" "RPL7" "RPS15A" "RPS27L" "RPS4X" "RPL14" "RPL23A"
[43] "RPL26" "RPL27A" "RPL3" "RPL37" "RPL4" "RPS17" "RPS20" "RPS24" "RPS27A" "RPS6KA1" "RPS6KC1"
hsp_in_hvgs <- shared_hvgs[grepl("^HSP|^HSPA|^HSPB", shared_hvgs)]
cat("\nHeat shock genes in shared_hvgs:", length(hsp_in_hvgs), "\n")
Heat shock genes in shared_hvgs: 13
[1] "HSP90AA1" "HSP90AB1" "HSP90B1" "HSPA1A" "HSPA1B" "HSPA5" "HSPA8" "HSPA9" "HSPB1" "HSPD1" "HSPE1" "HSPH1"
[13] "HSPA4"
snhg_in_hvgs <- shared_hvgs[grepl("^SNHG|MALAT1|NEAT1", shared_hvgs)]
cat("\nlncRNA genes in shared_hvgs:", length(snhg_in_hvgs), "\n")
lncRNA genes in shared_hvgs: 15
[1] "MALAT1" "NEAT1" "SNHG12" "SNHG15" "SNHG3" "SNHG7" "SNHG8" "SNHG1" "SNHG29" "SNHG16" "SNHG25" "SNHG17" "SNHG30" "SNHG32" "SNHG5"
cat("\nTotal junk genes to be filtered:",
length(mt_in_hvgs) + length(ribo_in_hvgs) +
length(hsp_in_hvgs) + length(snhg_in_hvgs), "\n")
Total junk genes to be filtered: 94
DefaultAssay(reference_integrated) <- "SCT"
DefaultAssay(MalignantCD4T) <- "SCT"
dims_to_use <- min(50,
ncol(Embeddings(reference_integrated, "pca")),
ncol(Embeddings(MalignantCD4T, "pca")))
cat("Finding anchors: dims 1:", dims_to_use, "| features:", length(final_features), "\n\n")Finding anchors: dims 1: 50 | features: 1356
# ── HARD STOP: verify PCA rotation features overlap with final_features ──────
# reference.reduction="pca" projects the query into the reference PCA space.
# The PCA rotation matrix was built on "integrated" assay CCA features.
# final_features are SCT HVGs. Seurat uses only the intersection.
# If overlap is small, the projection is meaningless.
pca_features <- rownames(reference_integrated[["pca"]]@feature.loadings)
pca_overlap <- intersect(pca_features, final_features)
cat(sprintf("Reference PCA rotation features : %d\n", length(pca_features)))Reference PCA rotation features : 2902
final_features (SCT HVGs) : 1356
Overlap (used for projection) : 1356
if (length(pca_overlap) < 200)
stop(sprintf(
"CRITICAL: PCA–SCT feature overlap is only %d genes.\n",
length(pca_overlap),
"Query projection onto reference PCA will be unreliable.\n",
"Check that the reference object has SCT HVGs in its integrated PCA."
))
if (length(pca_overlap) < 500)
warning(sprintf("Low PCA–SCT feature overlap: %d genes. Projection quality may be reduced.", length(pca_overlap)))
anchors <- FindTransferAnchors(
reference = reference_integrated,
query = MalignantCD4T,
features = final_features,
normalization.method = "SCT",
reference.assay = "SCT", # use SCT expression for feature matching
query.assay = "SCT", # use SCT expression for feature matching
reference.reduction = "pca", # integrated PCA (50 dims) — biology intact
dims = 1:dims_to_use,
k.anchor = 10,
k.filter = 500,
k.score = 30,
verbose = TRUE
)[1] "Given reference assay has multiple sct models, selecting model with most cells for finding transfer anchors"
# ── Anchor QC ────────────────────────────────────────────────────────────────
anchor_df <- as.data.frame(slot(anchors, "anchors"))
n_anchors <- nrow(anchor_df)
anchor_ratio <- ncol(MalignantCD4T) / n_anchors
mean_score <- mean(anchor_df$score)
med_score <- median(anchor_df$score)
cat(sprintf("\n=== Anchor summary ===\n"))
=== Anchor summary ===
Anchors : 8022
Cells per anchor : 5.1:1 (ideal ≤ 8:1)
Score: mean=0.553 | median=0.538
if (anchor_ratio > 8)
warning("Low anchor density — check junk gene removal and k.anchor = 10")
# QC figure: anchor score distribution
p_anchor <- ggplot(anchor_df, aes(x = score)) +
geom_histogram(bins = 50, fill = "#2166AC", colour = "white", alpha = 0.85) +
geom_vline(xintercept = mean_score, linetype = "dashed", colour = "red",
linewidth = 0.8) +
geom_vline(xintercept = med_score, linetype = "dotted", colour = "darkgreen",
linewidth = 0.8) +
annotate("text", x = mean_score + 0.02, y = Inf, vjust = 1.5,
label = sprintf("mean=%.3f", mean_score), colour = "red", size = 3.5) +
annotate("text", x = med_score - 0.02, y = Inf, vjust = 3,
label = sprintf("median=%.3f", med_score), colour = "darkgreen",
size = 3.5, hjust = 1) +
theme_classic() +
labs(
title = sprintf("Transfer Anchor Score Distribution (n=%d anchors)", n_anchors),
subtitle = sprintf("Cells:anchors = %.1f:1", anchor_ratio),
x = "Anchor Score", y = "Count"
)
p_anchor# anchor_df has columns: cell1 (reference index), cell2 (query index), score
# cell2 indexes into the QUERY object (MalignantCD4T) — map back to cell_line
query_cell_names <- colnames(MalignantCD4T)
anchor_lines <- anchor_df %>%
mutate(
query_cell = query_cell_names[cell2],
cell_line = MalignantCD4T@meta.data[query_cell, "cell_line"]
) %>%
count(cell_line, name = "n_anchors") %>%
arrange(cell_line) %>%
mutate(
n_cells = as.integer(table(MalignantCD4T$cell_line)[cell_line]),
cells_per_anchor = round(n_cells / n_anchors, 2),
pct_anchors = round(100 * n_anchors / sum(n_anchors), 1)
)
cat("=== Anchors per cell line ===\n")=== Anchors per cell line ===
Total anchors : 8022
cat(sprintf("Overall ratio : %.1f cells per anchor\n",
sum(anchor_lines$n_cells) / sum(anchor_lines$n_anchors)))Overall ratio : 5.1 cells per anchor
# Flag any line with poor anchor density
poor_lines <- anchor_lines %>% filter(cells_per_anchor > 8)
if (nrow(poor_lines) > 0) {
warning(sprintf("Poor anchor density (>8:1) in: %s",
paste(poor_lines$cell_line, collapse = ", ")))
}
# ── Plot: anchors and cells:anchor ratio per line ─────────────────────────────
p_anch_n <- ggplot(anchor_lines, aes(x = cell_line, y = n_anchors, fill = cell_line)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = sprintf("%d\n(%.1f%%)", n_anchors, pct_anchors)),
vjust = -0.3, size = 3.2, fontface = "bold") +
scale_fill_manual(values = line_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
theme_classic(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold")) +
labs(title = "A Anchors per Cell Line",
x = "Cell Line", y = "Number of Anchors")
p_anch_ratio <- ggplot(anchor_lines, aes(x = cell_line, y = cells_per_anchor,
fill = cell_line)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = sprintf("%.1f:1", cells_per_anchor)),
vjust = -0.3, size = 3.2, fontface = "bold") +
geom_hline(yintercept = 8, linetype = "dashed", colour = "red",
linewidth = 0.7) +
annotate("text", x = 0.6, y = 8.4, label = "8:1 threshold",
colour = "red", size = 3, hjust = 0) +
scale_fill_manual(values = line_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
theme_classic(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold")) +
labs(title = "B Cells per Anchor (lower = better coverage)",
x = "Cell Line", y = "Cells : Anchor ratio")
p_anchor_lines <- p_anch_n | p_anch_ratio
p_anchor_lines# anchor_df columns:
# cell1 = index into REFERENCE (reference_integrated)
# cell2 = index into QUERY (MalignantCD4T)
# score = anchor score
ref_cell_names <- colnames(reference_integrated)
query_cell_names <- colnames(MalignantCD4T)
anchor_full <- anchor_df %>%
mutate(
ref_cell = ref_cell_names[cell1],
query_cell = query_cell_names[cell2],
ref_celltype = reference_integrated@meta.data[ref_cell, "predicted.celltype.l2"],
query_line = MalignantCD4T@meta.data[query_cell, "cell_line"]
)
# ── Table 1: anchors per reference cell type ─────────────────────────────────
by_reftype <- anchor_full %>%
count(ref_celltype, name = "n_anchors") %>%
arrange(desc(n_anchors)) %>%
mutate(
pct_anchors = round(100 * n_anchors / sum(n_anchors), 1),
# how many reference cells of this type exist?
n_ref_cells = as.integer(table(reference_integrated$predicted.celltype.l2)[ref_celltype]),
anchors_per_ref_cell = round(n_anchors / n_ref_cells, 2)
)
cat("=== Anchors by reference Azimuth l2 cell type ===\n")=== Anchors by reference Azimuth l2 cell type ===
# ── Table 2: cross-table — query line × reference cell type ──────────────────
cross_tab <- anchor_full %>%
count(query_line, ref_celltype) %>%
group_by(query_line) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup()
cat("\n=== Anchor source (ref cell type) per query cell line ===\n")
=== Anchor source (ref cell type) per query cell line ===
cross_wide <- cross_tab %>%
dplyr::select(query_line, ref_celltype, pct) %>%
tidyr::pivot_wider(names_from = ref_celltype, values_from = pct,
values_fill = 0)
print(cross_wide)
# ── Plot A: bar chart — anchors per reference cell type ──────────────────────
p_by_type <- ggplot(by_reftype,
aes(x = reorder(ref_celltype, n_anchors),
y = n_anchors,
fill = ref_celltype)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = sprintf("%d (%.1f%%)", n_anchors, pct_anchors)),
hjust = -0.08, size = 3.5, fontface = "bold") +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
coord_flip() +
theme_classic(base_size = 13) +
theme(legend.position = "none",
plot.title = element_text(face = "bold")) +
labs(title = "A Anchors per Reference Cell Type (Azimuth l2)",
x = NULL, y = "Number of Anchors")
# ── Plot B: anchors per reference cell in that type (density of use) ─────────
p_density <- ggplot(by_reftype,
aes(x = reorder(ref_celltype, anchors_per_ref_cell),
y = anchors_per_ref_cell,
fill = ref_celltype)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = sprintf("%.2f", anchors_per_ref_cell)),
hjust = -0.1, size = 3.5, fontface = "bold") +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
coord_flip() +
theme_classic(base_size = 13) +
theme(legend.position = "none",
plot.title = element_text(face = "bold")) +
labs(title = "B Anchors per Reference Cell\n(how heavily each state is used)",
x = NULL, y = "Anchors / Reference Cell")
# ── Plot C: stacked bar — which ref cell type anchors each query line ─────────
cross_tab <- cross_tab %>%
mutate(ref_celltype = factor(ref_celltype, levels = names(azimuth_l2_colors))) %>%
arrange(query_line, ref_celltype) %>%
group_by(query_line) %>%
mutate(
label_y = cumsum(pct) - 0.5 * pct,
label = ifelse(pct >= 3, sprintf("%.1f%%", pct), "")
) %>%
ungroup()
p_cross <- ggplot(cross_tab,
aes(x = query_line, y = pct, fill = ref_celltype)) +
geom_bar(stat = "identity", width = 0.78) +
geom_text(aes(y = label_y, label = label),
size = 3.0, fontface = "bold", colour = "white", vjust = 0.5) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70",
name = "Reference\nCell Type") +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.02))) +
theme_classic(base_size = 13) +
theme(plot.title = element_text(face = "bold"),
legend.position = "right") +
labs(title = "C Reference Cell Type Composition of Anchors per Query Line",
x = "Query Cell Line", y = "% of Anchors")
# ── Assemble ──────────────────────────────────────────────────────────────────
p_anchor_reftype <- (p_by_type | p_density) / p_cross +
plot_layout(heights = c(1, 1.2)) +
plot_annotation(
title = "Anchor Quality — Reference Cell Type Breakdown",
subtitle = sprintf("Total anchors: %d | Reference cells: %d | Query cells: %d",
nrow(anchor_df),
ncol(reference_integrated),
ncol(MalignantCD4T)),
theme = theme(
plot.title = element_text(size = 15, face = "bold"),
plot.subtitle = element_text(size = 10, colour = "grey40")
)
)
p_anchor_reftype✅ MapQuery complete
Mapped cells: 40695
# Label transfer confidence
score_col <- "predicted.predicted.celltype.l2.score"
if (score_col %in% colnames(mapped_MalignantCD4T@meta.data)) {
scores <- mapped_MalignantCD4T@meta.data[[score_col]]
cat(sprintf("Label transfer confidence: mean=%.3f | median=%.3f\n",
mean(scores, na.rm = TRUE), median(scores, na.rm = TRUE)))
}Label transfer confidence: mean=0.958 | median=1.000
Transferred label distribution:
CD4 Naive CD4 TCM CD4 TEM Treg
37 40643 14 1
Transferred pseudotime summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2367 15.1562 19.5877 20.4392 25.7830 28.4802
# ── QC Figure: Projection overview ───────────────────────────────────────────
ref_bg <- data.frame(
Embeddings(reference_integrated, "umap"),
stringsAsFactors = FALSE
)
colnames(ref_bg)[1:2] <- c("UMAP_1","UMAP_2")
query_coords <- data.frame(
Embeddings(mapped_MalignantCD4T, "ref.umap"),
pseudotime = mapped_MalignantCD4T$predicted.pseudotime,
celltype = mapped_MalignantCD4T$predicted.predicted.celltype.l2,
stringsAsFactors = FALSE
)
colnames(query_coords)[1:2] <- c("UMAP_1","UMAP_2")
p_proj_pt <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey87", size = 0.3, alpha = 0.6) +
geom_point(data = query_coords %>% filter(is.finite(pseudotime)),
aes(UMAP_1, UMAP_2, colour = pseudotime),
size = 0.5, alpha = 0.8) +
scale_colour_viridis_c(option = "plasma", name = "Pseudotime") +
theme_classic() +
ggtitle("Sézary cells — transferred pseudotime") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
p_proj_ct <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey87", size = 0.3, alpha = 0.6) +
geom_point(data = query_coords,
aes(UMAP_1, UMAP_2, colour = celltype),
size = 0.5, alpha = 0.8) +
scale_colour_manual(values = azimuth_l2_colors, na.value = "grey60",
name = "State") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
theme_classic() +
ggtitle("Sézary cells — transferred cell state labels") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
qc4 <- p_proj_pt | p_proj_ct
qc4save_fig(qc4, "QC11_sezary_projection_overview", w = 18, h = 8)
# Score distribution
if (score_col %in% colnames(mapped_MalignantCD4T@meta.data)) {
score_df <- data.frame(score = mapped_MalignantCD4T@meta.data[[score_col]])
p_score <- ggplot(score_df, aes(x = score)) +
geom_histogram(bins = 60, fill = "#2166AC", colour = "white", alpha = 0.85) +
geom_vline(xintercept = median(score_df$score, na.rm = TRUE),
linetype = "dashed", colour = "red", linewidth = 0.8) +
annotate("text", x = median(score_df$score, na.rm = TRUE) + 0.01,
y = Inf, vjust = 1.5, hjust = 0,
label = sprintf("median=%.3f", median(score_df$score, na.rm = TRUE)),
colour = "red", size = 3.5) +
theme_classic() +
labs(title = "Label Transfer Confidence Score — Sézary Cells",
x = "Score", y = "Cell Count")
p_score
save_fig(p_score, "QC12_label_transfer_confidence", w = 10, h = 6)
}# ════════════════════════════════════════════════════════════════════════════
# Bin boundaries are midpoints between adjacent reference state medians.
# Computed from reference_integrated$monocle3_pseudotime — the SAME column
# that was passed to MapQuery. The pseudotime scale of predicted.pseudotime
# in Sézary cells inherits this scale directly.
# ════════════════════════════════════════════════════════════════════════════
naive_tcm_cut <- round((naive_med + tcm_med) / 2, 3)
tcm_tem_cut <- round((tcm_med + tem_med) / 2, 3)
tem_temra_cut <- round((tem_med + temra_med) / 2, 3)
cat("=== Bin boundaries ===\n")=== Bin boundaries ===
Naive | TCM = (3.389 + 13.027) / 2 = 8.208
TCM | TEM = (13.027 + 26.890) / 2 = 19.959
TEM | Temra = (26.890 + 27.350) / 2 = 27.120
Treg = label-based (branch — not linear axis)
# ── Assign bins ───────────────────────────────────────────────────────────────
mapped_MalignantCD4T$state_azimuth_l2 <-
mapped_MalignantCD4T$predicted.predicted.celltype.l2
mapped_MalignantCD4T$pseudotime_value <-
as.numeric(mapped_MalignantCD4T$predicted.pseudotime)
mapped_MalignantCD4T$pseudotime_bin <- factor(
dplyr::case_when(
mapped_MalignantCD4T$state_azimuth_l2 == "Treg" ~ "Treg-like",
mapped_MalignantCD4T$pseudotime_value < naive_tcm_cut ~ "Naive-like",
mapped_MalignantCD4T$pseudotime_value >= naive_tcm_cut &
mapped_MalignantCD4T$pseudotime_value < tcm_tem_cut ~ "TCM-like",
mapped_MalignantCD4T$pseudotime_value >= tcm_tem_cut &
mapped_MalignantCD4T$pseudotime_value < tem_temra_cut ~ "TEM-like",
mapped_MalignantCD4T$pseudotime_value >= tem_temra_cut ~ "Temra-like",
TRUE ~ NA_character_
),
levels = c("Naive-like","TCM-like","Treg-like","TEM-like","Temra-like")
)
cat("\n=== Bin distribution ===\n")
=== Bin distribution ===
bin_tab <- table(mapped_MalignantCD4T$pseudotime_bin, useNA = "ifany")
bin_pct <- round(100 * prop.table(bin_tab), 2)
print(data.frame(n = as.integer(bin_tab), pct = as.numeric(bin_pct),
row.names = names(bin_tab)))
=== Per cell line ===
Naive-like TCM-like Treg-like TEM-like Temra-like
L1 927 2099 1 2467 331
L2 38 1842 0 645 3410
L3 4 3881 0 1597 946
L4 20 3308 0 2161 517
L5 9 3867 0 1330 816
L6 0 2196 0 891 2061
L7 1 3005 0 1651 674
# ── QC Figure: Bin distribution ──────────────────────────────────────────────
bin_df <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(pseudotime_bin) %>%
mutate(pct = round(100 * n / sum(n), 1))
p_bin_bar <- ggplot(bin_df, aes(x = pseudotime_bin, y = n, fill = pseudotime_bin)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = sprintf("%s\n(%.1f%%)", comma(n), pct)),
vjust = -0.3, size = 3.5, fontface = "bold") +
scale_fill_manual(values = bin_colors) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.18))) +
theme_classic() +
theme(legend.position = "none",
axis.text = element_text(size = 11)) +
labs(title = "Sézary Cells — Pseudotime Bin Distribution",
x = NULL, y = "Cell Count")
p_bin_barsave_fig(p_bin_bar, "QC13_pseudotime_bin_distribution", w = 9, h = 6)
# ── QC Figure: Bins on UMAP ───────────────────────────────────────────────────
query_bins <- data.frame(
Embeddings(mapped_MalignantCD4T, "ref.umap"),
pseudotime_bin = mapped_MalignantCD4T$pseudotime_bin,
cell_line = mapped_MalignantCD4T$cell_line
)
colnames(query_bins)[1:2] <- c("UMAP_1","UMAP_2")
p_bin_umap <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey67", size = 0.3, alpha = 0.5) +
geom_point(data = query_bins %>% filter(!is.na(pseudotime_bin)),
aes(UMAP_1, UMAP_2, colour = pseudotime_bin),
size = 0.5, alpha = 0.8) +
scale_colour_manual(values = bin_colors, name = "Bin") +
guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
theme_classic() +
ggtitle("Sézary Cells — Pseudotime Bins on Reference UMAP") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
p_bin_umap# Reference
saveRDS(reference_integrated,
file.path(out_dir, "reference_integrated_RPCA_monocle3.Rds"))
cat("✅ reference_integrated saved (CCA integration intact, monocle3 pseudotime added)\n")✅ reference_integrated saved (CCA integration intact, monocle3 pseudotime added)
# Mapped Sézary
saveRDS(mapped_MalignantCD4T,
file.path(out_dir, "MalignantCD4T_mapped_pseudotime.Rds"))Error: C stack usage 7971300 is too close to the limit
✅ mapped_MalignantCD4T saved
# Bin boundaries — single source of truth
bin_boundaries <- data.frame(
boundary = c("Naive_TCM", "TCM_TEM", "TEM_Temra"),
pseudotime_cut = c(naive_tcm_cut, tcm_tem_cut, tem_temra_cut),
naive_med = naive_med,
tcm_med = tcm_med,
treg_med = treg_med,
tem_med = tem_med,
temra_med = temra_med
)
write.csv(bin_boundaries,
file.path(out_dir, "bin_boundaries.csv"),
row.names = FALSE)
cat("✅ bin_boundaries.csv saved\n")✅ bin_boundaries.csv saved
# Full metadata
write.csv(mapped_MalignantCD4T@meta.data,
file.path(out_dir, "metadata_full.csv"),
row.names = TRUE)
cat("✅ metadata_full.csv saved\n")✅ metadata_full.csv saved
Saved files:
[1] "bin_boundaries.csv" "Manuscript_Figures" "metadata_full.csv"
[4] "QC_Figures" "reference_integrated_RPCA_monocle3.Rds"
All figures below are saved as high-resolution PNG (300 dpi) and PDF.
# Panel A: UMAP coloured by Azimuth l2
p_f1a <- DimPlot(
reference_integrated,
group.by = "predicted.celltype.l2",
reduction = "umap",
label = TRUE, repel = TRUE, label.size = 4.5, label.box = TRUE
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey70") +
ggtitle("A Healthy CD4\u207a T Cell Reference\n(Azimuth l2 States)") +
theme_classic(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0),
legend.position = "bottom",
legend.text = element_text(size = 10)
) +
guides(colour = guide_legend(nrow = 2, override.aes = list(size = 4)))
# Panel B: Pseudotime on reference UMAP
p_f1b <- FeaturePlot(
reference_integrated,
features = "monocle3_pseudotime",
reduction = "umap",
order = TRUE
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("B Monocle3 Pseudotime\n(Root = CD4 Naive)") +
theme_classic(base_size = 13) +
theme(plot.title = element_text(face = "bold", hjust = 0))
# Panel C: Pseudotime violin by state
state_order <- c("CD4 Naive","CD4 TCM","Treg","CD4 TEM","CD4 Temra/CTL")
state_labels <- c("Naive","TCM","Treg","TEM","Temra/CTL")
pt_meta2 <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime),
predicted.celltype.l2 %in% state_order) %>%
mutate(State = factor(predicted.celltype.l2,
levels = state_order,
labels = state_labels))
# ── Define colour vector BEFORE the ggplot call — never inside the + chain ───
azimuth_5 <- azimuth_l2_colors[state_order]
names(azimuth_5) <- state_labels
p_f1c <- ggplot(pt_meta2, aes(x = State, y = monocle3_pseudotime, fill = State)) +
geom_violin(scale = "width", trim = TRUE, alpha = 0.85) +
geom_boxplot(width = 0.12, fill = "white",
outlier.size = 0.5, outlier.alpha = 0.3) +
scale_fill_manual(values = azimuth_5) +
geom_hline(
yintercept = c(naive_med, tcm_med, treg_med, tem_med, temra_med),
linetype = "dashed", colour = "black", linewidth = 0.4, alpha = 0.5
) +
theme_classic(base_size = 13) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 30, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0)
) +
labs(
title = "C Pseudotime by T Cell State",
subtitle = sprintf("Naive=%.2f | TCM=%.2f | Treg=%.2f | TEM=%.2f | Temra=%.2f",
naive_med, tcm_med, treg_med, tem_med, temra_med),
x = NULL, y = "Pseudotime"
)
# ── Assemble figure ───────────────────────────────────────────────────────────
fig1 <- (p_f1a | p_f1b) / p_f1c +
plot_layout(heights = c(1.2, 1)) +
plot_annotation(
title = "Figure 1 — Healthy CD4\u207a T Cell Reference Atlas",
subtitle = sprintf("n=%d cells | Monocle3 trajectory | Root: CD4 Naive",
ncol(reference_integrated)),
theme = theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 11, colour = "grey40")
)
)
fig1# Panel A: All Sézary cells on reference UMAP coloured by pseudotime
p_f2a <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey67", size = 0.3, alpha = 0.5) +
geom_point(data = query_coords %>% filter(is.finite(pseudotime)),
aes(UMAP_1, UMAP_2, colour = pseudotime),
size = 0.4, alpha = 0.8) +
scale_colour_viridis_c(option = "plasma", name = "Transferred\nPseudotime") +
theme_classic(base_size = 13) +
ggtitle("A Sézary Cells Projected onto\nHealthy CD4⁺ T Cell Reference") +
theme(plot.title = element_text(face = "bold", hjust = 0))
# Panel B: Bins on UMAP
p_f2b <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey67", size = 0.3, alpha = 0.5) +
geom_point(data = query_bins %>% filter(!is.na(pseudotime_bin)),
aes(UMAP_1, UMAP_2, colour = pseudotime_bin),
size = 0.4, alpha = 0.8) +
scale_colour_manual(values = bin_colors, name = "State Bin") +
guides(colour = guide_legend(override.aes = list(size = 3.5, alpha = 1))) +
theme_classic(base_size = 13) +
ggtitle("B Differentiation State Bins\n(Pseudotime-anchored)") +
theme(plot.title = element_text(face = "bold", hjust = 0))
# Panel C: Bin % stacked by line
line_bin_df <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(cell_line, pseudotime_bin) %>%
group_by(cell_line) %>%
mutate(pct = 100 * n / sum(n)) %>%
ungroup()
p_f2c <- ggplot(line_bin_df,
aes(x = cell_line, y = pct, fill = pseudotime_bin)) +
geom_bar(stat = "identity", width = 0.78) +
scale_fill_manual(values = bin_colors, name = "State Bin") +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_classic(base_size = 13) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold", hjust = 0)
) +
labs(title = "C State Bin Composition per Cell Line",
x = "Cell Line", y = "% Cells")
fig2 <- (p_f2a | p_f2b) / p_f2c +
plot_layout(heights = c(1.3, 1)) +
plot_annotation(
title = "Figure 2 — Sézary CD4⁺ T Cell Projection and State Assignment",
subtitle = sprintf("n=%d cells across %d cell lines",
ncol(mapped_MalignantCD4T),
length(unique(mapped_MalignantCD4T$cell_line))),
theme = theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 11, colour = "grey40")
)
)
fig2
# Panel A
p_f2a <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey67", size = 0.3, alpha = 0.5) +
geom_point(data = query_coords %>% filter(is.finite(pseudotime)),
aes(UMAP_1, UMAP_2, colour = pseudotime),
size = 0.4, alpha = 0.8) +
scale_colour_viridis_c(option = "plasma", name = "Transferred\nPseudotime") +
theme_classic(base_size = 13) +
ggtitle("A Sézary Cells Projected onto\nHealthy CD4\u207a T Cell Reference") +
theme(plot.title = element_text(face = "bold", hjust = 0))
# Panel B
p_f2b <- ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey67", size = 0.3, alpha = 0.5) +
geom_point(data = query_bins %>% filter(!is.na(pseudotime_bin)),
aes(UMAP_1, UMAP_2, colour = pseudotime_bin),
size = 0.4, alpha = 0.8) +
scale_colour_manual(values = bin_colors, name = "State Bin") +
guides(colour = guide_legend(override.aes = list(size = 3.5, alpha = 1))) +
theme_classic(base_size = 13) +
ggtitle("B Differentiation State Bins\n(Pseudotime-anchored)") +
theme(plot.title = element_text(face = "bold", hjust = 0))
# Panel C: stacked bar with correctly positioned % labels
# ── KEY FIX: sort within each line by the SAME factor order ggplot uses,
# then cumsum so the midpoints match the actual rendered stack position ──
bin_level_order <- c("Naive-like","TCM-like","Treg-like","TEM-like","Temra-like")
line_bin_df <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(cell_line, pseudotime_bin) %>%
group_by(cell_line) %>%
mutate(pct = 100 * n / sum(n)) %>%
ungroup() %>%
mutate(pseudotime_bin = factor(pseudotime_bin, levels = bin_level_order)) %>%
# sort within each cell_line in REVERSE factor order — ggplot stacks
# bottom-to-top in factor order, so cumsum must go bottom-to-top too
arrange(cell_line, pseudotime_bin) %>%
group_by(cell_line) %>%
mutate(
# position = midpoint of this segment in the stacked bar
label_y = cumsum(pct) - 0.5 * pct,
label = ifelse(pct >= 3, sprintf("%.1f%%", pct), "")
) %>%
ungroup()
p_f2c <- ggplot(line_bin_df,
aes(x = cell_line, y = pct, fill = pseudotime_bin)) +
geom_bar(stat = "identity", width = 0.78) +
geom_text(
aes(y = label_y, label = label),
size = 3.2,
fontface = "bold",
colour = "white",
vjust = 0.5
) +
scale_fill_manual(values = bin_colors, name = "State Bin") +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.02))) +
theme_classic(base_size = 13) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold", hjust = 0)
) +
labs(title = "C State Bin Composition per Cell Line",
x = "Cell Line", y = "% Cells")
fig2_v2 <- (p_f2a | p_f2b) / p_f2c +
plot_layout(heights = c(1.3, 1)) +
plot_annotation(
title = "Figure 2 — Sézary CD4\u207a T Cell Projection and State Assignment",
subtitle = sprintf("n=%d cells across %d cell lines",
ncol(mapped_MalignantCD4T),
length(unique(mapped_MalignantCD4T$cell_line))),
theme = theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 11, colour = "grey40")
)
)
fig2_v2pt_sezary <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value), !is.na(pseudotime_bin)) %>%
mutate(
cell_line = factor(cell_line, levels = paste0("L", 1:7)),
pseudotime_bin = factor(pseudotime_bin,
levels = c("Naive-like","TCM-like","Treg-like",
"TEM-like","Temra-like"))
)
# Panel A: Ridge plot of pseudotime per line
p_f3a <- ggplot(pt_sezary,
aes(x = pseudotime_value, y = cell_line, fill = cell_line)) +
geom_density_ridges(scale = 1.2, alpha = 0.85, colour = "white",
quantile_lines = TRUE, quantiles = 2) +
geom_vline(xintercept = c(naive_tcm_cut, tcm_tem_cut, tem_temra_cut),
linetype = "dashed", colour = "black", linewidth = 0.6, alpha = 0.7) +
annotate("text",
x = c(naive_tcm_cut, tcm_tem_cut, tem_temra_cut),
y = 1.2, vjust = -0.2, hjust = -0.05,
label = c("Naive|TCM","TCM|TEM","TEM|Temra"),
size = 3, colour = "black") +
scale_fill_manual(values = line_colors) +
theme_classic(base_size = 13) +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0)) +
labs(title = "A Pseudotime Distribution per Cell Line",
x = "Transferred Pseudotime", y = "Cell Line")
# Panel B: Bin proportions — overall bubble/bar
bin_total_df <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(pseudotime_bin) %>%
mutate(pct = round(100 * n / sum(n), 1),
pseudotime_bin = factor(pseudotime_bin,
levels = c("Naive-like","TCM-like","Treg-like",
"TEM-like","Temra-like")))
p_f3b <- ggplot(bin_total_df, aes(x = pseudotime_bin, y = pct,
fill = pseudotime_bin)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(aes(label = sprintf("%.1f%%\n(n=%s)", pct, comma(n))),
vjust = -0.3, size = 3.5, fontface = "bold") +
scale_fill_manual(values = bin_colors) +
scale_y_continuous(expand = expansion(mult = c(0, 0.2)),
labels = function(x) paste0(x, "%")) +
theme_classic(base_size = 13) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 25, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0)) +
labs(title = "B State Bin Composition — All Sézary Cells",
x = NULL, y = "% Cells")
# Panel C: Heatmap of bin % by line
line_bin_wide <- line_bin_df %>%
dplyr::select(cell_line, pseudotime_bin, pct) %>%
tidyr::pivot_wider(names_from = pseudotime_bin, values_from = pct,
values_fill = 0) %>%
tibble::column_to_rownames("cell_line")
annot_colors <- list(State = bin_colors)
p_f3c_grob <- pheatmap(
as.matrix(line_bin_wide),
color = colorRampPalette(c("white","#FEE090","#D73027"))(100),
display_numbers = TRUE,
number_format = "%.1f",
number_color = "black",
fontsize_number = 9,
cluster_rows = FALSE,
cluster_cols = FALSE,
border_color = "white",
main = "C State Bin % by Cell Line",
angle_col = 45,
silent = TRUE
)
fig3 <- (p_f3a | p_f3b) /
wrap_elements(p_f3c_grob$gtable) +
plot_layout(heights = c(1.2, 1)) +
plot_annotation(
title = "Figure 3 — Pseudotime Distribution and State Composition",
theme = theme(plot.title = element_text(size = 16, face = "bold"))
)
fig3DefaultAssay(mapped_MalignantCD4T) <- "SCT"
# Panels A–D: violin/dot plots of key state markers
state_markers <- list(
"Naive/TCM" = c("CCR7","SELL","TCF7","IL7R","LEF1"),
"Treg" = c("FOXP3","IL2RA","IKZF2","CTLA4","TNFRSF4"),
"TEM" = c("GZMK","GZMA","CCL5","NKG7","CX3CR1"),
"Temra/CTL" = c("GZMB","PRF1","GNLY","FGFBP2","FCGR3A")
)
all_ms <- unique(unlist(state_markers))
avail_ms <- intersect(all_ms, rownames(mapped_MalignantCD4T))
dp <- DotPlot(
mapped_MalignantCD4T,
features = avail_ms,
group.by = "pseudotime_bin",
assay = "SCT",
dot.scale = 7
) +
scale_color_gradient2(low = "#2166AC", mid = "white", high = "#D73027",
midpoint = 0, name = "Avg Expr") +
theme_classic(base_size = 12) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
plot.title = element_text(face = "bold", hjust = 0)
) +
labs(title = "Figure 4 — Key T Cell Marker Expression by Pseudotime Bin",
x = NULL, y = "State Bin")
dpline_plot_list <- lapply(paste0("L", 1:7), function(ln) {
df_ln <- query_bins %>% filter(cell_line == ln)
ggplot() +
geom_point(data = ref_bg, aes(UMAP_1, UMAP_2),
colour = "grey90", size = 0.25, alpha = 0.4) +
geom_point(data = df_ln %>% filter(!is.na(pseudotime_bin)),
aes(UMAP_1, UMAP_2, colour = pseudotime_bin),
size = 0.6, alpha = 0.9) +
scale_colour_manual(values = bin_colors, name = "Bin") +
theme_classic(base_size = 11) +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5, size = 10)) +
ggtitle(sprintf("%s (n=%s)", ln, comma(nrow(df_ln))))
})
# Shared legend
legend_plot <- ggplot(
data.frame(x=1:5, y=1, bin=names(bin_colors)),
aes(x, y, colour = bin)
) +
geom_point(size = 4) +
scale_colour_manual(values = bin_colors, name = "State Bin") +
guides(colour = guide_legend(override.aes = list(size = 5))) +
theme_void() +
theme(legend.position = "right")
shared_legend <- cowplot::get_legend(legend_plot)
s1_grid <- wrap_plots(line_plot_list, ncol = 4) +
plot_annotation(
title = "Supplementary Figure S1 — Pseudotime Bin Distribution per Cell Line",
subtitle = "Grey background = healthy reference cells",
theme = theme(
plot.title = element_text(size = 15, face = "bold"),
plot.subtitle = element_text(size = 10, colour = "grey40")
)
)
# Attach shared legend to right side using cowplot
s1 <- cowplot::plot_grid(s1_grid, shared_legend,
ncol = 2, rel_widths = c(1, 0.08))
s1# cowplot::plot_grid() returns a gtable, not a ggplot — use save_plot not ggsave
cowplot::save_plot(
filename = file.path(fig_dir_ms, "SuppFig_S1_per_line_UMAP.png"),
plot = s1,
base_width = 22, base_height = 14, dpi = 300, bg = "white"
)
cowplot::save_plot(
filename = file.path(fig_dir_pdf, "SuppFig_S1_per_line_UMAP.pdf"),
plot = s1,
base_width = 22, base_height = 14
)# Anchor scores
p_s2a <- ggplot(anchor_df, aes(x = score)) +
geom_histogram(bins = 60, fill = "#2166AC", colour = "white", alpha = 0.85) +
geom_vline(xintercept = median(anchor_df$score),
linetype = "dashed", colour = "red", linewidth = 0.8) +
annotate("text", x = median(anchor_df$score) + 0.02, y = Inf, vjust = 1.5, hjust = 0,
label = sprintf("median=%.3f", median(anchor_df$score)),
colour = "red", size = 3.5) +
theme_classic(base_size = 12) +
labs(title = "A Transfer Anchor Score Distribution",
subtitle = sprintf("n=%d anchors | cells:anchors = %.1f:1",
n_anchors, anchor_ratio),
x = "Anchor Score", y = "Count")
# Label transfer confidence
if (score_col %in% colnames(mapped_MalignantCD4T@meta.data)) {
sc_df <- data.frame(score = mapped_MalignantCD4T@meta.data[[score_col]])
p_s2b <- ggplot(sc_df, aes(x = score)) +
geom_histogram(bins = 60, fill = "#D73027", colour = "white", alpha = 0.85) +
geom_vline(xintercept = median(sc_df$score, na.rm = TRUE),
linetype = "dashed", colour = "black", linewidth = 0.8) +
annotate("text", x = median(sc_df$score, na.rm = TRUE) - 0.02,
y = Inf, vjust = 1.5, hjust = 1,
label = sprintf("median=%.3f", median(sc_df$score, na.rm = TRUE)),
colour = "black", size = 3.5) +
theme_classic(base_size = 12) +
labs(title = "B Label Transfer Confidence Score — Sézary Cells",
x = "Score", y = "Cell Count")
} else {
p_s2b <- ggplot() + theme_void() + ggtitle("Score column not available")
}
s2 <- p_s2a | p_s2b
s2 <- s2 + plot_annotation(
title = "Supplementary Figure S2 — Transfer Quality Metrics",
theme = theme(plot.title = element_text(size = 15, face = "bold"))
)
s2=== QC Figures saved ===
QC_DotPlot_AzimuthL2_markers.png
QC1_incoming_reference_UMAP.png
QC10_anchor_score_distribution.png
QC11_sezary_projection_overview.png
QC12_label_transfer_confidence.png
QC13_pseudotime_bin_distribution.png
QC14_bins_on_UMAP.png
QC2_reference_UMAP_verified.png
QC3_reference_marker_features.png
QC4_principal_graph_cell_type.png
QC5_monocle3_pseudotime_UMAP.png
QC6_pseudotime_by_state_violin.png
QC8_sezary_cell_counts.png
QC9_HVG_overlap.png
=== Manuscript Figures saved ===
Figure1_Reference_Atlas.png
Figure2_Sezary_Projection.png
Figure3_Pseudotime_Distribution.png
Figure4_Marker_DotPlot.png
SuppFig_S1_per_line_UMAP.png
SuppFig_S2_transfer_quality.png
=== PDF Versions ===
Figure1_Reference_Atlas.pdf
Figure2_Sezary_Projection.pdf
Figure3_Pseudotime_Distribution.pdf
Figure4_Marker_DotPlot.pdf
SuppFig_S1_per_line_UMAP.pdf
SuppFig_S2_transfer_quality.pdf
R version 4.5.2 (2025-10-31)
Platform: x86_64-redhat-linux-gnu
Running under: Rocky Linux 9.7 (Blue Onyx)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS-OPENMP; LAPACK version 3.9.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] future_1.69.0 igraph_2.2.2 cowplot_1.2.0 pheatmap_1.0.13 scales_1.4.0 Matrix_1.7-4 ggrepel_0.9.6
[8] ggridges_0.5.7 viridis_0.6.5 viridisLite_0.4.3 RColorBrewer_1.1-3 patchwork_1.3.2 ggplot2_4.0.2 tibble_3.3.1
[15] tidyr_1.3.2 dplyr_1.2.0 SeuratWrappers_0.4.0 monocle3_1.4.26 SingleCellExperiment_1.32.0 SummarizedExperiment_1.40.0 GenomicRanges_1.62.1
[22] Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0 MatrixGenerics_1.22.0 matrixStats_1.5.0 Biobase_2.70.0 BiocGenerics_0.56.0
[29] generics_0.1.4 Seurat_5.4.0 SeuratObject_5.3.0 sp_2.2-1
loaded via a namespace (and not attached):
[1] rstudioapi_0.18.0 jsonlite_2.0.0 magrittr_2.0.4 spatstat.utils_3.2-1 nloptr_2.2.1 farver_2.1.2 ragg_1.5.0
[8] vctrs_0.7.1 ROCR_1.0-12 DelayedMatrixStats_1.32.0 minqa_1.2.8 spatstat.explore_3.7-0 htmltools_0.5.9 S4Arrays_1.10.1
[15] SparseArray_1.10.8 sctransform_0.4.3 parallelly_1.46.1 KernSmooth_2.23-26 htmlwidgets_1.6.4 ica_1.0-3 plyr_1.8.9
[22] plotly_4.12.0 zoo_1.8-15 mime_0.13 lifecycle_1.0.5 pkgconfig_2.0.3 rsvd_1.0.5 R6_2.6.1
[29] fastmap_1.2.0 rbibutils_2.4.1 fitdistrplus_1.2-6 shiny_1.12.1 digest_0.6.39 tensor_1.5.1 RSpectra_0.16-2
[36] irlba_2.3.7 textshaping_1.0.4 beachmat_2.26.0 labeling_0.4.3 progressr_0.18.0 spatstat.sparse_3.1-0 httr_1.4.8
[43] polyclip_1.10-7 abind_1.4-8 compiler_4.5.2 proxy_0.4-29 remotes_2.5.0 withr_3.0.2 S7_0.2.1
[50] fastDummies_1.7.5 R.utils_2.13.0 MASS_7.3-65 DelayedArray_0.36.0 tools_4.5.2 lmtest_0.9-40 otel_0.2.0
[57] httpuv_1.6.16 future.apply_1.20.1 goftest_1.2-3 glmGamPoi_1.22.0 R.oo_1.27.1 glue_1.8.0 nlme_3.1-168
[64] promises_1.5.0 grid_4.5.2 Rtsne_0.17 cluster_2.1.8.2 reshape2_1.4.5 gtable_0.3.6 spatstat.data_3.1-9
[71] R.methodsS3_1.8.2 data.table_1.18.2.1 XVector_0.50.0 spatstat.geom_3.7-0 RcppAnnoy_0.0.23 RANN_2.6.2 pillar_1.11.1
[78] stringr_1.6.0 spam_2.11-3 RcppHNSW_0.6.0 later_1.4.6 splines_4.5.2 lattice_0.22-9 survival_3.8-6
[85] deldir_2.0-4 tidyselect_1.2.1 miniUI_0.1.2 pbapply_1.7-4 knitr_1.51 reformulas_0.4.4 gridExtra_2.3
[92] scattermore_1.2 xfun_0.56 stringi_1.8.7 lazyeval_0.2.2 boot_1.3-32 evaluate_1.0.5 codetools_0.2-20
[99] BiocManager_1.30.27 cli_3.6.5 uwot_0.2.4 systemfonts_1.3.1 Rdpack_2.6.6 xtable_1.8-4 reticulate_1.45.0
[106] dichromat_2.0-0.1 Rcpp_1.1.1 globals_0.19.0 spatstat.random_3.4-4 png_0.1-8 spatstat.univar_3.1-6 parallel_4.5.2
[113] assertthat_0.2.1 dotCall64_1.2 sparseMatrixStats_1.22.0 lme4_1.1-38 listenv_0.10.0 purrr_1.2.1 rlang_1.1.7