# ── CRITICAL requirements for this object ─────────────────────────────────
# ✅ CD4 Proliferating cluster removed (MKI67+/TOP2A+/CDK1+)
# ✅ UMAP built with return.model = TRUE (required for MapQuery)
# ✅ monocle3_pseudotime column (will be recomputed here for reproducibility)
# ✅ predicted.celltype.l2 (Azimuth l2 labels)
# ✅ cell_type annotation (clusters 0-6)
reference_integrated <- readRDS(
"../1-Final_Custom_MST_Monocle3_Trajectory_and_mapping/1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds"
)
cat("=== Reference object summary ===\n")
cat("Cells :", ncol(reference_integrated), "\n")
cat("Assays :", paste(names(reference_integrated@assays), collapse = ", "), "\n")
cat("Reductions:", paste(names(reference_integrated@reductions), collapse = ", "), "\n")
cat("UMAP model:", !is.null(reference_integrated@reductions$umap@misc$model), "\n")
cat("Clusters :",
paste(sort(unique(reference_integrated$seurat_clusters)), collapse = ", "), "\n")
# ── Confirm CD4 Proliferating cells absent ────────────────────────────────
if (any(grepl("Prolif|prolif|cycling|Cycling",
reference_integrated$cell_type, ignore.case = TRUE))) {
warning("⚠️ Proliferating cell_type labels detected — remove before proceeding.")
} else {
cat("✅ No proliferating cell_type labels\n")
}
prolif_markers <- c("MKI67", "TOP2A", "PCNA", "CDK1", "STMN1")
prolif_present <- intersect(prolif_markers, rownames(reference_integrated))
if (length(prolif_present) > 0) {
DefaultAssay(reference_integrated) <- "SCT"
prolif_expr <- Matrix::colMeans(
GetAssayData(reference_integrated, layer = "data")[prolif_present, , drop = FALSE]
)
cat("Prolif marker score — max:", round(max(prolif_expr), 3),
"| cells > 0.5:", sum(prolif_expr > 0.5), "\n")
if (sum(prolif_expr > 0.5) > 50) {
warning("⚠️ Substantial proliferation signal — verify removal was complete.")
} else {
cat("✅ Proliferating cells confirmed absent\n")
}
}
cat("\nCell type distribution (reference):\n")
print(table(reference_integrated$cell_type))
cat("\nAzimuth l2 distribution (reference):\n")
print(table(reference_integrated$predicted.celltype.l2))
# Extend palette to cover any labels not pre-seeded above
ref_l2_labels <- unique(as.character(reference_integrated$predicted.celltype.l2))
ref_l2_labels <- ref_l2_labels[!is.na(ref_l2_labels)]
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, ref_l2_labels)
cat("\nColour palette covers", length(ref_l2_labels), "reference labels ✅\n")
# ── Hard stop: UMAP model must exist for MapQuery ─────────────────────────
if (is.null(reference_integrated@reductions$umap@misc$model)) {
stop(
"❌ UMAP model missing.\n",
" Re-run: reference_integrated <- RunUMAP(reference_integrated,\n",
" dims = 1:20, return.model = TRUE)\n",
" Then re-save and reload."
)
} else {
cat("✅ UMAP model present — MapQuery projection ready\n")
}All_samples_Merged <- readRDS(
"/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds"
)
cat("All cell lines:\n")
print(table(All_samples_Merged$cell_line))
All_samples_Merged$Group <- ifelse(
All_samples_Merged$cell_line %in% paste0("L", 1:7), "MalignantCD4T",
ifelse(
All_samples_Merged$cell_line %in%
c("CD4Tcells_lab", "CD4Tcells_10x"),
"NormalCD4T", "Other"
)
)
cat("\nGroup distribution:\n")
print(table(All_samples_Merged$Group))
MalignantCD4T_raw <- subset(All_samples_Merged, subset = Group == "MalignantCD4T")
cat("\nMalignant CD4T cells:", ncol(MalignantCD4T_raw), "\n")
print(table(MalignantCD4T_raw$cell_line))
rm(All_samples_Merged); gc()# ══════════════════════════════════════════════════════════════════════════
# PER-CELL-LINE SCTransform
# ══════════════════════════════════════════════════════════════════════════
# WHY: Each cell line is a separate library with different sequencing depth
# and ambient RNA profile. A single merged SCT model conflates batch
# variation with biology. Per-line SCT corrects for this before projection.
#
# APPROACH:
# 1. Split by cell_line
# 2. SCTransform each line (regress percent.mt + cell cycle scores)
# 3. Select HVGs by cross-line frequency — most robust to batch effects
# 4. Merge, scale, PCA — ready for FindTransferAnchors
# ══════════════════════════════════════════════════════════════════════════
cat("=== Per-cell-line SCTransform ===\n")
# Save names BEFORE lapply to avoid double SplitObject call
# FIX: original script called SplitObject twice — wasteful and risky
cell_line_names <- names(SplitObject(MalignantCD4T_raw, split.by = "cell_line"))
cell_line_list <- SplitObject(MalignantCD4T_raw, split.by = "cell_line")
cat("Lines to process:", paste(cell_line_names, collapse = ", "), "\n\n")
cell_line_list <- lapply(cell_line_names, function(line_name) {
obj <- cell_line_list[[line_name]]
cat("Processing:", line_name, "| Cells:", ncol(obj), "\n")
if (!"percent.mt" %in% colnames(obj@meta.data))
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
# Cell cycle regression removes cycling artefacts without removing cells
vars_regress <- tryCatch({
obj <- CellCycleScoring(obj,
s.features = cc.genes$s.genes,
g2m.features = cc.genes$g2m.genes,
set.ident = FALSE)
cat(" → Cell cycle scores computed\n")
c("percent.mt", "S.Score", "G2M.Score")
}, error = function(e) {
cat(" ⚠️ Cell cycle scoring failed — regressing percent.mt only\n")
"percent.mt"
})
obj <- SCTransform(obj,
vars.to.regress = vars_regress,
variable.features.n = 3000,
vst.flavor = "v2",
verbose = FALSE)
cat(" ✅", line_name, "complete |", length(VariableFeatures(obj)), "HVGs\n")
return(obj)
})
names(cell_line_list) <- cell_line_names # reuse saved names — no second SplitObject
# ── Select HVGs by cross-line frequency ───────────────────────────────────
all_hvg_lists <- lapply(cell_line_list, VariableFeatures)
hvg_freq <- sort(table(unlist(all_hvg_lists)), decreasing = TRUE)
n_hvgs <- min(3000, length(hvg_freq))
shared_hvgs <- names(hvg_freq)[1:n_hvgs]
cat("\nHVG selection:\n")
cat(" Variable in all", length(cell_line_list), "lines:",
sum(hvg_freq == length(cell_line_list)), "\n")
cat(" Variable in ≥3 lines:", sum(hvg_freq >= 3), "\n")
cat(" Final HVG set:", n_hvgs, "\n")
# ── Merge, scale, PCA ─────────────────────────────────────────────────────
MalignantCD4T <- merge(cell_line_list[[1]], y = cell_line_list[-1],
merge.data = TRUE)
VariableFeatures(MalignantCD4T) <- shared_hvgs
cat("\nRunning PCA on merged object...\n")
MalignantCD4T <- ScaleData(MalignantCD4T, features = shared_hvgs,
assay = "SCT", verbose = FALSE)
MalignantCD4T <- RunPCA(MalignantCD4T, features = shared_hvgs,
assay = "SCT", npcs = 30, verbose = FALSE)
cat("✅ Merged object ready\n")
cat(" Cells:", ncol(MalignantCD4T), "\n")
cat(" HVGs:", length(VariableFeatures(MalignantCD4T)), "\n")
cat(" PCA dims:", ncol(Embeddings(MalignantCD4T, "pca")), "\n\n")
print(table(MalignantCD4T$cell_line))
rm(MalignantCD4T_raw, cell_line_list); gc()# Genes variable in ALL 7 lines
hvg_all7 <- names(hvg_freq[hvg_freq == 7])
cat("Genes variable in all 7 lines:", length(hvg_all7), "\n")
print(hvg_all7)
# Check which canonical T cell markers are in the all-7 set
t_cell_markers <- c(
# Naive/memory
"CCR7", "SELL", "TCF7", "IL7R", "LEF1", "KLF2",
# Activation/exhaustion
"TOX", "PDCD1", "LAG3", "TIGIT", "CTLA4", "HAVCR2",
# Effector
"GZMB", "GZMK", "GZMA", "PRF1", "IFNG", "TNF",
# Treg
"FOXP3", "IL2RA", "IKZF2", "CTLA4",
# Sézary specific
"KIR3DL2", "PLS3", "TWIST1", "EPHA4", "CD164",
# Proliferation
"MKI67", "TOP2A", "CDK1"
)
found_in_all7 <- intersect(t_cell_markers, hvg_all7)
cat("\nCanonical markers in all-7 HVG set:\n")
print(found_in_all7)
not_found <- setdiff(t_cell_markers, hvg_all7)
cat("\nMarkers NOT in all-7 set:\n")
print(not_found)# Check what junk genes are in your current HVG set
# before find-anchors runs
cat("=== Junk gene check on shared_hvgs ===\n")
mt_in_hvgs <- shared_hvgs[grepl("^MT-", shared_hvgs)]
cat("MT genes in shared_hvgs:", length(mt_in_hvgs), "\n")
print(mt_in_hvgs)
ribo_in_hvgs <- shared_hvgs[grepl("^RPL|^RPS", shared_hvgs)]
cat("\nRibosomal genes in shared_hvgs:", length(ribo_in_hvgs), "\n")
print(ribo_in_hvgs)
hsp_in_hvgs <- shared_hvgs[grepl("^HSP|^HSPA|^HSPB", shared_hvgs)]
cat("\nHeat shock genes in shared_hvgs:", length(hsp_in_hvgs), "\n")
print(hsp_in_hvgs)
snhg_in_hvgs <- shared_hvgs[grepl("^SNHG|MALAT1|NEAT1", shared_hvgs)]
cat("\nlncRNA genes in shared_hvgs:", length(snhg_in_hvgs), "\n")
print(snhg_in_hvgs)
cat("\nTotal junk genes to be filtered:",
length(mt_in_hvgs) + length(ribo_in_hvgs) +
length(hsp_in_hvgs) + length(snhg_in_hvgs), "\n")
cat("=== Building Monocle3 CDS ===\n")
cds <- as.cell_data_set(reference_integrated)
# Transfer frozen UMAP coordinates — these define the trajectory space
reducedDim(cds, "UMAP") <- Embeddings(reference_integrated, "umap")
# Single partition: one connected graph across all healthy cells.
# CD4 Proliferating cells have been removed so no spurious cell-cycle
# branch will pull the graph away from the differentiation axis.
partition_vec <- setNames(factor(rep(1L, ncol(cds))), colnames(cds))
cds@clusters$UMAP$partitions <- partition_vec
cluster_vec <- setNames(
factor(reference_integrated$seurat_clusters[colnames(cds)]),
colnames(cds)
)
cds@clusters$UMAP$clusters <- cluster_vec
# Transfer metadata
colData(cds)$cell_line <- reference_integrated$orig.ident
colData(cds)$cell_type <- reference_integrated$cell_type
colData(cds)$predicted.celltype.l2 <- reference_integrated$predicted.celltype.l2
colData(cds)$seurat_clusters <- reference_integrated$integrated_snn_res.0.2
if ("orig.ident" %in% colnames(reference_integrated@meta.data))
colData(cds)$sample <- reference_integrated$orig.ident
# Validate all required slots are named correctly
stopifnot(
"clusters must be named" = !is.null(names(cds@clusters$UMAP$clusters)),
"partitions must be named" = !is.null(names(cds@clusters$UMAP$partitions)),
"cluster length matches" = length(cds@clusters$UMAP$clusters) == ncol(cds),
"partition length matches" = length(cds@clusters$UMAP$partitions) == ncol(cds)
)
cat("CDS built:", ncol(cds), "cells\n")
cat("Clusters:", nlevels(factor(colData(cds)$seurat_clusters)), "\n")
cat("Partitions:", nlevels(partitions(cds)), "\n\n")
cat("Azimuth l2 breakdown:\n")
print(table(colData(cds)$predicted.celltype.l2))# ── Learn principal graph ──────────────────────────────────────────────────
# Goal: recover two biological lineages as a single connected graph:
# Lineage 1: Naive → TCM → TEM → Temra/CTL (effector axis)
# Lineage 2: Naive → TCM → Treg (regulatory branch)
#
# use_partition = FALSE: single connected graph (no partition forcing)
# close_loop = FALSE: open trajectory (no loop back)
# minimal_branch_len = 5: short enough to detect the Treg branch
# — if Treg branch not detected, reduce to 3
# — if too many spurious branches, increase to 8
#
# NOTE: nn.k is NOT a valid learn_graph_control parameter in monocle3.
# FIX: removed nn.k — monocle3 handles neighbourhood structure internally.
# The key sensitivity parameter is minimal_branch_len.
set.seed(42)
cds <- learn_graph(
cds,
use_partition = FALSE,
close_loop = FALSE,
learn_graph_control = list(
minimal_branch_len = 10,
# ncenter = 100, # default is ~250 — reduce to simplify graph
orthogonal_proj_tip = FALSE
),
verbose = TRUE
)
cat("\n✅ Principal graph learned\n")
cat("Nodes:", length(igraph::V(principal_graph(cds)$UMAP)), "\n")
n_branch_points <- sum(igraph::degree(principal_graph(cds)$UMAP) > 2)
cat("Branch points:", n_branch_points, "\n")
if (n_branch_points == 0) {
warning(
"❌ No branch point — Treg lineage not separated.\n",
" Re-run learn_graph with minimal_branch_len = 3"
)
} else if (n_branch_points > 3) {
warning(
"⚠️ Too many branch points (", n_branch_points, ").\n",
" Re-run learn_graph with minimal_branch_len = 8"
)
} else {
cat("✅ Branch structure looks correct (1-3 branch points)\n")
}
# ── Trajectory visualisation ──────────────────────────────────────────────
umap_coords <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_coords) <- c("UMAP1", "UMAP2")
umap_coords$l2 <- reference_integrated$predicted.celltype.l2
umap_coords$cell_type <- reference_integrated$cell_type
label_l2 <- umap_coords %>%
group_by(l2) %>%
summarise(UMAP1 = median(UMAP1), UMAP2 = median(UMAP2), .groups = "drop")
label_ct <- umap_coords %>%
group_by(cell_type) %>%
summarise(UMAP1 = median(UMAP1), UMAP2 = median(UMAP2), .groups = "drop")
p_graph_l2 <- plot_cells(
cds,
color_cells_by = "predicted.celltype.l2",
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = TRUE,
cell_size = 0.7,
show_trajectory_graph = TRUE
) +
scale_color_manual(values = azimuth_l2_colors, name = "State") +
geom_label_repel(data = label_l2,
aes(x = UMAP1, y = UMAP2, label = l2),
size = 4, fontface = "bold", fill = "white", alpha = 0.85,
box.padding = 0.5, segment.color = "grey40",
inherit.aes = FALSE) +
ggtitle("Monocle3 Graph: Azimuth l2\n(Two lineages: effector axis + Treg branch)") +
theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"),
legend.position = "none")
p_graph_ct <- plot_cells(
cds,
color_cells_by = "cell_type",
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = TRUE,
cell_size = 0.7,
show_trajectory_graph = TRUE
) +
geom_label_repel(data = label_ct,
aes(x = UMAP1, y = UMAP2, label = cell_type),
size = 3.5, fontface = "bold", fill = "white", alpha = 0.85,
box.padding = 0.5, segment.color = "grey40",
inherit.aes = FALSE) +
ggtitle("Monocle3 Graph: Cell Types") +
theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"),
legend.position = "none")
p_graph_l2 | p_graph_ctcat("Azimuth l2 labels in CDS:\n")
print(sort(unique(colData(cds)$predicted.celltype.l2)))
# ── Identify naive cells ───────────────────────────────────────────────────
naive_ids <- rownames(colData(cds))[
grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2,
ignore.case = TRUE) |
grepl("Tnaive", as.character(colData(cds)$cell_type),
ignore.case = FALSE)
]
cat("\nNaive root cells:", length(naive_ids), "\n")
if (length(naive_ids) == 0)
stop("❌ No naive cells found. Check label patterns above.")
# ── Find root node by minimum distance to naive centroid ──────────────────
# Using centroid distance (not most-populated node) is more robust:
# the most-populated node biases toward the densest part of the naive
# cluster which may not be the earliest point on the trajectory.
naive_umap <- Embeddings(reference_integrated, "umap")[naive_ids, ]
naive_centroid <- colMeans(naive_umap)
pr_nodes <- t(cds@principal_graph_aux[["UMAP"]]$dp_mst)
node_dists <- rowSums(
(pr_nodes - matrix(naive_centroid,
nrow = nrow(pr_nodes), ncol = 2,
byrow = TRUE))^2
)
root_node <- names(which.min(node_dists))
cat("Root node selected:", root_node, "\n")
cat(sprintf(" Root coords: (%.3f, %.3f)\n",
pr_nodes[root_node, 1], pr_nodes[root_node, 2]))
cat(sprintf(" Naive centroid: (%.3f, %.3f)\n",
naive_centroid[1], naive_centroid[2]))
# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)
# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)
reference_integrated$monocle3_pseudotime[
!is.finite(reference_integrated$monocle3_pseudotime)] <- NA
cat("\nPseudotime summary (reference):\n")
print(summary(reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)]))
# ══════════════════════════════════════════════════════════════════════════
# TOPOLOGY VALIDATION
# ══════════════════════════════════════════════════════════════════════════
# Correct order for each lineage:
# Effector: Naive < TCM < TEM < Temra
# Treg: Naive < TCM — Treg (Treg branches from TCM, NOT after TEM)
#
# This is a hard biological requirement. If Treg > TEM in pseudotime,
# the graph has placed Treg downstream of TEM which is anatomically wrong.
# Treg differentiates from TCM-stage precursors via FOXP3 induction,
# not from terminally differentiated TEM/Temra cells.
pt_check <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n = n(),
med_pt = round(median(monocle3_pseudotime, na.rm = TRUE), 3),
mean_pt = round(mean(monocle3_pseudotime, na.rm = TRUE), 3),
.groups = "drop"
) %>%
arrange(med_pt)
cat("\n=== Pseudotime order per state (must be Naive < TCM < Treg < TEM < Temra) ===\n")
print(pt_check)
# Extract state medians — used for bin boundaries downstream
naive_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 Naive"]
tcm_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 TCM"]
treg_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "Treg"]
tem_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 TEM"]
temra_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 Temra/CTL"]
# FIX: single-string sprintf — original had two format strings which
# caused the second string to be silently ignored
cat(sprintf(
"\nState medians: Naive=%.2f | TCM=%.2f | Treg=%.2f | TEM=%.2f | Temra=%.2f\n",
naive_med, tcm_med, treg_med, tem_med, temra_med
))
# Hard stops — script will not continue if topology is biologically wrong
if (naive_med > tcm_med)
stop(sprintf(
"❌ TOPOLOGY ERROR: Naive (%.2f) > TCM (%.2f).\n Root node may be misplaced — check naive_centroid coordinates.",
naive_med, tcm_med))
if (tcm_med > tem_med)
stop(sprintf(
"❌ TOPOLOGY ERROR: TCM (%.2f) > TEM (%.2f).\n Re-check root node selection.",
tcm_med, tem_med))
if (treg_med > tem_med)
stop(sprintf(
"❌ TOPOLOGY ERROR: Treg (%.2f) > TEM (%.2f).\n Treg must branch from TCM, not be downstream of TEM.\n Fix: re-run learn_graph with minimal_branch_len = 3",
treg_med, tem_med))
# FIX: single sprintf format string (was split across two strings before)
cat(sprintf(
"✅ Topology confirmed: Naive(%.2f) < TCM(%.2f) < Treg(%.2f) < TEM(%.2f) < Temra(%.2f)\n",
naive_med, tcm_med, treg_med, tem_med, temra_med
))
# ── Pseudotime UMAP plots ─────────────────────────────────────────────────
plot_cells(
cds,
color_cells_by = "pseudotime",
label_cell_groups = FALSE,
cell_size = 0.8,
show_trajectory_graph = TRUE
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("Reference Pseudotime\n(Root = CD4 Naive | Two lineages: TEM/Temra + Treg)") +
theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"))# Visual validation that pseudotime order matches expected biology.
# Both violin and ridge plots should show:
# Naive (lowest) → TCM → Treg (branch, intermediate) → TEM → Temra (highest)
pt_data <- data.frame(
pseudotime = reference_integrated$monocle3_pseudotime,
l2 = reference_integrated$predicted.celltype.l2
) %>% filter(is.finite(pseudotime), !is.na(l2))
# Order x-axis by median pseudotime — should give biological order
l2_order <- pt_data %>%
group_by(l2) %>%
summarise(med_pt = median(pseudotime, na.rm = TRUE), .groups = "drop") %>%
arrange(med_pt) %>%
pull(l2)
pt_data$l2 <- factor(pt_data$l2, levels = l2_order)
p_violin <- ggplot(pt_data, aes(x = l2, y = pseudotime, fill = l2)) +
geom_violin(scale = "width", alpha = 0.85, trim = TRUE) +
geom_boxplot(width = 0.12, fill = "white", outlier.size = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Reference Pseudotime per Azimuth l2 Label",
subtitle = "Expected order: Naive < TCM < Treg < TEM < Temra",
x = "", y = "Monocle3 Pseudotime")
p_ridge <- ggplot(pt_data, aes(x = pseudotime, y = l2, fill = l2)) +
geom_density_ridges(alpha = 0.75, scale = 1.2, rel_min_height = 0.01) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 10) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Reference Pseudotime Density by Azimuth l2 Label",
x = "Pseudotime", y = "")
p_violin | p_ridge
# ════════════════════════════════════════════════════════════════════════════════
# CRITICAL FIX: Rebuild reference PCA in SCT space (preserves UMAP trajectory)
# PROBLEM: reference_integrated PCA = "integrated" assay (50 dims)
# MalignantCD4T PCA = "SCT" assay (30 dims)
# SOLUTION: SCT + PCA on reference → unified SCT space for FindTransferAnchors
# UMAP MODEL 100% SAFE — lives in reductions$umap@misc$model
# ════════════════════════════════════════════════════════════════════════════════
cat("=== DIAGNOSTIC: Current reference state ===\n")
cat("DefaultAssay:", DefaultAssay(reference_integrated), "\n")
cat("SCT HVGs:", length(VariableFeatures(reference_integrated)), "\n")
cat("PCA dims:", ncol(Embeddings(reference_integrated, "pca")), "\n")
cat("UMAP model present:", !is.null(reference_integrated@reductions$umap@misc$model), "\n")
# 1. Sanity check UMAP before
old_umap_coords <- Embeddings(reference_integrated, "umap")[1:3, ]
old_umap_model <- !is.null(reference_integrated@reductions$umap@misc$model)
cat("\nUMAP first 3 cells (backup):", paste(round(old_umap_coords[1,], 3), collapse=" "), "\n")
# 2. SCT on reference (creates SCT assay + HVGs)
DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- SCTransform(
reference_integrated,
variable.features.n = 3000,
vst.flavor = "v2",
vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"),
verbose = FALSE
)
# 3. PCA in SCT space (matches MalignantCD4T exactly)
reference_integrated <- RunPCA(
reference_integrated,
assay = "SCT",
npcs = 30, # ← Match query dims exactly
verbose = FALSE
)
# 4. Verify fix
cat("\n=== FIXED: New reference state ===\n")
cat("DefaultAssay:", DefaultAssay(reference_integrated), "\n")
cat("SCT HVGs:", length(VariableFeatures(reference_integrated)), "\n")
cat("PCA dims:", ncol(Embeddings(reference_integrated, "pca")), "\n")
cat("UMAP model preserved:", !is.null(reference_integrated@reductions$umap@misc$model), "\n")
# 5. UMAP integrity check
new_umap_coords <- Embeddings(reference_integrated, "umap")[1:3, ]
cat("UMAP unchanged:", all(old_umap_coords == new_umap_coords), "\n")
cat("✅ Reference now in SCT space — FindTransferAnchors will work for ALL 7 lines\n")# ══════════════════════════════════════════════════════════════════════════
# FIND TRANSFER ANCHORS
# ══════════════════════════════════════════════════════════════════════════
# Anchors are mutual nearest neighbours between reference and query in
# shared PCA space. Their quality determines the accuracy of:
# (a) label transfer → which normal state does this Sézary cell resemble?
# (b) pseudotime transfer → where along the differentiation axis is it?
#
# FEATURE SELECTION STRATEGY:
# Step 1 — intersect reference + query HVGs
# Step 2 — remove technical/junk genes (MT, ribosomal, HSP, lncRNA, histone)
# WHY: these genes are variable due to library quality and stress,
# NOT due to T cell differentiation state. Keeping them biases
# anchors toward batch effects rather than biology.
# Step 3 — rank remaining genes by joint residual variance (ref + query)
# WHY: genes highly variable in BOTH datasets are the most
# reliable for finding true biological nearest neighbours
# ══════════════════════════════════════════════════════════════════════════
DefaultAssay(reference_integrated) <- "SCT"
DefaultAssay(MalignantCD4T) <- "SCT"
# ── Step 1: Intersect reference and query HVGs ────────────────────────────
ref_hvgs <- VariableFeatures(reference_integrated, assay = "SCT")
if (length(ref_hvgs) == 0) {
cat("⚠️ No HVGs stored in reference — extracting from scale.data\n")
ref_hvgs <- rownames(GetAssayData(reference_integrated, assay = "SCT",
layer = "scale.data"))
}
query_hvgs <- VariableFeatures(MalignantCD4T)
common_features <- intersect(ref_hvgs, query_hvgs)
cat("=== Feature set construction ===\n")
cat("Reference HVGs :", length(ref_hvgs), "\n")
cat("Query HVGs :", length(query_hvgs), "\n")
cat("Common HVGs :", length(common_features), "\n")
if (length(common_features) < 1500)
warning(paste0("⚠️ Only ", length(common_features),
" common features — below recommended 1500.\n",
" Increase nfeatures in SCTransform to 4000."))
# ── Step 2: Remove technical/junk genes ───────────────────────────────────
# Categories removed and WHY:
# ^MT- : mitochondrial OXPHOS genes — variable due to cell quality/stress
# ^RPL/S : ribosomal proteins — variable due to translation rate/library depth
# ^HSP* : heat shock proteins — variable due to stress, not differentiation
# ^SNHG/MALAT1/NEAT1 : lncRNAs — nuclear architecture, not T cell state
# ^HIST : replication-dependent histones — S-phase markers (cell cycle)
# NOTE: ^HIST added based on pre-run QC showing 20+ histone genes
# in shared HVG set, all driven by cycling subpopulation
junk_pattern <- paste0(
"^RPL|^RPS|", # ribosomal large + small subunits
"^MT-|", # mitochondrial genome (OXPHOS complexes I,III,IV,V)
"^HSP|^HSPA|^HSPB|", # heat shock proteins
"^HSPD|^HSPE|^HSPH|", # heat shock proteins (continued)
"^SNHG|", # small nucleolar RNA host genes
"^MALAT1|^NEAT1|", # abundant nuclear lncRNAs
"^XIST|", # X-inactivation (sex-specific batch)
"^HIST" # replication-dependent histones (S-phase signal)
)
# Report breakdown by category before removing
mt_removed <- sum(grepl("^MT-", common_features))
ribo_removed <- sum(grepl("^RPL|^RPS", common_features))
hsp_removed <- sum(grepl("^HSP", common_features))
lnc_removed <- sum(grepl("^SNHG|^MALAT1|^NEAT1|^XIST", common_features))
hist_removed <- sum(grepl("^HIST", common_features))
cat("\n=== Junk gene removal breakdown ===\n")
cat(sprintf(" Mitochondrial (OXPHOS) : %d\n", mt_removed))
cat(sprintf(" Ribosomal (RPL/RPS) : %d\n", ribo_removed))
cat(sprintf(" Heat shock (HSP*) : %d\n", hsp_removed))
cat(sprintf(" lncRNA (SNHG/MALAT1/NEAT1): %d\n", lnc_removed))
cat(sprintf(" Histones (HIST*) : %d\n", hist_removed))
n_before <- length(common_features)
common_features_clean <- common_features[!grepl(junk_pattern, common_features)]
n_removed <- n_before - length(common_features_clean)
cat(sprintf(" ─────────────────────────────\n"))
cat(sprintf(" Total removed : %d (%.1f%%)\n",
n_removed, 100 * n_removed / n_before))
cat(sprintf(" Clean features remaining : %d\n\n",
length(common_features_clean)))
if (length(common_features_clean) < 1200)
warning("⚠️ Fewer than 1200 clean features — anchor quality may be reduced.")
# ── Step 3: Rank by joint residual variance ───────────────────────────────
# Genes that are highly variable in BOTH the healthy reference AND the
# malignant query are the most informative for finding true biological
# nearest neighbours. Ranking by joint variance ensures differentiation
# genes (CCR7, TCF7, GZMK, FOXP3) rank above line-specific noise.
ref_var_meta <- reference_integrated[["SCT"]]@meta.features
query_var_meta <- MalignantCD4T[["SCT"]]@meta.features
if ("residual_variance" %in% colnames(ref_var_meta) &&
"residual_variance" %in% colnames(query_var_meta)) {
ref_rv <- ref_var_meta[common_features_clean, "residual_variance"]
query_rv <- query_var_meta[common_features_clean, "residual_variance"]
ref_rv[is.na(ref_rv)] <- 0
query_rv[is.na(query_rv)] <- 0
mean_rv <- (ref_rv + query_rv) / 2
common_features <- common_features_clean[order(mean_rv, decreasing = TRUE)]
cat("Top 20 features by joint residual variance:\n")
print(head(common_features, 20))
cat("\n")
} else {
cat("⚠️ residual_variance not in SCT meta.features — using unranked clean list\n")
cat(" This is OK but ranking would improve anchor quality.\n")
common_features <- common_features_clean
}
# ── Step 4: Canonical T cell marker check ─────────────────────────────────
# These markers span the full differentiation axis and Treg branch.
# If fewer than 5 are present, the feature set cannot distinguish
# differentiation states and anchor quality will be poor.
markers_check <- c(
"CCR7", "SELL", "TCF7", "IL7R", # Naive/TCM
"GZMB", "GZMK", "PRF1", "EOMES", # Effector/TEM
"FOXP3", "IKZF2", # Treg
"TOX", "PDCD1", "LAG3", # Exhaustion
"CD44", "TBX21" # Activation
)
found <- markers_check[markers_check %in% common_features]
not_found <- markers_check[!markers_check %in% common_features]
cat("=== Canonical T cell marker check ===\n")
cat("Present (", length(found), "):", paste(found, collapse = ", "), "\n")
cat("Absent (", length(not_found),"):", paste(not_found, collapse = ", "), "\n")
if (length(found) < 5) {
warning("⚠️ Fewer than 5 canonical markers present — anchor quality at risk.")
} else if (length(found) < 8) {
cat("⚠️ Some markers absent — likely heterogeneous across lines (expected)\n")
cat(" Projection will still work but interpret per-line results carefully\n")
} else {
cat("✅ Good marker coverage — feature set captures differentiation axis\n")
}
# ── Step 5: FindTransferAnchors ───────────────────────────────────────────
# k.anchor = 10 : neighbours used to find anchors — balanced for ~11k ref
# k.filter = 500 : neighbours for anchor filtering — removes weak anchors
# k.score = 30 : neighbours for anchor scoring — weights anchor quality
# dims = 1:30: use all 30 PCA dims — captures full differentiation space
dims_to_use <- min(30,
ncol(Embeddings(reference_integrated, "pca")),
ncol(Embeddings(MalignantCD4T, "pca")))
cat("\nFinding anchors with dims 1:", dims_to_use, "\n")
cat("Features used:", length(common_features), "\n\n")
anchors <- FindTransferAnchors(
reference = reference_integrated,
query = MalignantCD4T,
features = common_features,
normalization.method = "SCT",
reference.reduction = "pca",
dims = 1:dims_to_use,
k.anchor = 10,
k.filter = 500,
k.score = 30,
verbose = TRUE
)
# ── Step 6: Anchor quality check ──────────────────────────────────────────
# NOTE: In Seurat FindTransferAnchors internal convention:
# cell1 = reference indices
# cell2 = query indices (OPPOSITE to what variable names suggest)
# Always use cell1→reference and cell2→query for barcode mapping.
anchor_df <- as.data.frame(slot(anchors, "anchors"))
# CORRECT barcode mapping — cell1=reference, cell2=query
ref_barcodes <- colnames(reference_integrated)
query_barcodes <- colnames(MalignantCD4T)
anchor_df$ref_barcode <- ref_barcodes[anchor_df$cell1]
anchor_df$query_barcode <- query_barcodes[anchor_df$cell2]
anchor_df$query_cellline <- MalignantCD4T$cell_line[anchor_df$query_barcode]
anchor_df$ref_label <- reference_integrated$predicted.celltype.l2[anchor_df$ref_barcode]
# Verify no NAs
cat("\n=== Barcode mapping validation ===\n")
cat("NA ref barcodes :", sum(is.na(anchor_df$ref_barcode)), "\n")
cat("NA query barcodes:", sum(is.na(anchor_df$query_barcode)), "\n")
cat("NA cell lines :", sum(is.na(anchor_df$query_cellline)), "\n")
n_anchors <- nrow(anchor_df)
anchor_ratio <- ncol(MalignantCD4T) / n_anchors
anchor_scores <- anchor_df$score
cat("\n=== Anchor quality summary ===\n")
cat("Anchors found :", n_anchors, "\n")
cat("Malignant cells :", ncol(MalignantCD4T), "\n")
cat("Reference cells :", ncol(reference_integrated), "\n")
cat(sprintf("Cells per anchor : %.1f : 1 (ideal ≤ 8:1)\n", anchor_ratio))
cat(sprintf("Anchor scores : mean=%.3f | median=%.3f | min=%.3f | max=%.3f\n",
mean(anchor_scores), median(anchor_scores),
min(anchor_scores), max(anchor_scores)))
cat("\n=== Anchors per cell line ===\n")
print(table(anchor_df$query_cellline, useNA = "ifany"))
cat("\n=== Anchors by reference label ===\n")
anchor_df %>%
group_by(ref_label) %>%
summarise(n_anchors = n(),
mean_score = round(mean(score), 3),
.groups = "drop") %>%
arrange(desc(n_anchors)) %>%
print()
if (anchor_ratio > 8) {
warning(paste0(
"⚠️ Low anchor density (", round(anchor_ratio, 1), ":1).\n",
" 1. Junk genes still dominating PCA — check top 20 features above\n",
" 2. Sézary cells too transcriptionally distant from reference\n",
" 3. Increase k.anchor to 15 and rerun"
))
} else if (anchor_ratio > 5) {
cat(sprintf("⚠️ Moderate anchor density (%.1f:1) — acceptable\n", anchor_ratio))
} else {
cat(sprintf("✅ Good anchor density (%.1f:1) — projection quality should be high\n",
anchor_ratio))
}# ── ANCHOR DIAGNOSTIC: Biology vs Artefact ────────────────────────────────
# If L3-L7 are truly dedifferentiated, their PCA space should be
# DISTANT from the reference. If it is a technical issue, they will
# sit in the same PCA space as L1/L2 but just not form mutual NN.
cat("=== PCA space comparison: L1/L2 vs L3-L7 ===\n")
# Get reference PCA centroid (TCM region)
ref_pca <- Embeddings(reference_integrated, "pca")[, 1:10]
ref_centroid <- colMeans(ref_pca)
# Get query PCA embeddings per line
query_pca <- Embeddings(MalignantCD4T, "pca")[, 1:10]
# Distance from each query cell to reference centroid
dist_to_ref <- rowSums(
sweep(query_pca, 2, ref_centroid, "-")^2
)
# Compare distances by cell line
dist_df <- data.frame(
cell = colnames(MalignantCD4T),
cell_line = MalignantCD4T$cell_line,
dist_to_ref = dist_to_ref
)
dist_summary <- dist_df %>%
group_by(cell_line) %>%
summarise(
mean_dist = round(mean(dist_to_ref), 2),
median_dist = round(median(dist_to_ref), 2),
.groups = "drop"
) %>%
arrange(mean_dist)
cat("\nDistance from reference PCA centroid per cell line:\n")
cat("(Lower = more similar to healthy reference)\n\n")
print(dist_summary)
# Check HVG overlap per line with reference
cat("\n=== HVG overlap with reference per line ===\n")
ref_hvgs_check <- VariableFeatures(reference_integrated, assay = "SCT")
for (ln in paste0("L", 1:7)) {
line_cells <- colnames(MalignantCD4T)[MalignantCD4T$cell_line == ln]
# Check expression of key T cell markers in each line
markers <- c("CCR7", "TCF7", "SELL", "IL7R", "GZMB", "TOX", "KIR3DL2")
markers_present <- intersect(markers, rownames(MalignantCD4T))
expr <- rowMeans(
GetAssayData(MalignantCD4T, assay = "SCT", layer = "data")[
markers_present, line_cells, drop = FALSE]
)
cat(sprintf("\n%s — key marker expression:\n", ln))
print(round(expr, 3))
}# Show TOP reference states each cell line connects to
cat("\n=== WHAT EACH CELL LINE ANCHORS TO ===\n")
anchor_df %>%
group_by(query_cellline, ref_label) %>%
summarise(
n_anchors = n(),
mean_score = round(mean(score), 3),
.groups = "drop"
) %>%
arrange(query_cellline, desc(n_anchors)) %>%
print(n = 30) # Show all combinationscat("=== MapQuery projection ===\n")
# MapQuery simultaneously:
# 1. Transfers labels + pseudotime from reference to query (TransferData)
# 2. Projects query PCA into reference PCA space (IntegrateEmbeddings)
# 3. Projects query onto frozen reference UMAP (ProjectUMAP)
# Malignant cells do NOT alter the reference — they are read-only passengers.
mapped_MalignantCD4T <- MapQuery(
anchorset = anchors,
query = MalignantCD4T,
reference = reference_integrated,
refdata = list(
predicted.celltype.l2 = "predicted.celltype.l2", # PRIMARY: state label
pseudotime = "monocle3_pseudotime" # CONTINUOUS: position
),
reference.reduction = "pca",
reduction.model = "umap"
)
cat("✅ MapQuery complete\n")
cat("Mapped cells:", ncol(mapped_MalignantCD4T), "\n")
# Coerce pseudotime to numeric — Seurat TransferData returns character
mapped_MalignantCD4T$predicted.pseudotime <- as.numeric(
mapped_MalignantCD4T$predicted.pseudotime
)
cat("\nTransferred pseudotime summary:\n")
print(summary(mapped_MalignantCD4T$predicted.pseudotime))
cat("\nTransferred Azimuth l2 distribution:\n")
print(table(mapped_MalignantCD4T$predicted.predicted.celltype.l2))
# FIXED Diversity check: Handle NA predictions properly
cat("\n=== Diversity check ===\n")
l2_table <- table(mapped_MalignantCD4T$predicted.predicted.celltype.l2, useNA = "always")
l2_dist <- prop.table(l2_table)
cat("Label distribution:\n")
print(l2_table)
na_count <- l2_table["<NA>"]
na_pct <- ifelse(is.na(na_count), 0, round(100 * na_pct, 1))
# Non-NA predictions only
non_na_table <- l2_table[!names(l2_table) %in% "<NA>"]
if (length(non_na_table) > 0) {
non_na_dist <- prop.table(non_na_table)
max_state <- names(which.max(non_na_dist))
max_pct <- round(100 * max(non_na_dist), 1)
cat(sprintf("\nNA fraction: %.1f%%\n", na_pct))
cat(sprintf("Dominant state: %s (%.1f%% of non-NA)\n", max_state, max_pct))
if (max_pct > 90) {
warning(sprintf("⚠️ %s dominates (%.1f%%) — check SCT/feature filtering", max_state, max_pct))
} else {
cat("✅ Diverse mapping across states\n")
}
} else {
cat("⚠️ No confident predictions made\n")
}
# Sézary-specific diversity check
dominant_pct <- round(100 * max(prop.table(non_na_table)), 1)
if (dominant_pct > 95) {
cat(sprintf("✅ TCM-dominant (%.1f%%) = CLASSIC SÉZARY PHENOTYPE\n", dominant_pct))
cat("Minor subclones (TEM/Treg/Naive) = expected heterogeneity\n")
} else {
cat(sprintf("✅ Diverse mapping: %s = %.1f%%\n", max_state, dominant_pct))
}
# 1. Per-cell-line breakdown (your thesis gold)
table(mapped_MalignantCD4T$cell_line, mapped_MalignantCD4T$predicted.predicted.celltype.l2)
# 2. Plot on trajectory UMAP
DimPlot(mapped_MalignantCD4T, group.by = "predicted.predicted.celltype.l2", reduction = "ref.umap")
# 3. Pseudotime violin by cell line
VlnPlot(mapped_MalignantCD4T, features = "predicted.pseudotime", group.by = "cell_line", pt.size = 0)# Build data frames for ggplot — rename UMAP columns consistently
ref_umap <- data.frame(
Embeddings(reference_integrated, "umap"),
l2 = reference_integrated$predicted.celltype.l2,
pseudotime = reference_integrated$monocle3_pseudotime,
type = "Reference"
)
colnames(ref_umap)[1:2] <- c("UMAP_1", "UMAP_2")
query_umap <- data.frame(
Embeddings(mapped_MalignantCD4T, "ref.umap"),
pseudotime = mapped_MalignantCD4T$predicted.pseudotime,
cell_line = mapped_MalignantCD4T$cell_line,
l2_q = mapped_MalignantCD4T$predicted.predicted.celltype.l2,
type = "Malignant"
)
colnames(query_umap)[1:2] <- c("UMAP_1", "UMAP_2")
cell_line_palette <- colorRampPalette(brewer.pal(8, "Dark2"))(
length(unique(query_umap$cell_line)))
names(cell_line_palette) <- sort(unique(query_umap$cell_line))
all_l2_labels <- unique(c(as.character(ref_umap$l2),
as.character(query_umap$l2_q)))
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, all_l2_labels)
# Panel 1: malignant coloured by transferred pseudotime
p_pt <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.6) +
geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 1.2, alpha = 0.9) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime",
na.value = "grey40") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Sézary Cells Projected onto Reference\n(Transferred pseudotime)")
# Panel 2: reference coloured by state, malignant in red
p_state <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2, color = l2),
size = 0.4, alpha = 0.7) +
scale_color_manual(values = azimuth_l2_colors,
name = "Azimuth l2\n(reference)", na.value = "grey60") +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2),
color = malignant_color, size = 1.0, alpha = 0.8) +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 8)) +
ggtitle("Reference States (colour)\n+ Sézary Cells (red overlay)")
p_pt | p_statep_l2 <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.6) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = l2_q),
size = 1.2, alpha = 0.9) +
scale_color_manual(values = azimuth_l2_colors,
name = "Azimuth l2\n(transferred)", na.value = "grey40") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Sézary Cells — Transferred Azimuth l2 Labels")
p_lines <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.5) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = cell_line),
size = 1.0, alpha = 0.9) +
scale_color_manual(values = cell_line_palette, name = "Cell line") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Sézary Cells per Cell Line")
p_l2 | p_linesggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = "grey60", size = 0.3, alpha = 0.5) +
geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 0.9, alpha = 0.85) +
facet_wrap(~ cell_line, ncol = 4) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime",
na.value = "grey60") +
theme_classic(base_size = 10) +
theme(strip.text = element_text(size = 9, face = "bold"),
strip.background = element_rect(fill = "#E8F4FD"),
plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Per Cell Line — Projection onto Reference UMAP (Pseudotime)")# ══════════════════════════════════════════════════════════════════════════
# PSEUDOTIME BIN ASSIGNMENT — STATE-ANCHORED BOUNDARIES
# ══════════════════════════════════════════════════════════════════════════
#
# Bin boundaries = midpoint between adjacent reference state medians.
# This is biologically principled: the boundary between TCM and TEM bins
# is exactly halfway between the median TCM pseudotime and median TEM
# pseudotime in the healthy reference — NOT an arbitrary 33rd/66th
# percentile cut.
#
# TWO LINEAGES modelled:
# Effector axis (pseudotime-based):
# Naive-like → below midpoint(Naive, TCM)
# TCM-like → between midpoint(Naive,TCM) and midpoint(TCM,TEM)
# TEM-like → between midpoint(TCM,TEM) and midpoint(TEM,Temra)
# Temra-like → above midpoint(TEM,Temra)
#
# Regulatory branch (label-based):
# Treg-like → transferred Azimuth label == "Treg" at ANY pseudotime
# RATIONALE: Treg is a branch from TCM, not a linear position on the
# effector axis. Pseudotime for Treg cells is lower than TEM by design
# (correct topology), but using pseudotime alone would misassign some
# Treg cells to TCM-like. Label + branch logic is more accurate.
# IMPORTANT: This means Treg-like bin % will equal Treg label % exactly.
# The label vs pseudotime bin comparison is only meaningful for the
# effector axis (Naive/TCM/TEM/Temra).
# ── Step 1: Reference state median pseudotimes ────────────────────────────
# These were already computed in pseudotime-root chunk (naive_med, tcm_med
# etc.) but we recompute here for clarity and chunk independence.
pt_state_medians <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n = n(),
med_pt = round(median(monocle3_pseudotime, na.rm = TRUE), 3),
.groups = "drop"
) %>%
arrange(med_pt)
cat("=== Reference state median pseudotimes ===\n")
print(pt_state_medians)
# Extract medians (overwrite with fresh values from this chunk)
naive_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 Naive"]
tcm_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 TCM"]
treg_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "Treg"]
tem_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 TEM"]
temra_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 Temra/CTL"]
# ── Step 2: Bin boundaries as midpoints between state medians ─────────────
naive_tcm_cut <- (naive_med + tcm_med) / 2
tcm_tem_cut <- (tcm_med + tem_med) / 2
tem_temra_cut <- (tem_med + temra_med) / 2
tcm_treg_cut <- (tcm_med + treg_med) / 2 # stored for reference only
cat("\n=== Biology-anchored bin boundaries ===\n")
cat(sprintf(" Naive | TCM : %.3f\n", naive_tcm_cut))
cat(sprintf(" TCM | TEM : %.3f (main effector axis)\n", tcm_tem_cut))
cat(sprintf(" TEM | Temra: %.3f\n", tem_temra_cut))
cat(sprintf(" TCM | Treg : %.3f (regulatory branch — label-based, not used as cut)\n",
tcm_treg_cut))
# ── Step 3: Assign working columns ────────────────────────────────────────
mapped_MalignantCD4T$state_azimuth_l2 <-
mapped_MalignantCD4T$predicted.predicted.celltype.l2
mapped_MalignantCD4T$pseudotime_value <-
as.numeric(mapped_MalignantCD4T$predicted.pseudotime)
# ── Step 4: Assign pseudotime bins ────────────────────────────────────────
mapped_MalignantCD4T$pseudotime_bin <- case_when(
# Treg lineage — label takes priority over pseudotime position
# because Treg is a branch, not a point on the linear effector axis
mapped_MalignantCD4T$state_azimuth_l2 == "Treg" ~ "Treg-like",
# Effector axis bins — pseudotime-based
mapped_MalignantCD4T$pseudotime_value < naive_tcm_cut ~ "Naive-like",
mapped_MalignantCD4T$pseudotime_value >= naive_tcm_cut &
mapped_MalignantCD4T$pseudotime_value < tcm_tem_cut ~ "TCM-like",
mapped_MalignantCD4T$pseudotime_value >= tcm_tem_cut &
mapped_MalignantCD4T$pseudotime_value < tem_temra_cut ~ "TEM-like",
mapped_MalignantCD4T$pseudotime_value >= tem_temra_cut ~ "Temra-like",
TRUE ~ NA_character_
)
# Set factor levels in biological trajectory order
mapped_MalignantCD4T$pseudotime_bin <- factor(
mapped_MalignantCD4T$pseudotime_bin,
levels = c("Naive-like", "TCM-like", "Treg-like", "TEM-like", "Temra-like")
)
# ── Step 5: Treg count check ──────────────────────────────────────────────
# FIX: added as identified in expert review. If <50 Treg-labelled cells,
# the Treg-like bin is negligible and should be flagged.
n_treg <- sum(mapped_MalignantCD4T$state_azimuth_l2 == "Treg", na.rm = TRUE)
cat(sprintf("\nTreg-labelled malignant cells: %d (%.2f%% of total)\n",
n_treg,
100 * n_treg / ncol(mapped_MalignantCD4T)))
if (n_treg == 0) {
message("ℹ️ No Treg-labelled cells — Treg-like bin will be empty.\n",
" This is biologically plausible: Sézary cells rarely map to Treg.\n",
" The Treg bin column is retained for completeness.")
} else if (n_treg < 50) {
message(sprintf(
"⚠️ Only %d Treg-labelled cells (%.2f%%).\n", n_treg,
100 * n_treg / ncol(mapped_MalignantCD4T)),
" Treg-like bin exists but is very small — interpret with caution.\n",
" Consider noting this as 'rare/absent Treg identity' in results.")
} else {
cat(sprintf("✅ Treg-like bin has %d cells — sufficient for reporting.\n", n_treg))
}
cat("\nNote: Treg-like bin % equals Treg label % by design (label-based assignment).\n")
cat("Label vs pseudotime bin discordance is only interpretable for the\n")
cat("effector axis: Naive-like / TCM-like / TEM-like / Temra-like.\n")
# ── Step 6: Results ───────────────────────────────────────────────────────
cat("\n=== State assignment complete ===\n")
cat("\nAzimuth l2 label (cell of origin — WHAT the cell resembles):\n")
print(table(mapped_MalignantCD4T$state_azimuth_l2))
cat("\nPseudotime bin (differentiation position — WHERE the cell sits):\n")
print(table(mapped_MalignantCD4T$pseudotime_bin))
# ── KEY RESULT: cross-table ───────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════════\n")
cat("KEY RESULT: Label (cell of origin) vs Pseudotime Bin\n")
cat("═══════════════════════════════════════════════════════════\n")
cat("TCM-labelled cells in TEM/Temra bins = effector skewing\n")
cat("beyond the TCM arrest point\n\n")
# FIX: explicit column naming from table() output — avoids rename() failure
# when Var1/Var2 column names differ from expected Label/PT_Bin
cross_tab <- table(
Label = mapped_MalignantCD4T$state_azimuth_l2,
PT_Bin = mapped_MalignantCD4T$pseudotime_bin
)
print(cross_tab)
cat("\nNote: Treg row will show 100% Treg-like — this is by design.\n")# ── Overall state distribution ─────────────────────────────────────────────
state_summary <- mapped_MalignantCD4T@meta.data %>%
count(state_azimuth_l2, name = "n_cells") %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
arrange(desc(n_cells))
cat("=== Overall Azimuth l2 State Distribution ===\n")
print(state_summary)
# ── Per cell line ─────────────────────────────────────────────────────────
state_per_line <- mapped_MalignantCD4T@meta.data %>%
count(cell_line, state_azimuth_l2, name = "n_cells") %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
ungroup() %>%
arrange(cell_line, desc(n_cells))
cat("\n=== State Distribution per Cell Line ===\n")
print(state_per_line, n = Inf)
# ── Pseudotime bin distribution ───────────────────────────────────────────
pt_bin_summary <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(pseudotime_bin, name = "n_cells") %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1))
cat("\n=== Pseudotime Bin Summary ===\n")
print(pt_bin_summary)
pt_bin_per_line <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(cell_line, pseudotime_bin, name = "n_cells") %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
ungroup()
cat("\n=== Pseudotime Bin per Cell Line ===\n")
print(pt_bin_per_line, n = Inf)# Top panel: label transfer (cell of origin)
p_bar_labels <- ggplot(state_per_line,
aes(x = cell_line, y = pct,
fill = state_azimuth_l2)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70",
name = "Azimuth l2\nstate") +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Sézary Cell State Composition per Cell Line",
subtitle = "Label transfer: cell of origin (which normal state does this cell resemble?)",
x = "Cell Line", y = "% Cells")
# Bottom panel: pseudotime bins (differentiation position)
p_bar_bins <- ggplot(pt_bin_per_line,
aes(x = cell_line, y = pct,
fill = pseudotime_bin)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = bin_colors, name = "Pseudotime\nBin",
drop = FALSE) +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Sézary Cell Pseudotime Bin Distribution per Cell Line",
subtitle = paste0(
"Differentiation position — state-anchored boundaries: ",
"Naive|TCM=", round(naive_tcm_cut, 2),
" | TCM|TEM=", round(tcm_tem_cut, 2),
" | TEM|Temra=", round(tem_temra_cut, 2),
" | Treg=label-based"),
x = "Cell Line", y = "% Cells")
p_bar_labels / p_bar_bins +
plot_annotation(
title = "Cell of Origin (Label Transfer) vs Differentiation Position (Pseudotime Bins)\nSézary Syndrome — CD4+ T Cell Trajectory",
theme = theme(plot.title = element_text(hjust = 0.5, size = 13,
face = "bold"))
)# Heatmap 1: Azimuth l2 label proportions per cell line
heatmap_df <- state_per_line %>%
select(cell_line, state_azimuth_l2, pct) %>%
pivot_wider(names_from = state_azimuth_l2,
values_from = pct, values_fill = 0)
heatmap_mat <- as.matrix(heatmap_df[, -1])
rownames(heatmap_mat) <- heatmap_df$cell_line
pheatmap(
heatmap_mat,
color = colorRampPalette(c("white", "#FEE090", "#D73027"))(50),
display_numbers = TRUE, number_format = "%.1f%%",
fontsize_number = 8, fontsize_row = 10, fontsize_col = 9,
angle_col = 45, cluster_rows = TRUE, cluster_cols = TRUE,
main = "% Sézary Cells per Azimuth l2 State\n(Label transfer — cell of origin)",
border_color = "white"
)
# Heatmap 2: pseudotime bin proportions — biological order preserved
bin_heatmap_df <- pt_bin_per_line %>%
select(cell_line, pseudotime_bin, pct) %>%
pivot_wider(names_from = pseudotime_bin,
values_from = pct, values_fill = 0)
# FIX: use any_of() to safely select only columns that exist
# Original used bare select() which errors if a bin column is missing
all_bin_levels <- c("Naive-like", "TCM-like", "Treg-like",
"TEM-like", "Temra-like")
for (b in all_bin_levels) {
if (!b %in% colnames(bin_heatmap_df)) bin_heatmap_df[[b]] <- 0
}
bin_heatmap_df <- bin_heatmap_df %>%
select(cell_line, any_of(all_bin_levels))
bin_mat <- as.matrix(bin_heatmap_df[, -1])
rownames(bin_mat) <- bin_heatmap_df$cell_line
pheatmap(
bin_mat,
color = colorRampPalette(c("white", "#74ADD1", "#D73027"))(50),
display_numbers = TRUE, number_format = "%.1f%%",
fontsize_number = 8, fontsize_row = 10, fontsize_col = 9,
angle_col = 45,
cluster_rows = TRUE,
cluster_cols = FALSE, # preserve biological order of bins
main = "% Sézary Cells per Pseudotime Bin\n(State-anchored boundaries — differentiation position)",
border_color = "white"
)ref_pt <- data.frame(
pseudotime = reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)],
source = "Healthy reference"
)
mal_pt <- data.frame(
pseudotime = mapped_MalignantCD4T$pseudotime_value[
is.finite(mapped_MalignantCD4T$pseudotime_value)],
source = "Sézary cells"
)
pt_combined <- bind_rows(ref_pt, mal_pt)
# Reference state median lines — visual anchors showing where each
# healthy state sits on the pseudotime axis
state_vlines <- data.frame(
state = c("Naive", "TCM", "Treg", "TEM", "Temra"),
med = c(naive_med, tcm_med, treg_med, tem_med, temra_med),
col = c("#2166AC", "#74ADD1","#762A83", "#FEE090","#D73027")
)
p_density <- ggplot(pt_combined, aes(x = pseudotime, fill = source)) +
geom_density(alpha = 0.65, adjust = 1.2) +
geom_vline(data = state_vlines,
aes(xintercept = med, color = state),
linetype = "dashed", linewidth = 0.8) +
scale_color_manual(
values = setNames(state_vlines$col, state_vlines$state),
name = "Reference state\nmedian") +
scale_fill_manual(
values = c("Healthy reference" = "#4575B4",
"Sézary cells" = malignant_color),
name = "") +
theme_classic(base_size = 12) +
theme(legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Pseudotime Distribution: Healthy Reference vs Sézary Cells",
subtitle = "Peak shift indicates differentiation arrest point | dashed = reference state medians",
x = "Monocle3 Pseudotime", y = "Density")
p_ridge <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value)) %>%
ggplot(aes(x = pseudotime_value, y = cell_line, fill = cell_line)) +
geom_density_ridges(alpha = 0.80, scale = 1.3, rel_min_height = 0.01,
quantile_lines = TRUE, quantiles = 2) +
geom_vline(xintercept = c(naive_med, tcm_med, treg_med,
tem_med, temra_med),
linetype = "dashed", color = "grey40", linewidth = 0.5) +
scale_fill_manual(values = cell_line_palette) +
theme_classic(base_size = 11) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Pseudotime per Cell Line",
subtitle = "Vertical line = median | Dashed = reference state medians",
x = "Pseudotime", y = "")
p_density | p_ridge# FIX: pct columns computed only for non-Treg cells on the effector axis.
# Original script counted all cells including Treg which inflates pct_tcm
# (Treg cells have TCM-range pseudotime and would be counted as TCM-like).
# Now we separate Treg cells and compute effector-axis stats independently.
pt_stats <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value)) %>%
group_by(cell_line) %>%
summarise(
n_cells = n(),
n_treg = sum(state_azimuth_l2 == "Treg", na.rm = TRUE),
n_effector = sum(state_azimuth_l2 != "Treg", na.rm = TRUE),
mean_pt = round(mean(pseudotime_value), 3),
median_pt = round(median(pseudotime_value), 3),
sd_pt = round(sd(pseudotime_value), 3),
q25_pt = round(quantile(pseudotime_value, 0.25), 3),
q75_pt = round(quantile(pseudotime_value, 0.75), 3),
# Effector axis pct: computed only on non-Treg cells
pct_naive = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value < naive_tcm_cut,
na.rm = TRUE) / n_effector, 1),
pct_tcm = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value >= naive_tcm_cut &
pseudotime_value < tcm_tem_cut,
na.rm = TRUE) / n_effector, 1),
pct_tem = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value >= tcm_tem_cut &
pseudotime_value < tem_temra_cut,
na.rm = TRUE) / n_effector, 1),
pct_temra = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value >= tem_temra_cut,
na.rm = TRUE) / n_effector, 1),
pct_treg = round(100 * n_treg / n_cells, 1),
.groups = "drop"
) %>%
arrange(median_pt)
cat("=== Pseudotime Statistics per Cell Line ===\n")
cat("(pct_naive/tcm/tem/temra computed on effector-axis cells only,\n")
cat(" pct_treg = Treg-labelled cells as % of all cells)\n\n")
print(pt_stats, n = Inf)
cat("\n=== Reference Pseudotime per Azimuth l2 State ===\n")
ref_stats <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime), !is.na(predicted.celltype.l2)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n_cells = n(),
mean_pt = round(mean(monocle3_pseudotime), 3),
median_pt = round(median(monocle3_pseudotime), 3),
.groups = "drop"
) %>%
arrange(median_pt)
print(ref_stats)p_ref <- DimPlot(
reference_integrated, group.by = "predicted.celltype.l2",
reduction = "umap", label = TRUE, repel = TRUE,
label.size = 3, pt.size = 0.4
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey40") +
ggtitle("A. Healthy Reference CD4+ T Cells\n(Azimuth l2)") +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
NoLegend()
p_pt_ref <- FeaturePlot(
reference_integrated, features = "monocle3_pseudotime",
reduction = "umap", cols = c("lightblue", "yellow", "red"),
pt.size = 0.4
) +
ggtitle("B. Reference Pseudotime\n(Naive → Temra + Treg branch)") +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold"))
p_proj <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = "grey88", size = 0.3, alpha = 0.6) +
geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 1.0, alpha = 0.85) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
ggtitle("C. Sézary Cells Projected\n(Transferred pseudotime)")
top_states <- state_summary %>% head(8) %>% pull(state_azimuth_l2)
state_plot_df <- state_per_line %>% filter(state_azimuth_l2 %in% top_states)
p_quant <- ggplot(state_plot_df,
aes(x = reorder(cell_line, -pct), y = pct,
fill = state_azimuth_l2)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey40",
name = "Azimuth l2") +
theme_classic(base_size = 10) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
plot.title = element_text(hjust = 0.5, size = 11, face = "bold"),
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 8)) +
labs(title = "D. State Proportions per Cell Line\n(Azimuth l2 label transfer)",
x = "", y = "% Cells")
(p_ref | p_pt_ref) / (p_proj | p_quant) +
plot_annotation(
title = "Sézary Cell Differentiation State Mapping onto Healthy CD4+ T Cell Trajectory",
theme = theme(plot.title = element_text(hjust = 0.5, size = 13,
face = "bold"))
)mal_state_pt <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value), !is.na(state_azimuth_l2)) %>%
mutate(state = state_azimuth_l2)
state_order_mal <- mal_state_pt %>%
group_by(state) %>%
summarise(med = median(pseudotime_value), .groups = "drop") %>%
arrange(med) %>%
pull(state)
mal_state_pt$state <- factor(mal_state_pt$state, levels = state_order_mal)
ggplot(mal_state_pt, aes(x = state, y = pseudotime_value, fill = state)) +
geom_violin(scale = "width", alpha = 0.8, trim = TRUE) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.3) +
geom_hline(yintercept = c(naive_med, tcm_med, treg_med,
tem_med, temra_med),
linetype = "dashed", color = "grey50", linewidth = 0.5) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Transferred Pseudotime per Azimuth l2 State (Sézary Cells)",
subtitle = "Dashed lines = healthy reference state medians",
x = "Transferred State (cell of origin)",
y = "Transferred Pseudotime (differentiation position)")# This is the core result plot.
# For each transferred Azimuth l2 label, shows what fraction of cells
# fall into each pseudotime bin.
# Key message: TCM-labelled cells span TCM, TEM, and Temra bins
# → identity = TCM (cell of origin)
# → position = further along effector axis (partial skewing)
# FIX: build cross_df with explicit column selection from table output
# rather than rename() which can fail if column names differ
cross_df <- as.data.frame(cross_tab)
# table() always returns: Var1 → Label, Var2 → PT_Bin (from our named dims)
# but if table was constructed with named dims "Label" and "PT_Bin"
# then colnames are already Label, PT_Bin, Freq
colnames(cross_df) <- c("Label", "PT_Bin", "n")
cross_df <- cross_df %>%
group_by(Label) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
filter(n > 0) %>%
mutate(PT_Bin = factor(PT_Bin,
levels = c("Naive-like", "TCM-like", "Treg-like",
"TEM-like", "Temra-like")))
ggplot(cross_df, aes(x = PT_Bin, y = pct, fill = PT_Bin)) +
geom_col(color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 3, paste0(pct, "%"), "")),
vjust = -0.3, size = 3, fontface = "bold") +
facet_wrap(~ Label, scales = "free_y", ncol = 3) +
scale_fill_manual(values = bin_colors, drop = FALSE,
name = "Pseudotime Bin") +
theme_classic(base_size = 10) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 8),
strip.text = element_text(face = "bold", size = 9),
strip.background = element_rect(fill = "#E8F4FD"),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(
title = "Pseudotime Bin Distribution within Each Transferred Azimuth l2 Label",
subtitle = paste0(
"Core finding: TCM-labelled Sézary cells span multiple pseudotime bins\n",
"→ TCM identity (label) but partial effector skewing (pseudotime position)\n",
"Note: Treg panel shows 100% Treg-like by design (label-based bin assignment)"),
x = "Pseudotime Bin", y = "% Cells within label"
)Sys.setlocale("LC_COLLATE", "C")
options(expressions = 10000)
out_dir <- "results/MalignantCD4T_Monocle3_projection"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
library(SeuratObject)
SaveSeuratRds(mapped_MalignantCD4T,
file = file.path(out_dir,
"MalignantCD4T_mapped_monocle3_pseudotime_noProlif_ref.Rds"))
# Summary tables
write.csv(state_summary,
file.path(out_dir, "state_distribution_overall.csv"),
row.names = FALSE)
write.csv(state_per_line,
file.path(out_dir, "state_distribution_per_cellline.csv"),
row.names = FALSE)
write.csv(pt_bin_per_line,
file.path(out_dir, "pseudotime_bins_per_cellline.csv"),
row.names = FALSE)
write.csv(pt_stats,
file.path(out_dir, "pseudotime_stats_per_cellline.csv"),
row.names = FALSE)
write.csv(as.data.frame(cross_tab),
file.path(out_dir, "label_vs_pseudotime_bin_crosstable.csv"),
row.names = FALSE)
write.csv(mapped_MalignantCD4T@meta.data,
file.path(out_dir, "MalignantCD4T_full_metadata_with_projection.csv"),
row.names = TRUE)
# Bin boundaries — saved for methods section and reproducibility
bin_boundaries <- data.frame(
boundary = c("Naive_TCM", "TCM_TEM", "TEM_Temra", "TCM_Treg_branch"),
pseudotime_cut = c(naive_tcm_cut, tcm_tem_cut, tem_temra_cut, tcm_treg_cut),
naive_med = naive_med,
tcm_med = tcm_med,
treg_med = treg_med,
tem_med = tem_med,
temra_med = temra_med
)
write.csv(bin_boundaries,
file.path(out_dir, "pseudotime_bin_boundaries.csv"),
row.names = FALSE)
cat("✅ All outputs saved to:", out_dir, "\n\n")
cat("Files:\n")
print(list.files(out_dir))