This script projects Sézary syndrome malignant CD4 T cells (cell lines L1–L7) onto the pre-built normal CD4 T cell reference trajectory.
Requires outputs from
cd4_ref_dual_trajectory.Rmd: -
cd4_ref_dual_trajectory.rds — reference with milestone +
pseudotime columns - monocle3_cds.rds — Monocle3
cell_data_set with principal graph - mst_graph.rds — igraph
MST object - milestone_centroids.rds — milestone centroid
coordinates and metadata
suppressPackageStartupMessages({
library(Seurat)
library(SeuratWrappers)
library(monocle3)
library(igraph)
library(RANN)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(tidyr)
library(patchwork)
library(pheatmap)
library(viridis)
library(scales)
library(RColorBrewer)
library(knitr)
library(kableExtra)
})
STATE_COLORS <- c(
"CD4 Naive" = "#4472C4",
"CD4 TCM" = "#70AD47",
"CD4 TEM" = "#ED7D31",
"CD4 Temra/CTL" = "#C00000",
"Treg" = "#7030A0"
)
METHOD_COLORS <- c(
"Monocle3" = "#e74c3c",
"Custom MST" = "#2980b9"
)
STATE_ORDER <- c("CD4 Naive","CD4 TCM","CD4 TEM","CD4 Temra/CTL","Treg")
UMAP_NAME <- "umap"
STATE_COL <- "predicted.celltype.l2"
cat("✅ Setup complete\n")
✅ Setup complete
# ── Load reference and trajectory objects ─────────────────────────────────
cd4_ref <- readRDS("../1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds")
cds <- readRDS("../1-Custom_MST/Objects/monocle3_cds.rds")
mst_graph <- readRDS("../1-Custom_MST/Objects/mst_graph.rds")
milestone_centroids <- readRDS("../1-Custom_MST/Objects/milestone_centroids.rds")
cat("Reference loaded:", ncol(cd4_ref), "cells\n")
Reference loaded: 11466 cells
cat("Milestone labels present:", "milestone" %in% colnames(cd4_ref@meta.data), "\n")
Milestone labels present: TRUE
cat("MST pseudotime present: ", "mst_pseudotime_norm" %in% colnames(cd4_ref@meta.data), "\n")
MST pseudotime present: TRUE
cat("M3 pseudotime present: ", "monocle3_pseudotime_norm" %in% colnames(cd4_ref@meta.data), "\n")
M3 pseudotime present: TRUE
# Validate required columns
stopifnot(
"milestone missing from cd4_ref" = "milestone" %in% colnames(cd4_ref@meta.data),
"mst_pseudotime_norm missing from cd4_ref" = "mst_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
"monocle3_pseudotime_norm missing from cd4_ref" = "monocle3_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
"integrated assay missing" = "integrated" %in% names(cd4_ref@assays),
"PCA loadings missing" = ncol(cd4_ref[["pca"]]@feature.loadings) > 0,
"UMAP model missing" = !is.null(cd4_ref[["umap"]]@misc$model)
)
cat("✅ Reference validation passed\n")
✅ Reference validation passed
cat("\nMilestone distribution in reference:\n")
Milestone distribution in reference:
print(table(cd4_ref$milestone))
M00 M01 M02 M03 M04 M05 M06
2037 4350 4717 145 10 146 61
cat("\nAzimuth l2 labels:\n")
Azimuth l2 labels:
print(table(cd4_ref@meta.data[[STATE_COL]]))
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
# ── These objects were created during MST computation in Script 1.
# ── Reconstruct from saved RDS files so this script is fully standalone.
# Milestone order
milestone_order <- sprintf("M%02d", seq_len(nrow(milestone_centroids)) - 1L)
# Graph distances from root M00 (unweighted hop count)
graph_dists <- distances(mst_graph, v = "M00", weights = NA)
graph_dists[is.infinite(graph_dists)] <-
max(graph_dists[is.finite(graph_dists)]) + 1
g_dists_vec <- setNames(as.numeric(graph_dists), colnames(graph_dists))
# MST pseudotime range (needed for Sézary normalisation)
mst_pt_range <- range(cd4_ref$mst_pseudotime, na.rm = TRUE)
# MST edge data frame for plotting
mst_edges <- as_edgelist(mst_graph)
mst_edge_df <- do.call(rbind, lapply(seq_len(nrow(mst_edges)), function(i) {
m1 <- mst_edges[i,1]; m2 <- mst_edges[i,2]
r1 <- milestone_centroids[milestone_centroids$milestone == m1, ]
r2 <- milestone_centroids[milestone_centroids$milestone == m2, ]
if (nrow(r1)==0 || nrow(r2)==0) return(NULL)
data.frame(x1=r1$UMAP_1, y1=r1$UMAP_2, x2=r2$UMAP_1, y2=r2$UMAP_2)
}))
# Monocle3 edge data frame for plotting
pr_graph_obj <- principal_graph(cds)[["UMAP"]]
pr_layout <- t(cds@principal_graph_aux[["UMAP"]]$dp_mst)
m3_edge_list <- igraph::as_edgelist(pr_graph_obj)
m3_edge_df <- do.call(rbind, lapply(seq_len(nrow(m3_edge_list)), function(i) {
n1 <- m3_edge_list[i,1]; n2 <- m3_edge_list[i,2]
data.frame(x1=pr_layout[n1,1], y1=pr_layout[n1,2],
x2=pr_layout[n2,1], y2=pr_layout[n2,2])
}))
# UMAP coordinates data frame for reference background in plots
umap_df <- as.data.frame(Embeddings(cd4_ref, UMAP_NAME))
colnames(umap_df) <- c("UMAP_1","UMAP_2")
umap_df$cell <- rownames(umap_df)
umap_df$state <- cd4_ref@meta.data[[STATE_COL]]
umap_df$milestone <- cd4_ref@meta.data$milestone
# MST projection function (needed for sezary-pseudotime chunk)
project_onto_mst <- function(cell_coords, centroid_mat, mst_graph, graph_dists) {
edges <- as_edgelist(mst_graph)
n_cells <- nrow(cell_coords)
pseudotime <- numeric(n_cells)
nearest_ms <- character(n_cells)
for (i in seq_len(n_cells)) {
cx <- cell_coords[i,1]; cy <- cell_coords[i,2]
best_dist <- Inf; best_pt <- 0; best_ms <- NA_character_
for (e in seq_len(nrow(edges))) {
m1 <- edges[e,1]; m2 <- edges[e,2]
p1 <- centroid_mat[m1,]; p2 <- centroid_mat[m2,]
dx <- p2[1]-p1[1]; dy <- p2[2]-p1[2]
len2 <- dx^2 + dy^2
if (len2 < 1e-10) next
t <- max(0, min(1, ((cx-p1[1])*dx + (cy-p1[2])*dy) / len2))
proj <- c(p1[1]+t*dx, p1[2]+t*dy)
d <- sqrt((cx-proj[1])^2 + (cy-proj[2])^2)
if (d < best_dist) {
best_dist <- d
pt_m1 <- graph_dists[m1]
pt_m2 <- graph_dists[m2]
best_pt <- pt_m1 + t * (pt_m2 - pt_m1)
best_ms <- if (t < 0.5) m1 else m2
}
}
pseudotime[i] <- best_pt
nearest_ms[i] <- best_ms
}
list(pseudotime = pseudotime, nearest_ms = nearest_ms)
}
# centroid_mat needed for project_onto_mst
centroid_mat <- as.matrix(milestone_centroids[, c("UMAP_1","UMAP_2")])
rownames(centroid_mat) <- milestone_centroids$milestone
cat("✅ All projection objects reconstructed\n")
✅ All projection objects reconstructed
cat("Milestone order:", paste(milestone_order, collapse=" → "), "\n")
Milestone order: M00 → M01 → M02 → M03 → M04 → M05 → M06
cat("MST pseudotime range:", round(mst_pt_range[1],2), "–", round(mst_pt_range[2],2), "\n")
MST pseudotime range: 0 – 4
sezary_obj <- readRDS("/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds")
cat("Sézary object loaded:", ncol(sezary_obj), "cells\n")
Sézary object loaded: 49305 cells
cat("\nAll orig.ident values:\n")
All orig.ident values:
print(table(sezary_obj$orig.ident))
L1 L2 L3 L4 L5 L6 L7 CD4T_lab CD4T_10x
5825 5935 6428 6006 6022 5148 5331 5106 3504
stopifnot(
"RNA assay missing from sezary_obj" = "RNA" %in% names(sezary_obj@assays)
)
cat("✅ Sézary object validated\n")
✅ Sézary object validated
# ── Subset to malignant Sézary cell lines only ────────────────────────────
malignant_idents <- c("L1","L2","L3","L4","L5","L6","L7")
sezary_obj <- subset(sezary_obj, subset = orig.ident %in% malignant_idents)
cat("Cell counts after subsetting to L1-L7:\n")
Cell counts after subsetting to L1-L7:
print(table(sezary_obj$orig.ident))
L1 L2 L3 L4 L5 L6 L7 CD4T_lab CD4T_10x
5825 5935 6428 6006 6022 5148 5331 0 0
cat("\nTotal malignant cells:", ncol(sezary_obj), "\n")
Total malignant cells: 40695
stopifnot(
"No cells after subset — check orig.ident values" = ncol(sezary_obj) > 0,
"Expected 7 cell lines" = length(unique(sezary_obj$orig.ident)) == 7
)
cat("✅ Malignant subset validated\n")
✅ Malignant subset validated
# Just for this step
old_plan <- plan("sequential")
options(future.globals.maxSize = 50 * 1024^3) # 50 GB
# ══════════════════════════════════════════════════════════════════════════
# SÉZARY CELL PROJECTION ONTO REFERENCE TRAJECTORY
# ══════════════════════════════════════════════════════════════════════════
# Reference was built with:
# SCTransform (per dataset, vst.flavor="v2") →
# PrepSCTIntegration → FindIntegrationAnchors (RPCA) →
# IntegrateData (normalization.method="SCT") →
# RunPCA (integrated assay, return.model=TRUE) →
# RunUMAP (return.model=TRUE)
#
# Therefore:
# normalization.method = "SCT" (matches reference integration)
# reference.reduction = "pca" (PCA lives on integrated assay)
# reduction = "pcaproject" (uses saved PCA rotation matrix)
# Query must be SCT-normalised per sample to match reference feature space
# ══════════════════════════════════════════════════════════════════════════
# ── Step 1: Set reference default assay ───────────────────────────────────
DefaultAssay(cd4_ref) <- "integrated"
cat("Reference default assay:", DefaultAssay(cd4_ref), "\n")
Reference default assay: integrated
cat("Reference cells:", ncol(cd4_ref), "\n")
Reference cells: 11466
cat("Reference PCA dims:", ncol(Embeddings(cd4_ref, "pca")), "\n")
Reference PCA dims: 50
cat("PCA model present:", ncol(cd4_ref[["pca"]]@feature.loadings) > 0, "\n")
PCA model present: TRUE
cat("UMAP model present:", !is.null(cd4_ref[["umap"]]@misc$model), "\n")
UMAP model present: TRUE
# ── Step 2: Cell cycle scoring on query ───────────────────────────────────
DefaultAssay(sezary_obj) <- "RNA"
sezary_obj <- NormalizeData(sezary_obj, verbose = FALSE)
sezary_obj <- CellCycleScoring(
sezary_obj,
s.features = cc.genes.updated.2019$s.genes,
g2m.features = cc.genes.updated.2019$g2m.genes,
set.ident = FALSE
)
cat("\nCell cycle scoring complete\n")
Cell cycle scoring complete
print(table(sezary_obj$Phase))
G1 G2M S
15379 13760 11556
# ── Step 3: Per-sample SCT on query ───────────────────────────────────────
# Reference was SCT'd per dataset (3 models). Query must match.
# Each cell line is a separate library — per-line SCT prevents batch effects
# from dominating HVG selection and anchor finding.
split_col <- "orig.ident"
cat("\nSplitting query by:", split_col, "\n")
Splitting query by: orig.ident
print(table(sezary_obj@meta.data[[split_col]]))
L1 L2 L3 L4 L5 L6 L7
5825 5935 6428 6006 6022 5148 5331
sezary_list <- SplitObject(sezary_obj, split.by = split_col)
cat("Running SCT per sample (", length(sezary_list), "samples)...\n")
Running SCT per sample ( 7 samples)...
sezary_list <- lapply(seq_along(sezary_list), function(i) {
cat(" SCT sample", i, "/", length(sezary_list),
"-", names(sezary_list)[i],
"(", ncol(sezary_list[[i]]), "cells)\n")
SCTransform(
sezary_list[[i]],
vst.flavor = "v2",
vars.to.regress = c("S.Score", "G2M.Score", "percent.mt"),
verbose = FALSE
)
})
SCT sample 1 / 7 - L1 ( 5825 cells)
SCT sample 2 / 7 - L2 ( 5935 cells)
SCT sample 3 / 7 - L3 ( 6428 cells)
SCT sample 4 / 7 - L4 ( 6006 cells)
SCT sample 5 / 7 - L5 ( 6022 cells)
SCT sample 6 / 7 - L6 ( 5148 cells)
SCT sample 7 / 7 - L7 ( 5331 cells)
sezary_obj <- merge(sezary_list[[1]], sezary_list[-1], merge.data = TRUE)
cat("✅ SCT complete — merged query:", ncol(sezary_obj), "cells\n")
✅ SCT complete — merged query: 40695 cells
# ── Step 4: Set shared variable features ──────────────────────────────────
# Use reference integration features as the anchor feature set.
# These are the 2902 genes used to build the reference PCA — using the
# same genes ensures the pcaproject rotation is valid.
ref_features <- rownames(cd4_ref[["integrated"]]@scale.data)
query_genes <- rownames(sezary_obj[["SCT"]])
shared_features <- intersect(ref_features, query_genes)
cat("\nReference integration features:", length(ref_features), "\n")
Reference integration features: 2902
cat("Query SCT genes: ", length(query_genes), "\n")
Query SCT genes: 23460
cat("Shared features: ", length(shared_features), "\n")
Shared features: 2893
if (length(shared_features) < 1500) {
warning(paste0(
"⚠️ Only ", length(shared_features), " shared features.\n",
" Minimum recommended is 1500.\n",
" Check that sezary_obj RNA assay contains the same gene set as reference."
))
}
# ── Step 5: Clean shared features ─────────────────────────────────────────
# Remove ribosomal, mitochondrial, TCR, HLA and other batch-driven genes.
junk_pattern <- "^RPL|^RPS|^MT-|^HSP|^HSPA|^HSPB|^HSPD|^HSPE|^HSPH|^SNHG|^MALAT1|^NEAT1|^XIST|^TRBV|^TRAV|^TRGV|^TRDV|^HLA-"
shared_features_clean <- shared_features[!grepl(junk_pattern, shared_features)]
cat("\nFeatures before cleaning:", length(shared_features), "\n")
Features before cleaning: 2893
cat("Features after cleaning: ", length(shared_features_clean), "\n")
Features after cleaning: 2823
cat("Junk genes removed: ", length(shared_features) - length(shared_features_clean), "\n")
Junk genes removed: 70
# Sanity check: key T cell markers present?
markers_check <- c("CCR7","SELL","TCF7","IL7R","GZMB","PRF1",
"FOXP3","TOX","PDCD1","CD44","CD69","TNF",
"TIGIT","HAVCR2","EOMES","TBX21","CXCR3","GZMK")
found_markers <- markers_check[markers_check %in% shared_features_clean]
cat("\nKey T cell markers in feature set:",
length(found_markers), "/", length(markers_check), "\n")
Key T cell markers in feature set: 13 / 18
print(found_markers)
[1] "CCR7" "SELL" "TCF7" "IL7R" "PRF1" "FOXP3" "TOX" "CD44" "CD69" "TNF" "TIGIT" "CXCR3" "GZMK"
if (length(found_markers) < 8) {
warning(paste0(
"⚠️ Fewer than 8 T cell markers in shared features.\n",
" Feature set may be dominated by non-specific genes.\n",
" Inspect head(shared_features_clean, 30)."
))
}
VariableFeatures(sezary_obj) <- shared_features_clean
DefaultAssay(sezary_obj) <- "SCT"
# ── Step 6: Find transfer anchors ─────────────────────────────────────────
cat("\n=== Finding transfer anchors ===\n")
=== Finding transfer anchors ===
cat("Reference cells:", ncol(cd4_ref), "\n")
Reference cells: 11466
cat("Query cells: ", ncol(sezary_obj), "\n")
Query cells: 40695
cat("Features: ", length(shared_features_clean), "\n")
Features: 2823
# dims: use reference PCA only — pcaproject does not require query PCA
dims_to_use <- min(30, ncol(Embeddings(cd4_ref, "pca")))
cat("Using dims 1:", dims_to_use, "\n")
Using dims 1: 30
anchors <- FindTransferAnchors(
reference = cd4_ref,
query = sezary_obj,
features = shared_features_clean,
normalization.method = "SCT", # ✅ matches SCT-RPCA reference
reference.reduction = "pca", # ✅ PCA on integrated assay
reduction = "pcaproject", # ✅ project into reference PCA space
dims = 1:dims_to_use,
k.anchor = 10, # default 5 → more anchor candidates
k.filter = 500, # default 200 → less pruning on large query
k.score = 30, # default 30 → unchanged
verbose = TRUE
)
n_anchors <- nrow(anchors@anchors)
anchor_ratio <- ncol(sezary_obj) / n_anchors
cat("\nAnchors found: ", n_anchors, "\n")
Anchors found: 8661
cat("Cells per anchor: ", round(anchor_ratio, 1), "\n")
Cells per anchor: 4.7
if (anchor_ratio > 8) {
warning(paste0(
"⚠️ Low anchor density (1:", round(anchor_ratio, 1), ").\n",
" Ideal is 1:5 or better.\n",
" Check T cell markers found above — want ≥ 8/18.\n",
" Inspect head(shared_features_clean, 30) for batch genes."
))
} else {
cat("✅ Anchor density acceptable\n")
}
✅ Anchor density acceptable
# ── Step 7: MapQuery ──────────────────────────────────────────────────────
cat("\n=== Running MapQuery ===\n")
=== Running MapQuery ===
sezary_obj <- MapQuery(
anchorset = anchors,
query = sezary_obj,
reference = cd4_ref,
refdata = list(
predicted_state = STATE_COL, # predicted.celltype.l2
predicted_milestone = "milestone" # MST milestone labels (M00-M06)
),
reference.reduction = "pca", # basis for UMAP projection
reduction.model = "umap", # uses saved UMAP model
verbose = FALSE
)
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
cat("✅ MapQuery complete\n")
✅ MapQuery complete
cat("Mapped cells:", ncol(sezary_obj), "\n")
Mapped cells: 40695
# ── Step 8: Coerce numeric transfers from character ───────────────────────
for (col in colnames(sezary_obj@meta.data)) {
if (grepl("score$", col, ignore.case = TRUE)) {
sezary_obj@meta.data[[col]] <- as.numeric(sezary_obj@meta.data[[col]])
}
}
# ── Step 9: Validate results ──────────────────────────────────────────────
cat("\nPredicted state distribution:\n")
Predicted state distribution:
print(table(sezary_obj$predicted.predicted_state))
CD4 TCM CD4 TEM Treg
40628 66 1
cat("\nPrediction score summary:\n")
Prediction score summary:
print(summary(sezary_obj$predicted.predicted_state.score))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.4108 1.0000 1.0000 0.9837 1.0000 1.0000
cat("\nPredicted milestone distribution:\n")
Predicted milestone distribution:
print(table(sezary_obj$predicted.predicted_milestone))
M01 M02 M03 M05
1713 38893 86 3
state_pct <- prop.table(table(sezary_obj$predicted.predicted_state)) * 100
dominant <- names(which(state_pct > 60))
if (length(dominant) > 0) {
warning(paste0(
"⚠️ State collapse detected: ", dominant,
" contains ", round(max(state_pct), 1), "% of cells.\n",
" Check anchor density and shared feature quality above."
))
} else {
cat("✅ State distribution is diverse — no collapse detected\n")
}
sez_umap_coords <- as.matrix(Embeddings(sezary_obj, "ref.umap"))
colnames(sez_umap_coords) <- c("UMAP_1","UMAP_2")
cat("Projecting Sézary cells onto MST...\n")
Projecting Sézary cells onto MST...
sez_mst <- project_onto_mst(sez_umap_coords, centroid_mat, mst_graph, g_dists_vec)
ref_umap_mat <- Embeddings(cd4_ref, UMAP_NAME)
ref_m3_pt <- cd4_ref@meta.data$monocle3_pseudotime_norm
nn_result <- nn2(data=ref_umap_mat, query=sez_umap_coords, k=15)
inv_dist <- 1 / (nn_result$nn.dists + 1e-6)
weights <- inv_dist / rowSums(inv_dist)
meta_sez <- sezary_obj@meta.data
meta_sez$mst_pseudotime <- pmax(0, pmin(100,
100 * (sez_mst$pseudotime - mst_pt_range[1]) /
(mst_pt_range[2] - mst_pt_range[1])
))
meta_sez$nearest_milestone_mst <- sez_mst$nearest_ms
meta_sez$monocle3_pseudotime <- rowSums(
weights * matrix(ref_m3_pt[nn_result$nn.idx], nrow=nrow(nn_result$nn.idx))
)
sezary_obj@meta.data <- meta_sez
cat("MST pseudotime:", round(range(sezary_obj@meta.data$mst_pseudotime, na.rm=TRUE), 1), "\n")
MST pseudotime: 0 100
cat("Monocle3 pseudotime:", round(range(sezary_obj@meta.data$monocle3_pseudotime, na.rm=TRUE), 1), "\n")
Monocle3 pseudotime: 1.9 99.3
p_sez_m3_state <- ggplot() +
geom_point(data=ref_bg[sample(nrow(ref_bg)),],
aes(x=UMAP_1,y=UMAP_2), colour="grey85", size=.3, alpha=.4) +
geom_segment(data=m3_edge_df, aes(x=x1,y=y1,xend=x2,yend=y2),
colour="grey30", linewidth=.6, alpha=.7) +
geom_point(data=sez_plot_df,
aes(x=UMAP_1,y=UMAP_2,colour=predicted_state), size=1.8, alpha=.85) +
scale_colour_manual(values=STATE_COLORS, name="Predicted state") +
theme_classic() +
labs(x="UMAP-1", y="UMAP-2",
title="Sézary cells — Monocle3 projection (predicted state)") +
theme(plot.title=element_text(size=13, face="bold"))
p_sez_m3_pt <- ggplot() +
geom_point(data=ref_bg[sample(nrow(ref_bg)),],
aes(x=UMAP_1,y=UMAP_2), colour="grey85", size=.3, alpha=.4) +
geom_segment(data=m3_edge_df, aes(x=x1,y=y1,xend=x2,yend=y2),
colour="grey30", linewidth=.5, alpha=.6) +
geom_point(data=sez_plot_df,
aes(x=UMAP_1,y=UMAP_2,colour=m3_pt), size=1.8, alpha=.9) +
scale_colour_gradientn(
colours=c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
name="Monocle3\npseudotime", limits=c(0,100)) +
theme_classic() +
labs(x="UMAP-1", y="UMAP-2", title="Sézary — Monocle3 projected pseudotime") +
theme(plot.title=element_text(size=13, face="bold"))
print(p_sez_m3_state / p_sez_m3_pt)
sez_cor <- cor(sez_plot_df$mst_pt, sez_plot_df$m3_pt,
method="spearman", use="complete.obs")
p_sez_scatter <- ggplot(sez_plot_df, aes(x=mst_pt, y=m3_pt, colour=predicted_state)) +
geom_point(size=1.2, alpha=.7) +
geom_abline(slope=1, intercept=0, linetype="dashed", colour="grey40") +
geom_smooth(method="lm", se=FALSE, colour="black", linewidth=.8) +
scale_colour_manual(values=STATE_COLORS, name="Predicted state") +
annotate("text", x=5, y=90, label=sprintf("Spearman ρ = %.3f", sez_cor),
size=4.5, fontface="bold", hjust=0) +
theme_classic() +
labs(x="Custom MST pseudotime", y="Monocle3 pseudotime",
title="Sézary cells: pseudotime method agreement") +
theme(plot.title=element_text(size=12, face="bold"))
p_sez_violin <- ggplot(
sez_plot_df %>%
pivot_longer(c(mst_pt,m3_pt), names_to="method", values_to="pt") %>%
mutate(method=recode(method, mst_pt="Custom MST", m3_pt="Monocle3")),
aes(x=predicted_state, y=pt, fill=method)
) +
geom_violin(scale="width", trim=FALSE, alpha=.7, position=position_dodge(.8)) +
geom_boxplot(width=.08, fill="white", outlier.size=.3, position=position_dodge(.8)) +
scale_fill_manual(values=METHOD_COLORS, name="Method") +
theme_classic() +
labs(x="Predicted state", y="Pseudotime (0–100)",
title="Sézary pseudotime per state: method comparison") +
theme(plot.title=element_text(size=12, face="bold"),
axis.text.x=element_text(angle=30, hjust=1))
print(p_sez_scatter | p_sez_violin)
# State distribution per cell line
cell_line_state <- sez_plot_df %>%
count(cell_line, predicted_state) %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup()
p_cell_line <- ggplot(cell_line_state,
aes(x=cell_line, y=pct, fill=predicted_state)) +
geom_col(width=.7) +
scale_fill_manual(values=STATE_COLORS, name="Predicted state") +
theme_classic() +
labs(x="Cell line", y="% cells",
title="Predicted state distribution per cell line") +
theme(plot.title=element_text(size=12, face="bold"))
# Pseudotime per cell line
p_pt_cell_line <- ggplot(sez_plot_df,
aes(x=cell_line, y=mst_pt, fill=cell_line)) +
geom_violin(scale="width", trim=FALSE, alpha=.8) +
geom_boxplot(width=.08, fill="white", outlier.size=.3) +
theme_classic() +
labs(x="Cell line", y="MST pseudotime (0–100)",
title="Pseudotime distribution per cell line") +
theme(plot.title=element_text(size=12, face="bold"),
legend.position="none")
print(p_cell_line | p_pt_cell_line)
sez_summary <- sez_plot_df %>%
group_by(predicted_state) %>%
summarise(
n_cells = n(),
pct = round(100*n()/nrow(sez_plot_df), 1),
mean_pred_score = round(mean(pred_score), 3),
mst_mean_pt = round(mean(mst_pt, na.rm=TRUE), 1),
mst_sd_pt = round(sd(mst_pt, na.rm=TRUE), 1),
m3_mean_pt = round(mean(m3_pt, na.rm=TRUE), 1),
m3_sd_pt = round(sd(m3_pt, na.rm=TRUE), 1),
.groups = "drop"
) %>% arrange(mst_mean_pt)
kable(sez_summary,
col.names = c("State","N","% of Sézary","Pred. score",
"MST mean pt","MST SD","M3 mean pt","M3 SD"),
caption = "Sézary cell trajectory summary — both methods") %>%
kable_styling(bootstrap_options=c("striped","hover"), full_width=FALSE) %>%
column_spec(5:6, color="white", background=METHOD_COLORS["Custom MST"]) %>%
column_spec(7:8, color="white", background=METHOD_COLORS["Monocle3"])
| State | N | % of Sézary | Pred. score | MST mean pt | MST SD | M3 mean pt | M3 SD |
|---|---|---|---|---|---|---|---|
| CD4 TCM | 40628 | 99.8 | 0.984 | 68.4 | 20.1 | 64.4 | 15.8 |
| Treg | 1 | 0.0 | 0.480 | 96.0 | NA | 44.0 | NA |
| CD4 TEM | 66 | 0.2 | 0.526 | 99.4 | 2.4 | 98.0 | 0.0 |
ms_dist <- sez_plot_df %>%
count(milestone_ref) %>%
left_join(milestone_centroids[,c("milestone","state","order_pos")],
by=c("milestone_ref"="milestone")) %>%
mutate(pct=round(100*n/sum(n),1),
milestone_ref=factor(milestone_ref, levels=milestone_order)) %>%
arrange(order_pos)
p_ms_bar <- ggplot(ms_dist, aes(x=milestone_ref, y=pct, fill=state)) +
geom_col(width=.7) +
geom_text(aes(label=sprintf("%.0f%%",pct)), vjust=-.3, size=3) +
scale_fill_manual(values=STATE_COLORS, name="State") +
scale_x_discrete(limits=milestone_order) +
theme_classic() +
labs(x="Nearest reference milestone", y="% Sézary cells",
title="Sézary cell distribution across reference milestones") +
theme(axis.text.x=element_text(angle=45,hjust=1),
plot.title=element_text(size=12,face="bold"))
print(p_ms_bar)
# ── RDS ───────────────────────────────────────────────────────────────────
saveRDS(sezary_obj, "Objects/sezary_projected_dual.rds")
Error: C stack usage 7969460 is too close to the limit
# ── CSV tables ────────────────────────────────────────────────────────────
write.csv(sez_summary, "Tables/sezary_trajectory_summary.csv", row.names=FALSE)
write.csv(ms_dist, "Tables/sezary_milestone_distribution.csv", row.names=FALSE)
# ── Figures ───────────────────────────────────────────────────────────────
ggsave("Figures/fig_sezary_mst_state.pdf", p_sez_mst_state, width=12, height=8)
ggsave("Figures/fig_sezary_mst_pt.pdf", p_sez_mst_pt, width=12, height=8)
ggsave("Figures/fig_sezary_m3_state.pdf", p_sez_m3_state, width=12, height=8)
ggsave("Figures/fig_sezary_m3_pt.pdf", p_sez_m3_pt, width=12, height=8)
ggsave("Figures/fig_sezary_agreement.pdf", p_sez_scatter, width=12, height=5)
`geom_smooth()` using formula = 'y ~ x'
ggsave("Figures/fig_sezary_cell_lines.pdf", p_cell_line, width=12, height=5)
ggsave("Figures/fig_milestone_bar.pdf", p_ms_bar, width=12, height=5)
cat("✅ All outputs saved\n")
✅ All outputs saved
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-redhat-linux-gnu
Running under: Rocky Linux 9.7 (Blue Onyx)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS-OPENMP; LAPACK version 3.9.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8
[6] LC_MESSAGES=C.UTF-8 LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] future_1.69.0 kableExtra_1.4.0 knitr_1.51 RColorBrewer_1.1-3
[5] scales_1.4.0 viridis_0.6.5 viridisLite_0.4.3 pheatmap_1.0.13
[9] patchwork_1.3.2 tidyr_1.3.2 dplyr_1.2.0 ggrepel_0.9.6
[13] ggplot2_4.0.2 RANN_2.6.2 igraph_2.2.2 monocle3_1.4.26
[17] SingleCellExperiment_1.32.0 SummarizedExperiment_1.40.0 GenomicRanges_1.62.1 Seqinfo_1.0.0
[21] IRanges_2.44.0 S4Vectors_0.48.0 MatrixGenerics_1.22.0 matrixStats_1.5.0
[25] Biobase_2.70.0 BiocGenerics_0.56.0 generics_0.1.4 SeuratWrappers_0.4.0
[29] Seurat_5.4.0 SeuratObject_5.3.0 sp_2.2-1
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.23 splines_4.5.2 later_1.4.6 tibble_3.3.1 R.oo_1.27.1
[6] polyclip_1.10-7 fastDummies_1.7.5 lifecycle_1.0.5 Rdpack_2.6.6 globals_0.19.0
[11] lattice_0.22-9 MASS_7.3-65 magrittr_2.0.4 plotly_4.12.0 sass_0.4.10
[16] rmarkdown_2.30 jquerylib_0.1.4 yaml_2.3.12 remotes_2.5.0 httpuv_1.6.16
[21] otel_0.2.0 glmGamPoi_1.22.0 sctransform_0.4.3 spam_2.11-3 spatstat.sparse_3.1-0
[26] reticulate_1.45.0 cowplot_1.2.0 pbapply_1.7-4 minqa_1.2.8 abind_1.4-8
[31] Rtsne_0.17 purrr_1.2.1 R.utils_2.13.0 irlba_2.3.7 listenv_0.10.0
[36] spatstat.utils_3.2-1 goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.4-4 fitdistrplus_1.2-6
[41] parallelly_1.46.1 svglite_2.2.2 DelayedMatrixStats_1.32.0 codetools_0.2-20 DelayedArray_0.36.0
[46] xml2_1.5.2 tidyselect_1.2.1 farver_2.1.2 lme4_1.1-38 spatstat.explore_3.7-0
[51] jsonlite_2.0.0 progressr_0.18.0 ggridges_0.5.7 survival_3.8-6 systemfonts_1.3.1
[56] tools_4.5.2 ragg_1.5.0 ica_1.0-3 Rcpp_1.1.1 glue_1.8.0
[61] gridExtra_2.3 SparseArray_1.10.8 mgcv_1.9-4 xfun_0.56 withr_3.0.2
[66] BiocManager_1.30.27 fastmap_1.2.0 boot_1.3-32 digest_0.6.39 rsvd_1.0.5
[71] R6_2.6.1 mime_0.13 textshaping_1.0.4 scattermore_1.2 tensor_1.5.1
[76] dichromat_2.0-0.1 spatstat.data_3.1-9 R.methodsS3_1.8.2 data.table_1.18.2.1 httr_1.4.8
[81] htmlwidgets_1.6.4 S4Arrays_1.10.1 uwot_0.2.4 pkgconfig_2.0.3 gtable_0.3.6
[86] rsconnect_1.7.0 lmtest_0.9-40 S7_0.2.1 XVector_0.50.0 htmltools_0.5.9
[91] dotCall64_1.2 png_0.1-8 spatstat.univar_3.1-6 reformulas_0.4.4 rstudioapi_0.18.0
[96] reshape2_1.4.5 nlme_3.1-168 nloptr_2.2.1 cachem_1.1.0 zoo_1.8-15
[101] stringr_1.6.0 KernSmooth_2.23-26 parallel_4.5.2 miniUI_0.1.2 pillar_1.11.1
[106] grid_4.5.2 vctrs_0.7.1 promises_1.5.0 beachmat_2.26.0 xtable_1.8-4
[111] cluster_2.1.8.2 evaluate_1.0.5 cli_3.6.5 compiler_4.5.2 rlang_1.1.7
[116] future.apply_1.20.1 labeling_0.4.3 plyr_1.8.9 stringi_1.8.7 deldir_2.0-4
[121] lazyeval_0.2.2 spatstat.geom_3.7-0 Matrix_1.7-4 RcppHNSW_0.6.0 sparseMatrixStats_1.22.0
[126] shiny_1.12.1 rbibutils_2.4.1 ROCR_1.0-12 bslib_0.10.0
| Column | Description |
|---|---|
| predicted.predicted_state | MapQuery label transfer: predicted.celltype.l2 from reference (Naive/TCM/TEM/Temra/Treg) |
| predicted.predicted_state.score | MapQuery prediction confidence score (0–1) |
| predicted.predicted_milestone | MapQuery label transfer: milestone label from reference (M00–M06) |
| nearest_milestone_mst | Nearest MST milestone after edge projection in ref.umap space |
| mst_pseudotime | MST pseudotime on reference 0–100 scale (capped 0–100) |
| monocle3_pseudotime | KNN-weighted Monocle3 pseudotime from 15 nearest reference neighbours |