# ── CRITICAL: use the ANNOTATED reference with saved UMAP model ────────────
# This is the cleaned reference object with:
# - CD4 Proliferating cluster removed (MKI67+/TOP2A+/CDK1+ cells)
# - Cluster 12 contamination removed
# - UMAP reduction with return.model = TRUE
# - monocle3_pseudotime column computed on the clean trajectory
# - cell_type annotation (clusters 0-6, no proliferating)
# - predicted.celltype.l2 (Azimuth l2)
# ──────────────────────────────────────────────────────────────────────────
reference_integrated <- readRDS("../Part3_Slingshot_Trajectory/part4_Slingshot+Projection/cd4_ref_dual_trajectory.rds")
cat("=== Reference object summary ===\n")=== Reference object summary ===
Cells: 11466
Assays: RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
Reductions: pca, umap, integrated_dr, ref.umap
UMAP model present: TRUE
# ── Confirm clusters present ───────────────────────────────────────────────
cat("\nClusters present:", paste(sort(unique(reference_integrated$seurat_clusters)),
collapse = ", "), "\n")
Clusters present: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
# ── Confirm CD4 Proliferating cells are ABSENT ────────────────────────────
# Check 1: cell_type label should not contain "Prolif"
if (any(grepl("Prolif|prolif|cycling|Cycling", reference_integrated$cell_type,
ignore.case = TRUE))) {
warning("⚠️ Proliferating cell_type labels still detected! Check removal step.")
} else {
cat("✅ No proliferating cell_type labels found\n")
}✅ No proliferating cell_type labels found
# Check 2: Molecular signature — MKI67/TOP2A expression should be near zero
prolif_markers <- c("MKI67", "TOP2A", "PCNA", "CDK1", "STMN1")
prolif_present <- intersect(prolif_markers, rownames(reference_integrated))
if (length(prolif_present) > 0) {
DefaultAssay(reference_integrated) <- "SCT"
prolif_expr <- Matrix::colMeans(
GetAssayData(reference_integrated, layer = "data")[prolif_present, , drop = FALSE]
)
cat("Proliferation marker score — max:", round(max(prolif_expr), 3),
" | mean:", round(mean(prolif_expr), 4), "\n")
cat("Cells with any prolif marker > 0.5:", sum(prolif_expr > 0.5),
"(expect near zero after removal)\n")
if (sum(prolif_expr > 0.5) > 50) {
warning("⚠️ Substantial proliferation signal remains — verify removal was complete.")
} else {
cat("✅ Proliferating cells confirmed removed\n")
}
}Proliferation marker score — max: 1.187 | mean: 0.0635
Cells with any prolif marker > 0.5: 6 (expect near zero after removal)
✅ Proliferating cells confirmed removed
# ── Cell type distribution ─────────────────────────────────────────────────
cat("\nCell type distribution (should be 6 types, no Proliferating):\n")
Cell type distribution (should be 6 types, no Proliferating):
CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like) CD4 CTL/Temra (GZMK+GZMA+CCL5+)
5479 3994 522 490
CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+) CD4 Tnaive-RTE (IGF1R+)
412 336 233
Azimuth l2 distribution:
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
# ── Finalise Azimuth l2 colour palette from actual labels in data ──────────
ref_l2_labels <- unique(as.character(reference_integrated$predicted.celltype.l2))
ref_l2_labels <- ref_l2_labels[!is.na(ref_l2_labels)]
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, ref_l2_labels)
cat("\nAzimuth l2 colour palette covers", length(ref_l2_labels), "reference labels ✅\n")
Azimuth l2 colour palette covers 5 reference labels ✅
# NOTE: monocle3_pseudotime summary is printed AFTER trajectory is built
# (Section 2 below). If this is a freshly loaded reference that already
# has pseudotime stored, the validate-pseudotime chunk will summarise it.
# ── Validate: UMAP model MUST exist for MapQuery ──────────────────────────
if (is.null(reference_integrated@reductions$umap@misc$model)) {
stop(
"❌ UMAP model missing! Re-run on the cleaned (no-prolif) object:\n",
" reference_integrated <- RunUMAP(reference_integrated, dims = 1:20,\n",
" return.model = TRUE)\n",
"Then re-save and reload."
)
} else {
cat("✅ UMAP model confirmed — ready for MapQuery projection\n")
}✅ UMAP model confirmed — ready for MapQuery projection
# ── Load the full merged object and subset malignant cells ─────────────────
# Adapt path and column names to your actual object
All_samples_Merged <- readRDS("/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds")
cat("All samples:\n")All samples:
L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab CD4Tcells_10x
5825 5935 6428 6006 6022 5148 5331 5106 3504
# ── Define group labels ────────────────────────────────────────────────────
# Adjust the cell_line names to match YOUR object exactly
All_samples_Merged$Group <- ifelse(
All_samples_Merged$cell_line %in% paste0("L", 1:7),
"MalignantCD4T",
ifelse(
All_samples_Merged$cell_line %in% c("CD4Tcells_lab", "CD4Tcells_10x_S1", "CD4Tcells_10x_S2"),
"NormalCD4T",
"Other"
)
)
cat("\nGroup distribution:\n")
Group distribution:
MalignantCD4T NormalCD4T Other
40695 5106 3504
# ── Subset malignant cells ─────────────────────────────────────────────────
MalignantCD4T <- subset(All_samples_Merged, subset = Group == "MalignantCD4T")
cat("\nMalignant CD4T cells:", ncol(MalignantCD4T), "\n")
Malignant CD4T cells: 40695
Cell lines present:
L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab CD4Tcells_10x
5825 5935 6428 6006 6022 5148 5331 0 0
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 8863699 473.4 15972085 853.1 15972085 853.1
Vcells 1393403835 10630.9 3062219230 23362.9 2634513366 20099.8
# ── Convert Seurat reference to CellDataSet ────────────────────────────────
# We use the reference to build/confirm the trajectory.
# If monocle3_pseudotime already exists in the object (from previous run),
# we can skip learn_graph and go straight to projection.
# Here we re-build for completeness and reproducibility.
cat("=== Building Monocle3 CDS from reference ===\n")=== Building Monocle3 CDS from reference ===
cds <- as.cell_data_set(reference_integrated)
# ── Transfer UMAP coordinates from Seurat (use the frozen reference UMAP) ─
reducedDim(cds, "UMAP") <- Embeddings(reference_integrated, "umap")
# ── Set partitions: single partition = one connected trajectory ────────────
# For CD4 T cell differentiation we expect a linear/branched but connected
# graph — use_partition=FALSE gives cleaner results for a single lineage.
cds@clusters$UMAP$partitions <- factor(rep(1, ncol(cds)))
# ── Set cluster assignments aligned to Seurat clusters ─────────────────────
cluster_vec <- reference_integrated$seurat_clusters[colnames(cds)]
cds@clusters$UMAP$clusters <- factor(cluster_vec)
# ── Transfer cell metadata ─────────────────────────────────────────────────
colData(cds)$cell_line <- reference_integrated$orig.ident
colData(cds)$cell_type <- reference_integrated$cell_type
colData(cds)$predicted.celltype.l2 <- reference_integrated$predicted.celltype.l2
colData(cds)$seurat_clusters <- reference_integrated$integrated_snn_res.0.2
# Transfer sample/origin column if it exists (name may vary)
if ("orig.ident" %in% colnames(reference_integrated@meta.data)) {
colData(cds)$sample <- reference_integrated$orig.ident
} else if ("sample" %in% colnames(reference_integrated@meta.data)) {
colData(cds)$sample <- reference_integrated$sample
}
cat("CDS built:", ncol(cds), "cells\n")CDS built: 11466 cells
Clusters: 8
Partitions: 1
Cell type breakdown in CDS:
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
# ── Learn principal graph ──────────────────────────────────────────────────
# use_partition = FALSE: single connected graph across all healthy cells.
# This works well here because CD4 Proliferating cells
# have been removed — without them, the manifold is a
# clean continuous differentiation space with no
# spurious cell-cycle branch pulling cells away.
# close_loop = FALSE: open trajectory (Tnaive → Temra linear axis).
# The trajectory should not loop back on itself.
#
# RATIONALE: Removing proliferating cells is key to this step working well.
# Previously (with proliferating cells), Monocle3 would create an artifactual
# branch driven by MKI67/TOP2A/CDK1 rather than differentiation genes.
# Now the graph cleanly follows: Tnaive → TCM → TEM → Temra (+ Treg branch).
# ── FIX PARTITIONS NAMES (CRITICAL) ──────────────────────────────────────
if (is.null(names(cds@clusters$UMAP$partitions))) {
cds@clusters$UMAP$partitions <- rep(1, ncol(cds))
names(cds@clusters$UMAP$partitions) <- colnames(cds)
}
cat("✅ Partitions fixed:", table(cds@clusters$UMAP$partitions), "\n")✅ Partitions fixed: 11466
# ── FIX CLUSTERS (if needed) ─────────────────────────────────────────────
if (is.null(cds@clusters$UMAP$clusters)) {
clustervec <- reference_integrated$seurat_clusters[colnames(cds)]
cds@clusters$UMAP$clusters <- factor(clustervec)
}
cat("✅ Clusters:", length(unique(cds@clusters$UMAP$clusters)), "\n")✅ Clusters: 12
# ── LEARN GRAPH ──────────────────────────────────────────────────────────
cds <- learn_graph(cds, use_partition = FALSE, close_loop = FALSE, verbose = TRUE)
cat("\n✅ Principal graph learned!\n")
✅ Principal graph learned!
Nodes: 349
Branch points: 18
Expect 1-2 branches (Treg divergence ± Tnaive-RTE)
# ── VISUALIZE: Azimuth l2 (PRIMARY) + cell_type (SECONDARY) ───────────────
p1 <- plot_cells(cds,
color_cells_by = "predicted.celltype.l2", # YOUR 5 labels
label_groups_by_cluster = FALSE,
label_leaves = TRUE,
label_branch_points = TRUE,
graph_label_size = 3.5,
cell_size = 0.8) +
scale_color_manual(values = azimuth_l2_colors) +
ggtitle("Monocle3 Graph: Azimuth l2 (CD4 Naive → Temra/CTL)") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
p2 <- plot_cells(cds,
color_cells_by = "cell_type",
label_groups_by_cluster = FALSE,
label_leaves = TRUE,
label_branch_points = TRUE,
graph_label_size = 3.5,
cell_size = 0.8) +
ggtitle("Monocle3 Graph: Cell Types") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
p1 | p2umap_coords <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_coords) <- c("UMAP1", "UMAP2") # rename for clarity
umap_coords$l2 <- reference_integrated$predicted.celltype.l2
umap_coords$cell_type <- reference_integrated$cell_type
# Centroid per Azimuth l2 state
label_coords <- umap_coords %>%
dplyr::group_by(l2) %>%
dplyr::summarise(
UMAP1 = median(UMAP1),
UMAP2 = median(UMAP2),
.groups = "drop"
)
# Centroid per cell_type
label_coords2 <- umap_coords %>%
dplyr::group_by(cell_type) %>%
dplyr::summarise(
UMAP1 = median(UMAP1),
UMAP2 = median(UMAP2),
.groups = "drop"
)
library(ggrepel)
# Azimuth l2 plot
p1 <- plot_cells(
cds,
color_cells_by = "predicted.celltype.l2",
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = FALSE,
cell_size = 0.7,
show_trajectory_graph = TRUE
) +
scale_color_manual(values = azimuth_l2_colors, name = "State") +
geom_label_repel(
data = label_coords,
aes(x = UMAP1, y = UMAP2, label = l2),
size = 4, fontface = "bold",
fill = "white", alpha = 0.85,
box.padding = 0.5, label.padding = 0.3,
segment.color = "grey40"
) +
ggtitle("Monocle3 Graph: Azimuth l2 (CD4 Naive → Temra/CTL)") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "none")
# cell_type plot
p2 <- plot_cells(
cds,
color_cells_by = "cell_type",
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = FALSE,
cell_size = 0.7,
show_trajectory_graph = TRUE
) +
geom_label_repel(
data = label_coords2,
aes(x = UMAP1, y = UMAP2, label = cell_type),
size = 3.5, fontface = "bold",
fill = "white", alpha = 0.85,
box.padding = 0.5,
segment.color = "grey40"
) +
ggtitle("Monocle3 Graph: Cell Types") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "none")
p1 | p2# ── Identify root node in the naive cluster ────────────────────────────────
# Root = cells in the Tnaive cluster (CCR7+SELL+TCF7+) = cluster 0
# Using the principal graph node closest to the Tnaive centroid ensures
# pseudotime=0 is biologically anchored to the most undifferentiated state.
cat("Available Azimuth l2 labels:\n")Available Azimuth l2 labels:
[1] CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
Levels: CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
# Find naive cells — use both Azimuth label and our cell_type annotation
naive_ids <- rownames(colData(cds))[
grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2, ignore.case = TRUE) |
grepl("Tnaive", as.character(colData(cds)$cell_type), ignore.case = FALSE)
]
cat("\nNaive root cells identified:", length(naive_ids), "\n")
Naive root cells identified: 5739
if (length(naive_ids) == 0) {
stop("❌ No naive root cells found. Check label strings above and adjust grep patterns.")
}
# ── Function to find the principal graph node closest to naive centroid ────
get_root_node <- function(cds, cell_ids) {
closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
closest <- as.matrix(closest[colnames(cds), ])
igraph::V(principal_graph(cds)$UMAP)$name[
as.numeric(names(which.max(table(closest[cell_ids, ]))))
]
}
root_node <- get_root_node(cds, naive_ids)
cat("Root node:", root_node, "\n")Root node: Y_338
# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)
# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)
# Replace Inf (unreachable cells) with NA for clean plotting
reference_integrated$monocle3_pseudotime[
!is.finite(reference_integrated$monocle3_pseudotime)
] <- NA
cat("\nPseudotime summary (finite values):\n")
Pseudotime summary (finite values):
print(summary(reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)
])) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 9.653 17.619 17.918 26.763 33.713
# ── Plot pseudotime on trajectory ─────────────────────────────────────────
plot_cells(
cds,
color_cells_by = "pseudotime",
label_cell_groups = FALSE,
cell_size = 0.8,
show_trajectory_graph = TRUE
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("Reference CD4+ T Cell Pseudotime\n(Root = Tnaive, Terminal = Temra/CTL)") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# ── Pseudotime on Seurat UMAP ─────────────────────────────────────────────
FeaturePlot(
reference_integrated,
features = "monocle3_pseudotime",
reduction = "umap",
cols = c("lightblue", "darkblue", "red"),
label = TRUE,
pt.size = 0.5
) +
ggtitle("Reference UMAP — Pseudotime") +
theme(plot.title = element_text(hjust = 0.5))# ── Identify root node in the naive cluster ────────────────────────────────
# Root = cells in the Tnaive cluster (CCR7+SELL+TCF7+) = cluster 0
# Using the principal graph node closest to the Tnaive centroid ensures
# pseudotime=0 is biologically anchored to the most undifferentiated state.
cat("Available Azimuth l2 labels:\n")Available Azimuth l2 labels:
[1] CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
Levels: CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
# Find naive cells — use both Azimuth label and our cell_type annotation
naive_ids <- rownames(colData(cds))[
grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2, ignore.case = TRUE) |
grepl("Tnaive", as.character(colData(cds)$cell_type), ignore.case = FALSE)
]
cat("\nNaive root cells identified:", length(naive_ids), "\n")
Naive root cells identified: 5739
if (length(naive_ids) == 0) {
stop("❌ No naive root cells found. Check label strings above and adjust grep patterns.")
}
# ── Function to find the principal graph node closest to naive centroid ────
get_root_node <- function(cds, cell_ids) {
closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
closest <- as.matrix(closest[colnames(cds), ])
igraph::V(principal_graph(cds)$UMAP)$name[
as.numeric(names(which.max(table(closest[cell_ids, ]))))
]
}
root_node <- get_root_node(cds, naive_ids)
cat("Root node:", root_node, "\n")Root node: Y_338
# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)
# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)
# Replace Inf (unreachable cells) with NA for clean plotting
reference_integrated$monocle3_pseudotime[
!is.finite(reference_integrated$monocle3_pseudotime)
] <- NA
cat("\nPseudotime summary (finite values):\n")
Pseudotime summary (finite values):
print(summary(reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)
])) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 9.653 17.619 17.918 26.763 33.713
# ── Plot pseudotime on trajectory ─────────────────────────────────────────
plot_cells(
cds,
color_cells_by = "pseudotime",
label_cell_groups = FALSE,
cell_size = 0.8,
show_trajectory_graph = TRUE
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("Reference CD4+ T Cell Pseudotime\n(Root = Tnaive, Terminal = Temra/CTL)") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# ── Pseudotime on Seurat UMAP + state labels ─────────────────────────────
library(dplyr)
library(ggrepel)
umap_df <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_df) <- c("UMAP1", "UMAP2")
umap_df$pt <- reference_integrated$monocle3_pseudotime
umap_df$l2 <- reference_integrated$predicted.celltype.l2
umap_df <- umap_df[is.finite(umap_df$pt) & !is.na(umap_df$l2), ]
label_coords <- umap_df %>%
group_by(l2) %>%
summarise(
UMAP1 = median(UMAP1),
UMAP2 = median(UMAP2),
.groups = "drop"
)
p_umap <- FeaturePlot(
reference_integrated,
features = "monocle3_pseudotime",
reduction = "umap",
cols = c("lightblue", "darkblue", "red"),
label = FALSE,
pt.size = 0.5
)
p_umap +
geom_label_repel(
data = label_coords,
aes(x = UMAP1, y = UMAP2, label = l2),
inherit.aes = FALSE,
size = 4,
fontface = "bold",
fill = "white",
alpha = 0.85,
box.padding = 0.4,
label.padding = 0.25,
segment.color = "grey40"
) +
ggtitle("Reference UMAP — Pseudotime with CD4 Naive / TCM / TEM / Temra/CTL / Treg labels") +
theme(plot.title = element_text(hjust = 0.5))# ── DEFINE MILESTONES: Key CD4 states along your trajectory ───────────────
# Based on your Azimuth l2 + pseudotime quantiles
library(dplyr)
# 1. Get state-specific pseudotime ranges
pt_ranges <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n = n(),
pt_min = round(min(monocle3_pseudotime), 2),
pt_max = round(max(monocle3_pseudotime), 2),
pt_median = round(median(monocle3_pseudotime), 2),
.groups = "drop"
) %>%
arrange(pt_median)
print("Pseudotime by state (milestone order):")[1] "Pseudotime by state (milestone order):"
# 2. Define milestones (pseudotime quantiles + biology)
milestones <- data.frame(
milestone = c("Naive", "Central_Memory", "Effector_Memory", "Terminal_Effector"),
pseudotime = c(0.1, 0.8, 1.4, 2.0), # Your trajectory quantiles
state = c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL")
)
print("Milestones defined:")[1] "Milestones defined:"
# 3. Assign cells to milestones — FIXED for YOUR 5 states
ref_milestones <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
mutate(
milestone = case_when(
# Naive: pt < 25th percentile
monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.25, na.rm = TRUE) ~ "Naive",
# Central Memory (TCM): 25th-60th percentile
monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.60, na.rm = TRUE) ~ "Central_Memory",
# Treg + TEM: 60th-85th percentile (branch + effector memory)
predicted.celltype.l2 == "Treg" |
(monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.85, na.rm = TRUE)) ~ "Regulatory_Effector",
# Terminal: TEMRA/CTL (top 15%)
TRUE ~ "Terminal_Effector"
),
# Simplified state mapping for coloring
state_group = case_when(
predicted.celltype.l2 == "CD4 Naive" ~ "Naive",
predicted.celltype.l2 == "CD4 TCM" ~ "Central_Memory",
predicted.celltype.l2 == "Treg" ~ "Treg",
predicted.celltype.l2 == "CD4 TEM" ~ "Effector_Memory",
predicted.celltype.l2 == "CD4 Temra/CTL" ~ "Terminal_Effector"
)
)
cat("Milestone distribution:\n")Milestone distribution:
Central_Memory Naive Regulatory_Effector Terminal_Effector
4013 2867 2866 1720
State validation:
Central_Memory Effector_Memory Naive Terminal_Effector Treg
CD4 Naive 0 0 2037 0 0
CD4 TCM 9067 0 0 0 0
CD4 TEM 0 145 0 0 0
CD4 Temra/CTL 0 0 0 10 0
Treg 0 0 0 0 207
# ── Guard: only run if monocle3_pseudotime has already been computed ──────
# This column is created in the trajectory chunk (Section 2).
# If you are running this script for the first time on a freshly loaded
# reference (without pseudotime), this chunk will skip gracefully.
# Re-run after the trajectory section completes.
if (!"monocle3_pseudotime" %in% colnames(reference_integrated@meta.data)) {
message(
"⚠️ monocle3_pseudotime not yet computed — skipping validation plots.\n",
" Run Section 2 (Reference Trajectory) first, then re-run this chunk."
)
} else {
# ── Violin: pseudotime distribution per Azimuth l2 label ─────────────────
# CONFIRMED label order in this dataset (low → high pseudotime):
# CD4 Naive → CD4 TCM → CD4 TEM → CD4 Temra/CTL (highest)
# Treg: separate branch, variable pseudotime
#
# We use predicted.celltype.l2 as PRIMARY label throughout — consistent
# with the rest of the analysis.
pt_data <- data.frame(
pseudotime = reference_integrated$monocle3_pseudotime,
l2 = reference_integrated$predicted.celltype.l2
) %>%
filter(is.finite(pseudotime), !is.na(l2))
# Order l2 labels by median pseudotime
l2_order <- pt_data %>%
group_by(l2) %>%
summarise(med_pt = median(pseudotime, na.rm = TRUE)) %>%
arrange(med_pt) %>%
pull(l2)
pt_data$l2 <- factor(pt_data$l2, levels = l2_order)
# ── Violin plot coloured by Azimuth l2 ────────────────────────────────────
p_violin <- ggplot(pt_data, aes(x = l2, y = pseudotime, fill = l2)) +
geom_violin(scale = "width", alpha = 0.85, trim = TRUE) +
geom_boxplot(width = 0.12, fill = "white", outlier.size = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 11) +
theme(
axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")
) +
labs(
title = "Pseudotime Distribution per Azimuth l2 Label (Reference)",
x = "", y = "Monocle3 Pseudotime"
)
# ── Ridge plot — same data, different view ────────────────────────────────
p_ridge <- ggplot(pt_data, aes(x = pseudotime, y = l2, fill = l2)) +
geom_density_ridges(alpha = 0.75, scale = 1.2, rel_min_height = 0.01) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 10) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(
title = "Pseudotime Density by Azimuth l2 Label",
x = "Pseudotime", y = ""
)
p_violin | p_ridge
} # end else (monocle3_pseudotime exists)
DefaultAssay(reference_integrated) <- "SCT"
# ── FindTransferAnchors ────────────────────────────────────────────────────
# normalization.method = "SCT": both reference and query are SCT-normalised
# reference.reduction = "pca": uses the reference PCA space (not re-run on query)
# dims = 1:20: consistent with the dims used for reference clustering/UMAP
cat("=== Finding transfer anchors ===\n")=== Finding transfer anchors ===
(This may take 5-15 minutes depending on cell numbers)
anchors <- FindTransferAnchors(
reference = reference_integrated,
query = MalignantCD4T,
normalization.method = "SCT",
reference.reduction = "pca",
dims = 1:20,
verbose = TRUE
)[1] "Given reference assay has multiple sct models, selecting model with most cells for finding transfer anchors"
Error: No features to use in finding transfer anchors. To troubleshoot, try explicitly providing features to the features parameter and ensure that they are present in both reference and query assays.
# ── MapQuery ───────────────────────────────────────────────────────────────
# This does three things simultaneously:
# 1. TransferData: transfers labels + pseudotime from reference to query
# 2. IntegrateEmbeddings: projects query PCA into reference PCA space
# 3. ProjectUMAP: projects query cells onto the FROZEN reference UMAP
#
# refdata transfers:
# - monocle3_pseudotime: continuous pseudotime value (via weighted anchors)
# - seurat_clusters: reference cluster ID (0-6)
# - cell_type: our annotated cell type label
# - predicted.celltype.l2: Azimuth l2 label (KEY for state interpretation)
cat("=== Projecting malignant cells onto reference (MapQuery) ===\n")
mapped_MalignantCD4T <- MapQuery(
anchorset = anchors,
query = MalignantCD4T,
reference = reference_integrated,
refdata = list(
# PRIMARY — Azimuth l2 label: the main state annotation used throughout
predicted.celltype.l2 = "predicted.celltype.l2",
# CONTINUOUS — pseudotime value from Monocle3 trajectory
pseudotime = "monocle3_pseudotime",
# SECONDARY — reference cluster ID (integer, for cross-check only)
seurat_clusters = "seurat_clusters"
),
reference.reduction = "pca",
reduction.model = "umap"
)
cat("✅ MapQuery complete\n")
cat("Mapped malignant cells:", ncol(mapped_MalignantCD4T), "\n")
cat("\nTransferred metadata columns:\n")
transferred_cols <- grep("^predicted\\.", colnames(mapped_MalignantCD4T@meta.data), value = TRUE)
print(transferred_cols)
cat("\nTransferred pseudotime summary:\n")
print(summary(mapped_MalignantCD4T$predicted.pseudotime))
cat("\nTransferred Azimuth l2 distribution (PRIMARY):\n")
print(table(mapped_MalignantCD4T$predicted.predicted.celltype.l2))
cat("\nTransferred cluster distribution (secondary reference):\n")
print(table(mapped_MalignantCD4T$predicted.seurat_clusters))# ── Build data frames for ggplot ──────────────────────────────────────────
ref_umap <- data.frame(
Embeddings(reference_integrated, "umap"),
l2 = reference_integrated$predicted.celltype.l2, # PRIMARY: Azimuth l2
pseudotime = reference_integrated$monocle3_pseudotime,
type = "Reference"
)
colnames(ref_umap)[1:2] <- c("UMAP_1", "UMAP_2")
# Extend palette to include any malignant l2 labels after MapQuery
# (done again here in case new labels appear after projection)
query_umap <- data.frame(
Embeddings(mapped_MalignantCD4T, "ref.umap"),
pseudotime = mapped_MalignantCD4T$predicted.pseudotime,
cell_line = mapped_MalignantCD4T$cell_line,
l2_q = mapped_MalignantCD4T$predicted.predicted.celltype.l2,
type = "Malignant"
)
colnames(query_umap)[1:2] <- c("UMAP_1", "UMAP_2")
# Extend palette to cover any new l2 labels from malignant cells
all_l2_labels <- unique(c(as.character(ref_umap$l2),
as.character(query_umap$l2_q)))
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, all_l2_labels)
# ── Plot 1: Reference grey, malignant coloured by pseudotime ──────────────
p1 <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.6) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 1.2, alpha = 0.9) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime",
na.value = "grey60") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Malignant CD4T Projected onto Reference\n(Coloured by transferred pseudotime)")
# ── Plot 2: Reference coloured by Azimuth l2, malignant cells in red ──────
p2 <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2, color = l2),
size = 0.4, alpha = 0.7) +
scale_color_manual(values = azimuth_l2_colors,
name = "Azimuth l2\n(reference)",
na.value = "grey70") +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2),
color = malignant_color, size = 1.0, alpha = 0.8, shape = 16) +
theme_classic(base_size = 11) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 8)
) +
ggtitle("Reference Azimuth l2 (colour)\n+ Malignant Cells (red overlay)")
p1 | p2# ── Plot 3: Malignant coloured by transferred Azimuth l2 label ────────────
# azimuth_l2_colors already extended to cover all labels in both ref + query
p3 <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.6) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = l2_q),
size = 1.2, alpha = 0.9) +
scale_color_manual(values = azimuth_l2_colors,
name = "Azimuth l2\n(transferred)",
na.value = "grey70") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Malignant CD4T — Transferred Azimuth l2 Labels")
# ── Plot 4: Per cell line ─────────────────────────────────────────────────
cell_line_palette <- colorRampPalette(brewer.pal(8, "Dark2"))(
length(unique(query_umap$cell_line))
)
names(cell_line_palette) <- sort(unique(query_umap$cell_line))
p4 <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.5) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = cell_line),
size = 1.0, alpha = 0.9) +
scale_color_manual(values = cell_line_palette, name = "Cell line") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Malignant CD4T per Cell Line")
p3 | p4# ── Facet by cell line to see line-specific projection patterns ───────────
ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = "grey90", size = 0.3, alpha = 0.5) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 0.9, alpha = 0.85) +
facet_wrap(~ cell_line, ncol = 4) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime", na.value = "grey60") +
theme_classic(base_size = 10) +
theme(
strip.text = element_text(size = 9, face = "bold"),
strip.background = element_rect(fill = "#E8F4FD"),
plot.title = element_text(hjust = 0.5, face = "bold")
) +
ggtitle("Per Cell Line: Projection onto Reference UMAP (Pseudotime)")# ── State assignment strategy ─────────────────────────────────────────────
# For each malignant cell we have TWO sources of state information:
# 1. predicted.predicted.celltype.l2 → Azimuth l2 label transferred via
# MapQuery anchors — PRIMARY label for all plots and quantification
# 2. predicted.seurat_clusters → reference cluster integer ID —
# SECONDARY, for cross-validation only
#
# Azimuth l2 is used exclusively for colour coding, bar charts, heatmaps,
# and pseudotime plots. It is unbiased, automated, and consistent with the
# analysis framework used throughout this notebook.
# ── State assignment ──────────────────────────────────────────────────────
# PRIMARY label: predicted.predicted.celltype.l2 — Azimuth l2 transferred
# via MapQuery anchors. This is the main state for all plots
# and quantification tables.
# SECONDARY: predicted.seurat_clusters — reference cluster integer ID,
# kept only for cross-reference / sanity checking.
mapped_MalignantCD4T$state_azimuth_l2 <- mapped_MalignantCD4T$predicted.predicted.celltype.l2
mapped_MalignantCD4T$state_ref_cluster <- mapped_MalignantCD4T$predicted.seurat_clusters
mapped_MalignantCD4T$pseudotime_value <- mapped_MalignantCD4T$predicted.pseudotime
# ── Pseudotime bins ────────────────────────────────────────────────────────
# Bin into Early (Tnaive-like), Mid (TCM/TEM-like), Late (Temra-like)
# based on reference pseudotime range tertiles
pt_vals <- reference_integrated$monocle3_pseudotime
pt_vals <- pt_vals[is.finite(pt_vals)]
pt_breaks <- quantile(pt_vals, probs = c(0, 1/3, 2/3, 1))
mapped_MalignantCD4T$pseudotime_bin <- cut(
mapped_MalignantCD4T$pseudotime_value,
breaks = pt_breaks,
labels = c("Early (Tnaive-like)", "Mid (TCM/TEM-like)", "Late (Temra-like)"),
include.lowest = TRUE
)
cat("=== State assignment complete ===\n")
cat("\nAzimuth l2 state (PRIMARY — used for all quantification):\n")
print(table(mapped_MalignantCD4T$state_azimuth_l2))
cat("\nReference cluster (secondary cross-check):\n")
print(table(mapped_MalignantCD4T$state_ref_cluster))
cat("\nPseudotime bin:\n")
print(table(mapped_MalignantCD4T$pseudotime_bin))# ── Table 1: Overall state distribution ───────────────────────────────────
state_summary <- mapped_MalignantCD4T@meta.data %>%
count(state_azimuth_l2, name = "n_cells") %>%
mutate(
pct = round(100 * n_cells / sum(n_cells), 1),
label = paste0(state_azimuth_l2, "\n(", pct, "%)")
) %>%
arrange(desc(n_cells))
cat("=== Overall Malignant CD4T State Distribution ===\n")
print(state_summary)
# ── Table 2: Per cell line ─────────────────────────────────────────────────
state_per_line <- mapped_MalignantCD4T@meta.data %>%
count(cell_line, state_azimuth_l2, name = "n_cells") %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
ungroup() %>%
arrange(cell_line, desc(n_cells))
cat("\n=== State Distribution per Cell Line ===\n")
print(state_per_line, n = Inf)
# ── Table 3: Reference cluster cross-check ────────────────────────────────
# Which reference clusters do malignant cells map to? For cross-validation
# against the Azimuth l2 label above — they should be concordant.
cluster_summary <- mapped_MalignantCD4T@meta.data %>%
count(state_ref_cluster, name = "n_cells") %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
arrange(as.numeric(as.character(state_ref_cluster)))
cat("\n=== Reference Cluster Distribution (cross-check) ===\n")
print(cluster_summary)
# ── Table 4: Pseudotime bin summary ───────────────────────────────────────
pt_bin_summary <- mapped_MalignantCD4T@meta.data %>%
count(pseudotime_bin, name = "n_cells") %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1))
cat("\n=== Pseudotime Bin Summary ===\n")
print(pt_bin_summary)# ── Plot 1: Overall stacked bar per cell line ─────────────────────────────
p_bar_overall <- ggplot(state_per_line,
aes(x = cell_line, y = pct, fill = state_azimuth_l2)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70",
name = "Azimuth l2\nstate") +
theme_classic(base_size = 11) +
theme(
axis.text.x = element_text(angle = 40, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.key.size = unit(0.5, "cm")
) +
labs(
title = "Malignant CD4T — State Composition per Cell Line\n(Azimuth l2 transferred labels)",
x = "Cell Line", y = "% Cells"
)
# ── Plot 2: Grouped bar — pseudotime bins per cell line ───────────────────
pt_bin_line <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(cell_line, pseudotime_bin) %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup()
bin_colors <- c(
"Early (Tnaive-like)" = "#4575B4",
"Mid (TCM/TEM-like)" = "#FEE090",
"Late (Temra-like)" = "#D73027"
)
p_bar_bins <- ggplot(pt_bin_line,
aes(x = cell_line, y = pct, fill = pseudotime_bin)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = bin_colors, name = "Pseudotime\nBin") +
theme_classic(base_size = 11) +
theme(
axis.text.x = element_text(angle = 40, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")
) +
labs(
title = "Malignant CD4T — Pseudotime Bin Distribution per Cell Line",
x = "Cell Line", y = "% Cells"
)
p_bar_overall / p_bar_bins# ── Proportion heatmap: cell line × Azimuth l2 state ─────────────────────
# This is the most compact quantification figure — publication-ready.
heatmap_df <- state_per_line %>%
select(cell_line, state_azimuth_l2, pct) %>%
pivot_wider(names_from = state_azimuth_l2, values_from = pct, values_fill = 0)
heatmap_mat <- as.matrix(heatmap_df[, -1])
rownames(heatmap_mat) <- heatmap_df$cell_line
# Use pheatmap for clean heatmap
library(pheatmap)
pheatmap(
heatmap_mat,
color = colorRampPalette(c("white", "#FEE090", "#D73027"))(50),
display_numbers = TRUE,
number_format = "%.1f%%",
fontsize_number = 8,
fontsize_row = 10,
fontsize_col = 9,
angle_col = 45,
cluster_rows = TRUE,
cluster_cols = TRUE,
main = "% Malignant Cells per State (Azimuth l2)\nClustered by cell line similarity",
border_color = "white"
)# ── Density plot: pseudotime distribution of malignant vs reference ───────
# Reference pseudotime
ref_pt <- data.frame(
pseudotime = reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)
],
source = "Healthy reference"
)
# Malignant pseudotime
mal_pt <- data.frame(
pseudotime = mapped_MalignantCD4T$pseudotime_value[
is.finite(mapped_MalignantCD4T$pseudotime_value)
],
source = "Malignant CD4T"
)
pt_combined <- bind_rows(ref_pt, mal_pt)
p_density <- ggplot(pt_combined, aes(x = pseudotime, fill = source)) +
geom_density(alpha = 0.65, adjust = 1.2) +
scale_fill_manual(
values = c("Healthy reference" = "#4575B4", "Malignant CD4T" = malignant_color),
name = ""
) +
theme_classic(base_size = 12) +
theme(
legend.position = "top",
plot.title = element_text(hjust = 0.5, face = "bold")
) +
labs(
title = "Pseudotime Distribution: Healthy Reference vs Malignant CD4T",
subtitle = "Peak shift indicates differentiation arrest point",
x = "Monocle3 Pseudotime", y = "Density"
)
# ── Per cell line ridge plot ──────────────────────────────────────────────
mal_pt_line <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value)) %>%
select(cell_line, pseudotime_value)
p_ridge_lines <- ggplot(mal_pt_line,
aes(x = pseudotime_value, y = cell_line, fill = cell_line)) +
geom_density_ridges(alpha = 0.80, scale = 1.3, rel_min_height = 0.01,
quantile_lines = TRUE, quantiles = 2) +
scale_fill_manual(values = cell_line_palette) +
theme_classic(base_size = 11) +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")
) +
labs(
title = "Pseudotime Distribution per Cell Line",
subtitle = "Vertical line = median",
x = "Pseudotime", y = ""
)
p_density | p_ridge_lines# ── Detailed pseudotime statistics per cell line ──────────────────────────
pt_stats <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value)) %>%
group_by(cell_line) %>%
summarise(
n_cells = n(),
mean_pt = round(mean(pseudotime_value), 3),
median_pt = round(median(pseudotime_value), 3),
sd_pt = round(sd(pseudotime_value), 3),
q25_pt = round(quantile(pseudotime_value, 0.25), 3),
q75_pt = round(quantile(pseudotime_value, 0.75), 3),
pct_early = round(100 * mean(pseudotime_value < pt_breaks[2], na.rm = TRUE), 1),
pct_mid = round(100 * mean(pseudotime_value >= pt_breaks[2] &
pseudotime_value < pt_breaks[3], na.rm = TRUE), 1),
pct_late = round(100 * mean(pseudotime_value >= pt_breaks[3], na.rm = TRUE), 1)
) %>%
arrange(median_pt)
cat("=== Pseudotime Statistics per Cell Line ===\n")
print(pt_stats, n = Inf)
# ── Add reference l2 pseudotime for comparison ────────────────────────────
ref_stats <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime), !is.na(predicted.celltype.l2)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n_cells = n(),
mean_pt = round(mean(monocle3_pseudotime), 3),
median_pt = round(median(monocle3_pseudotime), 3)
) %>%
arrange(median_pt)
cat("\n=== Reference Pseudotime per Azimuth l2 State (for comparison) ===\n")
print(ref_stats)# ── Panel 1: Reference UMAP coloured by Azimuth l2 ───────────────────────
p_ref <- DimPlot(
reference_integrated, group.by = "predicted.celltype.l2",
reduction = "umap", label = TRUE, repel = TRUE,
label.size = 3, pt.size = 0.4
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey70") +
ggtitle("A. Reference CD4+ T Cells\n(Azimuth l2 — Proliferating removed)") +
theme(
plot.title = element_text(hjust = 0.5, size = 11, face = "bold"),
legend.position = "none"
) + NoLegend()
# ── Panel 2: Reference pseudotime ────────────────────────────────────────
p_pt_ref <- FeaturePlot(
reference_integrated, features = "monocle3_pseudotime",
reduction = "umap", cols = c("lightblue","yellow","red"),
pt.size = 0.4
) +
ggtitle("B. Reference Pseudotime\n(Naive → Temra axis)") +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold"))
# ── Panel 3: Malignant projection coloured by pseudotime ──────────────────
p_proj <- ggplot() +
geom_point(data = ref_umap, aes(x = UMAP_1, y = UMAP_2),
color = "grey88", size = 0.3, alpha = 0.6) +
geom_point(data = query_umap, aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 1.0, alpha = 0.85) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
ggtitle("C. Malignant CD4T Projected\n(pseudotime transferred)")
# ── Panel 4: State proportion bar chart ──────────────────────────────────
top_states <- state_summary %>% head(8) %>% pull(state_azimuth_l2)
state_plot_df <- state_per_line %>%
filter(state_azimuth_l2 %in% top_states)
p_quant <- ggplot(state_plot_df,
aes(x = reorder(cell_line, -pct), y = pct, fill = state_azimuth_l2)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70", name = "Azimuth l2") +
theme_classic(base_size = 10) +
theme(
axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
plot.title = element_text(hjust = 0.5, size = 11, face = "bold"),
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 8)
) +
labs(
title = "D. State Proportions per Cell Line\n(Azimuth l2 transferred)",
x = "", y = "% Cells"
)
(p_ref | p_pt_ref) / (p_proj | p_quant) +
plot_annotation(
title = "Malignant CD4+ T Cell Differentiation State Mapping\n(Sézary Syndrome — Monocle3 Reference Trajectory)",
theme = theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
)# ── Dot/boxplot: pseudotime of malignant cells grouped by transferred state ─
mal_state_pt <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value), !is.na(state_azimuth_l2)) %>%
mutate(state = state_azimuth_l2)
state_order_mal <- mal_state_pt %>%
group_by(state) %>%
summarise(med = median(pseudotime_value)) %>%
arrange(med) %>%
pull(state)
mal_state_pt$state <- factor(mal_state_pt$state, levels = state_order_mal)
ggplot(mal_state_pt, aes(x = state, y = pseudotime_value, fill = state)) +
geom_violin(scale = "width", alpha = 0.8, trim = TRUE) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 11) +
theme(
axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")
) +
labs(
title = "Pseudotime of Malignant CD4T per Transferred State\n(Azimuth l2 labels)",
x = "Transferred Differentiation State",
y = "Transferred Pseudotime"
)# ── Save mapped malignant object ──────────────────────────────────────────
out_dir <- "results/MalignantCD4T_Monocle3_projection"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
saveRDS(
mapped_MalignantCD4T,
file.path(out_dir, "MalignantCD4T_mapped_monocle3_pseudotime_noProlif_ref.rds")
)
# ── Save quantification tables as CSV ─────────────────────────────────────
write.csv(
state_summary,
file.path(out_dir, "state_distribution_overall.csv"),
row.names = FALSE
)
write.csv(
state_per_line,
file.path(out_dir, "state_distribution_per_cellline.csv"),
row.names = FALSE
)
write.csv(
pt_stats,
file.path(out_dir, "pseudotime_stats_per_cellline.csv"),
row.names = FALSE
)
# ── Save full metadata for downstream analysis ─────────────────────────────
write.csv(
mapped_MalignantCD4T@meta.data,
file.path(out_dir, "MalignantCD4T_full_metadata_with_projection.csv"),
row.names = TRUE
)
cat("✅ All outputs saved to:", out_dir, "\n")
cat("\nFiles saved:\n")
print(list.files(out_dir))