1 Overview

What this script does — nothing is re-computed from scratch.

  • Loads the already-saved reference trajectory objects from Step 1
  • Loads Sézary cell lines L1–L7
  • Runs per-cell-line SCT normalisation before FindTransferAnchors → each line gets its own SCT model → more biologically meaningful anchors
  • Projects each line individually onto the frozen reference UMAP
  • Transfers both MST pseudotime and Monocle3 pseudotime from reference
  • Transfers milestone labels (M00–M06) so positions can be compared with the 7-milestone structure confirmed in Step 1
  • Produces visualisations identical in style to previous results so you can directly compare this run (7 milestones) with the previous run

Reference milestone structure (already confirmed in Step 1):

M00 = CD4 Naive      (2037)
M01 = CD4 TCM early  (4717)
M02 = CD4 TCM late   (4350)
M03 = CD4 TEM        (145)
M04 = CD4 Temra/CTL  (10)
M05 = Treg resting   (146)
M06 = Treg effector  (61)

2 Setup

suppressPackageStartupMessages({
  library(Seurat)
  library(SeuratWrappers)
  library(monocle3)
  library(igraph)
  library(ggplot2)
  library(ggrepel)
  library(dplyr)
  library(tidyr)
  library(patchwork)
  library(viridis)
  library(scales)
  library(RColorBrewer)
  library(knitr)
  library(kableExtra)
})

# ── Must match Step 1 exactly ──────────────────────────────────────────────
STATE_COLORS <- c(
  "CD4 Naive"     = "#4472C4",
  "CD4 TCM"       = "#70AD47",
  "CD4 TEM"       = "#ED7D31",
  "CD4 Temra/CTL" = "#C00000",
  "Treg"          = "#7030A0"
)

CELL_LINE_COLORS <- setNames(
  colorRampPalette(brewer.pal(8,"Dark2"))(7),
  paste0("L", 1:7)
)

MILESTONE_COLORS <- setNames(
  c("#4472C4",            # M00 Naive
    "#70AD47","#A9D18E",  # M01/M02 TCM early/late
    "#ED7D31",            # M03 TEM
    "#C00000",            # M04 Temra
    "#7030A0","#B4A0D4"), # M05/M06 Treg
  sprintf("M%02d", 0:6)
)

MILESTONE_LABELS <- c(
  M00 = "M00 Naive",
  M01 = "M01 TCM(early)",
  M02 = "M02 TCM(late)",
  M03 = "M03 TEM",
  M04 = "M04 Temra",
  M05 = "M05 Treg(rest)",
  M06 = "M06 Treg(eff)"
)

STATE_COL  <- "predicted.celltype.l2"
UMAP_NAME  <- "umap"
CELL_LINES <- paste0("L", 1:7)

MILESTONE_ORDER <- sprintf("M%02d", 0:6)

cat("✓ Setup complete\n")
✓ Setup complete

3 Load Reference Objects (Step 1 outputs — not re-run)

# ── Load the reference Seurat object with frozen UMAP model ───────────────
cd4_ref <- readRDS("../../1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds")

# ── Load MST graph and centroids ──────────────────────────────────────────
mst_graph           <- readRDS("../../1-Custom_MST/Objects/mst_graph.rds")
milestone_centroids <- readRDS("../../1-Custom_MST/Objects/milestone_centroids.rds")

# ── Validate milestone structure ──────────────────────────────────────────
expected_ms <- sprintf("M%02d", 0:6)
actual_ms   <- sort(unique(na.omit(cd4_ref@meta.data$milestone)))
stopifnot(
  "Milestone mismatch — re-run Step 1 first" = identical(actual_ms, expected_ms),
  "mst_pseudotime_norm missing"              = "mst_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
  "monocle3_pseudotime_norm missing"         = "monocle3_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
  "UMAP model missing from cd4_ref"          = !is.null(cd4_ref@reductions[[UMAP_NAME]]@misc$model)
)

cat("✅ Reference objects loaded and validated\n")
✅ Reference objects loaded and validated
cat(sprintf("   %d reference cells | %d milestones\n", ncol(cd4_ref), length(actual_ms)))
   11466 reference cells | 7 milestones
cat(sprintf("   MST: %d nodes, %d edges\n", vcount(mst_graph), ecount(mst_graph)))
   MST: 7 nodes, 6 edges
cat("\nMilestone structure from Step 1:\n")

Milestone structure from Step 1:
print(milestone_centroids[, c("milestone","state","n_cells")])

# ── Build reference UMAP data frame for background plotting ───────────────
ref_umap_df           <- as.data.frame(Embeddings(cd4_ref, UMAP_NAME))
colnames(ref_umap_df) <- c("UMAP_1","UMAP_2")
ref_umap_df$state     <- cd4_ref@meta.data[[STATE_COL]]
ref_umap_df$milestone <- cd4_ref@meta.data$milestone
ref_umap_df$mst_pt    <- cd4_ref@meta.data$mst_pseudotime_norm
ref_umap_df$m3_pt     <- cd4_ref@meta.data$monocle3_pseudotime_norm

# ── MST edge data frame for overlay ───────────────────────────────────────
mst_edges   <- as_edgelist(mst_graph)
mst_edge_df <- do.call(rbind, lapply(seq_len(nrow(mst_edges)), function(i) {
  m1 <- mst_edges[i,1]; m2 <- mst_edges[i,2]
  r1 <- milestone_centroids[milestone_centroids$milestone == m1, ]
  r2 <- milestone_centroids[milestone_centroids$milestone == m2, ]
  data.frame(x1=r1$UMAP_1, y1=r1$UMAP_2, x2=r2$UMAP_1, y2=r2$UMAP_2)
}))

4 Load Sézary Cell Lines

all_obj <- readRDS("/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds")

# ── cell_line column confirmed present (40,695 cells, L1–L7) ─────────────
all_obj$cell_line <- as.character(all_obj$cell_line)
sezary_obj <- subset(all_obj, subset = cell_line %in% CELL_LINES)
rm(all_obj); gc()
             used    (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells    9046606   483.2   16164308   863.3   16164308   863.3
Vcells 1394581656 10639.9 3064212682 23378.1 2635168938 20104.8
cat(sprintf("Sézary cells loaded: %d total\n", ncol(sezary_obj)))
Sézary cells loaded: 40695 total
cat("Per cell line:\n")
Per cell line:
print(table(sezary_obj$cell_line))

  L1   L2   L3   L4   L5   L6   L7 
5825 5935 6428 6006 6022 5148 5331 
DefaultAssay(sezary_obj) <- "RNA"

# Shared genes with reference (confirmed ~36,601)
shared_genes <- intersect(rownames(sezary_obj), rownames(cd4_ref))
cat(sprintf("\nShared genes (query ∩ reference): %d\n", length(shared_genes)))

Shared genes (query ∩ reference): 36601
stopifnot("Fewer than 2000 shared genes — check objects" = length(shared_genes) >= 2000)

# ── Check percent.mt exists — needed for SCTransform vars.to.regress ──────
# If percent.mt is absent, SCT will run without regressing it (see per-line chunk)
HAS_PCT_MT <- "percent.mt" %in% colnames(sezary_obj@meta.data)
cat(sprintf("\npercent.mt in metadata: %s\n", HAS_PCT_MT))

percent.mt in metadata: TRUE

5 Per-Cell-Line SCT + Projection

Rationale for per-cell-line SCT:
Each Sézary cell line has its own library size distribution and technical variation. Running SCTransform on each line independently fits a line-specific regression model, which produces better-calibrated residuals for anchor finding. This gives FindTransferAnchors more reliable anchors compared to normalising all lines together, at the cost of one SCT run per line (~30–60 s each).

# ── Step 7: Summary ───────────────────────────────────────────────────────
cat("\nPredicted state distribution:\n")

Predicted state distribution:
print(table(sezary_obj$predicted.state))

CD4 TCM CD4 TEM    Treg 
  40628      66       1 
cat("\nPredicted milestone distribution:\n")

Predicted milestone distribution:
print(table(sezary_obj$predicted.milestone))

  M01   M02   M03   M05 
 1713 38893    86     3 
cat("\nMST pseudotime range:", round(range(sezary_obj$mst_pseudotime_norm, na.rm=TRUE), 1), "\n")

MST pseudotime range: 8.3 99.6 
cat("\n✅ Projection complete\n")

✅ Projection complete
cat(sprintf("✅ Anchors found: %d anchor pairs\n", nrow(anchors@anchors)))
✅ Anchors found: 8661 anchor pairs
cat(sprintf("Cells projected: %d\n", ncol(sezary_obj)))
Cells projected: 40695
cat(sprintf("Mean prediction score: %.3f (>0.5 = reliable, >0.7 = confident)\n",
            mean(sezary_obj$predicted.state.score, na.rm=TRUE)))
Mean prediction score: 0.984 (>0.5 = reliable, >0.7 = confident)
cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE),
            100*mean(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE)))
Cells with score >0.5: 40643 (99.9%)
cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE),
            100*mean(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE)))
Cells with score >0.7: 40207 (98.8%)
cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE),
            100*mean(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE)))
Cells with score >0.5: 40388 (99.2%)
cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE),
            100*mean(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE)))
Cells with score >0.7: 34183 (84.0%)
cat(sprintf("Total cells: %d\n", nrow(query_umap_df)))
Total cells: 40695
cat("Per cell line:\n")
Per cell line:
print(table(query_umap_df$cell_line))

  L1   L2   L3   L4   L5   L6   L7 
5825 5935 6428 6006 6022 5148 5331 
cat("\nMilestone distribution:\n")

Milestone distribution:
print(table(query_umap_df$predicted_milestone, useNA="ifany"))

  M01   M02   M03   M05 
 1713 38893    86     3 

transfer_summary <- query_umap_df %>%
  group_by(cell_line) %>%
  summarise(
    n_cells       = n(),
    mst_pt_mean   = round(mean(mst_pt, na.rm=TRUE), 1),
    mst_pt_med    = round(median(mst_pt, na.rm=TRUE), 1),
    m3_pt_mean    = round(mean(m3_pt, na.rm=TRUE), 1),
    m3_pt_med     = round(median(m3_pt, na.rm=TRUE), 1),
    top_milestone = names(sort(table(predicted_milestone), decreasing=TRUE))[1],
    .groups       = "drop"
  )

ms_wide <- query_umap_df %>%
  filter(!is.na(predicted_milestone)) %>%
  count(cell_line, predicted_milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  select(cell_line, predicted_milestone, pct) %>%
  pivot_wider(names_from=predicted_milestone, values_from=pct, values_fill=0)

for (m in MILESTONE_ORDER) {
  if (!m %in% colnames(ms_wide)) ms_wide[[m]] <- 0
}
ms_wide      <- ms_wide[, c("cell_line", MILESTONE_ORDER)]
full_summary <- left_join(transfer_summary, ms_wide, by="cell_line")

kable(full_summary,
      caption = "Sézary projection summary — pseudotime and milestone composition",
      digits  = 1) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=FALSE) %>%
  add_header_above(c(" "=ncol(transfer_summary),
                     "% cells per milestone"=length(MILESTONE_ORDER)))
Sézary projection summary — pseudotime and milestone composition
% cells per milestone
cell_line n_cells mst_pt_mean mst_pt_med m3_pt_mean m3_pt_med top_milestone M00 M01 M02 M03 M04 M05 M06
L1 5825 57.5 55.7 68.6 66.1 M02 0 6.4 92.1 1.5 0 0.1 0
L2 5935 52.7 54.5 57.2 58.9 M02 0 4.7 95.3 0.0 0 0.0 0
L3 6428 52.1 53.5 61.0 60.7 M02 0 6.9 93.1 0.0 0 0.0 0
L4 6006 56.7 56.7 63.7 60.6 M02 0 1.9 98.1 0.0 0 0.0 0
L5 6022 51.8 52.8 57.4 58.4 M02 0 5.5 94.5 0.0 0 0.0 0
L6 5148 58.3 57.1 62.8 60.7 M02 0 2.1 97.9 0.0 0 0.0 0
L7 5331 55.5 55.8 62.8 60.0 M02 0 1.3 98.7 0.0 0 0.0 0

6 Visualisations

6.1 Reference UMAP — milestones

p_ref_ms <- ggplot() +
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=state), size=.4, alpha=.5) +
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black", linewidth=1, alpha=.85, lineend="round") +
  geom_point(data=milestone_centroids,
             aes(x=UMAP_1,y=UMAP_2,fill=state),
             shape=21, size=8, colour="white", stroke=2) +
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,
                      label=paste0(milestone,"\n(",MILESTONE_LABELS[milestone],")")),
                  size=2.8, fontface="bold",
                  bg.color="grey80", bg.r=.15, max.overlaps=20) +
  scale_colour_manual(values=STATE_COLORS, name="State") +
  scale_fill_manual(values=STATE_COLORS, guide="none") +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Reference CD4 T cells — 7-milestone structure",
       subtitle="M00=Naive | M01/M02=TCM | M03=TEM | M04=Temra | M05/M06=Treg") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_ref_ms)

p_ref_ms <- ggplot() +
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=state), size=.4, alpha=.5) +
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black", linewidth=1, alpha=.85, lineend="round") +
  geom_point(data=milestone_centroids,
             aes(x=UMAP_1,y=UMAP_2,fill=state),
             shape=21, size=8, colour="white", stroke=2) +
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,
                      label=paste0(milestone,"\n(",MILESTONE_LABELS[milestone],")")),
                  size=2.8, fontface="bold",
                  bg.color="grey80", bg.r=.15, max.overlaps=20) +
  scale_colour_manual(values=STATE_COLORS, name="State") +
  scale_fill_manual(values=STATE_COLORS, guide="none") +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Reference CD4 T cells — 7-milestone structure",
       subtitle="M00=Naive | M01/M02=TCM | M03=TEM | M04=Temra | M05/M06=Treg") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_ref_ms)

6.2 All Lines — Position on Reference UMAP

p_all_lines <- ggplot() +
  # Reference background (grey)
  geom_point(data=ref_umap_df,
             aes(x=UMAP_1,y=UMAP_2), colour="grey68", size=.25, alpha=.6) +
  # MST overlay
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="grey30", linewidth=.7, alpha=.7, lineend="round") +
  # Query cells coloured by cell line
  geom_point(data=query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1,y=UMAP_2,colour=cell_line), size=1.2, alpha=.75) +
  scale_colour_manual(values=CELL_LINE_COLORS, name="Cell line") +
  # Milestone labels
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,label=milestone),
                  size=3, fontface="bold", colour="grey20",
                  bg.color="white", bg.r=.12, max.overlaps=15) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",
       title="Sézary L1–L7 projected onto reference UMAP",
       subtitle="Grey = reference | Coloured = query cell lines") +
  theme(plot.title=element_text(size=13,face="bold")) +
  facet_wrap(~cell_line, ncol=4)

print(p_all_lines)

6.3 All Lines — MST Pseudotime on Reference UMAP

6.4 Per-Line MST Pseudotime Violin

# Milestone median lines for reference
ms_medians <- cd4_ref@meta.data %>%
  group_by(milestone) %>%
  summarise(med = median(mst_pseudotime_norm, na.rm=TRUE), .groups="drop") %>%
  filter(milestone %in% c("M00","M01","M02","M03","M04","M05","M06"))

violin_df <- data.frame(
  cell_line = sezary_obj$cell_line,
  mst_pt    = sezary_obj$mst_pseudotime_norm,
  m3_pt     = sezary_obj$monocle3_pseudotime_norm,
  milestone = sezary_obj$predicted.milestone
) %>% filter(!is.na(mst_pt))

p_violin_mst <- ggplot(violin_df, aes(x=cell_line, y=mst_pt, fill=cell_line)) +
  geom_violin(scale="width", trim=FALSE, alpha=.85) +
  geom_boxplot(width=.08, fill="white", outlier.size=.3, outlier.alpha=.4) +
  # Reference milestone median lines
  geom_hline(data=ms_medians,
             aes(yintercept=med, linetype=milestone), colour="grey40", linewidth=.5) +
  geom_text(data=ms_medians,
          aes(x=7.6, y=med, label=milestone),
          size=2.8, hjust=0, colour="grey25",
          inherit.aes=FALSE) +
  scale_fill_manual(values=CELL_LINE_COLORS, guide="none") +
  scale_linetype_manual(values=rep("dashed",7), guide="none") +
  coord_cartesian(xlim=c(0.5,8.5)) +
  theme_classic() +
  labs(x="Cell line", y="MST pseudotime (0–100)",
       title="MST pseudotime per Sézary cell line",
       subtitle="Dashed lines = reference milestone medians (M00–M06)") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_violin_mst)

6.5 Per-Line Monocle3 Pseudotime Violin

# Reference milestone medians — Monocle3
ms_medians_m3 <- cd4_ref@meta.data %>%
  group_by(milestone) %>%
  summarise(med = median(monocle3_pseudotime_norm, na.rm=TRUE), .groups="drop")

p_violin_m3 <- ggplot(violin_df, aes(x=cell_line, y=m3_pt, fill=cell_line)) +
  geom_violin(scale="width", trim=FALSE, alpha=.85) +
  geom_boxplot(width=.08, fill="white", outlier.size=.3, outlier.alpha=.4) +
  geom_hline(data=ms_medians_m3,
             aes(yintercept=med, linetype=milestone), colour="grey40", linewidth=.5) +
  geom_text(data=ms_medians,
          aes(x=7.6, y=med, label=milestone),
          size=2.8, hjust=0, colour="grey25",
          inherit.aes=FALSE) +
  scale_fill_manual(values=CELL_LINE_COLORS, guide="none") +
  scale_linetype_manual(values=rep("dashed",7), guide="none") +
  coord_cartesian(xlim=c(0.5,8.5)) +
  theme_classic() +
  labs(x="Cell line", y="Monocle3 pseudotime (0–100)",
       title="Monocle3 pseudotime per Sézary cell line",
       subtitle="Dashed lines = reference milestone medians (M00–M06)") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_violin_m3)

6.6 Milestone Composition per Cell Line

ms_comp <- violin_df %>%
  count(cell_line, milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ungroup() %>%
  filter(!is.na(milestone)) %>%
  mutate(milestone = factor(milestone, levels=sprintf("M%02d",0:6)))

p_ms_bar <- ggplot(ms_comp, aes(x=cell_line, y=pct, fill=milestone)) +
  geom_col(width=.75) +
  scale_fill_manual(values=MILESTONE_COLORS,
                    labels=MILESTONE_LABELS, name="Milestone") +
  theme_classic() +
  labs(x="Cell line", y="% cells",
       title="Milestone composition per Sézary cell line",
       subtitle="Based on transferred milestone labels from reference") +
  theme(plot.title=element_text(size=13,face="bold"),
        legend.text=element_text(size=9))

print(p_ms_bar)


# Milestone labels with biological annotation
MILESTONE_LABELS_FULL <- c(
  "M00" = "M00\nNaive",
  "M01" = "M01\nTCM early",
  "M02" = "M02\nTCM late\n(branch)",
  "M03" = "M03\nTEM",
  "M04" = "M04\nTemra",
  "M05" = "M05\nTreg(rest)",
  "M06" = "M06\nTreg(eff)"
)

# Compute milestone distribution per cell line
ms_per_line <- query_umap_df %>%
  filter(!is.na(predicted_milestone)) %>%
  count(cell_line, predicted_milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(predicted_milestone = factor(predicted_milestone,
                                      levels = MILESTONE_ORDER))

# ── Heatmap tile plot: cell line × milestone ──────────────────────────────
p_ms_heatmap <- ggplot(ms_per_line,
                        aes(x=predicted_milestone, y=cell_line, fill=pct)) +
  geom_tile(colour="white", linewidth=.5) +
  geom_text(aes(label=ifelse(pct >= 2, paste0(pct, "%"), "")),
            size=3.2, fontface="bold",
            colour=ifelse(ms_per_line$pct > 40, "white", "grey20")) +
  scale_fill_gradientn(
    colours = c("#F7FBFF","#DEEBF7","#9ECAE1","#4292C6","#08519C","#08306B"),
    name    = "% of cell\nline cells",
    limits  = c(0, 100)
  ) +
  scale_x_discrete(labels=MILESTONE_LABELS_FULL) +
  theme_classic() +
  labs(x="Reference milestone", y="Sézary cell line",
       title="Milestone distribution per Sézary cell line",
       subtitle="Darker = more cells | M02 = branch point between effector and regulatory arms") +
  theme(plot.title   = element_text(size=13, face="bold"),
        plot.subtitle= element_text(size=9,  colour="grey40"),
        axis.text.x  = element_text(size=9),
        axis.text.y  = element_text(size=11, face="bold"),
        legend.title = element_text(size=9))

print(p_ms_heatmap)


ggsave("Figures/fig_milestone_heatmap.pdf", p_ms_heatmap, width=10, height=5)

# Colours per milestone — derived from state colours
ms_state_colors <- setNames(
  STATE_COLORS[milestone_centroids$state[
    match(MILESTONE_ORDER, milestone_centroids$milestone)]],
  MILESTONE_ORDER
)

p_ms_stack <- ggplot(ms_per_line,
                      aes(x=cell_line, y=pct, fill=predicted_milestone)) +
  geom_col(width=.75, colour="white", linewidth=.3) +
  geom_text(aes(label=ifelse(pct >= 5,
                             paste0(predicted_milestone, "\n", pct, "%"), "")),
            position=position_stack(vjust=0.5),
            size=2.8, colour="white", fontface="bold") +
  scale_fill_manual(values=ms_state_colors,
                    labels=MILESTONE_LABELS_FULL,
                    name="Milestone") +
  scale_y_continuous(expand=c(0,0), limits=c(0,105)) +
  theme_classic() +
  labs(x="Cell line", y="% of cells",
       title="Milestone composition per Sézary cell line",
       subtitle="Colour = CD4 state | Label shown if ≥ 5%") +
  theme(plot.title  = element_text(size=13, face="bold"),
        plot.subtitle = element_text(size=9, colour="grey40"),
        axis.text.x = element_text(size=12, face="bold"))

print(p_ms_stack)


ggsave("Figures/fig_milestone_stack.pdf", p_ms_stack, width=8, height=5)

Note: this chunk must come after the milestone-heatmap chunk since it uses ms_per_line and MILESTONE_LABELS_FULL defined there.

6.7 MST vs Monocle3 Pseudotime per Line

# Scatter: MST vs Monocle3 per cell line — method agreement check
p_pt_cor <- ggplot(violin_df[sample(nrow(violin_df)),],
                    aes(x=mst_pt, y=m3_pt, colour=milestone)) +
  geom_point(size=.8, alpha=.6) +
  geom_abline(slope=1, intercept=0, linetype="dashed", colour="grey40") +
  scale_colour_manual(values=MILESTONE_COLORS,
                      labels=MILESTONE_LABELS, name="Milestone") +
  facet_wrap(~cell_line, ncol=4) +
  theme_classic() +
  labs(x="MST pseudotime (0–100)", y="Monocle3 pseudotime (0–100)",
       title="MST vs Monocle3 pseudotime per Sézary cell line",
       subtitle="Dashed = perfect agreement") +
  theme(plot.title=element_text(size=13,face="bold"),
        strip.text=element_text(face="bold"))

# Compute per-line Spearman rho
pt_cors <- violin_df %>%
  group_by(cell_line) %>%
  summarise(
    spearman = round(cor(mst_pt, m3_pt, method="spearman", use="complete.obs"), 3),
    n        = n(),
    .groups  = "drop"
  )

print(p_pt_cor)

cat("\nPer-line MST vs Monocle3 Spearman ρ:\n")

Per-line MST vs Monocle3 Spearman ρ:
print(pt_cors)
NA

6.8 Per-Cell-Line Individual UMAPs

# Individual UMAP + pseudotime for each line — same style as previous results
pt_gradient <- scale_colour_gradientn(
  colours = c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
  name    = "MST pseudotime\n(0–100)", limits = c(0,100), na.value = "grey80"
)

for (ln in CELL_LINES) {
  q_df <- query_umap_df %>% filter(cell_line == ln)

  p <- ggplot() +
    geom_point(data=ref_umap_df, aes(x=UMAP_1,y=UMAP_2),
               colour="grey78", size=.25, alpha=.5) +
    geom_segment(data=mst_edge_df,
                 aes(x=x1,y=y1,xend=x2,yend=y2),
                 colour="grey30", linewidth=.7, alpha=.7, lineend="round") +
    geom_point(data=q_df[sample(nrow(q_df)),],
               aes(x=UMAP_1,y=UMAP_2,colour=mst_pt), size=1.5, alpha=.85) +
    pt_gradient +
    # Milestone labels
    geom_text_repel(data=milestone_centroids,
                    aes(x=UMAP_1,y=UMAP_2,label=milestone),
                    size=2.8, fontface="bold", colour="grey20",
                    bg.color="white", bg.r=.1, max.overlaps=15) +
    theme_classic() +
    labs(x="UMAP-1",y="UMAP-2",
         title=sprintf("%s — MST pseudotime on reference UMAP", ln),
         subtitle=sprintf("n=%d cells | MST median=%.1f",
                 nrow(q_df),
                 median(q_df$mst_pt, na.rm=TRUE)))  +
    theme(plot.title=element_text(size=12,face="bold"))

  print(p)
}


7 Pseudotime Summary Table

# Full per-line summary including milestone breakdown
ms_wide <- ms_comp %>%
  select(cell_line, milestone, pct) %>%
  pivot_wider(names_from=milestone, values_from=pct, values_fill=0) %>%
  mutate(across(where(is.numeric), ~round(., 1)))

full_summary <- transfer_summary %>%
  left_join(ms_wide, by="cell_line")

kable(full_summary,
      caption = "Sézary cell line projection summary — pseudotime and milestone composition (%)",
      digits  = 1) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"), full_width=FALSE) %>%
  add_header_above(c(" "=ncol(transfer_summary),
                     "% cells per milestone"=ncol(ms_wide)-1))
Sézary cell line projection summary — pseudotime and milestone composition (%)
% cells per milestone
cell_line n_cells mst_pt_mean mst_pt_med m3_pt_mean m3_pt_med top_milestone M01 M02 M03 M05
L1 5825 57.5 55.7 68.6 66.1 M02 6.4 92.1 1.5 0.1
L2 5935 52.7 54.5 57.2 58.9 M02 4.7 95.3 0.0 0.0
L3 6428 52.1 53.5 61.0 60.7 M02 6.9 93.1 0.0 0.0
L4 6006 56.7 56.7 63.7 60.6 M02 1.9 98.1 0.0 0.0
L5 6022 51.8 52.8 57.4 58.4 M02 5.5 94.5 0.0 0.0
L6 5148 58.3 57.1 62.8 60.7 M02 2.1 97.9 0.0 0.0
L7 5331 55.5 55.8 62.8 60.0 M02 1.3 98.7 0.0 0.0

8 Save Outputs

dir.create("Objects", showWarnings=FALSE)
dir.create("Figures", showWarnings=FALSE)
dir.create("Tables",  showWarnings=FALSE)

SaveSeuratRds(sezary_obj, "Objects/sezary_projected.rds")

write.csv(full_summary, "Tables/projection_summary_per_line.csv",   row.names=FALSE)
write.csv(ms_comp,      "Tables/milestone_composition_per_line.csv", row.names=FALSE)

ggsave("Figures/fig_all_lines_umap.pdf",        p_all_lines,  width=14, height=10)
ggsave("Figures/fig_violin_mst.pdf",            p_violin_mst, width=10, height=5)
ggsave("Figures/fig_violin_m3.pdf",             p_violin_m3,  width=10, height=5)
ggsave("Figures/fig_milestone_composition.pdf", p_ms_bar,     width=11, height=5)
ggsave("Figures/fig_pt_correlation.pdf",        p_pt_cor,     width=12, height=8)

cat("✅ All outputs saved\n")

sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-redhat-linux-gnu
Running under: Rocky Linux 9.7 (Blue Onyx)

Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS-OPENMP;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8        LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C           LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] future_1.69.0               kableExtra_1.4.0            knitr_1.51                  RColorBrewer_1.1-3          scales_1.4.0               
 [6] viridis_0.6.5               viridisLite_0.4.3           patchwork_1.3.2             tidyr_1.3.2                 dplyr_1.2.0                
[11] ggrepel_0.9.6               ggplot2_4.0.2               igraph_2.2.2                monocle3_1.4.26             SingleCellExperiment_1.32.0
[16] SummarizedExperiment_1.40.0 GenomicRanges_1.62.1        Seqinfo_1.0.0               IRanges_2.44.0              S4Vectors_0.48.0           
[21] MatrixGenerics_1.22.0       matrixStats_1.5.0           Biobase_2.70.0              BiocGenerics_0.56.0         generics_0.1.4             
[26] SeuratWrappers_0.4.0        Seurat_5.4.0                SeuratObject_5.3.0          sp_2.2-1                   

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.23          splines_4.5.2             later_1.4.6               tibble_3.3.1              R.oo_1.27.1               polyclip_1.10-7          
  [7] fastDummies_1.7.5         lifecycle_1.0.5           Rdpack_2.6.6              globals_0.19.0            lattice_0.22-9            MASS_7.3-65              
 [13] magrittr_2.0.4            plotly_4.12.0             sass_0.4.10               rmarkdown_2.30            jquerylib_0.1.4           yaml_2.3.12              
 [19] remotes_2.5.0             httpuv_1.6.16             otel_0.2.0                glmGamPoi_1.22.0          sctransform_0.4.3         spam_2.11-3              
 [25] spatstat.sparse_3.1-0     reticulate_1.45.0         cowplot_1.2.0             pbapply_1.7-4             minqa_1.2.8               abind_1.4-8              
 [31] Rtsne_0.17                purrr_1.2.1               R.utils_2.13.0            irlba_2.3.7               listenv_0.10.0            spatstat.utils_3.2-1     
 [37] goftest_1.2-3             RSpectra_0.16-2           spatstat.random_3.4-4     fitdistrplus_1.2-6        parallelly_1.46.1         DelayedMatrixStats_1.32.0
 [43] svglite_2.2.2             codetools_0.2-20          DelayedArray_0.36.0       xml2_1.5.2                tidyselect_1.2.1          farver_2.1.2             
 [49] lme4_1.1-38               spatstat.explore_3.7-0    jsonlite_2.0.0            progressr_0.18.0          ggridges_0.5.7            survival_3.8-6           
 [55] systemfonts_1.3.1         tools_4.5.2               ragg_1.5.0                ica_1.0-3                 Rcpp_1.1.1                glue_1.8.0               
 [61] gridExtra_2.3             SparseArray_1.10.8        xfun_0.56                 withr_3.0.2               BiocManager_1.30.27       fastmap_1.2.0            
 [67] boot_1.3-32               digest_0.6.39             rsvd_1.0.5                R6_2.6.1                  mime_0.13                 textshaping_1.0.4        
 [73] scattermore_1.2           tensor_1.5.1              dichromat_2.0-0.1         spatstat.data_3.1-9       R.methodsS3_1.8.2         utf8_1.2.6               
 [79] data.table_1.18.2.1       httr_1.4.8                htmlwidgets_1.6.4         S4Arrays_1.10.1           uwot_0.2.4                pkgconfig_2.0.3          
 [85] gtable_0.3.6              rsconnect_1.7.0           lmtest_0.9-40             S7_0.2.1                  XVector_0.50.0            htmltools_0.5.9          
 [91] dotCall64_1.2             png_0.1-8                 spatstat.univar_3.1-6     reformulas_0.4.4          rstudioapi_0.18.0         tzdb_0.5.0               
 [97] reshape2_1.4.5            nlme_3.1-168              nloptr_2.2.1              zoo_1.8-15                cachem_1.1.0              stringr_1.6.0            
[103] KernSmooth_2.23-26        parallel_4.5.2            miniUI_0.1.2              pillar_1.11.1             grid_4.5.2                vctrs_0.7.1              
[109] RANN_2.6.2                promises_1.5.0            beachmat_2.26.0           xtable_1.8-4              cluster_2.1.8.2           evaluate_1.0.5           
[115] readr_2.1.6               cli_3.6.5                 compiler_4.5.2            rlang_1.1.7               future.apply_1.20.1       labeling_0.4.3           
[121] fs_1.6.6                  plyr_1.8.9                stringi_1.8.7             deldir_2.0-4              lazyeval_0.2.2            spatstat.geom_3.7-0      
[127] Matrix_1.7-4              RcppHNSW_0.6.0            hms_1.1.4                 sparseMatrixStats_1.22.0  shiny_1.12.1              rbibutils_2.4.1          
[133] ROCR_1.0-12               bslib_0.10.0             
---
title: "Step 2: Sézary Cell Line Projection onto CD4 Reference Trajectory"
subtitle: "Per-cell-line SCT | MST + Monocle3 pseudotime transfer | 7-milestone alignment"
author: "NASIR MAHMOOD ABBASI"
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: false
    theme: journal
    highlight: tango
  html_document:
    toc: true
    df_print: paged
    number_sections: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE,
                      fig.align="center", dpi=150)
```

------------------------------------------------------------------------

# Overview

**What this script does — nothing is re-computed from scratch.**

- Loads the already-saved reference trajectory objects from Step 1
- Loads Sézary cell lines L1–L7
- Runs **per-cell-line SCT normalisation** before `FindTransferAnchors`
  → each line gets its own SCT model → more biologically meaningful anchors
- Projects each line individually onto the frozen reference UMAP
- Transfers both **MST pseudotime** and **Monocle3 pseudotime** from reference
- Transfers **milestone labels** (M00–M06) so positions can be compared with
  the 7-milestone structure confirmed in Step 1
- Produces visualisations identical in style to previous results so you can
  directly compare this run (7 milestones) with the previous run

**Reference milestone structure (already confirmed in Step 1):**
```
M00 = CD4 Naive      (2037)
M01 = CD4 TCM early  (4717)
M02 = CD4 TCM late   (4350)
M03 = CD4 TEM        (145)
M04 = CD4 Temra/CTL  (10)
M05 = Treg resting   (146)
M06 = Treg effector  (61)
```

------------------------------------------------------------------------

# Setup

```{r libraries}
suppressPackageStartupMessages({
  library(Seurat)
  library(SeuratWrappers)
  library(monocle3)
  library(igraph)
  library(ggplot2)
  library(ggrepel)
  library(dplyr)
  library(tidyr)
  library(patchwork)
  library(viridis)
  library(scales)
  library(RColorBrewer)
  library(knitr)
  library(kableExtra)
})

# ── Must match Step 1 exactly ──────────────────────────────────────────────
STATE_COLORS <- c(
  "CD4 Naive"     = "#4472C4",
  "CD4 TCM"       = "#70AD47",
  "CD4 TEM"       = "#ED7D31",
  "CD4 Temra/CTL" = "#C00000",
  "Treg"          = "#7030A0"
)

CELL_LINE_COLORS <- setNames(
  colorRampPalette(brewer.pal(8,"Dark2"))(7),
  paste0("L", 1:7)
)

MILESTONE_COLORS <- setNames(
  c("#4472C4",            # M00 Naive
    "#70AD47","#A9D18E",  # M01/M02 TCM early/late
    "#ED7D31",            # M03 TEM
    "#C00000",            # M04 Temra
    "#7030A0","#B4A0D4"), # M05/M06 Treg
  sprintf("M%02d", 0:6)
)

MILESTONE_LABELS <- c(
  M00 = "M00 Naive",
  M01 = "M01 TCM(early)",
  M02 = "M02 TCM(late)",
  M03 = "M03 TEM",
  M04 = "M04 Temra",
  M05 = "M05 Treg(rest)",
  M06 = "M06 Treg(eff)"
)

STATE_COL  <- "predicted.celltype.l2"
UMAP_NAME  <- "umap"
CELL_LINES <- paste0("L", 1:7)

MILESTONE_ORDER <- sprintf("M%02d", 0:6)

cat("✓ Setup complete\n")
```

------------------------------------------------------------------------

# Load Reference Objects (Step 1 outputs — not re-run)

```{r load-reference}
# ── Load the reference Seurat object with frozen UMAP model ───────────────
cd4_ref <- readRDS("../../1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds")

# ── Load MST graph and centroids ──────────────────────────────────────────
mst_graph           <- readRDS("../../1-Custom_MST/Objects/mst_graph.rds")
milestone_centroids <- readRDS("../../1-Custom_MST/Objects/milestone_centroids.rds")

# ── Validate milestone structure ──────────────────────────────────────────
expected_ms <- sprintf("M%02d", 0:6)
actual_ms   <- sort(unique(na.omit(cd4_ref@meta.data$milestone)))
stopifnot(
  "Milestone mismatch — re-run Step 1 first" = identical(actual_ms, expected_ms),
  "mst_pseudotime_norm missing"              = "mst_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
  "monocle3_pseudotime_norm missing"         = "monocle3_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
  "UMAP model missing from cd4_ref"          = !is.null(cd4_ref@reductions[[UMAP_NAME]]@misc$model)
)

cat("✅ Reference objects loaded and validated\n")
cat(sprintf("   %d reference cells | %d milestones\n", ncol(cd4_ref), length(actual_ms)))
cat(sprintf("   MST: %d nodes, %d edges\n", vcount(mst_graph), ecount(mst_graph)))
cat("\nMilestone structure from Step 1:\n")
print(milestone_centroids[, c("milestone","state","n_cells")])

# ── Build reference UMAP data frame for background plotting ───────────────
ref_umap_df           <- as.data.frame(Embeddings(cd4_ref, UMAP_NAME))
colnames(ref_umap_df) <- c("UMAP_1","UMAP_2")
ref_umap_df$state     <- cd4_ref@meta.data[[STATE_COL]]
ref_umap_df$milestone <- cd4_ref@meta.data$milestone
ref_umap_df$mst_pt    <- cd4_ref@meta.data$mst_pseudotime_norm
ref_umap_df$m3_pt     <- cd4_ref@meta.data$monocle3_pseudotime_norm

# ── MST edge data frame for overlay ───────────────────────────────────────
mst_edges   <- as_edgelist(mst_graph)
mst_edge_df <- do.call(rbind, lapply(seq_len(nrow(mst_edges)), function(i) {
  m1 <- mst_edges[i,1]; m2 <- mst_edges[i,2]
  r1 <- milestone_centroids[milestone_centroids$milestone == m1, ]
  r2 <- milestone_centroids[milestone_centroids$milestone == m2, ]
  data.frame(x1=r1$UMAP_1, y1=r1$UMAP_2, x2=r2$UMAP_1, y2=r2$UMAP_2)
}))
```

------------------------------------------------------------------------

# Load Sézary Cell Lines

```{r load-sezary}
all_obj <- readRDS("/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds")

# ── cell_line column confirmed present (40,695 cells, L1–L7) ─────────────
all_obj$cell_line <- as.character(all_obj$cell_line)
sezary_obj <- subset(all_obj, subset = cell_line %in% CELL_LINES)
rm(all_obj); gc()

cat(sprintf("Sézary cells loaded: %d total\n", ncol(sezary_obj)))
cat("Per cell line:\n")
print(table(sezary_obj$cell_line))

DefaultAssay(sezary_obj) <- "RNA"

# Shared genes with reference (confirmed ~36,601)
shared_genes <- intersect(rownames(sezary_obj), rownames(cd4_ref))
cat(sprintf("\nShared genes (query ∩ reference): %d\n", length(shared_genes)))
stopifnot("Fewer than 2000 shared genes — check objects" = length(shared_genes) >= 2000)

# ── Check percent.mt exists — needed for SCTransform vars.to.regress ──────
# If percent.mt is absent, SCT will run without regressing it (see per-line chunk)
HAS_PCT_MT <- "percent.mt" %in% colnames(sezary_obj@meta.data)
cat(sprintf("\npercent.mt in metadata: %s\n", HAS_PCT_MT))
```

------------------------------------------------------------------------

# Per-Cell-Line SCT + Projection

**Rationale for per-cell-line SCT:**  
Each Sézary cell line has its own library size distribution and technical
variation. Running SCTransform on each line independently fits a line-specific
regression model, which produces better-calibrated residuals for anchor finding.
This gives `FindTransferAnchors` more reliable anchors compared to normalising
all lines together, at the cost of one SCT run per line (~30–60 s each).

```{r per-line-projection, results='hold'}

plan("sequential")
options(future.globals.maxSize = 200000 * 1024^2)  # 200 GB — effectively unlimited

# ── Step 1: Cell cycle scoring ────────────────────────────────────────────
DefaultAssay(sezary_obj) <- "RNA"
sezary_obj <- NormalizeData(sezary_obj, verbose=FALSE)
sezary_obj <- CellCycleScoring(
  sezary_obj,
  s.features   = cc.genes.updated.2019$s.genes,
  g2m.features = cc.genes.updated.2019$g2m.genes,
  set.ident    = FALSE
)
cat("Cell cycle phases:\n"); print(table(sezary_obj$Phase))

# ── Step 2: Per-line SCT ──────────────────────────────────────────────────
HAS_PCT_MT  <- "percent.mt" %in% colnames(sezary_obj@meta.data)
sezary_list <- SplitObject(sezary_obj, split.by = "cell_line")

sezary_list <- lapply(seq_along(sezary_list), function(i) {
  cat(sprintf("  SCT: %s (%d cells)\n",
              names(sezary_list)[i], ncol(sezary_list[[i]])))
  SCTransform(
    sezary_list[[i]],
    vst.flavor      = "v2",
    vars.to.regress = c("S.Score", "G2M.Score",
                        if (HAS_PCT_MT) "percent.mt" else NULL),
    verbose         = FALSE
  )
})

sezary_obj <- merge(sezary_list[[1]], sezary_list[-1], merge.data=TRUE)
cat(sprintf("Merged query: %d cells\n", ncol(sezary_obj)))

# ── Step 3: Shared features from reference integrated assay ───────────────
DefaultAssay(cd4_ref) <- "integrated"
ref_features          <- rownames(cd4_ref[["integrated"]]@scale.data)
query_genes           <- rownames(sezary_obj[["SCT"]])
shared_features       <- intersect(ref_features, query_genes)

junk_pattern          <- "^RPL|^RPS|^MT-|^HSP|^SNHG|^MALAT1|^NEAT1|^XIST|^TRBV|^TRAV|^TRGV|^TRDV|^HLA-"
shared_features_clean <- shared_features[!grepl(junk_pattern, shared_features)]
cat(sprintf("Shared features after cleaning: %d\n", length(shared_features_clean)))

VariableFeatures(sezary_obj) <- shared_features_clean
DefaultAssay(sezary_obj)     <- "SCT"

# ── Step 4: FindTransferAnchors ───────────────────────────────────────────
anchors <- FindTransferAnchors(
  reference            = cd4_ref,
  query                = sezary_obj,
  features             = shared_features_clean,
  normalization.method = "SCT",
  reference.reduction  = "pca",
  reduction            = "pcaproject",
  dims                 = 1:30,
  k.anchor             = 10,
  k.filter             = 500,
  k.score              = 30,
  verbose              = TRUE
)
cat(sprintf("Anchors found: %d\n", nrow(anchors@anchors)))

# ── Step 5: MapQuery ──────────────────────────────────────────────────────
sezary_obj <- MapQuery(
  anchorset           = anchors,
  query               = sezary_obj,
  reference           = cd4_ref,
  refdata             = list(
    milestone = cd4_ref@meta.data$milestone,
    state     = cd4_ref@meta.data[[STATE_COL]]
  ),
  reference.reduction = "pca",
  reduction.model     = UMAP_NAME,
  verbose             = FALSE
)
cat(sprintf("MapQuery complete: %d cells\n", ncol(sezary_obj)))

# ── Step 6: Transfer pseudotime ───────────────────────────────────────────
pt_mat           <- rbind(
  mst_pt = cd4_ref@meta.data$mst_pseudotime_norm,
  m3_pt  = cd4_ref@meta.data$monocle3_pseudotime_norm
)
colnames(pt_mat) <- colnames(cd4_ref)

wt_red <- if ("ref.pca" %in% names(sezary_obj@reductions)) "ref.pca" else "pcaproject"
pt_transfer <- TransferData(
  anchorset        = anchors,
  refdata          = pt_mat,
  weight.reduction = sezary_obj[[wt_red]],
  dims             = 1:30
)
pt_data <- GetAssayData(pt_transfer, layer="data")
cat("pt rownames:", rownames(pt_data), "\n")
sezary_obj$mst_pseudotime_norm      <- as.numeric(pt_data["mst-pt", ])
sezary_obj$monocle3_pseudotime_norm <- as.numeric(pt_data["m3-pt",  ])

# ── Step 7: Summary ───────────────────────────────────────────────────────
cat("\nPredicted state distribution:\n")
print(table(sezary_obj$predicted.state))
cat("\nPredicted milestone distribution:\n")
print(table(sezary_obj$predicted.milestone))
cat("\nMST pseudotime range:", round(range(sezary_obj$mst_pseudotime_norm, na.rm=TRUE), 1), "\n")
cat("\n✅ Projection complete\n")


cat(sprintf("✅ Anchors found: %d anchor pairs\n", nrow(anchors@anchors)))
cat(sprintf("Cells projected: %d\n", ncol(sezary_obj)))
cat(sprintf("Mean prediction score: %.3f (>0.5 = reliable, >0.7 = confident)\n",
            mean(sezary_obj$predicted.state.score, na.rm=TRUE)))
cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE),
            100*mean(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE)))

cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE),
            100*mean(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE)))


cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE),
            100*mean(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE)))

cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE),
            100*mean(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE)))

```



```{r merge-projected}
query_umap_df <- as.data.frame(Embeddings(sezary_obj, "ref.umap"))
colnames(query_umap_df) <- c("UMAP_1","UMAP_2")
query_umap_df$cell_line           <- sezary_obj$cell_line
query_umap_df$predicted_state     <- sezary_obj$predicted.state
query_umap_df$predicted_milestone <- sezary_obj$predicted.milestone
query_umap_df$mst_pt              <- sezary_obj$mst_pseudotime_norm
query_umap_df$m3_pt               <- sezary_obj$monocle3_pseudotime_norm

cat(sprintf("Total cells: %d\n", nrow(query_umap_df)))
cat("Per cell line:\n")
print(table(query_umap_df$cell_line))
cat("\nMilestone distribution:\n")
print(table(query_umap_df$predicted_milestone, useNA="ifany"))

```

---

```{r summary-table, fig.width=20, fig.height=8}
transfer_summary <- query_umap_df %>%
  group_by(cell_line) %>%
  summarise(
    n_cells       = n(),
    mst_pt_mean   = round(mean(mst_pt, na.rm=TRUE), 1),
    mst_pt_med    = round(median(mst_pt, na.rm=TRUE), 1),
    m3_pt_mean    = round(mean(m3_pt, na.rm=TRUE), 1),
    m3_pt_med     = round(median(m3_pt, na.rm=TRUE), 1),
    top_milestone = names(sort(table(predicted_milestone), decreasing=TRUE))[1],
    .groups       = "drop"
  )

ms_wide <- query_umap_df %>%
  filter(!is.na(predicted_milestone)) %>%
  count(cell_line, predicted_milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  select(cell_line, predicted_milestone, pct) %>%
  pivot_wider(names_from=predicted_milestone, values_from=pct, values_fill=0)

for (m in MILESTONE_ORDER) {
  if (!m %in% colnames(ms_wide)) ms_wide[[m]] <- 0
}
ms_wide      <- ms_wide[, c("cell_line", MILESTONE_ORDER)]
full_summary <- left_join(transfer_summary, ms_wide, by="cell_line")

kable(full_summary,
      caption = "Sézary projection summary — pseudotime and milestone composition",
      digits  = 1) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=FALSE) %>%
  add_header_above(c(" "=ncol(transfer_summary),
                     "% cells per milestone"=length(MILESTONE_ORDER)))
```


------------------------------------------------------------------------

# Visualisations {.tabset}

## Reference UMAP — milestones

```{r vis-ref-milestones, fig.width=10, fig.height=8}
p_ref_ms <- ggplot() +
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=state), size=.4, alpha=.5) +
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black", linewidth=1, alpha=.85, lineend="round") +
  geom_point(data=milestone_centroids,
             aes(x=UMAP_1,y=UMAP_2,fill=state),
             shape=21, size=8, colour="white", stroke=2) +
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,
                      label=paste0(milestone,"\n(",MILESTONE_LABELS[milestone],")")),
                  size=2.8, fontface="bold",
                  bg.color="grey80", bg.r=.15, max.overlaps=20) +
  scale_colour_manual(values=STATE_COLORS, name="State") +
  scale_fill_manual(values=STATE_COLORS, guide="none") +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Reference CD4 T cells — 7-milestone structure",
       subtitle="M00=Naive | M01/M02=TCM | M03=TEM | M04=Temra | M05/M06=Treg") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_ref_ms)

```

```{r , fig.width=8, fig.height=6}
p_ref_ms <- ggplot() +
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=state), size=.4, alpha=.5) +
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black", linewidth=1, alpha=.85, lineend="round") +
  geom_point(data=milestone_centroids,
             aes(x=UMAP_1,y=UMAP_2,fill=state),
             shape=21, size=8, colour="white", stroke=2) +
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,
                      label=paste0(milestone,"\n(",MILESTONE_LABELS[milestone],")")),
                  size=2.8, fontface="bold",
                  bg.color="grey80", bg.r=.15, max.overlaps=20) +
  scale_colour_manual(values=STATE_COLORS, name="State") +
  scale_fill_manual(values=STATE_COLORS, guide="none") +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Reference CD4 T cells — 7-milestone structure",
       subtitle="M00=Naive | M01/M02=TCM | M03=TEM | M04=Temra | M05/M06=Treg") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_ref_ms)

```




```{r, fig.width=8, fig.height=6}
p_all_lines <- ggplot() +
  # Reference background
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2),
             colour="grey68", size=.25, alpha=.4) +
  # MST trajectory lines
  geom_segment(data = mst_edge_df,
               aes(x=x1, y=y1, xend=x2, yend=y2),
               colour="grey30", linewidth=1, alpha=.8, lineend="round") +
  # Sézary cells coloured by cell line
  geom_point(data = query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=cell_line),
             size=1.5, alpha=.75) +
  # Milestone labels using confirmed 7-milestone names
  geom_label_repel(data = milestone_centroids,
                   aes(x=UMAP_1, y=UMAP_2,
                       label=paste0(milestone, "\n",
                                    MILESTONE_LABELS[milestone])),
                   size=2.5, fontface="bold",
                   fill="white", colour="grey15",
                   label.size=0.2, max.overlaps=20,
                   box.padding=0.5) +
  scale_colour_manual(values=CELL_LINE_COLORS, name="Cell line") +
  guides(colour=guide_legend(override.aes=list(size=4, alpha=1))) +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Sézary cell lines — projection onto healthy CD4 T cell reference",
       subtitle=sprintf("Grey = healthy reference (%d cells) | Coloured = Sézary L1–L7 (%d cells) | Lines = MST trajectory",
                        nrow(ref_umap_df), nrow(query_umap_df))) +
  theme(plot.title    = element_text(size=13, face="bold"),
        plot.subtitle = element_text(size=9,  colour="grey40"),
        legend.title  = element_text(size=11, face="bold"),
        legend.text   = element_text(size=10))

print(p_all_lines)

```

## All Lines — Position on Reference UMAP

```{r vis-all-lines-umap, fig.width=14, fig.height=9}
p_all_lines <- ggplot() +
  # Reference background (grey)
  geom_point(data=ref_umap_df,
             aes(x=UMAP_1,y=UMAP_2), colour="grey68", size=.25, alpha=.6) +
  # MST overlay
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="grey30", linewidth=.7, alpha=.7, lineend="round") +
  # Query cells coloured by cell line
  geom_point(data=query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1,y=UMAP_2,colour=cell_line), size=1.2, alpha=.75) +
  scale_colour_manual(values=CELL_LINE_COLORS, name="Cell line") +
  # Milestone labels
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,label=milestone),
                  size=3, fontface="bold", colour="grey20",
                  bg.color="white", bg.r=.12, max.overlaps=15) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",
       title="Sézary L1–L7 projected onto reference UMAP",
       subtitle="Grey = reference | Coloured = query cell lines") +
  theme(plot.title=element_text(size=13,face="bold")) +
  facet_wrap(~cell_line, ncol=4)

print(p_all_lines)

```

## All Lines — MST Pseudotime on Reference UMAP

```{r vis-pseudotime-mst, fig.width=14, fig.height=6}
# Panel 1: reference pseudotime
p_ref_pt <- ggplot(ref_umap_df[sample(nrow(ref_umap_df)),],
                    aes(x=UMAP_1,y=UMAP_2,colour=mst_pt)) +
  geom_point(size=.3, alpha=.7) +
  scale_colour_gradientn(
    colours=c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
    name="MST pseudotime\n(0–100)", limits=c(0,100), na.value="grey80") +
  geom_segment(data=mst_edge_df,aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black",linewidth=.6,alpha=.6,inherit.aes=FALSE) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",title="Reference — MST pseudotime") +
  theme(plot.title=element_text(size=12,face="bold"))

# Panel 2: query pseudotime (all lines combined)
p_query_pt <- ggplot() +
  geom_point(data=ref_umap_df, aes(x=UMAP_1,y=UMAP_2),
             colour="grey78", size=.2, alpha=.5) +
  geom_point(data=query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1,y=UMAP_2,colour=mst_pt), size=1, alpha=.8) +
  scale_colour_gradientn(
    colours=c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
    name="MST pseudotime\n(0–100)", limits=c(0,100), na.value="grey80") +
  geom_segment(data=mst_edge_df,aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="grey30",linewidth=.6,alpha=.6,inherit.aes=FALSE) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",title="Sézary L1–L7 — transferred MST pseudotime") +
  theme(plot.title=element_text(size=12,face="bold"))

print(p_ref_pt | p_query_pt)

```

## Per-Line MST Pseudotime Violin

```{r vis-violin-mst, fig.width=7, fig.height=6}
# Milestone median lines for reference
ms_medians <- cd4_ref@meta.data %>%
  group_by(milestone) %>%
  summarise(med = median(mst_pseudotime_norm, na.rm=TRUE), .groups="drop") %>%
  filter(milestone %in% c("M00","M01","M02","M03","M04","M05","M06"))

violin_df <- data.frame(
  cell_line = sezary_obj$cell_line,
  mst_pt    = sezary_obj$mst_pseudotime_norm,
  m3_pt     = sezary_obj$monocle3_pseudotime_norm,
  milestone = sezary_obj$predicted.milestone
) %>% filter(!is.na(mst_pt))

p_violin_mst <- ggplot(violin_df, aes(x=cell_line, y=mst_pt, fill=cell_line)) +
  geom_violin(scale="width", trim=FALSE, alpha=.85) +
  geom_boxplot(width=.08, fill="white", outlier.size=.3, outlier.alpha=.4) +
  # Reference milestone median lines
  geom_hline(data=ms_medians,
             aes(yintercept=med, linetype=milestone), colour="grey40", linewidth=.5) +
  geom_text(data=ms_medians,
          aes(x=7.6, y=med, label=milestone),
          size=2.8, hjust=0, colour="grey25",
          inherit.aes=FALSE) +
  scale_fill_manual(values=CELL_LINE_COLORS, guide="none") +
  scale_linetype_manual(values=rep("dashed",7), guide="none") +
  coord_cartesian(xlim=c(0.5,8.5)) +
  theme_classic() +
  labs(x="Cell line", y="MST pseudotime (0–100)",
       title="MST pseudotime per Sézary cell line",
       subtitle="Dashed lines = reference milestone medians (M00–M06)") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_violin_mst)
```

## Per-Line Monocle3 Pseudotime Violin

```{r vis-violin-m3, fig.width=7, fig.height=6}
# Reference milestone medians — Monocle3
ms_medians_m3 <- cd4_ref@meta.data %>%
  group_by(milestone) %>%
  summarise(med = median(monocle3_pseudotime_norm, na.rm=TRUE), .groups="drop")

p_violin_m3 <- ggplot(violin_df, aes(x=cell_line, y=m3_pt, fill=cell_line)) +
  geom_violin(scale="width", trim=FALSE, alpha=.85) +
  geom_boxplot(width=.08, fill="white", outlier.size=.3, outlier.alpha=.4) +
  geom_hline(data=ms_medians_m3,
             aes(yintercept=med, linetype=milestone), colour="grey40", linewidth=.5) +
  geom_text(data=ms_medians,
          aes(x=7.6, y=med, label=milestone),
          size=2.8, hjust=0, colour="grey25",
          inherit.aes=FALSE) +
  scale_fill_manual(values=CELL_LINE_COLORS, guide="none") +
  scale_linetype_manual(values=rep("dashed",7), guide="none") +
  coord_cartesian(xlim=c(0.5,8.5)) +
  theme_classic() +
  labs(x="Cell line", y="Monocle3 pseudotime (0–100)",
       title="Monocle3 pseudotime per Sézary cell line",
       subtitle="Dashed lines = reference milestone medians (M00–M06)") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_violin_m3)
```

## Milestone Composition per Cell Line

```{r vis-milestone-bar, fig.width=7, fig.height=4}
ms_comp <- violin_df %>%
  count(cell_line, milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ungroup() %>%
  filter(!is.na(milestone)) %>%
  mutate(milestone = factor(milestone, levels=sprintf("M%02d",0:6)))

p_ms_bar <- ggplot(ms_comp, aes(x=cell_line, y=pct, fill=milestone)) +
  geom_col(width=.75) +
  scale_fill_manual(values=MILESTONE_COLORS,
                    labels=MILESTONE_LABELS, name="Milestone") +
  theme_classic() +
  labs(x="Cell line", y="% cells",
       title="Milestone composition per Sézary cell line",
       subtitle="Based on transferred milestone labels from reference") +
  theme(plot.title=element_text(size=13,face="bold"),
        legend.text=element_text(size=9))

print(p_ms_bar)
```

```{r milestone-heatmap, fig.width=10, fig.height=5}

# Milestone labels with biological annotation
MILESTONE_LABELS_FULL <- c(
  "M00" = "M00\nNaive",
  "M01" = "M01\nTCM early",
  "M02" = "M02\nTCM late\n(branch)",
  "M03" = "M03\nTEM",
  "M04" = "M04\nTemra",
  "M05" = "M05\nTreg(rest)",
  "M06" = "M06\nTreg(eff)"
)

# Compute milestone distribution per cell line
ms_per_line <- query_umap_df %>%
  filter(!is.na(predicted_milestone)) %>%
  count(cell_line, predicted_milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(predicted_milestone = factor(predicted_milestone,
                                      levels = MILESTONE_ORDER))

# ── Heatmap tile plot: cell line × milestone ──────────────────────────────
p_ms_heatmap <- ggplot(ms_per_line,
                        aes(x=predicted_milestone, y=cell_line, fill=pct)) +
  geom_tile(colour="white", linewidth=.5) +
  geom_text(aes(label=ifelse(pct >= 2, paste0(pct, "%"), "")),
            size=3.2, fontface="bold",
            colour=ifelse(ms_per_line$pct > 40, "white", "grey20")) +
  scale_fill_gradientn(
    colours = c("#F7FBFF","#DEEBF7","#9ECAE1","#4292C6","#08519C","#08306B"),
    name    = "% of cell\nline cells",
    limits  = c(0, 100)
  ) +
  scale_x_discrete(labels=MILESTONE_LABELS_FULL) +
  theme_classic() +
  labs(x="Reference milestone", y="Sézary cell line",
       title="Milestone distribution per Sézary cell line",
       subtitle="Darker = more cells | M02 = branch point between effector and regulatory arms") +
  theme(plot.title   = element_text(size=13, face="bold"),
        plot.subtitle= element_text(size=9,  colour="grey40"),
        axis.text.x  = element_text(size=9),
        axis.text.y  = element_text(size=11, face="bold"),
        legend.title = element_text(size=9))

print(p_ms_heatmap)

ggsave("Figures/fig_milestone_heatmap.pdf", p_ms_heatmap, width=10, height=5)
```




```{r milestone-stack, fig.width=8, fig.height=5}

# Colours per milestone — derived from state colours
ms_state_colors <- setNames(
  STATE_COLORS[milestone_centroids$state[
    match(MILESTONE_ORDER, milestone_centroids$milestone)]],
  MILESTONE_ORDER
)

p_ms_stack <- ggplot(ms_per_line,
                      aes(x=cell_line, y=pct, fill=predicted_milestone)) +
  geom_col(width=.75, colour="white", linewidth=.3) +
  geom_text(aes(label=ifelse(pct >= 5,
                             paste0(predicted_milestone, "\n", pct, "%"), "")),
            position=position_stack(vjust=0.5),
            size=2.8, colour="white", fontface="bold") +
  scale_fill_manual(values=ms_state_colors,
                    labels=MILESTONE_LABELS_FULL,
                    name="Milestone") +
  scale_y_continuous(expand=c(0,0), limits=c(0,105)) +
  theme_classic() +
  labs(x="Cell line", y="% of cells",
       title="Milestone composition per Sézary cell line",
       subtitle="Colour = CD4 state | Label shown if ≥ 5%") +
  theme(plot.title  = element_text(size=13, face="bold"),
        plot.subtitle = element_text(size=9, colour="grey40"),
        axis.text.x = element_text(size=12, face="bold"))

print(p_ms_stack)

ggsave("Figures/fig_milestone_stack.pdf", p_ms_stack, width=8, height=5)

```



Note: this chunk must come **after** the `milestone-heatmap` chunk since it uses `ms_per_line` and `MILESTONE_LABELS_FULL` defined there.



## MST vs Monocle3 Pseudotime per Line

```{r vis-pt-correlation, fig.width=12, fig.height=8}
# Scatter: MST vs Monocle3 per cell line — method agreement check
p_pt_cor <- ggplot(violin_df[sample(nrow(violin_df)),],
                    aes(x=mst_pt, y=m3_pt, colour=milestone)) +
  geom_point(size=.8, alpha=.6) +
  geom_abline(slope=1, intercept=0, linetype="dashed", colour="grey40") +
  scale_colour_manual(values=MILESTONE_COLORS,
                      labels=MILESTONE_LABELS, name="Milestone") +
  facet_wrap(~cell_line, ncol=4) +
  theme_classic() +
  labs(x="MST pseudotime (0–100)", y="Monocle3 pseudotime (0–100)",
       title="MST vs Monocle3 pseudotime per Sézary cell line",
       subtitle="Dashed = perfect agreement") +
  theme(plot.title=element_text(size=13,face="bold"),
        strip.text=element_text(face="bold"))

# Compute per-line Spearman rho
pt_cors <- violin_df %>%
  group_by(cell_line) %>%
  summarise(
    spearman = round(cor(mst_pt, m3_pt, method="spearman", use="complete.obs"), 3),
    n        = n(),
    .groups  = "drop"
  )

print(p_pt_cor)
cat("\nPer-line MST vs Monocle3 Spearman ρ:\n")
print(pt_cors)

```

## Per-Cell-Line Individual UMAPs

```{r vis-per-line-individual, fig.width=6, fig.height=4}
# Individual UMAP + pseudotime for each line — same style as previous results
pt_gradient <- scale_colour_gradientn(
  colours = c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
  name    = "MST pseudotime\n(0–100)", limits = c(0,100), na.value = "grey80"
)

for (ln in CELL_LINES) {
  q_df <- query_umap_df %>% filter(cell_line == ln)

  p <- ggplot() +
    geom_point(data=ref_umap_df, aes(x=UMAP_1,y=UMAP_2),
               colour="grey78", size=.25, alpha=.5) +
    geom_segment(data=mst_edge_df,
                 aes(x=x1,y=y1,xend=x2,yend=y2),
                 colour="grey30", linewidth=.7, alpha=.7, lineend="round") +
    geom_point(data=q_df[sample(nrow(q_df)),],
               aes(x=UMAP_1,y=UMAP_2,colour=mst_pt), size=1.5, alpha=.85) +
    pt_gradient +
    # Milestone labels
    geom_text_repel(data=milestone_centroids,
                    aes(x=UMAP_1,y=UMAP_2,label=milestone),
                    size=2.8, fontface="bold", colour="grey20",
                    bg.color="white", bg.r=.1, max.overlaps=15) +
    theme_classic() +
    labs(x="UMAP-1",y="UMAP-2",
         title=sprintf("%s — MST pseudotime on reference UMAP", ln),
         subtitle=sprintf("n=%d cells | MST median=%.1f",
                 nrow(q_df),
                 median(q_df$mst_pt, na.rm=TRUE)))  +
    theme(plot.title=element_text(size=12,face="bold"))

  print(p)
}

```

------------------------------------------------------------------------

# Pseudotime Summary Table

```{r}
# Full per-line summary including milestone breakdown
ms_wide <- ms_comp %>%
  select(cell_line, milestone, pct) %>%
  pivot_wider(names_from=milestone, values_from=pct, values_fill=0) %>%
  mutate(across(where(is.numeric), ~round(., 1)))

full_summary <- transfer_summary %>%
  left_join(ms_wide, by="cell_line")

kable(full_summary,
      caption = "Sézary cell line projection summary — pseudotime and milestone composition (%)",
      digits  = 1) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"), full_width=FALSE) %>%
  add_header_above(c(" "=ncol(transfer_summary),
                     "% cells per milestone"=ncol(ms_wide)-1))
```

------------------------------------------------------------------------

# Save Outputs

```{r save-outputs}
dir.create("Objects", showWarnings=FALSE)
dir.create("Figures", showWarnings=FALSE)
dir.create("Tables",  showWarnings=FALSE)

SaveSeuratRds(sezary_obj, "Objects/sezary_projected.rds")

write.csv(full_summary, "Tables/projection_summary_per_line.csv",   row.names=FALSE)
write.csv(ms_comp,      "Tables/milestone_composition_per_line.csv", row.names=FALSE)

ggsave("Figures/fig_all_lines_umap.pdf",        p_all_lines,  width=14, height=10)
ggsave("Figures/fig_violin_mst.pdf",            p_violin_mst, width=10, height=5)
ggsave("Figures/fig_violin_m3.pdf",             p_violin_m3,  width=10, height=5)
ggsave("Figures/fig_milestone_composition.pdf", p_ms_bar,     width=11, height=5)
ggsave("Figures/fig_pt_correlation.pdf",        p_pt_cor,     width=12, height=8)

cat("✅ All outputs saved\n")
```

```{r session}

sessionInfo()

```
