Load Reference Objects
(Step 1 outputs — not re-run)
# ── Load the reference Seurat object with frozen UMAP model ───────────────
cd4_ref <- readRDS("../../1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds")
# ── Load MST graph and centroids ──────────────────────────────────────────
mst_graph <- readRDS("../../1-Custom_MST/Objects/mst_graph.rds")
milestone_centroids <- readRDS("../../1-Custom_MST/Objects/milestone_centroids.rds")
# ── Validate milestone structure ──────────────────────────────────────────
expected_ms <- sprintf("M%02d", 0:6)
actual_ms <- sort(unique(na.omit(cd4_ref@meta.data$milestone)))
stopifnot(
"Milestone mismatch — re-run Step 1 first" = identical(actual_ms, expected_ms),
"mst_pseudotime_norm missing" = "mst_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
"monocle3_pseudotime_norm missing" = "monocle3_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
"UMAP model missing from cd4_ref" = !is.null(cd4_ref@reductions[[UMAP_NAME]]@misc$model)
)
cat("✅ Reference objects loaded and validated\n")
✅ Reference objects loaded and validated
cat(sprintf(" %d reference cells | %d milestones\n", ncol(cd4_ref), length(actual_ms)))
11466 reference cells | 7 milestones
cat(sprintf(" MST: %d nodes, %d edges\n", vcount(mst_graph), ecount(mst_graph)))
MST: 7 nodes, 6 edges
cat("\nMilestone structure from Step 1:\n")
Milestone structure from Step 1:
print(milestone_centroids[, c("milestone","state","n_cells")])
# ── Build reference UMAP data frame for background plotting ───────────────
ref_umap_df <- as.data.frame(Embeddings(cd4_ref, UMAP_NAME))
colnames(ref_umap_df) <- c("UMAP_1","UMAP_2")
ref_umap_df$state <- cd4_ref@meta.data[[STATE_COL]]
ref_umap_df$milestone <- cd4_ref@meta.data$milestone
ref_umap_df$mst_pt <- cd4_ref@meta.data$mst_pseudotime_norm
ref_umap_df$m3_pt <- cd4_ref@meta.data$monocle3_pseudotime_norm
# ── MST edge data frame for overlay ───────────────────────────────────────
mst_edges <- as_edgelist(mst_graph)
mst_edge_df <- do.call(rbind, lapply(seq_len(nrow(mst_edges)), function(i) {
m1 <- mst_edges[i,1]; m2 <- mst_edges[i,2]
r1 <- milestone_centroids[milestone_centroids$milestone == m1, ]
r2 <- milestone_centroids[milestone_centroids$milestone == m2, ]
data.frame(x1=r1$UMAP_1, y1=r1$UMAP_2, x2=r2$UMAP_1, y2=r2$UMAP_2)
}))
Load Sézary Cell
Lines
all_obj <- readRDS("/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds")
# ── cell_line column confirmed present (40,695 cells, L1–L7) ─────────────
all_obj$cell_line <- as.character(all_obj$cell_line)
sezary_obj <- subset(all_obj, subset = cell_line %in% CELL_LINES)
rm(all_obj); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 9046606 483.2 16164308 863.3 16164308 863.3
Vcells 1394581656 10639.9 3064212682 23378.1 2635168938 20104.8
cat(sprintf("Sézary cells loaded: %d total\n", ncol(sezary_obj)))
Sézary cells loaded: 40695 total
Per cell line:
print(table(sezary_obj$cell_line))
L1 L2 L3 L4 L5 L6 L7
5825 5935 6428 6006 6022 5148 5331
DefaultAssay(sezary_obj) <- "RNA"
# Shared genes with reference (confirmed ~36,601)
shared_genes <- intersect(rownames(sezary_obj), rownames(cd4_ref))
cat(sprintf("\nShared genes (query ∩ reference): %d\n", length(shared_genes)))
Shared genes (query ∩ reference): 36601
stopifnot("Fewer than 2000 shared genes — check objects" = length(shared_genes) >= 2000)
# ── Check percent.mt exists — needed for SCTransform vars.to.regress ──────
# If percent.mt is absent, SCT will run without regressing it (see per-line chunk)
HAS_PCT_MT <- "percent.mt" %in% colnames(sezary_obj@meta.data)
cat(sprintf("\npercent.mt in metadata: %s\n", HAS_PCT_MT))
percent.mt in metadata: TRUE
Per-Cell-Line SCT +
Projection
Rationale for per-cell-line SCT:
Each Sézary cell line has its own library size distribution and
technical variation. Running SCTransform on each line independently fits
a line-specific regression model, which produces better-calibrated
residuals for anchor finding. This gives
FindTransferAnchors more reliable anchors compared to
normalising all lines together, at the cost of one SCT run per line
(~30–60 s each).
# ── Step 7: Summary ───────────────────────────────────────────────────────
cat("\nPredicted state distribution:\n")
Predicted state distribution:
print(table(sezary_obj$predicted.state))
CD4 TCM CD4 TEM Treg
40628 66 1
cat("\nPredicted milestone distribution:\n")
Predicted milestone distribution:
print(table(sezary_obj$predicted.milestone))
M01 M02 M03 M05
1713 38893 86 3
cat("\nMST pseudotime range:", round(range(sezary_obj$mst_pseudotime_norm, na.rm=TRUE), 1), "\n")
MST pseudotime range: 8.3 99.6
cat("\n✅ Projection complete\n")
✅ Projection complete
cat(sprintf("✅ Anchors found: %d anchor pairs\n", nrow(anchors@anchors)))
✅ Anchors found: 8661 anchor pairs
cat(sprintf("Cells projected: %d\n", ncol(sezary_obj)))
Cells projected: 40695
cat(sprintf("Mean prediction score: %.3f (>0.5 = reliable, >0.7 = confident)\n",
mean(sezary_obj$predicted.state.score, na.rm=TRUE)))
Mean prediction score: 0.984 (>0.5 = reliable, >0.7 = confident)
cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
sum(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE),
100*mean(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE)))
Cells with score >0.5: 40643 (99.9%)
cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
sum(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE),
100*mean(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE)))
Cells with score >0.7: 40207 (98.8%)
cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
sum(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE),
100*mean(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE)))
Cells with score >0.5: 40388 (99.2%)
cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
sum(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE),
100*mean(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE)))
Cells with score >0.7: 34183 (84.0%)
cat(sprintf("Total cells: %d\n", nrow(query_umap_df)))
Total cells: 40695
Per cell line:
print(table(query_umap_df$cell_line))
L1 L2 L3 L4 L5 L6 L7
5825 5935 6428 6006 6022 5148 5331
cat("\nMilestone distribution:\n")
Milestone distribution:
print(table(query_umap_df$predicted_milestone, useNA="ifany"))
M01 M02 M03 M05
1713 38893 86 3
transfer_summary <- query_umap_df %>%
group_by(cell_line) %>%
summarise(
n_cells = n(),
mst_pt_mean = round(mean(mst_pt, na.rm=TRUE), 1),
mst_pt_med = round(median(mst_pt, na.rm=TRUE), 1),
m3_pt_mean = round(mean(m3_pt, na.rm=TRUE), 1),
m3_pt_med = round(median(m3_pt, na.rm=TRUE), 1),
top_milestone = names(sort(table(predicted_milestone), decreasing=TRUE))[1],
.groups = "drop"
)
ms_wide <- query_umap_df %>%
filter(!is.na(predicted_milestone)) %>%
count(cell_line, predicted_milestone) %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
select(cell_line, predicted_milestone, pct) %>%
pivot_wider(names_from=predicted_milestone, values_from=pct, values_fill=0)
for (m in MILESTONE_ORDER) {
if (!m %in% colnames(ms_wide)) ms_wide[[m]] <- 0
}
ms_wide <- ms_wide[, c("cell_line", MILESTONE_ORDER)]
full_summary <- left_join(transfer_summary, ms_wide, by="cell_line")
kable(full_summary,
caption = "Sézary projection summary — pseudotime and milestone composition",
digits = 1) %>%
kable_styling(bootstrap_options=c("striped","hover","condensed"),
full_width=FALSE) %>%
add_header_above(c(" "=ncol(transfer_summary),
"% cells per milestone"=length(MILESTONE_ORDER)))
Sézary projection summary — pseudotime and milestone composition
|
% cells per milestone |
| cell_line |
n_cells |
mst_pt_mean |
mst_pt_med |
m3_pt_mean |
m3_pt_med |
top_milestone |
M00 |
M01 |
M02 |
M03 |
M04 |
M05 |
M06 |
| L1 |
5825 |
57.5 |
55.7 |
68.6 |
66.1 |
M02 |
0 |
6.4 |
92.1 |
1.5 |
0 |
0.1 |
0 |
| L2 |
5935 |
52.7 |
54.5 |
57.2 |
58.9 |
M02 |
0 |
4.7 |
95.3 |
0.0 |
0 |
0.0 |
0 |
| L3 |
6428 |
52.1 |
53.5 |
61.0 |
60.7 |
M02 |
0 |
6.9 |
93.1 |
0.0 |
0 |
0.0 |
0 |
| L4 |
6006 |
56.7 |
56.7 |
63.7 |
60.6 |
M02 |
0 |
1.9 |
98.1 |
0.0 |
0 |
0.0 |
0 |
| L5 |
6022 |
51.8 |
52.8 |
57.4 |
58.4 |
M02 |
0 |
5.5 |
94.5 |
0.0 |
0 |
0.0 |
0 |
| L6 |
5148 |
58.3 |
57.1 |
62.8 |
60.7 |
M02 |
0 |
2.1 |
97.9 |
0.0 |
0 |
0.0 |
0 |
| L7 |
5331 |
55.5 |
55.8 |
62.8 |
60.0 |
M02 |
0 |
1.3 |
98.7 |
0.0 |
0 |
0.0 |
0 |
Save Outputs
dir.create("Objects", showWarnings=FALSE)
dir.create("Figures", showWarnings=FALSE)
dir.create("Tables", showWarnings=FALSE)
SaveSeuratRds(sezary_obj, "Objects/sezary_projected.rds")
write.csv(full_summary, "Tables/projection_summary_per_line.csv", row.names=FALSE)
write.csv(ms_comp, "Tables/milestone_composition_per_line.csv", row.names=FALSE)
ggsave("Figures/fig_all_lines_umap.pdf", p_all_lines, width=14, height=10)
ggsave("Figures/fig_violin_mst.pdf", p_violin_mst, width=10, height=5)
ggsave("Figures/fig_violin_m3.pdf", p_violin_m3, width=10, height=5)
ggsave("Figures/fig_milestone_composition.pdf", p_ms_bar, width=11, height=5)
ggsave("Figures/fig_pt_correlation.pdf", p_pt_cor, width=12, height=8)
cat("✅ All outputs saved\n")
R version 4.5.2 (2025-10-31)
Platform: x86_64-redhat-linux-gnu
Running under: Rocky Linux 9.7 (Blue Onyx)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS-OPENMP; LAPACK version 3.9.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] future_1.69.0 kableExtra_1.4.0 knitr_1.51 RColorBrewer_1.1-3 scales_1.4.0
[6] viridis_0.6.5 viridisLite_0.4.3 patchwork_1.3.2 tidyr_1.3.2 dplyr_1.2.0
[11] ggrepel_0.9.6 ggplot2_4.0.2 igraph_2.2.2 monocle3_1.4.26 SingleCellExperiment_1.32.0
[16] SummarizedExperiment_1.40.0 GenomicRanges_1.62.1 Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
[21] MatrixGenerics_1.22.0 matrixStats_1.5.0 Biobase_2.70.0 BiocGenerics_0.56.0 generics_0.1.4
[26] SeuratWrappers_0.4.0 Seurat_5.4.0 SeuratObject_5.3.0 sp_2.2-1
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.23 splines_4.5.2 later_1.4.6 tibble_3.3.1 R.oo_1.27.1 polyclip_1.10-7
[7] fastDummies_1.7.5 lifecycle_1.0.5 Rdpack_2.6.6 globals_0.19.0 lattice_0.22-9 MASS_7.3-65
[13] magrittr_2.0.4 plotly_4.12.0 sass_0.4.10 rmarkdown_2.30 jquerylib_0.1.4 yaml_2.3.12
[19] remotes_2.5.0 httpuv_1.6.16 otel_0.2.0 glmGamPoi_1.22.0 sctransform_0.4.3 spam_2.11-3
[25] spatstat.sparse_3.1-0 reticulate_1.45.0 cowplot_1.2.0 pbapply_1.7-4 minqa_1.2.8 abind_1.4-8
[31] Rtsne_0.17 purrr_1.2.1 R.utils_2.13.0 irlba_2.3.7 listenv_0.10.0 spatstat.utils_3.2-1
[37] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.4-4 fitdistrplus_1.2-6 parallelly_1.46.1 DelayedMatrixStats_1.32.0
[43] svglite_2.2.2 codetools_0.2-20 DelayedArray_0.36.0 xml2_1.5.2 tidyselect_1.2.1 farver_2.1.2
[49] lme4_1.1-38 spatstat.explore_3.7-0 jsonlite_2.0.0 progressr_0.18.0 ggridges_0.5.7 survival_3.8-6
[55] systemfonts_1.3.1 tools_4.5.2 ragg_1.5.0 ica_1.0-3 Rcpp_1.1.1 glue_1.8.0
[61] gridExtra_2.3 SparseArray_1.10.8 xfun_0.56 withr_3.0.2 BiocManager_1.30.27 fastmap_1.2.0
[67] boot_1.3-32 digest_0.6.39 rsvd_1.0.5 R6_2.6.1 mime_0.13 textshaping_1.0.4
[73] scattermore_1.2 tensor_1.5.1 dichromat_2.0-0.1 spatstat.data_3.1-9 R.methodsS3_1.8.2 utf8_1.2.6
[79] data.table_1.18.2.1 httr_1.4.8 htmlwidgets_1.6.4 S4Arrays_1.10.1 uwot_0.2.4 pkgconfig_2.0.3
[85] gtable_0.3.6 rsconnect_1.7.0 lmtest_0.9-40 S7_0.2.1 XVector_0.50.0 htmltools_0.5.9
[91] dotCall64_1.2 png_0.1-8 spatstat.univar_3.1-6 reformulas_0.4.4 rstudioapi_0.18.0 tzdb_0.5.0
[97] reshape2_1.4.5 nlme_3.1-168 nloptr_2.2.1 zoo_1.8-15 cachem_1.1.0 stringr_1.6.0
[103] KernSmooth_2.23-26 parallel_4.5.2 miniUI_0.1.2 pillar_1.11.1 grid_4.5.2 vctrs_0.7.1
[109] RANN_2.6.2 promises_1.5.0 beachmat_2.26.0 xtable_1.8-4 cluster_2.1.8.2 evaluate_1.0.5
[115] readr_2.1.6 cli_3.6.5 compiler_4.5.2 rlang_1.1.7 future.apply_1.20.1 labeling_0.4.3
[121] fs_1.6.6 plyr_1.8.9 stringi_1.8.7 deldir_2.0-4 lazyeval_0.2.2 spatstat.geom_3.7-0
[127] Matrix_1.7-4 RcppHNSW_0.6.0 hms_1.1.4 sparseMatrixStats_1.22.0 shiny_1.12.1 rbibutils_2.4.1
[133] ROCR_1.0-12 bslib_0.10.0
---
title: "Step 2: Sézary Cell Line Projection onto CD4 Reference Trajectory"
subtitle: "Per-cell-line SCT | MST + Monocle3 pseudotime transfer | 7-milestone alignment"
author: "NASIR MAHMOOD ABBASI"
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: false
    theme: journal
    highlight: tango
  html_document:
    toc: true
    df_print: paged
    number_sections: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE,
                      fig.align="center", dpi=150)
```

------------------------------------------------------------------------

# Overview

**What this script does — nothing is re-computed from scratch.**

- Loads the already-saved reference trajectory objects from Step 1
- Loads Sézary cell lines L1–L7
- Runs **per-cell-line SCT normalisation** before `FindTransferAnchors`
  → each line gets its own SCT model → more biologically meaningful anchors
- Projects each line individually onto the frozen reference UMAP
- Transfers both **MST pseudotime** and **Monocle3 pseudotime** from reference
- Transfers **milestone labels** (M00–M06) so positions can be compared with
  the 7-milestone structure confirmed in Step 1
- Produces visualisations identical in style to previous results so you can
  directly compare this run (7 milestones) with the previous run

**Reference milestone structure (already confirmed in Step 1):**
```
M00 = CD4 Naive      (2037)
M01 = CD4 TCM early  (4717)
M02 = CD4 TCM late   (4350)
M03 = CD4 TEM        (145)
M04 = CD4 Temra/CTL  (10)
M05 = Treg resting   (146)
M06 = Treg effector  (61)
```

------------------------------------------------------------------------

# Setup

```{r libraries}
suppressPackageStartupMessages({
  library(Seurat)
  library(SeuratWrappers)
  library(monocle3)
  library(igraph)
  library(ggplot2)
  library(ggrepel)
  library(dplyr)
  library(tidyr)
  library(patchwork)
  library(viridis)
  library(scales)
  library(RColorBrewer)
  library(knitr)
  library(kableExtra)
})

# ── Must match Step 1 exactly ──────────────────────────────────────────────
STATE_COLORS <- c(
  "CD4 Naive"     = "#4472C4",
  "CD4 TCM"       = "#70AD47",
  "CD4 TEM"       = "#ED7D31",
  "CD4 Temra/CTL" = "#C00000",
  "Treg"          = "#7030A0"
)

CELL_LINE_COLORS <- setNames(
  colorRampPalette(brewer.pal(8,"Dark2"))(7),
  paste0("L", 1:7)
)

MILESTONE_COLORS <- setNames(
  c("#4472C4",            # M00 Naive
    "#70AD47","#A9D18E",  # M01/M02 TCM early/late
    "#ED7D31",            # M03 TEM
    "#C00000",            # M04 Temra
    "#7030A0","#B4A0D4"), # M05/M06 Treg
  sprintf("M%02d", 0:6)
)

MILESTONE_LABELS <- c(
  M00 = "M00 Naive",
  M01 = "M01 TCM(early)",
  M02 = "M02 TCM(late)",
  M03 = "M03 TEM",
  M04 = "M04 Temra",
  M05 = "M05 Treg(rest)",
  M06 = "M06 Treg(eff)"
)

STATE_COL  <- "predicted.celltype.l2"
UMAP_NAME  <- "umap"
CELL_LINES <- paste0("L", 1:7)

MILESTONE_ORDER <- sprintf("M%02d", 0:6)

cat("✓ Setup complete\n")
```

------------------------------------------------------------------------

# Load Reference Objects (Step 1 outputs — not re-run)

```{r load-reference}
# ── Load the reference Seurat object with frozen UMAP model ───────────────
cd4_ref <- readRDS("../../1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds")

# ── Load MST graph and centroids ──────────────────────────────────────────
mst_graph           <- readRDS("../../1-Custom_MST/Objects/mst_graph.rds")
milestone_centroids <- readRDS("../../1-Custom_MST/Objects/milestone_centroids.rds")

# ── Validate milestone structure ──────────────────────────────────────────
expected_ms <- sprintf("M%02d", 0:6)
actual_ms   <- sort(unique(na.omit(cd4_ref@meta.data$milestone)))
stopifnot(
  "Milestone mismatch — re-run Step 1 first" = identical(actual_ms, expected_ms),
  "mst_pseudotime_norm missing"              = "mst_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
  "monocle3_pseudotime_norm missing"         = "monocle3_pseudotime_norm" %in% colnames(cd4_ref@meta.data),
  "UMAP model missing from cd4_ref"          = !is.null(cd4_ref@reductions[[UMAP_NAME]]@misc$model)
)

cat("✅ Reference objects loaded and validated\n")
cat(sprintf("   %d reference cells | %d milestones\n", ncol(cd4_ref), length(actual_ms)))
cat(sprintf("   MST: %d nodes, %d edges\n", vcount(mst_graph), ecount(mst_graph)))
cat("\nMilestone structure from Step 1:\n")
print(milestone_centroids[, c("milestone","state","n_cells")])

# ── Build reference UMAP data frame for background plotting ───────────────
ref_umap_df           <- as.data.frame(Embeddings(cd4_ref, UMAP_NAME))
colnames(ref_umap_df) <- c("UMAP_1","UMAP_2")
ref_umap_df$state     <- cd4_ref@meta.data[[STATE_COL]]
ref_umap_df$milestone <- cd4_ref@meta.data$milestone
ref_umap_df$mst_pt    <- cd4_ref@meta.data$mst_pseudotime_norm
ref_umap_df$m3_pt     <- cd4_ref@meta.data$monocle3_pseudotime_norm

# ── MST edge data frame for overlay ───────────────────────────────────────
mst_edges   <- as_edgelist(mst_graph)
mst_edge_df <- do.call(rbind, lapply(seq_len(nrow(mst_edges)), function(i) {
  m1 <- mst_edges[i,1]; m2 <- mst_edges[i,2]
  r1 <- milestone_centroids[milestone_centroids$milestone == m1, ]
  r2 <- milestone_centroids[milestone_centroids$milestone == m2, ]
  data.frame(x1=r1$UMAP_1, y1=r1$UMAP_2, x2=r2$UMAP_1, y2=r2$UMAP_2)
}))
```

------------------------------------------------------------------------

# Load Sézary Cell Lines

```{r load-sezary}
all_obj <- readRDS("/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds")

# ── cell_line column confirmed present (40,695 cells, L1–L7) ─────────────
all_obj$cell_line <- as.character(all_obj$cell_line)
sezary_obj <- subset(all_obj, subset = cell_line %in% CELL_LINES)
rm(all_obj); gc()

cat(sprintf("Sézary cells loaded: %d total\n", ncol(sezary_obj)))
cat("Per cell line:\n")
print(table(sezary_obj$cell_line))

DefaultAssay(sezary_obj) <- "RNA"

# Shared genes with reference (confirmed ~36,601)
shared_genes <- intersect(rownames(sezary_obj), rownames(cd4_ref))
cat(sprintf("\nShared genes (query ∩ reference): %d\n", length(shared_genes)))
stopifnot("Fewer than 2000 shared genes — check objects" = length(shared_genes) >= 2000)

# ── Check percent.mt exists — needed for SCTransform vars.to.regress ──────
# If percent.mt is absent, SCT will run without regressing it (see per-line chunk)
HAS_PCT_MT <- "percent.mt" %in% colnames(sezary_obj@meta.data)
cat(sprintf("\npercent.mt in metadata: %s\n", HAS_PCT_MT))
```

------------------------------------------------------------------------

# Per-Cell-Line SCT + Projection

**Rationale for per-cell-line SCT:**  
Each Sézary cell line has its own library size distribution and technical
variation. Running SCTransform on each line independently fits a line-specific
regression model, which produces better-calibrated residuals for anchor finding.
This gives `FindTransferAnchors` more reliable anchors compared to normalising
all lines together, at the cost of one SCT run per line (~30–60 s each).

```{r per-line-projection, results='hold'}

plan("sequential")
options(future.globals.maxSize = 200000 * 1024^2)  # 200 GB — effectively unlimited

# ── Step 1: Cell cycle scoring ────────────────────────────────────────────
DefaultAssay(sezary_obj) <- "RNA"
sezary_obj <- NormalizeData(sezary_obj, verbose=FALSE)
sezary_obj <- CellCycleScoring(
  sezary_obj,
  s.features   = cc.genes.updated.2019$s.genes,
  g2m.features = cc.genes.updated.2019$g2m.genes,
  set.ident    = FALSE
)
cat("Cell cycle phases:\n"); print(table(sezary_obj$Phase))

# ── Step 2: Per-line SCT ──────────────────────────────────────────────────
HAS_PCT_MT  <- "percent.mt" %in% colnames(sezary_obj@meta.data)
sezary_list <- SplitObject(sezary_obj, split.by = "cell_line")

sezary_list <- lapply(seq_along(sezary_list), function(i) {
  cat(sprintf("  SCT: %s (%d cells)\n",
              names(sezary_list)[i], ncol(sezary_list[[i]])))
  SCTransform(
    sezary_list[[i]],
    vst.flavor      = "v2",
    vars.to.regress = c("S.Score", "G2M.Score",
                        if (HAS_PCT_MT) "percent.mt" else NULL),
    verbose         = FALSE
  )
})

sezary_obj <- merge(sezary_list[[1]], sezary_list[-1], merge.data=TRUE)
cat(sprintf("Merged query: %d cells\n", ncol(sezary_obj)))

# ── Step 3: Shared features from reference integrated assay ───────────────
DefaultAssay(cd4_ref) <- "integrated"
ref_features          <- rownames(cd4_ref[["integrated"]]@scale.data)
query_genes           <- rownames(sezary_obj[["SCT"]])
shared_features       <- intersect(ref_features, query_genes)

junk_pattern          <- "^RPL|^RPS|^MT-|^HSP|^SNHG|^MALAT1|^NEAT1|^XIST|^TRBV|^TRAV|^TRGV|^TRDV|^HLA-"
shared_features_clean <- shared_features[!grepl(junk_pattern, shared_features)]
cat(sprintf("Shared features after cleaning: %d\n", length(shared_features_clean)))

VariableFeatures(sezary_obj) <- shared_features_clean
DefaultAssay(sezary_obj)     <- "SCT"

# ── Step 4: FindTransferAnchors ───────────────────────────────────────────
anchors <- FindTransferAnchors(
  reference            = cd4_ref,
  query                = sezary_obj,
  features             = shared_features_clean,
  normalization.method = "SCT",
  reference.reduction  = "pca",
  reduction            = "pcaproject",
  dims                 = 1:30,
  k.anchor             = 10,
  k.filter             = 500,
  k.score              = 30,
  verbose              = TRUE
)
cat(sprintf("Anchors found: %d\n", nrow(anchors@anchors)))

# ── Step 5: MapQuery ──────────────────────────────────────────────────────
sezary_obj <- MapQuery(
  anchorset           = anchors,
  query               = sezary_obj,
  reference           = cd4_ref,
  refdata             = list(
    milestone = cd4_ref@meta.data$milestone,
    state     = cd4_ref@meta.data[[STATE_COL]]
  ),
  reference.reduction = "pca",
  reduction.model     = UMAP_NAME,
  verbose             = FALSE
)
cat(sprintf("MapQuery complete: %d cells\n", ncol(sezary_obj)))

# ── Step 6: Transfer pseudotime ───────────────────────────────────────────
pt_mat           <- rbind(
  mst_pt = cd4_ref@meta.data$mst_pseudotime_norm,
  m3_pt  = cd4_ref@meta.data$monocle3_pseudotime_norm
)
colnames(pt_mat) <- colnames(cd4_ref)

wt_red <- if ("ref.pca" %in% names(sezary_obj@reductions)) "ref.pca" else "pcaproject"
pt_transfer <- TransferData(
  anchorset        = anchors,
  refdata          = pt_mat,
  weight.reduction = sezary_obj[[wt_red]],
  dims             = 1:30
)
pt_data <- GetAssayData(pt_transfer, layer="data")
cat("pt rownames:", rownames(pt_data), "\n")
sezary_obj$mst_pseudotime_norm      <- as.numeric(pt_data["mst-pt", ])
sezary_obj$monocle3_pseudotime_norm <- as.numeric(pt_data["m3-pt",  ])

# ── Step 7: Summary ───────────────────────────────────────────────────────
cat("\nPredicted state distribution:\n")
print(table(sezary_obj$predicted.state))
cat("\nPredicted milestone distribution:\n")
print(table(sezary_obj$predicted.milestone))
cat("\nMST pseudotime range:", round(range(sezary_obj$mst_pseudotime_norm, na.rm=TRUE), 1), "\n")
cat("\n✅ Projection complete\n")


cat(sprintf("✅ Anchors found: %d anchor pairs\n", nrow(anchors@anchors)))
cat(sprintf("Cells projected: %d\n", ncol(sezary_obj)))
cat(sprintf("Mean prediction score: %.3f (>0.5 = reliable, >0.7 = confident)\n",
            mean(sezary_obj$predicted.state.score, na.rm=TRUE)))
cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE),
            100*mean(sezary_obj$predicted.state.score > 0.5, na.rm=TRUE)))

cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE),
            100*mean(sezary_obj$predicted.state.score > 0.7, na.rm=TRUE)))


cat(sprintf("Cells with score >0.5: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE),
            100*mean(sezary_obj$predicted.milestone.score > 0.5, na.rm=TRUE)))

cat(sprintf("Cells with score >0.7: %d (%.1f%%)\n",
            sum(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE),
            100*mean(sezary_obj$predicted.milestone.score > 0.7, na.rm=TRUE)))

```



```{r merge-projected}
query_umap_df <- as.data.frame(Embeddings(sezary_obj, "ref.umap"))
colnames(query_umap_df) <- c("UMAP_1","UMAP_2")
query_umap_df$cell_line           <- sezary_obj$cell_line
query_umap_df$predicted_state     <- sezary_obj$predicted.state
query_umap_df$predicted_milestone <- sezary_obj$predicted.milestone
query_umap_df$mst_pt              <- sezary_obj$mst_pseudotime_norm
query_umap_df$m3_pt               <- sezary_obj$monocle3_pseudotime_norm

cat(sprintf("Total cells: %d\n", nrow(query_umap_df)))
cat("Per cell line:\n")
print(table(query_umap_df$cell_line))
cat("\nMilestone distribution:\n")
print(table(query_umap_df$predicted_milestone, useNA="ifany"))

```

---

```{r summary-table, fig.width=20, fig.height=8}
transfer_summary <- query_umap_df %>%
  group_by(cell_line) %>%
  summarise(
    n_cells       = n(),
    mst_pt_mean   = round(mean(mst_pt, na.rm=TRUE), 1),
    mst_pt_med    = round(median(mst_pt, na.rm=TRUE), 1),
    m3_pt_mean    = round(mean(m3_pt, na.rm=TRUE), 1),
    m3_pt_med     = round(median(m3_pt, na.rm=TRUE), 1),
    top_milestone = names(sort(table(predicted_milestone), decreasing=TRUE))[1],
    .groups       = "drop"
  )

ms_wide <- query_umap_df %>%
  filter(!is.na(predicted_milestone)) %>%
  count(cell_line, predicted_milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  select(cell_line, predicted_milestone, pct) %>%
  pivot_wider(names_from=predicted_milestone, values_from=pct, values_fill=0)

for (m in MILESTONE_ORDER) {
  if (!m %in% colnames(ms_wide)) ms_wide[[m]] <- 0
}
ms_wide      <- ms_wide[, c("cell_line", MILESTONE_ORDER)]
full_summary <- left_join(transfer_summary, ms_wide, by="cell_line")

kable(full_summary,
      caption = "Sézary projection summary — pseudotime and milestone composition",
      digits  = 1) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"),
                full_width=FALSE) %>%
  add_header_above(c(" "=ncol(transfer_summary),
                     "% cells per milestone"=length(MILESTONE_ORDER)))
```


------------------------------------------------------------------------

# Visualisations {.tabset}

## Reference UMAP — milestones

```{r vis-ref-milestones, fig.width=10, fig.height=8}
p_ref_ms <- ggplot() +
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=state), size=.4, alpha=.5) +
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black", linewidth=1, alpha=.85, lineend="round") +
  geom_point(data=milestone_centroids,
             aes(x=UMAP_1,y=UMAP_2,fill=state),
             shape=21, size=8, colour="white", stroke=2) +
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,
                      label=paste0(milestone,"\n(",MILESTONE_LABELS[milestone],")")),
                  size=2.8, fontface="bold",
                  bg.color="grey80", bg.r=.15, max.overlaps=20) +
  scale_colour_manual(values=STATE_COLORS, name="State") +
  scale_fill_manual(values=STATE_COLORS, guide="none") +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Reference CD4 T cells — 7-milestone structure",
       subtitle="M00=Naive | M01/M02=TCM | M03=TEM | M04=Temra | M05/M06=Treg") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_ref_ms)

```

```{r , fig.width=8, fig.height=6}
p_ref_ms <- ggplot() +
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=state), size=.4, alpha=.5) +
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black", linewidth=1, alpha=.85, lineend="round") +
  geom_point(data=milestone_centroids,
             aes(x=UMAP_1,y=UMAP_2,fill=state),
             shape=21, size=8, colour="white", stroke=2) +
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,
                      label=paste0(milestone,"\n(",MILESTONE_LABELS[milestone],")")),
                  size=2.8, fontface="bold",
                  bg.color="grey80", bg.r=.15, max.overlaps=20) +
  scale_colour_manual(values=STATE_COLORS, name="State") +
  scale_fill_manual(values=STATE_COLORS, guide="none") +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Reference CD4 T cells — 7-milestone structure",
       subtitle="M00=Naive | M01/M02=TCM | M03=TEM | M04=Temra | M05/M06=Treg") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_ref_ms)

```




```{r, fig.width=8, fig.height=6}
p_all_lines <- ggplot() +
  # Reference background
  geom_point(data = ref_umap_df[sample(nrow(ref_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2),
             colour="grey68", size=.25, alpha=.4) +
  # MST trajectory lines
  geom_segment(data = mst_edge_df,
               aes(x=x1, y=y1, xend=x2, yend=y2),
               colour="grey30", linewidth=1, alpha=.8, lineend="round") +
  # Sézary cells coloured by cell line
  geom_point(data = query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1, y=UMAP_2, colour=cell_line),
             size=1.5, alpha=.75) +
  # Milestone labels using confirmed 7-milestone names
  geom_label_repel(data = milestone_centroids,
                   aes(x=UMAP_1, y=UMAP_2,
                       label=paste0(milestone, "\n",
                                    MILESTONE_LABELS[milestone])),
                   size=2.5, fontface="bold",
                   fill="white", colour="grey15",
                   label.size=0.2, max.overlaps=20,
                   box.padding=0.5) +
  scale_colour_manual(values=CELL_LINE_COLORS, name="Cell line") +
  guides(colour=guide_legend(override.aes=list(size=4, alpha=1))) +
  theme_classic() +
  labs(x="UMAP-1", y="UMAP-2",
       title="Sézary cell lines — projection onto healthy CD4 T cell reference",
       subtitle=sprintf("Grey = healthy reference (%d cells) | Coloured = Sézary L1–L7 (%d cells) | Lines = MST trajectory",
                        nrow(ref_umap_df), nrow(query_umap_df))) +
  theme(plot.title    = element_text(size=13, face="bold"),
        plot.subtitle = element_text(size=9,  colour="grey40"),
        legend.title  = element_text(size=11, face="bold"),
        legend.text   = element_text(size=10))

print(p_all_lines)

```

## All Lines — Position on Reference UMAP

```{r vis-all-lines-umap, fig.width=14, fig.height=9}
p_all_lines <- ggplot() +
  # Reference background (grey)
  geom_point(data=ref_umap_df,
             aes(x=UMAP_1,y=UMAP_2), colour="grey68", size=.25, alpha=.6) +
  # MST overlay
  geom_segment(data=mst_edge_df,
               aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="grey30", linewidth=.7, alpha=.7, lineend="round") +
  # Query cells coloured by cell line
  geom_point(data=query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1,y=UMAP_2,colour=cell_line), size=1.2, alpha=.75) +
  scale_colour_manual(values=CELL_LINE_COLORS, name="Cell line") +
  # Milestone labels
  geom_text_repel(data=milestone_centroids,
                  aes(x=UMAP_1,y=UMAP_2,label=milestone),
                  size=3, fontface="bold", colour="grey20",
                  bg.color="white", bg.r=.12, max.overlaps=15) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",
       title="Sézary L1–L7 projected onto reference UMAP",
       subtitle="Grey = reference | Coloured = query cell lines") +
  theme(plot.title=element_text(size=13,face="bold")) +
  facet_wrap(~cell_line, ncol=4)

print(p_all_lines)

```

## All Lines — MST Pseudotime on Reference UMAP

```{r vis-pseudotime-mst, fig.width=14, fig.height=6}
# Panel 1: reference pseudotime
p_ref_pt <- ggplot(ref_umap_df[sample(nrow(ref_umap_df)),],
                    aes(x=UMAP_1,y=UMAP_2,colour=mst_pt)) +
  geom_point(size=.3, alpha=.7) +
  scale_colour_gradientn(
    colours=c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
    name="MST pseudotime\n(0–100)", limits=c(0,100), na.value="grey80") +
  geom_segment(data=mst_edge_df,aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="black",linewidth=.6,alpha=.6,inherit.aes=FALSE) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",title="Reference — MST pseudotime") +
  theme(plot.title=element_text(size=12,face="bold"))

# Panel 2: query pseudotime (all lines combined)
p_query_pt <- ggplot() +
  geom_point(data=ref_umap_df, aes(x=UMAP_1,y=UMAP_2),
             colour="grey78", size=.2, alpha=.5) +
  geom_point(data=query_umap_df[sample(nrow(query_umap_df)),],
             aes(x=UMAP_1,y=UMAP_2,colour=mst_pt), size=1, alpha=.8) +
  scale_colour_gradientn(
    colours=c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
    name="MST pseudotime\n(0–100)", limits=c(0,100), na.value="grey80") +
  geom_segment(data=mst_edge_df,aes(x=x1,y=y1,xend=x2,yend=y2),
               colour="grey30",linewidth=.6,alpha=.6,inherit.aes=FALSE) +
  theme_classic() +
  labs(x="UMAP-1",y="UMAP-2",title="Sézary L1–L7 — transferred MST pseudotime") +
  theme(plot.title=element_text(size=12,face="bold"))

print(p_ref_pt | p_query_pt)

```

## Per-Line MST Pseudotime Violin

```{r vis-violin-mst, fig.width=7, fig.height=6}
# Milestone median lines for reference
ms_medians <- cd4_ref@meta.data %>%
  group_by(milestone) %>%
  summarise(med = median(mst_pseudotime_norm, na.rm=TRUE), .groups="drop") %>%
  filter(milestone %in% c("M00","M01","M02","M03","M04","M05","M06"))

violin_df <- data.frame(
  cell_line = sezary_obj$cell_line,
  mst_pt    = sezary_obj$mst_pseudotime_norm,
  m3_pt     = sezary_obj$monocle3_pseudotime_norm,
  milestone = sezary_obj$predicted.milestone
) %>% filter(!is.na(mst_pt))

p_violin_mst <- ggplot(violin_df, aes(x=cell_line, y=mst_pt, fill=cell_line)) +
  geom_violin(scale="width", trim=FALSE, alpha=.85) +
  geom_boxplot(width=.08, fill="white", outlier.size=.3, outlier.alpha=.4) +
  # Reference milestone median lines
  geom_hline(data=ms_medians,
             aes(yintercept=med, linetype=milestone), colour="grey40", linewidth=.5) +
  geom_text(data=ms_medians,
          aes(x=7.6, y=med, label=milestone),
          size=2.8, hjust=0, colour="grey25",
          inherit.aes=FALSE) +
  scale_fill_manual(values=CELL_LINE_COLORS, guide="none") +
  scale_linetype_manual(values=rep("dashed",7), guide="none") +
  coord_cartesian(xlim=c(0.5,8.5)) +
  theme_classic() +
  labs(x="Cell line", y="MST pseudotime (0–100)",
       title="MST pseudotime per Sézary cell line",
       subtitle="Dashed lines = reference milestone medians (M00–M06)") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_violin_mst)
```

## Per-Line Monocle3 Pseudotime Violin

```{r vis-violin-m3, fig.width=7, fig.height=6}
# Reference milestone medians — Monocle3
ms_medians_m3 <- cd4_ref@meta.data %>%
  group_by(milestone) %>%
  summarise(med = median(monocle3_pseudotime_norm, na.rm=TRUE), .groups="drop")

p_violin_m3 <- ggplot(violin_df, aes(x=cell_line, y=m3_pt, fill=cell_line)) +
  geom_violin(scale="width", trim=FALSE, alpha=.85) +
  geom_boxplot(width=.08, fill="white", outlier.size=.3, outlier.alpha=.4) +
  geom_hline(data=ms_medians_m3,
             aes(yintercept=med, linetype=milestone), colour="grey40", linewidth=.5) +
  geom_text(data=ms_medians,
          aes(x=7.6, y=med, label=milestone),
          size=2.8, hjust=0, colour="grey25",
          inherit.aes=FALSE) +
  scale_fill_manual(values=CELL_LINE_COLORS, guide="none") +
  scale_linetype_manual(values=rep("dashed",7), guide="none") +
  coord_cartesian(xlim=c(0.5,8.5)) +
  theme_classic() +
  labs(x="Cell line", y="Monocle3 pseudotime (0–100)",
       title="Monocle3 pseudotime per Sézary cell line",
       subtitle="Dashed lines = reference milestone medians (M00–M06)") +
  theme(plot.title=element_text(size=13,face="bold"))

print(p_violin_m3)
```

## Milestone Composition per Cell Line

```{r vis-milestone-bar, fig.width=7, fig.height=4}
ms_comp <- violin_df %>%
  count(cell_line, milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  ungroup() %>%
  filter(!is.na(milestone)) %>%
  mutate(milestone = factor(milestone, levels=sprintf("M%02d",0:6)))

p_ms_bar <- ggplot(ms_comp, aes(x=cell_line, y=pct, fill=milestone)) +
  geom_col(width=.75) +
  scale_fill_manual(values=MILESTONE_COLORS,
                    labels=MILESTONE_LABELS, name="Milestone") +
  theme_classic() +
  labs(x="Cell line", y="% cells",
       title="Milestone composition per Sézary cell line",
       subtitle="Based on transferred milestone labels from reference") +
  theme(plot.title=element_text(size=13,face="bold"),
        legend.text=element_text(size=9))

print(p_ms_bar)
```

```{r milestone-heatmap, fig.width=10, fig.height=5}

# Milestone labels with biological annotation
MILESTONE_LABELS_FULL <- c(
  "M00" = "M00\nNaive",
  "M01" = "M01\nTCM early",
  "M02" = "M02\nTCM late\n(branch)",
  "M03" = "M03\nTEM",
  "M04" = "M04\nTemra",
  "M05" = "M05\nTreg(rest)",
  "M06" = "M06\nTreg(eff)"
)

# Compute milestone distribution per cell line
ms_per_line <- query_umap_df %>%
  filter(!is.na(predicted_milestone)) %>%
  count(cell_line, predicted_milestone) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(predicted_milestone = factor(predicted_milestone,
                                      levels = MILESTONE_ORDER))

# ── Heatmap tile plot: cell line × milestone ──────────────────────────────
p_ms_heatmap <- ggplot(ms_per_line,
                        aes(x=predicted_milestone, y=cell_line, fill=pct)) +
  geom_tile(colour="white", linewidth=.5) +
  geom_text(aes(label=ifelse(pct >= 2, paste0(pct, "%"), "")),
            size=3.2, fontface="bold",
            colour=ifelse(ms_per_line$pct > 40, "white", "grey20")) +
  scale_fill_gradientn(
    colours = c("#F7FBFF","#DEEBF7","#9ECAE1","#4292C6","#08519C","#08306B"),
    name    = "% of cell\nline cells",
    limits  = c(0, 100)
  ) +
  scale_x_discrete(labels=MILESTONE_LABELS_FULL) +
  theme_classic() +
  labs(x="Reference milestone", y="Sézary cell line",
       title="Milestone distribution per Sézary cell line",
       subtitle="Darker = more cells | M02 = branch point between effector and regulatory arms") +
  theme(plot.title   = element_text(size=13, face="bold"),
        plot.subtitle= element_text(size=9,  colour="grey40"),
        axis.text.x  = element_text(size=9),
        axis.text.y  = element_text(size=11, face="bold"),
        legend.title = element_text(size=9))

print(p_ms_heatmap)

ggsave("Figures/fig_milestone_heatmap.pdf", p_ms_heatmap, width=10, height=5)
```




```{r milestone-stack, fig.width=8, fig.height=5}

# Colours per milestone — derived from state colours
ms_state_colors <- setNames(
  STATE_COLORS[milestone_centroids$state[
    match(MILESTONE_ORDER, milestone_centroids$milestone)]],
  MILESTONE_ORDER
)

p_ms_stack <- ggplot(ms_per_line,
                      aes(x=cell_line, y=pct, fill=predicted_milestone)) +
  geom_col(width=.75, colour="white", linewidth=.3) +
  geom_text(aes(label=ifelse(pct >= 5,
                             paste0(predicted_milestone, "\n", pct, "%"), "")),
            position=position_stack(vjust=0.5),
            size=2.8, colour="white", fontface="bold") +
  scale_fill_manual(values=ms_state_colors,
                    labels=MILESTONE_LABELS_FULL,
                    name="Milestone") +
  scale_y_continuous(expand=c(0,0), limits=c(0,105)) +
  theme_classic() +
  labs(x="Cell line", y="% of cells",
       title="Milestone composition per Sézary cell line",
       subtitle="Colour = CD4 state | Label shown if ≥ 5%") +
  theme(plot.title  = element_text(size=13, face="bold"),
        plot.subtitle = element_text(size=9, colour="grey40"),
        axis.text.x = element_text(size=12, face="bold"))

print(p_ms_stack)

ggsave("Figures/fig_milestone_stack.pdf", p_ms_stack, width=8, height=5)

```



Note: this chunk must come **after** the `milestone-heatmap` chunk since it uses `ms_per_line` and `MILESTONE_LABELS_FULL` defined there.



## MST vs Monocle3 Pseudotime per Line

```{r vis-pt-correlation, fig.width=12, fig.height=8}
# Scatter: MST vs Monocle3 per cell line — method agreement check
p_pt_cor <- ggplot(violin_df[sample(nrow(violin_df)),],
                    aes(x=mst_pt, y=m3_pt, colour=milestone)) +
  geom_point(size=.8, alpha=.6) +
  geom_abline(slope=1, intercept=0, linetype="dashed", colour="grey40") +
  scale_colour_manual(values=MILESTONE_COLORS,
                      labels=MILESTONE_LABELS, name="Milestone") +
  facet_wrap(~cell_line, ncol=4) +
  theme_classic() +
  labs(x="MST pseudotime (0–100)", y="Monocle3 pseudotime (0–100)",
       title="MST vs Monocle3 pseudotime per Sézary cell line",
       subtitle="Dashed = perfect agreement") +
  theme(plot.title=element_text(size=13,face="bold"),
        strip.text=element_text(face="bold"))

# Compute per-line Spearman rho
pt_cors <- violin_df %>%
  group_by(cell_line) %>%
  summarise(
    spearman = round(cor(mst_pt, m3_pt, method="spearman", use="complete.obs"), 3),
    n        = n(),
    .groups  = "drop"
  )

print(p_pt_cor)
cat("\nPer-line MST vs Monocle3 Spearman ρ:\n")
print(pt_cors)

```

## Per-Cell-Line Individual UMAPs

```{r vis-per-line-individual, fig.width=6, fig.height=4}
# Individual UMAP + pseudotime for each line — same style as previous results
pt_gradient <- scale_colour_gradientn(
  colours = c("#0D0887","#6A00A8","#B12A90","#E16462","#FCA636","#F0F921"),
  name    = "MST pseudotime\n(0–100)", limits = c(0,100), na.value = "grey80"
)

for (ln in CELL_LINES) {
  q_df <- query_umap_df %>% filter(cell_line == ln)

  p <- ggplot() +
    geom_point(data=ref_umap_df, aes(x=UMAP_1,y=UMAP_2),
               colour="grey78", size=.25, alpha=.5) +
    geom_segment(data=mst_edge_df,
                 aes(x=x1,y=y1,xend=x2,yend=y2),
                 colour="grey30", linewidth=.7, alpha=.7, lineend="round") +
    geom_point(data=q_df[sample(nrow(q_df)),],
               aes(x=UMAP_1,y=UMAP_2,colour=mst_pt), size=1.5, alpha=.85) +
    pt_gradient +
    # Milestone labels
    geom_text_repel(data=milestone_centroids,
                    aes(x=UMAP_1,y=UMAP_2,label=milestone),
                    size=2.8, fontface="bold", colour="grey20",
                    bg.color="white", bg.r=.1, max.overlaps=15) +
    theme_classic() +
    labs(x="UMAP-1",y="UMAP-2",
         title=sprintf("%s — MST pseudotime on reference UMAP", ln),
         subtitle=sprintf("n=%d cells | MST median=%.1f",
                 nrow(q_df),
                 median(q_df$mst_pt, na.rm=TRUE)))  +
    theme(plot.title=element_text(size=12,face="bold"))

  print(p)
}

```

------------------------------------------------------------------------

# Pseudotime Summary Table

```{r}
# Full per-line summary including milestone breakdown
ms_wide <- ms_comp %>%
  select(cell_line, milestone, pct) %>%
  pivot_wider(names_from=milestone, values_from=pct, values_fill=0) %>%
  mutate(across(where(is.numeric), ~round(., 1)))

full_summary <- transfer_summary %>%
  left_join(ms_wide, by="cell_line")

kable(full_summary,
      caption = "Sézary cell line projection summary — pseudotime and milestone composition (%)",
      digits  = 1) %>%
  kable_styling(bootstrap_options=c("striped","hover","condensed"), full_width=FALSE) %>%
  add_header_above(c(" "=ncol(transfer_summary),
                     "% cells per milestone"=ncol(ms_wide)-1))
```

------------------------------------------------------------------------

# Save Outputs

```{r save-outputs}
dir.create("Objects", showWarnings=FALSE)
dir.create("Figures", showWarnings=FALSE)
dir.create("Tables",  showWarnings=FALSE)

SaveSeuratRds(sezary_obj, "Objects/sezary_projected.rds")

write.csv(full_summary, "Tables/projection_summary_per_line.csv",   row.names=FALSE)
write.csv(ms_comp,      "Tables/milestone_composition_per_line.csv", row.names=FALSE)

ggsave("Figures/fig_all_lines_umap.pdf",        p_all_lines,  width=14, height=10)
ggsave("Figures/fig_violin_mst.pdf",            p_violin_mst, width=10, height=5)
ggsave("Figures/fig_violin_m3.pdf",             p_violin_m3,  width=10, height=5)
ggsave("Figures/fig_milestone_composition.pdf", p_ms_bar,     width=11, height=5)
ggsave("Figures/fig_pt_correlation.pdf",        p_pt_cor,     width=12, height=8)

cat("✅ All outputs saved\n")
```

```{r session}

sessionInfo()

```
