1 Load Libraries

2 Load Objects

2.1 Load Reference (Healthy CD4+ T Cells with Trajectory)

# ── CRITICAL: use the ANNOTATED reference with saved UMAP model ────────────
# This is the cleaned reference object with:
#   - CD4 Proliferating cluster removed (MKI67+/TOP2A+/CDK1+ cells)
#   - Cluster 12 contamination removed
#   - UMAP reduction with return.model = TRUE
#   - monocle3_pseudotime column computed on the clean trajectory
#   - cell_type annotation (clusters 0-6, no proliferating)
#   - predicted.celltype.l2 (Azimuth l2)
# ──────────────────────────────────────────────────────────────────────────


reference_integrated <- readRDS("../Part3_Slingshot_Trajectory/part4_Slingshot+Projection/cd4_ref_dual_trajectory.rds")

cat("=== Reference object summary ===\n")
cat("Cells:", ncol(reference_integrated), "\n")
cat("Assays:", paste(names(reference_integrated@assays), collapse = ", "), "\n")
cat("Reductions:", paste(names(reference_integrated@reductions), collapse = ", "), "\n")
cat("UMAP model present:", !is.null(reference_integrated@reductions$umap@misc$model), "\n")

# ── Confirm clusters present ───────────────────────────────────────────────
cat("\nClusters present:", paste(sort(unique(reference_integrated$seurat_clusters)),
                                 collapse = ", "), "\n")

# ── Confirm CD4 Proliferating cells are ABSENT ────────────────────────────
# Check 1: cell_type label should not contain "Prolif"
if (any(grepl("Prolif|prolif|cycling|Cycling", reference_integrated$cell_type,
              ignore.case = TRUE))) {
  warning("⚠️  Proliferating cell_type labels still detected! Check removal step.")
} else {
  cat("✅ No proliferating cell_type labels found\n")
}

# Check 2: Molecular signature — MKI67/TOP2A expression should be near zero
prolif_markers <- c("MKI67", "TOP2A", "PCNA", "CDK1", "STMN1")
prolif_present <- intersect(prolif_markers, rownames(reference_integrated))

if (length(prolif_present) > 0) {
  DefaultAssay(reference_integrated) <- "SCT"
  prolif_expr <- Matrix::colMeans(
    GetAssayData(reference_integrated, layer = "data")[prolif_present, , drop = FALSE]
  )
  cat("Proliferation marker score — max:", round(max(prolif_expr), 3),
      " | mean:", round(mean(prolif_expr), 4), "\n")
  cat("Cells with any prolif marker > 0.5:", sum(prolif_expr > 0.5),
      "(expect near zero after removal)\n")
  if (sum(prolif_expr > 0.5) > 50) {
    warning("⚠️  Substantial proliferation signal remains — verify removal was complete.")
  } else {
    cat("✅ Proliferating cells confirmed removed\n")
  }
}

# ── Cell type distribution ─────────────────────────────────────────────────
cat("\nCell type distribution (should be 6 types, no Proliferating):\n")
print(table(reference_integrated$cell_type))

cat("\nAzimuth l2 distribution:\n")
print(table(reference_integrated$predicted.celltype.l2))

# ── Finalise Azimuth l2 colour palette from actual labels in data ──────────
ref_l2_labels <- unique(as.character(reference_integrated$predicted.celltype.l2))
ref_l2_labels <- ref_l2_labels[!is.na(ref_l2_labels)]
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, ref_l2_labels)
cat("\nAzimuth l2 colour palette covers", length(ref_l2_labels), "reference labels ✅\n")

# NOTE: monocle3_pseudotime summary is printed AFTER trajectory is built
# (Section 2 below). If this is a freshly loaded reference that already
# has pseudotime stored, the validate-pseudotime chunk will summarise it.

# ── Validate: UMAP model MUST exist for MapQuery ──────────────────────────
if (is.null(reference_integrated@reductions$umap@misc$model)) {
  stop(
    "❌ UMAP model missing! Re-run on the cleaned (no-prolif) object:\n",
    "  reference_integrated <- RunUMAP(reference_integrated, dims = 1:20,\n",
    "                                   return.model = TRUE)\n",
    "Then re-save and reload."
  )
} else {
  cat("✅ UMAP model confirmed — ready for MapQuery projection\n")
}

2.2 Load & Prepare Malignant CD4+ T Cells — Per-Cell-Line SCTransform

# ── Load merged object ─────────────────────────────────────────────────────
All_samples_Merged <- readRDS(
  "/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds"
)

cat("All samples:\n")
print(table(All_samples_Merged$cell_line))

# ── Define group labels ────────────────────────────────────────────────────
All_samples_Merged$Group <- ifelse(
  All_samples_Merged$cell_line %in% paste0("L", 1:7),
  "MalignantCD4T",
  ifelse(
    All_samples_Merged$cell_line %in% c("CD4Tcells_lab", "CD4Tcells_10x_S1", "CD4Tcells_10x_S2"),
    "NormalCD4T",
    "Other"
  )
)

cat("\nGroup distribution:\n")
print(table(All_samples_Merged$Group))

# ── Subset malignant cells ─────────────────────────────────────────────────
MalignantCD4T_raw <- subset(All_samples_Merged, subset = Group == "MalignantCD4T")
cat("\nMalignant CD4T cells:", ncol(MalignantCD4T_raw), "\n")
cat("Cell lines present:\n")
print(table(MalignantCD4T_raw$cell_line))

rm(All_samples_Merged)
gc()
# ══════════════════════════════════════════════════════════════════════════
# PER-CELL-LINE SCTransform
# ══════════════════════════════════════════════════════════════════════════
# WHY: Running SCTransform on the merged object creates a single shared
#      size-factor model that collapses batch variation with biology.
#      Each cell line is a separate library (different sequencing depth,
#      different ambient RNA profile). Per-line SCT corrects for this.
#
# APPROACH:
#   1. Split the malignant object by cell_line
#   2. Run SCTransform on each line independently
#   3. Merge back into one object (NOT integrate — integration is not
#      needed here because MapQuery uses reference anchors anyway)
#   4. Set VariableFeatures to the union of all per-line HVGs
#      (capped at 3000, ranked by cross-line frequency)
# ══════════════════════════════════════════════════════════════════════════

cat("=== Running per-cell-line SCTransform ===\n")
cat("This may take 5-15 minutes depending on cell count per line.\n\n")

# ── Step 1: Split by cell line ────────────────────────────────────────────
cell_line_list <- SplitObject(MalignantCD4T_raw, split.by = "cell_line")
cat("Cell lines to process:", paste(names(cell_line_list), collapse = ", "), "\n\n")

# ── Step 2: SCTransform each line independently ───────────────────────────
# vars.to.regress: remove cell-cycle and mitochondrial confounders
# These are biological covariates that would otherwise distort the
# differentiation signal in the trajectory projection.

cell_line_list <- lapply(names(cell_line_list), function(line_name) {
  obj <- cell_line_list[[line_name]]
  cat("Processing cell line:", line_name,
      "| Cells:", ncol(obj), "\n")

  # ── Compute % mitochondrial if not present ──────────────────────────────
  if (!"percent.mt" %in% colnames(obj@meta.data)) {
    obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
  }

  # ── Cell cycle scoring (for regression) ────────────────────────────────
  # Uses Seurat's built-in cc.genes (Tirosh et al.)
  # Regressing S.Score and G2M.Score removes cycling artefacts from the
  # SCT residuals without removing cycling cells from the object.
  tryCatch({
    obj <- CellCycleScoring(
      obj,
      s.features   = cc.genes$s.genes,
      g2m.features = cc.genes$g2m.genes,
      set.ident    = FALSE
    )
    vars_regress <- c("percent.mt", "S.Score", "G2M.Score")
    cat("  → Cell cycle scores computed for", line_name, "\n")
  }, error = function(e) {
    # If cell cycle genes not found (very small object), skip regression
    cat("  ⚠️  Cell cycle scoring failed for", line_name,
        "— regressing percent.mt only\n")
    vars_regress <<- "percent.mt"
  })

  # ── SCTransform ─────────────────────────────────────────────────────────
  obj <- SCTransform(
    obj,
    vars.to.regress  = vars_regress,
    variable.features.n = 3000,
    vst.flavor       = "v2",     # Pearson residuals v2 (faster, more stable)
    verbose          = FALSE
  )

  cat("  ✅", line_name, "SCT complete |",
      length(VariableFeatures(obj)), "HVGs\n")
  return(obj)
})
names(cell_line_list) <- names(SplitObject(MalignantCD4T_raw, split.by = "cell_line"))

# ── Step 3: Collect per-line HVGs ─────────────────────────────────────────
# Strategy: rank genes by how many cell lines they are variable in.
# Genes variable across multiple lines are robust HVGs not dominated
# by any single line's batch effect.

all_hvg_lists <- lapply(cell_line_list, VariableFeatures)
hvg_freq <- sort(table(unlist(all_hvg_lists)), decreasing = TRUE)

# Select top 3000 genes by cross-line frequency
n_hvgs      <- min(3000, length(hvg_freq))
shared_hvgs <- names(hvg_freq)[1:n_hvgs]

cat("\nTop 3000 HVGs selected by cross-line frequency\n")
cat("Genes variable in all", length(cell_line_list), "lines:",
    sum(hvg_freq == length(cell_line_list)), "\n")
cat("Genes variable in ≥3 lines:",
    sum(hvg_freq >= 3), "\n")

# ── Step 4: Merge SCT-normalised cell line objects ─────────────────────────
# merge() on SCT objects concatenates the corrected count matrices.
# We do NOT RunIntegration here — MapQuery will handle cross-dataset
# correction via anchor-based transfer.
cat("\nMerging per-line SCT objects...\n")

MalignantCD4T <- merge(
  x    = cell_line_list[[1]],
  y    = cell_line_list[-1],
  merge.data = TRUE    # concatenate SCT data layers
)

# ── Step 5: Set unified VariableFeatures on merged object ─────────────────
VariableFeatures(MalignantCD4T) <- shared_hvgs

# ── Step 6: Run PCA on merged SCT object ──────────────────────────────────
# PCA is needed for FindTransferAnchors (reference.reduction = "pca")
# We run it on the merged data so each cell has a PCA embedding consistent
# with the shared HVG space.
cat("Running PCA on merged malignant object...\n")

# Note: JoinLayers() is Seurat v5 only and not needed here.
# In Seurat v4, merge() with merge.data = TRUE already concatenates
# the SCT data slot directly — no layer joining required.

MalignantCD4T <- ScaleData(
  MalignantCD4T,
  features = shared_hvgs,
  assay    = "SCT",
  verbose  = FALSE
)
MalignantCD4T <- RunPCA(
  MalignantCD4T,
  features = shared_hvgs,
  assay    = "SCT",
  npcs     = 30,
  verbose  = FALSE
)

cat("✅ Per-cell-line SCT + merge complete\n")
cat("Final merged object:\n")
cat("  Cells:", ncol(MalignantCD4T), "\n")
cat("  VariableFeatures:", length(VariableFeatures(MalignantCD4T)), "\n")
cat("  PCA dims:", ncol(Embeddings(MalignantCD4T, "pca")), "\n")
cat("Cell line distribution:\n")
print(table(MalignantCD4T$cell_line))

# Clean up per-line list to free memory
rm(MalignantCD4T_raw, cell_line_list)
gc()

3 Reference Trajectory (Monocle3)

3.1 Build CDS from Reference

cat("CDS built:", ncol(cds), "cells\n")
CDS built: 11466 cells
cat("Clusters:", nlevels(factor(colData(cds)$seurat_clusters)), "\n")
Clusters: 8 
cat("Partitions:", nlevels(partitions(cds)), "\n")
Partitions: 0 
cat("\nCell type breakdown in CDS:\n")

Cell type breakdown in CDS:
print(table(colData(cds)$predicted.celltype.l2))

    CD4 Naive       CD4 TCM       CD4 TEM CD4 Temra/CTL          Treg 
         2037          9067           145            10           207 

3.2 Learn Trajectory Graph

cat("\n✅ Principal graph learned!\n")

✅ Principal graph learned!
cat("Nodes:", length(igraph::V(principal_graph(cds)$UMAP)), "\n")
Nodes: 349 
cat("Branch points:", sum(igraph::degree(principal_graph(cds)$UMAP) > 2), "\n")
Branch points: 18 
cat("Expect 1-2 branches (Treg divergence ± Tnaive-RTE)\n")
Expect 1-2 branches (Treg divergence ± Tnaive-RTE)
# ── VISUALIZE: Azimuth l2 (PRIMARY) + cell_type (SECONDARY) ───────────────
p1 <- plot_cells(cds,
                 color_cells_by = "predicted.celltype.l2",  # YOUR 5 labels
                 label_groups_by_cluster = FALSE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 graph_label_size = 3.5,
                 cell_size = 0.8) +
  scale_color_manual(values = azimuth_l2_colors) +
  ggtitle("Monocle3 Graph: Azimuth l2 (CD4 Naive → Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

p2 <- plot_cells(cds,
                 color_cells_by = "cell_type",
                 label_groups_by_cluster = FALSE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 graph_label_size = 3.5,
                 cell_size = 0.8) +
  ggtitle("Monocle3 Graph: Cell Types") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

p1 | p2

4 Fixed learn-graph Visualization

umap_coords <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_coords) <- c("UMAP1", "UMAP2")   # rename for clarity

umap_coords$l2        <- reference_integrated$predicted.celltype.l2
umap_coords$cell_type <- reference_integrated$cell_type

# Centroid per Azimuth l2 state
label_coords <- umap_coords %>%
  dplyr::group_by(l2) %>%
  dplyr::summarise(
    UMAP1 = median(UMAP1),
    UMAP2 = median(UMAP2),
    .groups = "drop"
  )

# Centroid per cell_type
label_coords2 <- umap_coords %>%
  dplyr::group_by(cell_type) %>%
  dplyr::summarise(
    UMAP1 = median(UMAP1),
    UMAP2 = median(UMAP2),
    .groups = "drop"
  )

library(ggrepel)

# Azimuth l2 plot
p1 <- plot_cells(
        cds,
        color_cells_by = "predicted.celltype.l2",
        label_groups_by_cluster = FALSE,
        label_leaves        = FALSE,
        label_branch_points = FALSE,
        cell_size           = 0.7,
        show_trajectory_graph = TRUE
      ) +
  scale_color_manual(values = azimuth_l2_colors, name = "State") +
  geom_label_repel(
    data = label_coords,
    aes(x = UMAP1, y = UMAP2, label = l2),
    size = 4, fontface = "bold",
    fill = "white", alpha = 0.85,
    box.padding = 0.5, label.padding = 0.3,
    segment.color = "grey40"
  ) +
  ggtitle("Monocle3 Graph: Azimuth l2 (CD4 Naive → Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "none")

# cell_type plot
p2 <- plot_cells(
        cds,
        color_cells_by = "cell_type",
        label_groups_by_cluster = FALSE,
        label_leaves        = FALSE,
        label_branch_points = FALSE,
        cell_size           = 0.7,
        show_trajectory_graph = TRUE
      ) +
  geom_label_repel(
    data = label_coords2,
    aes(x = UMAP1, y = UMAP2, label = cell_type),
    size = 3.5, fontface = "bold",
    fill = "white", alpha = 0.85,
    box.padding = 0.5,
    segment.color = "grey40"
  ) +
  ggtitle("Monocle3 Graph: Cell Types") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "none")

p1 | p2


p1 


p2

4.1 Set Root & Order Cells by Pseudotime

cat("\nPseudotime summary (finite values):\n")

Pseudotime summary (finite values):
print(summary(reference_integrated$monocle3_pseudotime[
  is.finite(reference_integrated$monocle3_pseudotime)
]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   9.653  17.619  17.918  26.763  33.713 
# ── Plot pseudotime on trajectory ─────────────────────────────────────────
plot_cells(
  cds,
  color_cells_by    = "pseudotime",
  label_cell_groups = FALSE,
  cell_size         = 0.8,
  show_trajectory_graph = TRUE
) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
  ggtitle("Reference CD4+ T Cell Pseudotime\n(Root = Tnaive, Terminal = Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))


# ── Pseudotime on Seurat UMAP ─────────────────────────────────────────────
FeaturePlot(
  reference_integrated,
  features  = "monocle3_pseudotime",
  reduction = "umap",
  cols      = c("lightblue", "darkblue", "red"),
  label     = TRUE,
  pt.size   = 0.5
) +
  ggtitle("Reference UMAP — Pseudotime") +
  theme(plot.title = element_text(hjust = 0.5))

5 Visualization

# ── Identify root node in the naive cluster ────────────────────────────────
# Root = cells in the Tnaive cluster (CCR7+SELL+TCF7+) = cluster 0
# Using the principal graph node closest to the Tnaive centroid ensures
# pseudotime=0 is biologically anchored to the most undifferentiated state.

cat("Available Azimuth l2 labels:\n")
Available Azimuth l2 labels:
print(sort(unique(colData(cds)$predicted.celltype.l2)))
[1] CD4 Naive     CD4 TCM       CD4 TEM       CD4 Temra/CTL Treg         
Levels: CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
# Find naive cells — use both Azimuth label and our cell_type annotation
naive_ids <- rownames(colData(cds))[
  grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2, ignore.case = TRUE) |
  grepl("Tnaive", as.character(colData(cds)$cell_type), ignore.case = FALSE)
]

cat("\nNaive root cells identified:", length(naive_ids), "\n")

Naive root cells identified: 5739 
if (length(naive_ids) == 0) {
  stop("❌ No naive root cells found. Check label strings above and adjust grep patterns.")
}

# ── Function to find the principal graph node closest to naive centroid ────
get_root_node <- function(cds, cell_ids) {
  closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
  closest <- as.matrix(closest[colnames(cds), ])
  igraph::V(principal_graph(cds)$UMAP)$name[
    as.numeric(names(which.max(table(closest[cell_ids, ]))))
  ]
}

root_node <- get_root_node(cds, naive_ids)
cat("Root node:", root_node, "\n")
Root node: Y_338 
# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)

# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)

# Replace Inf (unreachable cells) with NA for clean plotting
reference_integrated$monocle3_pseudotime[
  !is.finite(reference_integrated$monocle3_pseudotime)
] <- NA

cat("\nPseudotime summary (finite values):\n")

Pseudotime summary (finite values):
print(summary(reference_integrated$monocle3_pseudotime[
  is.finite(reference_integrated$monocle3_pseudotime)
]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   9.653  17.619  17.918  26.763  33.713 
# ── Plot pseudotime on trajectory ─────────────────────────────────────────
plot_cells(
  cds,
  color_cells_by    = "pseudotime",
  label_cell_groups = FALSE,
  cell_size         = 0.8,
  show_trajectory_graph = TRUE
) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
  ggtitle("Reference CD4+ T Cell Pseudotime\n(Root = Tnaive, Terminal = Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))


# ── Pseudotime on Seurat UMAP + state labels ─────────────────────────────
library(dplyr)
library(ggrepel)

umap_df <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_df) <- c("UMAP1", "UMAP2")
umap_df$pt <- reference_integrated$monocle3_pseudotime
umap_df$l2 <- reference_integrated$predicted.celltype.l2

umap_df <- umap_df[is.finite(umap_df$pt) & !is.na(umap_df$l2), ]

label_coords <- umap_df %>%
  group_by(l2) %>%
  summarise(
    UMAP1 = median(UMAP1),
    UMAP2 = median(UMAP2),
    .groups = "drop"
  )

p_umap <- FeaturePlot(
  reference_integrated,
  features  = "monocle3_pseudotime",
  reduction = "umap",
  cols      = c("lightblue", "green", "red"),
  label     = FALSE,
  pt.size   = 0.5
)

p_umap +
  geom_label_repel(
    data = label_coords,
    aes(x = UMAP1, y = UMAP2, label = l2),
    inherit.aes = FALSE,
    size = 4,
    fontface = "bold",
    fill = "white",
    alpha = 0.85,
    box.padding = 0.4,
    label.padding = 0.25,
    segment.color = "grey40"
  ) +
  ggtitle("Reference UMAP — Pseudotime with CD4 Naive / TCM / TEM / Temra/CTL / Treg labels") +
  theme(plot.title = element_text(hjust = 0.5))

# ── DEFINE MILESTONES: Key CD4 states along your trajectory ───────────────
# Based on your Azimuth l2 + pseudotime quantiles

library(dplyr)

# 1. Get state-specific pseudotime ranges
pt_ranges <- reference_integrated@meta.data %>%
  filter(is.finite(monocle3_pseudotime)) %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n = n(),
    pt_min = round(min(monocle3_pseudotime), 2),
    pt_max = round(max(monocle3_pseudotime), 2),
    pt_median = round(median(monocle3_pseudotime), 2),
    .groups = "drop"
  ) %>%
  arrange(pt_median)

print("Pseudotime by state (milestone order):")
[1] "Pseudotime by state (milestone order):"
print(pt_ranges)

# 2. Define milestones (pseudotime quantiles + biology)
milestones <- data.frame(
  milestone = c("Naive", "Central_Memory", "Effector_Memory", "Terminal_Effector"),
  pseudotime = c(0.1, 0.8, 1.4, 2.0),  # Your trajectory quantiles
  state = c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL")
)

print("Milestones defined:")
[1] "Milestones defined:"
print(milestones)

# 3. Assign cells to milestones — FIXED for YOUR 5 states
ref_milestones <- reference_integrated@meta.data %>%
  filter(is.finite(monocle3_pseudotime)) %>%
  mutate(
    milestone = case_when(
      # Naive: pt < 25th percentile
      monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.25, na.rm = TRUE) ~ "Naive",
      
      # Central Memory (TCM): 25th-60th percentile  
      monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.60, na.rm = TRUE) ~ "Central_Memory",
      
      # Treg + TEM: 60th-85th percentile (branch + effector memory)
      predicted.celltype.l2 == "Treg" | 
      (monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.85, na.rm = TRUE)) ~ "Regulatory_Effector",
      
      # Terminal: TEMRA/CTL (top 15%)
      TRUE ~ "Terminal_Effector"
    ),
    
    # Simplified state mapping for coloring
    state_group = case_when(
      predicted.celltype.l2 == "CD4 Naive" ~ "Naive",
      predicted.celltype.l2 == "CD4 TCM" ~ "Central_Memory", 
      predicted.celltype.l2 == "Treg" ~ "Treg",
      predicted.celltype.l2 == "CD4 TEM" ~ "Effector_Memory",
      predicted.celltype.l2 == "CD4 Temra/CTL" ~ "Terminal_Effector"
    )
  )

cat("Milestone distribution:\n")
Milestone distribution:
print(table(ref_milestones$milestone))

     Central_Memory               Naive Regulatory_Effector   Terminal_Effector 
               4013                2867                2866                1720 
cat("\nState validation:\n") 

State validation:
print(table(ref_milestones$predicted.celltype.l2, ref_milestones$state_group))
               
                Central_Memory Effector_Memory Naive Terminal_Effector Treg
  CD4 Naive                  0               0  2037                 0    0
  CD4 TCM                 9067               0     0                 0    0
  CD4 TEM                    0             145     0                 0    0
  CD4 Temra/CTL              0               0     0                10    0
  Treg                       0               0     0                 0  207

5.1 Validate Pseudotime Against Cell Type

# ── Guard: only run if monocle3_pseudotime has already been computed ──────
# This column is created in the trajectory chunk (Section 2).
# If you are running this script for the first time on a freshly loaded
# reference (without pseudotime), this chunk will skip gracefully.
# Re-run after the trajectory section completes.

if (!"monocle3_pseudotime" %in% colnames(reference_integrated@meta.data)) {
  message(
    "⚠️  monocle3_pseudotime not yet computed — skipping validation plots.\n",
    "   Run Section 2 (Reference Trajectory) first, then re-run this chunk."
  )
} else {

# ── Violin: pseudotime distribution per Azimuth l2 label ─────────────────
# CONFIRMED label order in this dataset (low → high pseudotime):
#   CD4 Naive → CD4 TCM → CD4 TEM → CD4 Temra/CTL (highest)
#   Treg: separate branch, variable pseudotime
#
# We use predicted.celltype.l2 as PRIMARY label throughout — consistent
# with the rest of the analysis.

pt_data <- data.frame(
  pseudotime = reference_integrated$monocle3_pseudotime,
  l2         = reference_integrated$predicted.celltype.l2
) %>%
  filter(is.finite(pseudotime), !is.na(l2))

# Order l2 labels by median pseudotime
l2_order <- pt_data %>%
  group_by(l2) %>%
  summarise(med_pt = median(pseudotime, na.rm = TRUE)) %>%
  arrange(med_pt) %>%
  pull(l2)

pt_data$l2 <- factor(pt_data$l2, levels = l2_order)

# ── Violin plot coloured by Azimuth l2 ────────────────────────────────────
p_violin <- ggplot(pt_data, aes(x = l2, y = pseudotime, fill = l2)) +
  geom_violin(scale = "width", alpha = 0.85, trim = TRUE) +
  geom_boxplot(width = 0.12, fill = "white", outlier.size = 0.3) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
  theme_classic(base_size = 11) +
  theme(
    axis.text.x  = element_text(angle = 40, hjust = 1, size = 9),
    legend.position = "none",
    plot.title   = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "Pseudotime Distribution per Azimuth l2 Label (Reference)",
    x = "", y = "Monocle3 Pseudotime"
  )

# ── Ridge plot — same data, different view ────────────────────────────────
p_ridge <- ggplot(pt_data, aes(x = pseudotime, y = l2, fill = l2)) +
  geom_density_ridges(alpha = 0.75, scale = 1.2, rel_min_height = 0.01) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
  theme_classic(base_size = 10) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(
    title = "Pseudotime Density by Azimuth l2 Label",
    x = "Pseudotime", y = ""
  )

p_violin | p_ridge

} # end else (monocle3_pseudotime exists)

6 Prepare Reference for MapQuery Projection

cat("Reference HVGs available:", length(ref_hvgs), "\n")
Reference HVGs available: 2902 
cat("PCA dims available:", ncol(Embeddings(reference_integrated, "pca")), "\n")
PCA dims available: 50 
cat("✅ Reference ready for MapQuery\n")
✅ Reference ready for MapQuery

7 Map Malignant CD4+ T Cells onto Reference

7.1 Find Transfer Anchors

# ══════════════════════════════════════════════════════════════════════════
# FindTransferAnchors
# ══════════════════════════════════════════════════════════════════════════
# normalization.method = "SCT": both objects are SCT-normalised
# reference.reduction  = "pca": anchors in the reference PCA space
# dims = 1:20: consistent with reference clustering
#
# KEY FIX: common_features now computed from per-line SCT HVGs (shared_hvgs)
# intersected with reference HVGs. This ensures anchors are found on genes
# that are variable in BOTH the reference biology AND each malignant line.
# Previously (merged SCT), common_features were dominated by batch-driven
# HVGs from one large SCT model → biased anchors → all cells → TCM.
# ══════════════════════════════════════════════════════════════════════════

DefaultAssay(reference_integrated) <- "SCT"
DefaultAssay(MalignantCD4T)        <- "SCT"

ref_hvgs <- VariableFeatures(reference_integrated, assay = "SCT")
if (length(ref_hvgs) == 0) {
  ref_hvgs <- rownames(GetAssayData(reference_integrated, assay = "SCT",
                                     layer = "scale.data"))
}

# VariableFeatures on merged malignant = shared_hvgs set in per-line SCT block
query_hvgs <- VariableFeatures(MalignantCD4T)

common_features <- intersect(ref_hvgs, query_hvgs)
cat("Reference HVGs:", length(ref_hvgs), "\n")
cat("Query HVGs (per-line consensus):", length(query_hvgs), "\n")
cat("Common HVGs for anchor finding:", length(common_features), "\n")

if (length(common_features) < 1500) {
  warning(paste0(
    "⚠️  Only ", length(common_features), " common features found.\n",
    "   This is below the recommended minimum of 1500.\n",
    "   Check that both objects used SCTransform with similar gene sets.\n",
    "   Consider increasing nfeatures in SCTransform to 4000."
  ))
} else {
  cat("✅ Sufficient common features for robust anchor finding\n")
}


# ── Clean and rank common_features ──────────────────────────────────────────

# 1. Remove ribosomal, mitochondrial, heat shock, and other known batch genes
#    These are highly variable due to library quality, not biology.
junk_pattern <- "^RPL|^RPS|^MT-|^HSP|^HSPA|^HSPB|^HSPD|^HSPE|^HSPH|^SNHG|^MALAT1|^NEAT1|^XIST"
n_before <- length(common_features)
common_features_clean <- common_features[!grepl(junk_pattern, common_features)]
cat("Removed junk genes:", n_before - length(common_features_clean), "\n")
cat("Clean common features:", length(common_features_clean), "\n")

# 2. Rank by mean HVG rank across reference and query
#    SCTransform stores residual variance per gene — higher = more variable.
#    Genes ranked highly in BOTH objects are robustly variable (biology-driven).
ref_var_meta  <- reference_integrated[["SCT"]]@meta.features
query_var_meta <- MalignantCD4T[["SCT"]]@meta.features

# residual_variance column is set by SCTransform
if ("residual_variance" %in% colnames(ref_var_meta) &&
    "residual_variance" %in% colnames(query_var_meta)) {

  ref_ranks   <- ref_var_meta[common_features_clean, "residual_variance"]
  query_ranks <- query_var_meta[common_features_clean, "residual_variance"]

  # Replace NA with 0 (gene not in that object's variance table)
  ref_ranks[is.na(ref_ranks)]     <- 0
  query_ranks[is.na(query_ranks)] <- 0

  mean_var <- (ref_ranks + query_ranks) / 2
  common_features <- common_features_clean[order(mean_var, decreasing = TRUE)]

  cat("Top 20 ranked features (should be T cell biology):\n")
  print(head(common_features, 20))

} else {
  # Fallback: just use cleaned list without ranking
  cat("⚠️  residual_variance not found in SCT meta.features, using unranked clean list\n")
  common_features <- common_features_clean
}

# 3. Sanity check: key T cell marker genes present?
markers_to_check <- c("CCR7", "SELL", "TCF7", "IL7R", "GZMB", "PRF1",
                      "FOXP3", "CXCR5", "TOX", "PDCD1", "HAVCR2", "TIGIT",
                      "CD44", "CD69", "IFNG", "TNF", "EOMES", "TBX21")
found_markers <- markers_to_check[markers_to_check %in% common_features]
cat("\nKey T cell markers in feature set:", length(found_markers), "/", length(markers_to_check), "\n")
print(found_markers)

if (length(found_markers) < 5) {
  warning(paste0(
    "⚠️  Fewer than 5 canonical T cell markers in common_features.\n",
    "   The feature set may still be dominated by non-specific genes.\n",
    "   Consider: did SCTransform use sufficient nfeatures (3000-4000)?\n",
    "   Run: length(VariableFeatures(reference_integrated)) to check."
  ))
}


cat("\n=== Finding transfer anchors ===\n")

# Diagnostics
cat("Reference cells:", ncol(reference_integrated), "\n")
cat("Query cells:    ", ncol(MalignantCD4T), "\n")
cat("Common features:", length(common_features), "\n")

# Use as many dims as both objects support (up to 30)
dims_to_use <- min(30,
                   ncol(Embeddings(reference_integrated, "pca")),
                   ncol(Embeddings(MalignantCD4T, "pca")))
cat("Using dims 1:", dims_to_use, "\n")

# KEY CHANGES vs previous run:
#   k.anchor = 10   (was default 5)  → more initial anchor candidates
#   k.filter = 500  (was default 200) → less aggressive pruning on 40k query
#   dims = 1:30     (was 1:20)        → capture more variance in tumour cells
anchors <- FindTransferAnchors(
  reference            = reference_integrated,
  query                = MalignantCD4T,
  features             = common_features,
  normalization.method = "SCT",
  reference.reduction  = "pca",
  dims                 = 1:dims_to_use,
  k.anchor             = 10,
  k.filter             = 500,
  k.score              = 30,
  verbose              = TRUE
)

n_anchors    <- nrow(anchors@anchors)
anchor_ratio <- ncol(MalignantCD4T) / n_anchors
cat("Anchors found:", n_anchors, "\n")
cat("Cells per anchor:", round(anchor_ratio, 1), "\n")

if (anchor_ratio > 8) {
  warning(paste0(
    "⚠️  Low anchor density (1:", round(anchor_ratio, 1), ").\n",
    "   Ideal is 1:5 or better. Inspect head(common_features) for batch genes,\n",
    "   and check query UMAP projection before trusting pseudotime values."
  ))
} else {
  cat("✅ Anchor density acceptable\n")
}

7.2 MapQuery — Project onto Reference UMAP

cat("✅ MapQuery complete\n")
✅ MapQuery complete
cat("Mapped malignant cells:", ncol(mapped_MalignantCD4T), "\n")
Mapped malignant cells: 40695 
cat("\nTransferred metadata columns:\n")

Transferred metadata columns:
transferred_cols <- grep("^predicted\\.", colnames(mapped_MalignantCD4T@meta.data), value = TRUE)
print(transferred_cols)
 [1] "predicted.celltype.l1.score"           "predicted.celltype.l1"                 "predicted.celltype.l2.score"          
 [4] "predicted.celltype.l2"                 "predicted.celltype.l3.score"           "predicted.celltype.l3"                
 [7] "predicted.celltype.l1_backup"          "predicted.celltype.l2_backup"          "predicted.celltype.l3_backup"         
[10] "predicted.predicted.celltype.l2.score" "predicted.predicted.celltype.l2"       "predicted.pseudotime.score"           
[13] "predicted.pseudotime"                  "predicted.seurat_clusters.score"       "predicted.seurat_clusters"            
cat("\nTransferred pseudotime summary:\n")

Transferred pseudotime summary:
print(summary(mapped_MalignantCD4T$predicted.pseudotime))
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.01186 24.06833 24.84493 23.88351 27.13900 33.71313 
cat("\nTransferred Azimuth l2 distribution (PRIMARY):\n")

Transferred Azimuth l2 distribution (PRIMARY):
print(table(mapped_MalignantCD4T$predicted.predicted.celltype.l2))

CD4 Naive   CD4 TCM   CD4 TEM      Treg 
      108     40562        20         5 
cat("\nTransferred cluster distribution (secondary reference):\n")

Transferred cluster distribution (secondary reference):
print(table(mapped_MalignantCD4T$predicted.seurat_clusters))

    0     1    11     2     3     4     5     8     9 
 2188 27830    37  3779  1933  3107  1605    66   150 
# ── Sanity check: distribution should NOT be 99%+ in one state ────────────
l2_dist   <- prop.table(table(mapped_MalignantCD4T$predicted.predicted.celltype.l2))
max_state <- names(which.max(l2_dist))
max_pct   <- round(100 * max(l2_dist), 1)

if (max_pct > 90) {
  warning(paste0(
    "⚠️  ", max_pct, "% of cells mapped to '", max_state, "'.\n",
    "   This suggests a batch/SCT issue. Verify per-cell-line SCT ran correctly\n",
    "   and that common_features contain differentiation genes, not batch genes."
  ))
} else {
  cat("\n✅ State distribution looks diverse — projection appears unbiased\n")
  cat("   Most common state:", max_state, "(", max_pct, "% of cells)\n")
}

8 Visualise Projection on Reference UMAP

8.1 Reference + Malignant Overlay (Pseudotime)


# Coerce pseudotime from character to numeric (Seurat transfer quirk)
mapped_MalignantCD4T$predicted.pseudotime <- as.numeric(
  mapped_MalignantCD4T$predicted.pseudotime
)

# Verify
cat("Class:", class(mapped_MalignantCD4T$predicted.pseudotime), "\n")
Class: numeric 
cat("Range:", range(mapped_MalignantCD4T$predicted.pseudotime, na.rm = TRUE), "\n")
Range: 0.01186257 33.71313 
ref_umap <- data.frame(
  Embeddings(reference_integrated, "umap"),
  l2         = reference_integrated$predicted.celltype.l2,
  pseudotime = reference_integrated$monocle3_pseudotime,
  type       = "Reference"
)
colnames(ref_umap)[1:2] <- c("UMAP_1", "UMAP_2")

query_umap <- data.frame(
  Embeddings(mapped_MalignantCD4T, "ref.umap"),
  pseudotime = mapped_MalignantCD4T$predicted.pseudotime,   # numeric after fix
  cell_line  = mapped_MalignantCD4T$cell_line,
  l2_q       = mapped_MalignantCD4T$predicted.predicted.celltype.l2,
  type       = "Malignant"
)
colnames(query_umap)[1:2] <- c("UMAP_1", "UMAP_2")

# ── Cell line palette ─────────────────────────────────────────────────────
cell_line_palette <- colorRampPalette(brewer.pal(8, "Dark2"))(
  length(unique(query_umap$cell_line))
)
names(cell_line_palette) <- sort(unique(query_umap$cell_line))

all_l2_labels  <- unique(c(as.character(ref_umap$l2),
                             as.character(query_umap$l2_q)))
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, all_l2_labels)

# ── Plot 1: Reference grey, malignant coloured by pseudotime ──────────────
p1 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = reference_color, size = 0.4, alpha = 0.6) +
  geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
             aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
             size = 1.2, alpha = 0.9) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime",
                        na.value = "grey40") +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Malignant CD4T Projected onto Reference\n(Coloured by transferred pseudotime)")

# ── Plot 2: Reference coloured by Azimuth l2, malignant cells in red ──────
p2 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2, color = l2),
             size = 0.4, alpha = 0.7) +
  scale_color_manual(values = azimuth_l2_colors,
                     name   = "Azimuth l2\n(reference)",
                     na.value = "grey60") +
  geom_point(data = query_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = malignant_color, size = 1.0, alpha = 0.8, shape = 16) +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.key.size = unit(0.4, "cm"),
        legend.text = element_text(size = 8)) +
  ggtitle("Reference Azimuth l2 (colour)\n+ Malignant Cells (red overlay)")

p1 | p2

8.2 Azimuth l2 Annotation Overlay

p3 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = reference_color, size = 0.4, alpha = 0.6) +
  geom_point(data = query_umap,
             aes(x = UMAP_1, y = UMAP_2, color = l2_q),
             size = 1.2, alpha = 0.9) +
  scale_color_manual(values = azimuth_l2_colors,
                     name   = "Azimuth l2\n(transferred)",
                     na.value = "grey40") +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Malignant CD4T — Transferred Azimuth l2 Labels")

p4 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = reference_color, size = 0.4, alpha = 0.5) +
  geom_point(data = query_umap,
             aes(x = UMAP_1, y = UMAP_2, color = cell_line),
             size = 1.0, alpha = 0.9) +
  scale_color_manual(values = cell_line_palette, name = "Cell line") +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Malignant CD4T per Cell Line")

p3 | p4

8.3 Per Cell Line Facet — State Mapping

ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = "grey60", size = 0.3, alpha = 0.5) +
  geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
             aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
             size = 0.9, alpha = 0.85) +
  facet_wrap(~ cell_line, ncol = 4) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime",
                        na.value = "grey60") +
  theme_classic(base_size = 10) +
  theme(strip.text = element_text(size = 9, face = "bold"),
        strip.background = element_rect(fill = "#E8F4FD"),
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Per Cell Line: Projection onto Reference UMAP (Pseudotime)")

9 Quantification — State Assignment

9.1 Assign Each Malignant Cell to a Reference State

# ── pseudotime bins based on reference quantiles ───────────────────────────
pt_vals   <- reference_integrated$monocle3_pseudotime
pt_vals   <- pt_vals[is.finite(pt_vals)]
pt_breaks <- quantile(pt_vals, probs = c(0, 1/3, 2/3, 1))

mapped_MalignantCD4T$state_azimuth_l2  <- mapped_MalignantCD4T$predicted.predicted.celltype.l2
mapped_MalignantCD4T$state_ref_cluster <- mapped_MalignantCD4T$predicted.seurat_clusters
mapped_MalignantCD4T$pseudotime_value  <- mapped_MalignantCD4T$predicted.pseudotime  # numeric

mapped_MalignantCD4T$pseudotime_bin <- cut(
  mapped_MalignantCD4T$pseudotime_value,
  breaks         = pt_breaks,
  labels         = c("Early (Tnaive-like)", "Mid (TCM/TEM-like)", "Late (Temra-like)"),
  include.lowest = TRUE
)

cat("=== State assignment complete ===\n")
=== State assignment complete ===
cat("\nAzimuth l2 state (PRIMARY):\n")

Azimuth l2 state (PRIMARY):
print(table(mapped_MalignantCD4T$state_azimuth_l2))

CD4 Naive   CD4 TCM   CD4 TEM      Treg 
      108     40562        20         5 
cat("\nReference cluster (cross-check):\n")

Reference cluster (cross-check):
print(table(mapped_MalignantCD4T$state_ref_cluster))

    0     1    11     2     3     4     5     8     9 
 2188 27830    37  3779  1933  3107  1605    66   150 
cat("\nPseudotime bin:\n")

Pseudotime bin:
print(table(mapped_MalignantCD4T$pseudotime_bin))

Early (Tnaive-like)  Mid (TCM/TEM-like)   Late (Temra-like) 
               3892               20081               16722 

9.2 Quantification Tables

state_summary <- mapped_MalignantCD4T@meta.data %>%
  count(state_azimuth_l2, name = "n_cells") %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
  arrange(desc(n_cells))

cat("=== Overall Malignant CD4T State Distribution ===\n")
=== Overall Malignant CD4T State Distribution ===
print(state_summary)

state_per_line <- mapped_MalignantCD4T@meta.data %>%
  count(cell_line, state_azimuth_l2, name = "n_cells") %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
  ungroup() %>%
  arrange(cell_line, desc(n_cells))

cat("\n=== State Distribution per Cell Line ===\n")

=== State Distribution per Cell Line ===
print(state_per_line, n = Inf)

cluster_summary <- mapped_MalignantCD4T@meta.data %>%
  count(state_ref_cluster, name = "n_cells") %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
  arrange(as.numeric(as.character(state_ref_cluster)))

cat("\n=== Reference Cluster Distribution (cross-check) ===\n")

=== Reference Cluster Distribution (cross-check) ===
print(cluster_summary)

pt_bin_summary <- mapped_MalignantCD4T@meta.data %>%
  count(pseudotime_bin, name = "n_cells") %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1))

cat("\n=== Pseudotime Bin Summary ===\n")

=== Pseudotime Bin Summary ===
print(pt_bin_summary)

9.3 Bar Chart — State Proportions

p_bar_overall <- ggplot(state_per_line,
                         aes(x = cell_line, y = pct, fill = state_azimuth_l2)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70",
                    name = "Azimuth l2\nstate") +
  theme_classic(base_size = 11) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1),
        plot.title  = element_text(hjust = 0.5, face = "bold"),
        legend.key.size = unit(0.5, "cm")) +
  labs(title = "Malignant CD4T — State Composition per Cell Line\n(Azimuth l2 transferred labels)",
       x = "Cell Line", y = "% Cells")

pt_bin_line <- mapped_MalignantCD4T@meta.data %>%
  filter(!is.na(pseudotime_bin)) %>%
  count(cell_line, pseudotime_bin) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup()

bin_colors <- c(
  "Early (Tnaive-like)" = "#4575B4",
  "Mid (TCM/TEM-like)"  = "#FEE090",
  "Late (Temra-like)"   = "#D73027"
)

p_bar_bins <- ggplot(pt_bin_line,
                      aes(x = cell_line, y = pct, fill = pseudotime_bin)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = bin_colors, name = "Pseudotime\nBin") +
  theme_classic(base_size = 11) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1),
        plot.title  = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Malignant CD4T — Pseudotime Bin Distribution per Cell Line",
       x = "Cell Line", y = "% Cells")

p_bar_overall / p_bar_bins

9.4 Heatmap — State Proportions Across Cell Lines

library(pheatmap)

heatmap_df <- state_per_line %>%
  select(cell_line, state_azimuth_l2, pct) %>%
  pivot_wider(names_from  = state_azimuth_l2,
              values_from = pct,
              values_fill = 0)

heatmap_mat             <- as.matrix(heatmap_df[, -1])
rownames(heatmap_mat)   <- heatmap_df$cell_line

pheatmap(
  heatmap_mat,
  color            = colorRampPalette(c("white", "#FEE090", "#D73027"))(50),
  display_numbers  = TRUE,
  number_format    = "%.1f%%",
  fontsize_number  = 8,
  fontsize_row     = 10,
  fontsize_col     = 9,
  angle_col        = 45,
  cluster_rows     = TRUE,
  cluster_cols     = TRUE,
  main             = "% Malignant Cells per State (Azimuth l2)\nClustered by cell line similarity",
  border_color     = "white"
)

10 Pseudotime Analysis of Malignant Cells

10.1 Pseudotime Distribution

ref_pt <- data.frame(
  pseudotime = reference_integrated$monocle3_pseudotime[
    is.finite(reference_integrated$monocle3_pseudotime)
  ],
  source = "Healthy reference"
)

mal_pt <- data.frame(
  pseudotime = mapped_MalignantCD4T$pseudotime_value[
    is.finite(mapped_MalignantCD4T$pseudotime_value)
  ],
  source = "Malignant CD4T"
)

pt_combined <- bind_rows(ref_pt, mal_pt)

p_density <- ggplot(pt_combined, aes(x = pseudotime, fill = source)) +
  geom_density(alpha = 0.65, adjust = 1.2) +
  scale_fill_manual(
    values = c("Healthy reference" = "#4575B4", "Malignant CD4T" = malignant_color),
    name = ""
  ) +
  theme_classic(base_size = 12) +
  theme(legend.position = "top",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Pseudotime Distribution: Healthy Reference vs Malignant CD4T",
       subtitle = "Peak shift indicates differentiation arrest point",
       x = "Monocle3 Pseudotime", y = "Density")

mal_pt_line <- mapped_MalignantCD4T@meta.data %>%
  filter(is.finite(pseudotime_value)) %>%
  select(cell_line, pseudotime_value)

p_ridge_lines <- ggplot(mal_pt_line,
                         aes(x = pseudotime_value, y = cell_line, fill = cell_line)) +
  geom_density_ridges(alpha = 0.80, scale = 1.3, rel_min_height = 0.01,
                      quantile_lines = TRUE, quantiles = 2) +
  scale_fill_manual(values = cell_line_palette) +
  theme_classic(base_size = 11) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Pseudotime Distribution per Cell Line",
       subtitle = "Vertical line = median",
       x = "Pseudotime", y = "")

p_density | p_ridge_lines

10.2 Pseudotime Summary Statistics Table

pt_stats <- mapped_MalignantCD4T@meta.data %>%
  filter(is.finite(pseudotime_value)) %>%
  group_by(cell_line) %>%
  summarise(
    n_cells   = n(),
    mean_pt   = round(mean(pseudotime_value), 3),
    median_pt = round(median(pseudotime_value), 3),
    sd_pt     = round(sd(pseudotime_value), 3),
    q25_pt    = round(quantile(pseudotime_value, 0.25), 3),
    q75_pt    = round(quantile(pseudotime_value, 0.75), 3),
    pct_early = round(100 * mean(pseudotime_value < pt_breaks[2], na.rm = TRUE), 1),
    pct_mid   = round(100 * mean(pseudotime_value >= pt_breaks[2] &
                                  pseudotime_value < pt_breaks[3], na.rm = TRUE), 1),
    pct_late  = round(100 * mean(pseudotime_value >= pt_breaks[3], na.rm = TRUE), 1)
  ) %>%
  arrange(median_pt)

cat("=== Pseudotime Statistics per Cell Line ===\n")
=== Pseudotime Statistics per Cell Line ===
print(pt_stats, n = Inf)

ref_stats <- reference_integrated@meta.data %>%
  filter(is.finite(monocle3_pseudotime), !is.na(predicted.celltype.l2)) %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n_cells   = n(),
    mean_pt   = round(mean(monocle3_pseudotime), 3),
    median_pt = round(median(monocle3_pseudotime), 3)
  ) %>%
  arrange(median_pt)

cat("\n=== Reference Pseudotime per Azimuth l2 State (for comparison) ===\n")

=== Reference Pseudotime per Azimuth l2 State (for comparison) ===
print(ref_stats)

11 Integrated Summary Visualisation

11.1 Four-Panel Summary Figure

p_ref <- DimPlot(
  reference_integrated, group.by = "predicted.celltype.l2",
  reduction = "umap", label = TRUE, repel = TRUE,
  label.size = 3, pt.size = 0.4
) +
  scale_color_manual(values = azimuth_l2_colors, na.value = "grey40") +
  ggtitle("A. Reference CD4+ T Cells\n(Azimuth l2 — Proliferating removed)") +
  theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
  NoLegend()

p_pt_ref <- FeaturePlot(
  reference_integrated, features = "monocle3_pseudotime",
  reduction = "umap", cols = c("lightblue","yellow","red"), pt.size = 0.4
) +
  ggtitle("B. Reference Pseudotime\n(Naive → Temra axis)") +
  theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold"))

p_proj <- ggplot() +
  geom_point(data = ref_umap, aes(x = UMAP_1, y = UMAP_2),
             color = "grey88", size = 0.3, alpha = 0.6) +
  geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
             aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
             size = 1.0, alpha = 0.85) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
  ggtitle("C. Malignant CD4T Projected\n(pseudotime transferred)")

top_states   <- state_summary %>% head(8) %>% pull(state_azimuth_l2)
state_plot_df <- state_per_line %>% filter(state_azimuth_l2 %in% top_states)

p_quant <- ggplot(state_plot_df,
                   aes(x = reorder(cell_line, -pct), y = pct,
                       fill = state_azimuth_l2)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey40",
                    name = "Azimuth l2") +
  theme_classic(base_size = 10) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
        plot.title  = element_text(hjust = 0.5, size = 11, face = "bold"),
        legend.key.size = unit(0.4, "cm"),
        legend.text = element_text(size = 8)) +
  labs(title = "D. State Proportions per Cell Line\n(Azimuth l2 transferred)",
       x = "", y = "% Cells")

(p_ref | p_pt_ref) / (p_proj | p_quant) +
  plot_annotation(
    title = "Malignant CD4+ T Cell Differentiation State Mapping\n(Sézary Syndrome — Monocle3 Reference Trajectory)",
    theme = theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
  )

11.2 Pseudotime vs State Violin (publication-ready)

mal_state_pt <- mapped_MalignantCD4T@meta.data %>%
  filter(is.finite(pseudotime_value), !is.na(state_azimuth_l2)) %>%
  mutate(state = state_azimuth_l2)

state_order_mal <- mal_state_pt %>%
  group_by(state) %>%
  summarise(med = median(pseudotime_value)) %>%
  arrange(med) %>%
  pull(state)

mal_state_pt$state <- factor(mal_state_pt$state, levels = state_order_mal)

ggplot(mal_state_pt, aes(x = state, y = pseudotime_value, fill = state)) +
  geom_violin(scale = "width", alpha = 0.8, trim = TRUE) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.3) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
  theme_classic(base_size = 11) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Pseudotime of Malignant CD4T per Transferred State\n(Azimuth l2 labels)",
       x = "Transferred Differentiation State",
       y = "Transferred Pseudotime")

12 Save Outputs

saveRDS(mapped_MalignantCD4T,file.path(out_dir, "MalignantCD4T_mapped_monocle3_pseudotime_noProlif_ref.rds"))
Error: C stack usage  7970484 is too close to the limit

13 Session Info


sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-redhat-linux-gnu
Running under: Rocky Linux 9.7 (Blue Onyx)

Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS-OPENMP;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8        LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8    LC_PAPER=C.UTF-8      
 [8] LC_NAME=C              LC_ADDRESS=C           LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] pheatmap_1.0.13             future_1.69.0               scales_1.4.0                Matrix_1.7-4                ggrepel_0.9.6              
 [6] ggridges_0.5.7              viridis_0.6.5               viridisLite_0.4.3           RColorBrewer_1.1-3          patchwork_1.3.2            
[11] ggplot2_4.0.2               tibble_3.3.1                tidyr_1.3.2                 dplyr_1.2.0                 SeuratWrappers_0.4.0       
[16] monocle3_1.4.26             SingleCellExperiment_1.32.0 SummarizedExperiment_1.40.0 GenomicRanges_1.62.1        Seqinfo_1.0.0              
[21] IRanges_2.44.0              S4Vectors_0.48.0            MatrixGenerics_1.22.0       matrixStats_1.5.0           Biobase_2.70.0             
[26] BiocGenerics_0.56.0         generics_0.1.4              Seurat_5.4.0                SeuratObject_5.3.0          sp_2.2-1                   

loaded via a namespace (and not attached):
  [1] RcppAnnoy_0.0.23          splines_4.5.2             later_1.4.6               R.oo_1.27.1               polyclip_1.10-7           fastDummies_1.7.5        
  [7] lifecycle_1.0.5           Rdpack_2.6.6              globals_0.19.0            lattice_0.22-9            MASS_7.3-65               magrittr_2.0.4           
 [13] plotly_4.12.0             sass_0.4.10               rmarkdown_2.30            jquerylib_0.1.4           yaml_2.3.12               remotes_2.5.0            
 [19] httpuv_1.6.16             otel_0.2.0                glmGamPoi_1.22.0          sctransform_0.4.3         spam_2.11-3               spatstat.sparse_3.1-0    
 [25] reticulate_1.45.0         cowplot_1.2.0             pbapply_1.7-4             minqa_1.2.8               abind_1.4-8               Rtsne_0.17               
 [31] purrr_1.2.1               R.utils_2.13.0            irlba_2.3.7               listenv_0.10.0            spatstat.utils_3.2-1      goftest_1.2-3            
 [37] RSpectra_0.16-2           spatstat.random_3.4-4     fitdistrplus_1.2-6        parallelly_1.46.1         DelayedMatrixStats_1.32.0 codetools_0.2-20         
 [43] DelayedArray_0.36.0       tidyselect_1.2.1          farver_2.1.2              lme4_1.1-38               spatstat.explore_3.7-0    jsonlite_2.0.0           
 [49] progressr_0.18.0          survival_3.8-6            tools_4.5.2               ica_1.0-3                 Rcpp_1.1.1                glue_1.8.0               
 [55] gridExtra_2.3             SparseArray_1.10.8        xfun_0.56                 withr_3.0.2               BiocManager_1.30.27       fastmap_1.2.0            
 [61] boot_1.3-32               digest_0.6.39             rsvd_1.0.5                R6_2.6.1                  mime_0.13                 scattermore_1.2          
 [67] tensor_1.5.1              dichromat_2.0-0.1         spatstat.data_3.1-9       R.methodsS3_1.8.2         data.table_1.18.2.1       httr_1.4.8               
 [73] htmlwidgets_1.6.4         S4Arrays_1.10.1           uwot_0.2.4                pkgconfig_2.0.3           gtable_0.3.6              rsconnect_1.7.0          
 [79] lmtest_0.9-40             S7_0.2.1                  XVector_0.50.0            htmltools_0.5.9           dotCall64_1.2             png_0.1-8                
 [85] spatstat.univar_3.1-6     reformulas_0.4.4          knitr_1.51                rstudioapi_0.18.0         reshape2_1.4.5            nlme_3.1-168             
 [91] nloptr_2.2.1              proxy_0.4-29              zoo_1.8-15                cachem_1.1.0              stringr_1.6.0             KernSmooth_2.23-26       
 [97] parallel_4.5.2            miniUI_0.1.2              pillar_1.11.1             grid_4.5.2                vctrs_0.7.1               RANN_2.6.2               
[103] promises_1.5.0            beachmat_2.26.0           xtable_1.8-4              cluster_2.1.8.2           evaluate_1.0.5            cli_3.6.5                
[109] query_ranks rlang_1.1.7               future.apply_1.20.1       labeling_0.4.3            plyr_1.8.9                stringi_1.8.7            
[115] deldir_2.0-4              assertthat_0.2.1          lazyeval_0.2.2            spatstat.geom_3.7-0       RcppHNSW_0.6.0            sparseMatrixStats_1.22.0 
[121] shiny_1.12.1              rbibutils_2.4.1           ROCR_1.0-12               igraph_2.2.2              bslib_0.10.0             
---
title: "Malignant CD4 T Cell Trajectory Analysis"
subtitle: "Monocle3 pseudotime mapping"
author: "Nasir Mahmood Abbasi"
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
    highlight: tango
  html_document:
    toc: true
    df_print: paged
    number_sections: true
---


    

<!--    ═══════════════════════════════════════════════════════════════════════ -->
<!--      SCIENTIFIC RATIONALE & APPROACH COMMENTARY -->
<!--   ═══════════════════════════════════════════════════════════════════════ -->

<!--   WHY THIS APPROACH? -->
<!--   ────────────────── -->
<!--   Sézary Syndrome (SS) is a leukemic form of cutaneous T-cell lymphoma in which -->
<!--   malignant CD4+ T cells (Sézary cells) accumulate in blood, skin, and lymph nodes. -->
<!--   A central unresolved question is: WHERE along the normal CD4+ T cell differentiation -->
<!--   continuum do these malignant cells arrest or originate? -->

<!--   Classical bulk approaches (microarray, bulk RNA-seq) cannot resolve this because they -->
<!--   average across millions of cells. Single-cell RNA-seq + pseudotime trajectory analysis -->
<!--   allows us to: -->

<!--   1. BUILD a reference differentiation map from HEALTHY CD4+ T cells spanning all -->
<!--      known states: Tnaive → TCM → TEM → Temra, with Treg as a side branch. -->
<!--      This reference is the "normal atlas" of CD4+ T cell biology. -->

<!--   2. PROJECT malignant Sézary cells onto this healthy atlas using Seurat MapQuery -->
<!--      (anchored RPCA-based transfer). Each malignant cell gets coordinates in the -->
<!--      reference UMAP space — meaning we can see exactly WHERE it falls relative -->
<!--      to healthy differentiation states. -->

<!--   3. TRANSFER pseudotime from reference to query. Because the healthy trajectory -->
<!--      has a continuous pseudotime axis (Naive=0 → Temra=max), each projected -->
<!--      malignant cell inherits a pseudotime value. This tells us: -->
<!--        - Low pseudotime → malignant cell resembles naïve/early memory state -->
<!--        - High pseudotime → malignant cell resembles late effector/Temra state -->
<!--        - Intermediate → malignant cell is arrested at a specific memory checkpoint -->

<!--   4. QUANTIFY state distribution: By combining projected coordinates with Azimuth -->
<!--      predicted.celltype.l2 labels (transferred from reference), we can count what -->
<!--      fraction of malignant cells map to each healthy state (Tnaive, TCM, TEM, Temra, -->
<!--      Treg). This provides a molecular "fingerprint" of the malignant population. -->

<!--   WHY MONOCLE3 ON THE REFERENCE (NOT ON MALIGNANT CELLS)? -->
<!--   ──────────────────────────────────────────────────────── -->
<!--   Running Monocle3 on malignant cells alone would produce a trajectory reflecting -->
<!--   tumour heterogeneity — not the biological differentiation axis we want to measure -->
<!--   against. Instead: -->

<!--     Step 1: Learn the trajectory on HEALTHY reference cells only. -->
<!--             Monocle3 builds a principal graph through the healthy manifold. -->
<!--             Root = CD4 Tnaive cluster (CCR7+SELL+TCF7+). -->

<!--     Step 2: Freeze the reference UMAP model (return.model = TRUE in RunUMAP). -->

<!--     Step 3: Project malignant cells into this frozen space via MapQuery. -->
<!--             Malignant cells DO NOT alter the reference trajectory — -->
<!--             they are overlaid as "passengers" on the healthy topology. -->

<!--     Step 4: Transfer pseudotime and Azimuth l2 labels to projected cells. -->

<!--   This is methodologically equivalent to what Cerapio et al. did: -->
<!--   building a reference CD4+ trajectory and projecting tumour cells to identify -->
<!--   the differentiation state of arrest in Sézary cells. -->

<!--   WHICH OBJECT TO USE? -->
<!--   ──────────────────── -->
<!--   Use the ANNOTATED reference with Monocle3 trajectory already computed -->
<!--   AND with CD4 Proliferating cells removed. This object has: -->
<!--     - Integrated UMAP with return.model = TRUE          ✅ -->
<!--     - Seurat clusters 0-6 with cell_type annotation     ✅ -->
<!--     - monocle3_pseudotime column already computed       ✅ -->
<!--     - predicted.celltype.l2 (Azimuth) labels            ✅ -->
<!--     - CD4 Proliferating cluster removed                 ✅ -->
<!--     - Cluster 12 contamination removed                  ✅ -->

<!--   WHY REMOVE CD4 PROLIFERATING CELLS FROM THE REFERENCE? -->
<!--   ──────────────────────────────────────────────────────── -->
<!--   Proliferating CD4+ T cells (MKI67+, TOP2A+, CDK1+) represent cells -->
<!--   actively cycling through the cell cycle — not a stable differentiation -->
<!--   state. Including them in the trajectory would: -->
<!--     1. Create a spurious branch pulled by cell cycle signature (S/G2M genes) -->
<!--        rather than differentiation genes (CCR7, GZMK, FOXP3 etc.) -->
<!--     2. Distort the pseudotime axis — proliferating cells get intermediate -->
<!--        pseudotime values that don't reflect their true differentiation state -->
<!--     3. Confound the malignant cell projection — Sézary cells that happen -->
<!--        to be proliferating would map to the wrong reference state -->
<!--   Removing them produces a clean Tnaive → TCM → TEM → Temra axis. -->

<!--   DO NOT re-run SCTransform/integration on the reference at this step. -->
<!--   DO NOT use the raw pre-trajectory object — it lacks the UMAP model needed -->
<!--   for MapQuery projection. -->
<!--   DO NOT use the slingshot-ready object — same issue, no frozen UMAP model. -->

<!-- ══════════════════════════════════════════════════════════════════════════  -->


# Load Libraries
```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo    = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width  = 12,
  fig.height = 8
)

# Core single-cell
library(Seurat)
library(monocle3)
library(SeuratWrappers)

# Data wrangling
library(dplyr)
library(tidyr)
library(tibble)

# Visualisation
library(ggplot2)
library(patchwork)
library(RColorBrewer)
library(viridis)
library(ggridges)    # for pseudotime density ridge plots
library(ggrepel)     # for label repulsion

# Statistics & tables
library(Matrix)
library(scales)      # for percent formatting

options(future.globals.maxSize = 8e9)
set.seed(1234)

# ── Colour palettes ────────────────────────────────────────────────────────
# PRIMARY: Azimuth predicted.celltype.l2 colours — built DYNAMICALLY from
# actual labels in the data (after loading the reference object).
# These known Azimuth l2 CD4 T cell labels are pre-seeded here so colours
# are consistent and biologically ordered (blue=naive → red=effector).
# Any extra labels not listed here will be assigned colours automatically.

azimuth_l2_colors <- c(
  # ── Naive ──────────────────────────────────────────────────
  "CD4 Naive"          = "#2166AC",   # dark blue
  "CD4 TN"             = "#2166AC",   # alias
  # ── Central Memory ─────────────────────────────────────────
  "CD4 TCM"            = "#74ADD1",   # sky blue
  # ── Effector Memory ────────────────────────────────────────
  "CD4 TEM"            = "#FEE090",   # pale yellow
  # ⚠️  CONFIRMED label in this dataset: "CD4 Temra/CTL" (Azimuth v1.1+)
  "CD4 Temra/CTL"      = "#D73027",   # red  ← exact match to data
  "CD4 TEMRA"          = "#D73027",   # alias — older Azimuth (all caps)
  "CD4 TEMRA/CTL"      = "#D73027",   # alias — uppercase variant
  "CD4 CTL"            = "#D73027",   # alias — some reference builds
  # ── Regulatory ─────────────────────────────────────────────
  "Treg"               = "#762A83",   # dark purple  ← confirmed label in this data
  "Treg Naive"         = "#9970AB",   # medium purple — alias
  "Treg Memory"        = "#762A83",   # alias
  # ── Other CD4 subsets that Azimuth l2 may return ───────────
  "CD4 Proliferating"  = "#A6A6A6",   # grey  — NOTE: present in query cells only
  "Th1"                = "#F46D43",
  "Th17"               = "#FDAE61",
  "Th1/Th17"           = "#FEE08B",
  "Tfh"                = "#66C2A5",
  "CD4 TCM_1"          = "#9ECAE1",
  "CD4 TCM_2"          = "#6BAED6",
  "CD4 TCM_3"          = "#4292C6",
  "CD4 TEM_1"          = "#FDD0A2",
  "CD4 TEM_2"          = "#FDAE6B",
  "CD4 TEM_3"          = "#FD8D3C",
  "CD4 TEM_4"          = "#D94801"
)
# NOTE: CD4 Proliferating is kept in this palette (grey) because it may
# appear in the MALIGNANT cells after projection — malignant Sézary cells
# can be cycling. It is absent from the REFERENCE only.

malignant_color <- "#E31A1C"   # vivid red for malignant cell overlay
reference_color <- "grey85"    # reference background in overlay plots

# ── Helper: extend palette for any unexpected l2 labels ───────────────────
# Called after loading data to fill in any labels not in azimuth_l2_colors
extend_palette <- function(palette, labels) {
  missing_labels <- setdiff(labels, names(palette))
  if (length(missing_labels) > 0) {
    extra_cols <- colorRampPalette(RColorBrewer::brewer.pal(8, "Dark2"))(length(missing_labels))
    names(extra_cols) <- missing_labels
    palette <- c(palette, extra_cols)
  }
  palette
}
```


# Load Objects

## Load Reference (Healthy CD4+ T Cells with Trajectory)

```{r load-reference}
# ── CRITICAL: use the ANNOTATED reference with saved UMAP model ────────────
# This is the cleaned reference object with:
#   - CD4 Proliferating cluster removed (MKI67+/TOP2A+/CDK1+ cells)
#   - Cluster 12 contamination removed
#   - UMAP reduction with return.model = TRUE
#   - monocle3_pseudotime column computed on the clean trajectory
#   - cell_type annotation (clusters 0-6, no proliferating)
#   - predicted.celltype.l2 (Azimuth l2)
# ──────────────────────────────────────────────────────────────────────────


reference_integrated <- readRDS("../Part3_Slingshot_Trajectory/part4_Slingshot+Projection/cd4_ref_dual_trajectory.rds")

cat("=== Reference object summary ===\n")
cat("Cells:", ncol(reference_integrated), "\n")
cat("Assays:", paste(names(reference_integrated@assays), collapse = ", "), "\n")
cat("Reductions:", paste(names(reference_integrated@reductions), collapse = ", "), "\n")
cat("UMAP model present:", !is.null(reference_integrated@reductions$umap@misc$model), "\n")

# ── Confirm clusters present ───────────────────────────────────────────────
cat("\nClusters present:", paste(sort(unique(reference_integrated$seurat_clusters)),
                                 collapse = ", "), "\n")

# ── Confirm CD4 Proliferating cells are ABSENT ────────────────────────────
# Check 1: cell_type label should not contain "Prolif"
if (any(grepl("Prolif|prolif|cycling|Cycling", reference_integrated$cell_type,
              ignore.case = TRUE))) {
  warning("⚠️  Proliferating cell_type labels still detected! Check removal step.")
} else {
  cat("✅ No proliferating cell_type labels found\n")
}

# Check 2: Molecular signature — MKI67/TOP2A expression should be near zero
prolif_markers <- c("MKI67", "TOP2A", "PCNA", "CDK1", "STMN1")
prolif_present <- intersect(prolif_markers, rownames(reference_integrated))

if (length(prolif_present) > 0) {
  DefaultAssay(reference_integrated) <- "SCT"
  prolif_expr <- Matrix::colMeans(
    GetAssayData(reference_integrated, layer = "data")[prolif_present, , drop = FALSE]
  )
  cat("Proliferation marker score — max:", round(max(prolif_expr), 3),
      " | mean:", round(mean(prolif_expr), 4), "\n")
  cat("Cells with any prolif marker > 0.5:", sum(prolif_expr > 0.5),
      "(expect near zero after removal)\n")
  if (sum(prolif_expr > 0.5) > 50) {
    warning("⚠️  Substantial proliferation signal remains — verify removal was complete.")
  } else {
    cat("✅ Proliferating cells confirmed removed\n")
  }
}

# ── Cell type distribution ─────────────────────────────────────────────────
cat("\nCell type distribution (should be 6 types, no Proliferating):\n")
print(table(reference_integrated$cell_type))

cat("\nAzimuth l2 distribution:\n")
print(table(reference_integrated$predicted.celltype.l2))

# ── Finalise Azimuth l2 colour palette from actual labels in data ──────────
ref_l2_labels <- unique(as.character(reference_integrated$predicted.celltype.l2))
ref_l2_labels <- ref_l2_labels[!is.na(ref_l2_labels)]
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, ref_l2_labels)
cat("\nAzimuth l2 colour palette covers", length(ref_l2_labels), "reference labels ✅\n")

# NOTE: monocle3_pseudotime summary is printed AFTER trajectory is built
# (Section 2 below). If this is a freshly loaded reference that already
# has pseudotime stored, the validate-pseudotime chunk will summarise it.

# ── Validate: UMAP model MUST exist for MapQuery ──────────────────────────
if (is.null(reference_integrated@reductions$umap@misc$model)) {
  stop(
    "❌ UMAP model missing! Re-run on the cleaned (no-prolif) object:\n",
    "  reference_integrated <- RunUMAP(reference_integrated, dims = 1:20,\n",
    "                                   return.model = TRUE)\n",
    "Then re-save and reload."
  )
} else {
  cat("✅ UMAP model confirmed — ready for MapQuery projection\n")
}
```

## Load & Prepare Malignant CD4+ T Cells — Per-Cell-Line SCTransform

```{r load-malignant}
# ── Load merged object ─────────────────────────────────────────────────────
All_samples_Merged <- readRDS(
  "/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds"
)

cat("All samples:\n")
print(table(All_samples_Merged$cell_line))

# ── Define group labels ────────────────────────────────────────────────────
All_samples_Merged$Group <- ifelse(
  All_samples_Merged$cell_line %in% paste0("L", 1:7),
  "MalignantCD4T",
  ifelse(
    All_samples_Merged$cell_line %in% c("CD4Tcells_lab", "CD4Tcells_10x_S1", "CD4Tcells_10x_S2"),
    "NormalCD4T",
    "Other"
  )
)

cat("\nGroup distribution:\n")
print(table(All_samples_Merged$Group))

# ── Subset malignant cells ─────────────────────────────────────────────────
MalignantCD4T_raw <- subset(All_samples_Merged, subset = Group == "MalignantCD4T")
cat("\nMalignant CD4T cells:", ncol(MalignantCD4T_raw), "\n")
cat("Cell lines present:\n")
print(table(MalignantCD4T_raw$cell_line))

rm(All_samples_Merged)
gc()
```

```{r per-line-sct}
# ══════════════════════════════════════════════════════════════════════════
# PER-CELL-LINE SCTransform
# ══════════════════════════════════════════════════════════════════════════
# WHY: Running SCTransform on the merged object creates a single shared
#      size-factor model that collapses batch variation with biology.
#      Each cell line is a separate library (different sequencing depth,
#      different ambient RNA profile). Per-line SCT corrects for this.
#
# APPROACH:
#   1. Split the malignant object by cell_line
#   2. Run SCTransform on each line independently
#   3. Merge back into one object (NOT integrate — integration is not
#      needed here because MapQuery uses reference anchors anyway)
#   4. Set VariableFeatures to the union of all per-line HVGs
#      (capped at 3000, ranked by cross-line frequency)
# ══════════════════════════════════════════════════════════════════════════

cat("=== Running per-cell-line SCTransform ===\n")
cat("This may take 5-15 minutes depending on cell count per line.\n\n")

# ── Step 1: Split by cell line ────────────────────────────────────────────
cell_line_list <- SplitObject(MalignantCD4T_raw, split.by = "cell_line")
cat("Cell lines to process:", paste(names(cell_line_list), collapse = ", "), "\n\n")

# ── Step 2: SCTransform each line independently ───────────────────────────
# vars.to.regress: remove cell-cycle and mitochondrial confounders
# These are biological covariates that would otherwise distort the
# differentiation signal in the trajectory projection.

cell_line_list <- lapply(names(cell_line_list), function(line_name) {
  obj <- cell_line_list[[line_name]]
  cat("Processing cell line:", line_name,
      "| Cells:", ncol(obj), "\n")

  # ── Compute % mitochondrial if not present ──────────────────────────────
  if (!"percent.mt" %in% colnames(obj@meta.data)) {
    obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
  }

  # ── Cell cycle scoring (for regression) ────────────────────────────────
  # Uses Seurat's built-in cc.genes (Tirosh et al.)
  # Regressing S.Score and G2M.Score removes cycling artefacts from the
  # SCT residuals without removing cycling cells from the object.
  tryCatch({
    obj <- CellCycleScoring(
      obj,
      s.features   = cc.genes$s.genes,
      g2m.features = cc.genes$g2m.genes,
      set.ident    = FALSE
    )
    vars_regress <- c("percent.mt", "S.Score", "G2M.Score")
    cat("  → Cell cycle scores computed for", line_name, "\n")
  }, error = function(e) {
    # If cell cycle genes not found (very small object), skip regression
    cat("  ⚠️  Cell cycle scoring failed for", line_name,
        "— regressing percent.mt only\n")
    vars_regress <<- "percent.mt"
  })

  # ── SCTransform ─────────────────────────────────────────────────────────
  obj <- SCTransform(
    obj,
    vars.to.regress  = vars_regress,
    variable.features.n = 3000,
    vst.flavor       = "v2",     # Pearson residuals v2 (faster, more stable)
    verbose          = FALSE
  )

  cat("  ✅", line_name, "SCT complete |",
      length(VariableFeatures(obj)), "HVGs\n")
  return(obj)
})
names(cell_line_list) <- names(SplitObject(MalignantCD4T_raw, split.by = "cell_line"))

# ── Step 3: Collect per-line HVGs ─────────────────────────────────────────
# Strategy: rank genes by how many cell lines they are variable in.
# Genes variable across multiple lines are robust HVGs not dominated
# by any single line's batch effect.

all_hvg_lists <- lapply(cell_line_list, VariableFeatures)
hvg_freq <- sort(table(unlist(all_hvg_lists)), decreasing = TRUE)

# Select top 3000 genes by cross-line frequency
n_hvgs      <- min(3000, length(hvg_freq))
shared_hvgs <- names(hvg_freq)[1:n_hvgs]

cat("\nTop 3000 HVGs selected by cross-line frequency\n")
cat("Genes variable in all", length(cell_line_list), "lines:",
    sum(hvg_freq == length(cell_line_list)), "\n")
cat("Genes variable in ≥3 lines:",
    sum(hvg_freq >= 3), "\n")

# ── Step 4: Merge SCT-normalised cell line objects ─────────────────────────
# merge() on SCT objects concatenates the corrected count matrices.
# We do NOT RunIntegration here — MapQuery will handle cross-dataset
# correction via anchor-based transfer.
cat("\nMerging per-line SCT objects...\n")

MalignantCD4T <- merge(
  x    = cell_line_list[[1]],
  y    = cell_line_list[-1],
  merge.data = TRUE    # concatenate SCT data layers
)

# ── Step 5: Set unified VariableFeatures on merged object ─────────────────
VariableFeatures(MalignantCD4T) <- shared_hvgs

# ── Step 6: Run PCA on merged SCT object ──────────────────────────────────
# PCA is needed for FindTransferAnchors (reference.reduction = "pca")
# We run it on the merged data so each cell has a PCA embedding consistent
# with the shared HVG space.
cat("Running PCA on merged malignant object...\n")

# Note: JoinLayers() is Seurat v5 only and not needed here.
# In Seurat v4, merge() with merge.data = TRUE already concatenates
# the SCT data slot directly — no layer joining required.

MalignantCD4T <- ScaleData(
  MalignantCD4T,
  features = shared_hvgs,
  assay    = "SCT",
  verbose  = FALSE
)
MalignantCD4T <- RunPCA(
  MalignantCD4T,
  features = shared_hvgs,
  assay    = "SCT",
  npcs     = 30,
  verbose  = FALSE
)

cat("✅ Per-cell-line SCT + merge complete\n")
cat("Final merged object:\n")
cat("  Cells:", ncol(MalignantCD4T), "\n")
cat("  VariableFeatures:", length(VariableFeatures(MalignantCD4T)), "\n")
cat("  PCA dims:", ncol(Embeddings(MalignantCD4T, "pca")), "\n")
cat("Cell line distribution:\n")
print(table(MalignantCD4T$cell_line))

# Clean up per-line list to free memory
rm(MalignantCD4T_raw, cell_line_list)
gc()
```



# Reference Trajectory (Monocle3)

## Build CDS from Reference

```{r build-cds}
# ── Convert Seurat reference to CellDataSet ────────────────────────────────
# We use the reference to build/confirm the trajectory.
# If monocle3_pseudotime already exists in the object (from previous run),
# we can skip learn_graph and go straight to projection.
# Here we re-build for completeness and reproducibility.

cat("=== Building Monocle3 CDS from reference ===\n")

cds <- as.cell_data_set(reference_integrated)

# ── Transfer UMAP coordinates from Seurat (use the frozen reference UMAP) ─
reducedDim(cds, "UMAP") <- Embeddings(reference_integrated, "umap")

# ── Set partitions: single partition = one connected trajectory ────────────
# For CD4 T cell differentiation we expect a linear/branched but connected
# graph — use_partition=FALSE gives cleaner results for a single lineage.
cds@clusters$UMAP$partitions <- factor(rep(1, ncol(cds)))

# ── Set cluster assignments aligned to Seurat clusters ─────────────────────
cluster_vec <- reference_integrated$seurat_clusters[colnames(cds)]
cds@clusters$UMAP$clusters <- factor(cluster_vec)

# ── Transfer cell metadata ─────────────────────────────────────────────────
colData(cds)$cell_line             <- reference_integrated$orig.ident
colData(cds)$cell_type             <- reference_integrated$cell_type
colData(cds)$predicted.celltype.l2 <- reference_integrated$predicted.celltype.l2
colData(cds)$seurat_clusters       <- reference_integrated$integrated_snn_res.0.2

# Transfer sample/origin column if it exists (name may vary)
if ("orig.ident" %in% colnames(reference_integrated@meta.data)) {
  colData(cds)$sample <- reference_integrated$orig.ident
} else if ("sample" %in% colnames(reference_integrated@meta.data)) {
  colData(cds)$sample <- reference_integrated$sample
}

cat("CDS built:", ncol(cds), "cells\n")
cat("Clusters:", nlevels(factor(colData(cds)$seurat_clusters)), "\n")
cat("Partitions:", nlevels(partitions(cds)), "\n")
cat("\nCell type breakdown in CDS:\n")
print(table(colData(cds)$predicted.celltype.l2))

```

## Learn Trajectory Graph

```{r learn-graph, fig.width=14, fig.height=8}
# ── Learn principal graph ──────────────────────────────────────────────────
# use_partition = FALSE: single connected graph across all healthy cells.
#                        This works well here because CD4 Proliferating cells
#                        have been removed — without them, the manifold is a
#                        clean continuous differentiation space with no
#                        spurious cell-cycle branch pulling cells away.
# close_loop    = FALSE: open trajectory (Tnaive → Temra linear axis).
#                        The trajectory should not loop back on itself.
#
# RATIONALE: Removing proliferating cells is key to this step working well.
# Previously (with proliferating cells), Monocle3 would create an artifactual
# branch driven by MKI67/TOP2A/CDK1 rather than differentiation genes.
# Now the graph cleanly follows: Tnaive → TCM → TEM → Temra (+ Treg branch).

# ── FIX PARTITIONS NAMES (CRITICAL) ──────────────────────────────────────
if (is.null(names(cds@clusters$UMAP$partitions))) {
  cds@clusters$UMAP$partitions <- rep(1, ncol(cds))
  names(cds@clusters$UMAP$partitions) <- colnames(cds)
}
cat("✅ Partitions fixed:", table(cds@clusters$UMAP$partitions), "\n")

# ── FIX CLUSTERS (if needed) ─────────────────────────────────────────────
if (is.null(cds@clusters$UMAP$clusters)) {
  clustervec <- reference_integrated$seurat_clusters[colnames(cds)]
  cds@clusters$UMAP$clusters <- factor(clustervec)
}
cat("✅ Clusters:", length(unique(cds@clusters$UMAP$clusters)), "\n")

# ── LEARN GRAPH ──────────────────────────────────────────────────────────
cds <- learn_graph(cds, use_partition = FALSE, close_loop = FALSE, verbose = TRUE)

cat("\n✅ Principal graph learned!\n")
cat("Nodes:", length(igraph::V(principal_graph(cds)$UMAP)), "\n")
cat("Branch points:", sum(igraph::degree(principal_graph(cds)$UMAP) > 2), "\n")
cat("Expect 1-2 branches (Treg divergence ± Tnaive-RTE)\n")

# ── VISUALIZE: Azimuth l2 (PRIMARY) + cell_type (SECONDARY) ───────────────
p1 <- plot_cells(cds,
                 color_cells_by = "predicted.celltype.l2",  # YOUR 5 labels
                 label_groups_by_cluster = FALSE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 graph_label_size = 3.5,
                 cell_size = 0.8) +
  scale_color_manual(values = azimuth_l2_colors) +
  ggtitle("Monocle3 Graph: Azimuth l2 (CD4 Naive → Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

p2 <- plot_cells(cds,
                 color_cells_by = "cell_type",
                 label_groups_by_cluster = FALSE,
                 label_leaves = TRUE,
                 label_branch_points = TRUE,
                 graph_label_size = 3.5,
                 cell_size = 0.8) +
  ggtitle("Monocle3 Graph: Cell Types") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

p1 | p2

```

# Fixed learn-graph Visualization
```{r Visualizations, fig.width=14, fig.height=8}
umap_coords <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_coords) <- c("UMAP1", "UMAP2")   # rename for clarity

umap_coords$l2        <- reference_integrated$predicted.celltype.l2
umap_coords$cell_type <- reference_integrated$cell_type

# Centroid per Azimuth l2 state
label_coords <- umap_coords %>%
  dplyr::group_by(l2) %>%
  dplyr::summarise(
    UMAP1 = median(UMAP1),
    UMAP2 = median(UMAP2),
    .groups = "drop"
  )

# Centroid per cell_type
label_coords2 <- umap_coords %>%
  dplyr::group_by(cell_type) %>%
  dplyr::summarise(
    UMAP1 = median(UMAP1),
    UMAP2 = median(UMAP2),
    .groups = "drop"
  )

library(ggrepel)

# Azimuth l2 plot
p1 <- plot_cells(
        cds,
        color_cells_by = "predicted.celltype.l2",
        label_groups_by_cluster = FALSE,
        label_leaves        = FALSE,
        label_branch_points = FALSE,
        cell_size           = 0.7,
        show_trajectory_graph = TRUE
      ) +
  scale_color_manual(values = azimuth_l2_colors, name = "State") +
  geom_label_repel(
    data = label_coords,
    aes(x = UMAP1, y = UMAP2, label = l2),
    size = 4, fontface = "bold",
    fill = "white", alpha = 0.85,
    box.padding = 0.5, label.padding = 0.3,
    segment.color = "grey40"
  ) +
  ggtitle("Monocle3 Graph: Azimuth l2 (CD4 Naive → Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "none")

# cell_type plot
p2 <- plot_cells(
        cds,
        color_cells_by = "cell_type",
        label_groups_by_cluster = FALSE,
        label_leaves        = FALSE,
        label_branch_points = FALSE,
        cell_size           = 0.7,
        show_trajectory_graph = TRUE
      ) +
  geom_label_repel(
    data = label_coords2,
    aes(x = UMAP1, y = UMAP2, label = cell_type),
    size = 3.5, fontface = "bold",
    fill = "white", alpha = 0.85,
    box.padding = 0.5,
    segment.color = "grey40"
  ) +
  ggtitle("Monocle3 Graph: Cell Types") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "none")

p1 | p2

p1 

p2
```


## Set Root & Order Cells by Pseudotime

```{r pseudotime-root, fig.width=12, fig.height=8}
# ── Identify root node in the naive cluster ────────────────────────────────
# Root = cells in the Tnaive cluster (CCR7+SELL+TCF7+) = cluster 0
# Using the principal graph node closest to the Tnaive centroid ensures
# pseudotime=0 is biologically anchored to the most undifferentiated state.

cat("Available Azimuth l2 labels:\n")
print(sort(unique(colData(cds)$predicted.celltype.l2)))

# Find naive cells — use both Azimuth label and our cell_type annotation
naive_ids <- rownames(colData(cds))[
  grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2, ignore.case = TRUE) |
  grepl("Tnaive", as.character(colData(cds)$cell_type), ignore.case = FALSE)
]

cat("\nNaive root cells identified:", length(naive_ids), "\n")
if (length(naive_ids) == 0) {
  stop("❌ No naive root cells found. Check label strings above and adjust grep patterns.")
}

# ── Function to find the principal graph node closest to naive centroid ────
get_root_node <- function(cds, cell_ids) {
  closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
  closest <- as.matrix(closest[colnames(cds), ])
  igraph::V(principal_graph(cds)$UMAP)$name[
    as.numeric(names(which.max(table(closest[cell_ids, ]))))
  ]
}

root_node <- get_root_node(cds, naive_ids)
cat("Root node:", root_node, "\n")

# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)

# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)

# Replace Inf (unreachable cells) with NA for clean plotting
reference_integrated$monocle3_pseudotime[
  !is.finite(reference_integrated$monocle3_pseudotime)
] <- NA

cat("\nPseudotime summary (finite values):\n")
print(summary(reference_integrated$monocle3_pseudotime[
  is.finite(reference_integrated$monocle3_pseudotime)
]))

# ── Plot pseudotime on trajectory ─────────────────────────────────────────
plot_cells(
  cds,
  color_cells_by    = "pseudotime",
  label_cell_groups = FALSE,
  cell_size         = 0.8,
  show_trajectory_graph = TRUE
) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
  ggtitle("Reference CD4+ T Cell Pseudotime\n(Root = Tnaive, Terminal = Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

# ── Pseudotime on Seurat UMAP ─────────────────────────────────────────────
FeaturePlot(
  reference_integrated,
  features  = "monocle3_pseudotime",
  reduction = "umap",
  cols      = c("lightblue", "darkblue", "red"),
  label     = TRUE,
  pt.size   = 0.5
) +
  ggtitle("Reference UMAP — Pseudotime") +
  theme(plot.title = element_text(hjust = 0.5))

```

# Visualization
```{r , fig.width=14, fig.height=8}
# ── Identify root node in the naive cluster ────────────────────────────────
# Root = cells in the Tnaive cluster (CCR7+SELL+TCF7+) = cluster 0
# Using the principal graph node closest to the Tnaive centroid ensures
# pseudotime=0 is biologically anchored to the most undifferentiated state.

cat("Available Azimuth l2 labels:\n")
print(sort(unique(colData(cds)$predicted.celltype.l2)))

# Find naive cells — use both Azimuth label and our cell_type annotation
naive_ids <- rownames(colData(cds))[
  grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2, ignore.case = TRUE) |
  grepl("Tnaive", as.character(colData(cds)$cell_type), ignore.case = FALSE)
]

cat("\nNaive root cells identified:", length(naive_ids), "\n")
if (length(naive_ids) == 0) {
  stop("❌ No naive root cells found. Check label strings above and adjust grep patterns.")
}

# ── Function to find the principal graph node closest to naive centroid ────
get_root_node <- function(cds, cell_ids) {
  closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
  closest <- as.matrix(closest[colnames(cds), ])
  igraph::V(principal_graph(cds)$UMAP)$name[
    as.numeric(names(which.max(table(closest[cell_ids, ]))))
  ]
}

root_node <- get_root_node(cds, naive_ids)
cat("Root node:", root_node, "\n")

# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)

# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)

# Replace Inf (unreachable cells) with NA for clean plotting
reference_integrated$monocle3_pseudotime[
  !is.finite(reference_integrated$monocle3_pseudotime)
] <- NA

cat("\nPseudotime summary (finite values):\n")
print(summary(reference_integrated$monocle3_pseudotime[
  is.finite(reference_integrated$monocle3_pseudotime)
]))

# ── Plot pseudotime on trajectory ─────────────────────────────────────────
plot_cells(
  cds,
  color_cells_by    = "pseudotime",
  label_cell_groups = FALSE,
  cell_size         = 0.8,
  show_trajectory_graph = TRUE
) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
  ggtitle("Reference CD4+ T Cell Pseudotime\n(Root = Tnaive, Terminal = Temra/CTL)") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

# ── Pseudotime on Seurat UMAP + state labels ─────────────────────────────
library(dplyr)
library(ggrepel)

umap_df <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_df) <- c("UMAP1", "UMAP2")
umap_df$pt <- reference_integrated$monocle3_pseudotime
umap_df$l2 <- reference_integrated$predicted.celltype.l2

umap_df <- umap_df[is.finite(umap_df$pt) & !is.na(umap_df$l2), ]

label_coords <- umap_df %>%
  group_by(l2) %>%
  summarise(
    UMAP1 = median(UMAP1),
    UMAP2 = median(UMAP2),
    .groups = "drop"
  )

p_umap <- FeaturePlot(
  reference_integrated,
  features  = "monocle3_pseudotime",
  reduction = "umap",
  cols      = c("lightblue", "green", "red"),
  label     = FALSE,
  pt.size   = 0.5
)

p_umap +
  geom_label_repel(
    data = label_coords,
    aes(x = UMAP1, y = UMAP2, label = l2),
    inherit.aes = FALSE,
    size = 4,
    fontface = "bold",
    fill = "white",
    alpha = 0.85,
    box.padding = 0.4,
    label.padding = 0.25,
    segment.color = "grey40"
  ) +
  ggtitle("Reference UMAP — Pseudotime with CD4 Naive / TCM / TEM / Temra/CTL / Treg labels") +
  theme(plot.title = element_text(hjust = 0.5))

```

```{r milestones, fig.width=14, fig.height=6}
# ── DEFINE MILESTONES: Key CD4 states along your trajectory ───────────────
# Based on your Azimuth l2 + pseudotime quantiles

library(dplyr)

# 1. Get state-specific pseudotime ranges
pt_ranges <- reference_integrated@meta.data %>%
  filter(is.finite(monocle3_pseudotime)) %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n = n(),
    pt_min = round(min(monocle3_pseudotime), 2),
    pt_max = round(max(monocle3_pseudotime), 2),
    pt_median = round(median(monocle3_pseudotime), 2),
    .groups = "drop"
  ) %>%
  arrange(pt_median)

print("Pseudotime by state (milestone order):")
print(pt_ranges)

# 2. Define milestones (pseudotime quantiles + biology)
milestones <- data.frame(
  milestone = c("Naive", "Central_Memory", "Effector_Memory", "Terminal_Effector"),
  pseudotime = c(0.1, 0.8, 1.4, 2.0),  # Your trajectory quantiles
  state = c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL")
)

print("Milestones defined:")
print(milestones)

# 3. Assign cells to milestones — FIXED for YOUR 5 states
ref_milestones <- reference_integrated@meta.data %>%
  filter(is.finite(monocle3_pseudotime)) %>%
  mutate(
    milestone = case_when(
      # Naive: pt < 25th percentile
      monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.25, na.rm = TRUE) ~ "Naive",
      
      # Central Memory (TCM): 25th-60th percentile  
      monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.60, na.rm = TRUE) ~ "Central_Memory",
      
      # Treg + TEM: 60th-85th percentile (branch + effector memory)
      predicted.celltype.l2 == "Treg" | 
      (monocle3_pseudotime <= quantile(monocle3_pseudotime, 0.85, na.rm = TRUE)) ~ "Regulatory_Effector",
      
      # Terminal: TEMRA/CTL (top 15%)
      TRUE ~ "Terminal_Effector"
    ),
    
    # Simplified state mapping for coloring
    state_group = case_when(
      predicted.celltype.l2 == "CD4 Naive" ~ "Naive",
      predicted.celltype.l2 == "CD4 TCM" ~ "Central_Memory", 
      predicted.celltype.l2 == "Treg" ~ "Treg",
      predicted.celltype.l2 == "CD4 TEM" ~ "Effector_Memory",
      predicted.celltype.l2 == "CD4 Temra/CTL" ~ "Terminal_Effector"
    )
  )

cat("Milestone distribution:\n")
print(table(ref_milestones$milestone))
cat("\nState validation:\n") 
print(table(ref_milestones$predicted.celltype.l2, ref_milestones$state_group))

```


## Validate Pseudotime Against Cell Type

```{r validate-pseudotime, fig.width=10, fig.height=5}
# ── Guard: only run if monocle3_pseudotime has already been computed ──────
# This column is created in the trajectory chunk (Section 2).
# If you are running this script for the first time on a freshly loaded
# reference (without pseudotime), this chunk will skip gracefully.
# Re-run after the trajectory section completes.

if (!"monocle3_pseudotime" %in% colnames(reference_integrated@meta.data)) {
  message(
    "⚠️  monocle3_pseudotime not yet computed — skipping validation plots.\n",
    "   Run Section 2 (Reference Trajectory) first, then re-run this chunk."
  )
} else {

# ── Violin: pseudotime distribution per Azimuth l2 label ─────────────────
# CONFIRMED label order in this dataset (low → high pseudotime):
#   CD4 Naive → CD4 TCM → CD4 TEM → CD4 Temra/CTL (highest)
#   Treg: separate branch, variable pseudotime
#
# We use predicted.celltype.l2 as PRIMARY label throughout — consistent
# with the rest of the analysis.

pt_data <- data.frame(
  pseudotime = reference_integrated$monocle3_pseudotime,
  l2         = reference_integrated$predicted.celltype.l2
) %>%
  filter(is.finite(pseudotime), !is.na(l2))

# Order l2 labels by median pseudotime
l2_order <- pt_data %>%
  group_by(l2) %>%
  summarise(med_pt = median(pseudotime, na.rm = TRUE)) %>%
  arrange(med_pt) %>%
  pull(l2)

pt_data$l2 <- factor(pt_data$l2, levels = l2_order)

# ── Violin plot coloured by Azimuth l2 ────────────────────────────────────
p_violin <- ggplot(pt_data, aes(x = l2, y = pseudotime, fill = l2)) +
  geom_violin(scale = "width", alpha = 0.85, trim = TRUE) +
  geom_boxplot(width = 0.12, fill = "white", outlier.size = 0.3) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
  theme_classic(base_size = 11) +
  theme(
    axis.text.x  = element_text(angle = 40, hjust = 1, size = 9),
    legend.position = "none",
    plot.title   = element_text(hjust = 0.5, face = "bold")
  ) +
  labs(
    title = "Pseudotime Distribution per Azimuth l2 Label (Reference)",
    x = "", y = "Monocle3 Pseudotime"
  )

# ── Ridge plot — same data, different view ────────────────────────────────
p_ridge <- ggplot(pt_data, aes(x = pseudotime, y = l2, fill = l2)) +
  geom_density_ridges(alpha = 0.75, scale = 1.2, rel_min_height = 0.01) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
  theme_classic(base_size = 10) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(
    title = "Pseudotime Density by Azimuth l2 Label",
    x = "Pseudotime", y = ""
  )

p_violin | p_ridge

} # end else (monocle3_pseudotime exists)
```


# Prepare Reference for MapQuery Projection

```{r prepare-reference}
# ── Ensure SCT assay and variable features are set ─────────────────────────
# MapQuery requires:
#   1. DefaultAssay = "SCT" on reference
#   2. VariableFeatures set on reference SCT
#   3. PCA reduction available
#   4. UMAP with return.model = TRUE (validated above)

DefaultAssay(reference_integrated) <- "SCT"

ref_hvgs <- VariableFeatures(reference_integrated, assay = "SCT")

if (length(ref_hvgs) == 0) {
  cat("⚠️  No HVGs stored — extracting from scale.data rownames\n")
  ref_hvgs <- rownames(GetAssayData(reference_integrated, assay = "SCT",
                                     layer = "scale.data"))
}

cat("Reference HVGs available:", length(ref_hvgs), "\n")
cat("PCA dims available:", ncol(Embeddings(reference_integrated, "pca")), "\n")
cat("✅ Reference ready for MapQuery\n")


```


# Map Malignant CD4+ T Cells onto Reference

## Find Transfer Anchors

```{r find-anchors}
# ══════════════════════════════════════════════════════════════════════════
# FindTransferAnchors
# ══════════════════════════════════════════════════════════════════════════
# normalization.method = "SCT": both objects are SCT-normalised
# reference.reduction  = "pca": anchors in the reference PCA space
# dims = 1:20: consistent with reference clustering
#
# KEY FIX: common_features now computed from per-line SCT HVGs (shared_hvgs)
# intersected with reference HVGs. This ensures anchors are found on genes
# that are variable in BOTH the reference biology AND each malignant line.
# Previously (merged SCT), common_features were dominated by batch-driven
# HVGs from one large SCT model → biased anchors → all cells → TCM.
# ══════════════════════════════════════════════════════════════════════════

DefaultAssay(reference_integrated) <- "SCT"
DefaultAssay(MalignantCD4T)        <- "SCT"

ref_hvgs <- VariableFeatures(reference_integrated, assay = "SCT")
if (length(ref_hvgs) == 0) {
  ref_hvgs <- rownames(GetAssayData(reference_integrated, assay = "SCT",
                                     layer = "scale.data"))
}

# VariableFeatures on merged malignant = shared_hvgs set in per-line SCT block
query_hvgs <- VariableFeatures(MalignantCD4T)

common_features <- intersect(ref_hvgs, query_hvgs)
cat("Reference HVGs:", length(ref_hvgs), "\n")
cat("Query HVGs (per-line consensus):", length(query_hvgs), "\n")
cat("Common HVGs for anchor finding:", length(common_features), "\n")

if (length(common_features) < 1500) {
  warning(paste0(
    "⚠️  Only ", length(common_features), " common features found.\n",
    "   This is below the recommended minimum of 1500.\n",
    "   Check that both objects used SCTransform with similar gene sets.\n",
    "   Consider increasing nfeatures in SCTransform to 4000."
  ))
} else {
  cat("✅ Sufficient common features for robust anchor finding\n")
}


# ── Clean and rank common_features ──────────────────────────────────────────

# 1. Remove ribosomal, mitochondrial, heat shock, and other known batch genes
#    These are highly variable due to library quality, not biology.
junk_pattern <- "^RPL|^RPS|^MT-|^HSP|^HSPA|^HSPB|^HSPD|^HSPE|^HSPH|^SNHG|^MALAT1|^NEAT1|^XIST"
n_before <- length(common_features)
common_features_clean <- common_features[!grepl(junk_pattern, common_features)]
cat("Removed junk genes:", n_before - length(common_features_clean), "\n")
cat("Clean common features:", length(common_features_clean), "\n")

# 2. Rank by mean HVG rank across reference and query
#    SCTransform stores residual variance per gene — higher = more variable.
#    Genes ranked highly in BOTH objects are robustly variable (biology-driven).
ref_var_meta  <- reference_integrated[["SCT"]]@meta.features
query_var_meta <- MalignantCD4T[["SCT"]]@meta.features

# residual_variance column is set by SCTransform
if ("residual_variance" %in% colnames(ref_var_meta) &&
    "residual_variance" %in% colnames(query_var_meta)) {

  ref_ranks   <- ref_var_meta[common_features_clean, "residual_variance"]
  query_ranks <- query_var_meta[common_features_clean, "residual_variance"]

  # Replace NA with 0 (gene not in that object's variance table)
  ref_ranks[is.na(ref_ranks)]     <- 0
  query_ranks[is.na(query_ranks)] <- 0

  mean_var <- (ref_ranks + query_ranks) / 2
  common_features <- common_features_clean[order(mean_var, decreasing = TRUE)]

  cat("Top 20 ranked features (should be T cell biology):\n")
  print(head(common_features, 20))

} else {
  # Fallback: just use cleaned list without ranking
  cat("⚠️  residual_variance not found in SCT meta.features, using unranked clean list\n")
  common_features <- common_features_clean
}

# 3. Sanity check: key T cell marker genes present?
markers_to_check <- c("CCR7", "SELL", "TCF7", "IL7R", "GZMB", "PRF1",
                      "FOXP3", "CXCR5", "TOX", "PDCD1", "HAVCR2", "TIGIT",
                      "CD44", "CD69", "IFNG", "TNF", "EOMES", "TBX21")
found_markers <- markers_to_check[markers_to_check %in% common_features]
cat("\nKey T cell markers in feature set:", length(found_markers), "/", length(markers_to_check), "\n")
print(found_markers)

if (length(found_markers) < 5) {
  warning(paste0(
    "⚠️  Fewer than 5 canonical T cell markers in common_features.\n",
    "   The feature set may still be dominated by non-specific genes.\n",
    "   Consider: did SCTransform use sufficient nfeatures (3000-4000)?\n",
    "   Run: length(VariableFeatures(reference_integrated)) to check."
  ))
}


cat("\n=== Finding transfer anchors ===\n")

# Diagnostics
cat("Reference cells:", ncol(reference_integrated), "\n")
cat("Query cells:    ", ncol(MalignantCD4T), "\n")
cat("Common features:", length(common_features), "\n")

# Use as many dims as both objects support (up to 30)
dims_to_use <- min(30,
                   ncol(Embeddings(reference_integrated, "pca")),
                   ncol(Embeddings(MalignantCD4T, "pca")))
cat("Using dims 1:", dims_to_use, "\n")

# KEY CHANGES vs previous run:
#   k.anchor = 10   (was default 5)  → more initial anchor candidates
#   k.filter = 500  (was default 200) → less aggressive pruning on 40k query
#   dims = 1:30     (was 1:20)        → capture more variance in tumour cells
anchors <- FindTransferAnchors(
  reference            = reference_integrated,
  query                = MalignantCD4T,
  features             = common_features,
  normalization.method = "SCT",
  reference.reduction  = "pca",
  dims                 = 1:dims_to_use,
  k.anchor             = 10,
  k.filter             = 500,
  k.score              = 30,
  verbose              = TRUE
)

n_anchors    <- nrow(anchors@anchors)
anchor_ratio <- ncol(MalignantCD4T) / n_anchors
cat("Anchors found:", n_anchors, "\n")
cat("Cells per anchor:", round(anchor_ratio, 1), "\n")

if (anchor_ratio > 8) {
  warning(paste0(
    "⚠️  Low anchor density (1:", round(anchor_ratio, 1), ").\n",
    "   Ideal is 1:5 or better. Inspect head(common_features) for batch genes,\n",
    "   and check query UMAP projection before trusting pseudotime values."
  ))
} else {
  cat("✅ Anchor density acceptable\n")
}
```

## MapQuery — Project onto Reference UMAP

```{r map-query}
# ── MapQuery ───────────────────────────────────────────────────────────────
# This does three things simultaneously:
#   1. TransferData: transfers labels + pseudotime from reference to query
#   2. IntegrateEmbeddings: projects query PCA into reference PCA space
#   3. ProjectUMAP: projects query cells onto the FROZEN reference UMAP
#
# refdata transfers:
#   - monocle3_pseudotime: continuous pseudotime value (via weighted anchors)
#   - seurat_clusters:     reference cluster ID (0-6)
#   - cell_type:           our annotated cell type label
#   - predicted.celltype.l2: Azimuth l2 label (KEY for state interpretation)

cat("=== Projecting malignant cells onto reference (MapQuery) ===\n")

mapped_MalignantCD4T <- MapQuery(
  anchorset         = anchors,
  query             = MalignantCD4T,
  reference         = reference_integrated,
  refdata           = list(
    # PRIMARY — Azimuth l2 label: the main state annotation used throughout
    predicted.celltype.l2 = "predicted.celltype.l2",
    # CONTINUOUS — pseudotime value from Monocle3 trajectory
    pseudotime            = "monocle3_pseudotime",
    # SECONDARY — reference cluster ID (integer, for cross-check only)
    seurat_clusters       = "seurat_clusters"
  ),
  reference.reduction = "pca",
  reduction.model     = "umap"
)

cat("✅ MapQuery complete\n")
cat("Mapped malignant cells:", ncol(mapped_MalignantCD4T), "\n")
cat("\nTransferred metadata columns:\n")
transferred_cols <- grep("^predicted\\.", colnames(mapped_MalignantCD4T@meta.data), value = TRUE)
print(transferred_cols)

cat("\nTransferred pseudotime summary:\n")
print(summary(mapped_MalignantCD4T$predicted.pseudotime))

cat("\nTransferred Azimuth l2 distribution (PRIMARY):\n")
print(table(mapped_MalignantCD4T$predicted.predicted.celltype.l2))

cat("\nTransferred cluster distribution (secondary reference):\n")
print(table(mapped_MalignantCD4T$predicted.seurat_clusters))

# ── Sanity check: distribution should NOT be 99%+ in one state ────────────
l2_dist   <- prop.table(table(mapped_MalignantCD4T$predicted.predicted.celltype.l2))
max_state <- names(which.max(l2_dist))
max_pct   <- round(100 * max(l2_dist), 1)

if (max_pct > 90) {
  warning(paste0(
    "⚠️  ", max_pct, "% of cells mapped to '", max_state, "'.\n",
    "   This suggests a batch/SCT issue. Verify per-cell-line SCT ran correctly\n",
    "   and that common_features contain differentiation genes, not batch genes."
  ))
} else {
  cat("\n✅ State distribution looks diverse — projection appears unbiased\n")
  cat("   Most common state:", max_state, "(", max_pct, "% of cells)\n")
}

```


# Visualise Projection on Reference UMAP

## Reference + Malignant Overlay (Pseudotime)

```{r plot-pseudotime-overlay, fig.width=14, fig.height=6}

# Coerce pseudotime from character to numeric (Seurat transfer quirk)
mapped_MalignantCD4T$predicted.pseudotime <- as.numeric(
  mapped_MalignantCD4T$predicted.pseudotime
)

# Verify
cat("Class:", class(mapped_MalignantCD4T$predicted.pseudotime), "\n")
cat("Range:", range(mapped_MalignantCD4T$predicted.pseudotime, na.rm = TRUE), "\n")




ref_umap <- data.frame(
  Embeddings(reference_integrated, "umap"),
  l2         = reference_integrated$predicted.celltype.l2,
  pseudotime = reference_integrated$monocle3_pseudotime,
  type       = "Reference"
)
colnames(ref_umap)[1:2] <- c("UMAP_1", "UMAP_2")

query_umap <- data.frame(
  Embeddings(mapped_MalignantCD4T, "ref.umap"),
  pseudotime = mapped_MalignantCD4T$predicted.pseudotime,   # numeric after fix
  cell_line  = mapped_MalignantCD4T$cell_line,
  l2_q       = mapped_MalignantCD4T$predicted.predicted.celltype.l2,
  type       = "Malignant"
)
colnames(query_umap)[1:2] <- c("UMAP_1", "UMAP_2")

# ── Cell line palette ─────────────────────────────────────────────────────
cell_line_palette <- colorRampPalette(brewer.pal(8, "Dark2"))(
  length(unique(query_umap$cell_line))
)
names(cell_line_palette) <- sort(unique(query_umap$cell_line))

all_l2_labels  <- unique(c(as.character(ref_umap$l2),
                             as.character(query_umap$l2_q)))
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, all_l2_labels)

# ── Plot 1: Reference grey, malignant coloured by pseudotime ──────────────
p1 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = reference_color, size = 0.4, alpha = 0.6) +
  geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
             aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
             size = 1.2, alpha = 0.9) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime",
                        na.value = "grey40") +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Malignant CD4T Projected onto Reference\n(Coloured by transferred pseudotime)")

# ── Plot 2: Reference coloured by Azimuth l2, malignant cells in red ──────
p2 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2, color = l2),
             size = 0.4, alpha = 0.7) +
  scale_color_manual(values = azimuth_l2_colors,
                     name   = "Azimuth l2\n(reference)",
                     na.value = "grey60") +
  geom_point(data = query_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = malignant_color, size = 1.0, alpha = 0.8, shape = 16) +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.key.size = unit(0.4, "cm"),
        legend.text = element_text(size = 8)) +
  ggtitle("Reference Azimuth l2 (colour)\n+ Malignant Cells (red overlay)")

p1 | p2
```

## Azimuth l2 Annotation Overlay

```{r plot-azimuth-overlay, fig.width=14, fig.height=6}
p3 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = reference_color, size = 0.4, alpha = 0.6) +
  geom_point(data = query_umap,
             aes(x = UMAP_1, y = UMAP_2, color = l2_q),
             size = 1.2, alpha = 0.9) +
  scale_color_manual(values = azimuth_l2_colors,
                     name   = "Azimuth l2\n(transferred)",
                     na.value = "grey40") +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Malignant CD4T — Transferred Azimuth l2 Labels")

p4 <- ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = reference_color, size = 0.4, alpha = 0.5) +
  geom_point(data = query_umap,
             aes(x = UMAP_1, y = UMAP_2, color = cell_line),
             size = 1.0, alpha = 0.9) +
  scale_color_manual(values = cell_line_palette, name = "Cell line") +
  theme_classic(base_size = 11) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Malignant CD4T per Cell Line")

p3 | p4
```

## Per Cell Line Facet — State Mapping

```{r plot-per-cellline, fig.width=12, fig.height=6}
ggplot() +
  geom_point(data = ref_umap,
             aes(x = UMAP_1, y = UMAP_2),
             color = "grey60", size = 0.3, alpha = 0.5) +
  geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
             aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
             size = 0.9, alpha = 0.85) +
  facet_wrap(~ cell_line, ncol = 4) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime",
                        na.value = "grey60") +
  theme_classic(base_size = 10) +
  theme(strip.text = element_text(size = 9, face = "bold"),
        strip.background = element_rect(fill = "#E8F4FD"),
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("Per Cell Line: Projection onto Reference UMAP (Pseudotime)")
```


# Quantification — State Assignment

## Assign Each Malignant Cell to a Reference State

```{r state-assignment}
# ── pseudotime bins based on reference quantiles ───────────────────────────
pt_vals   <- reference_integrated$monocle3_pseudotime
pt_vals   <- pt_vals[is.finite(pt_vals)]
pt_breaks <- quantile(pt_vals, probs = c(0, 1/3, 2/3, 1))

mapped_MalignantCD4T$state_azimuth_l2  <- mapped_MalignantCD4T$predicted.predicted.celltype.l2
mapped_MalignantCD4T$state_ref_cluster <- mapped_MalignantCD4T$predicted.seurat_clusters
mapped_MalignantCD4T$pseudotime_value  <- mapped_MalignantCD4T$predicted.pseudotime  # numeric

mapped_MalignantCD4T$pseudotime_bin <- cut(
  mapped_MalignantCD4T$pseudotime_value,
  breaks         = pt_breaks,
  labels         = c("Early (Tnaive-like)", "Mid (TCM/TEM-like)", "Late (Temra-like)"),
  include.lowest = TRUE
)

cat("=== State assignment complete ===\n")
cat("\nAzimuth l2 state (PRIMARY):\n")
print(table(mapped_MalignantCD4T$state_azimuth_l2))
cat("\nReference cluster (cross-check):\n")
print(table(mapped_MalignantCD4T$state_ref_cluster))
cat("\nPseudotime bin:\n")
print(table(mapped_MalignantCD4T$pseudotime_bin))
```

## Quantification Tables

```{r quantify-states, fig.width=14, fig.height=6}
state_summary <- mapped_MalignantCD4T@meta.data %>%
  count(state_azimuth_l2, name = "n_cells") %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
  arrange(desc(n_cells))

cat("=== Overall Malignant CD4T State Distribution ===\n")
print(state_summary)

state_per_line <- mapped_MalignantCD4T@meta.data %>%
  count(cell_line, state_azimuth_l2, name = "n_cells") %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
  ungroup() %>%
  arrange(cell_line, desc(n_cells))

cat("\n=== State Distribution per Cell Line ===\n")
print(state_per_line, n = Inf)

cluster_summary <- mapped_MalignantCD4T@meta.data %>%
  count(state_ref_cluster, name = "n_cells") %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
  arrange(as.numeric(as.character(state_ref_cluster)))

cat("\n=== Reference Cluster Distribution (cross-check) ===\n")
print(cluster_summary)

pt_bin_summary <- mapped_MalignantCD4T@meta.data %>%
  count(pseudotime_bin, name = "n_cells") %>%
  mutate(pct = round(100 * n_cells / sum(n_cells), 1))

cat("\n=== Pseudotime Bin Summary ===\n")
print(pt_bin_summary)
```

## Bar Chart — State Proportions

```{r barplot-states, fig.width=14, fig.height=10}
p_bar_overall <- ggplot(state_per_line,
                         aes(x = cell_line, y = pct, fill = state_azimuth_l2)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70",
                    name = "Azimuth l2\nstate") +
  theme_classic(base_size = 11) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1),
        plot.title  = element_text(hjust = 0.5, face = "bold"),
        legend.key.size = unit(0.5, "cm")) +
  labs(title = "Malignant CD4T — State Composition per Cell Line\n(Azimuth l2 transferred labels)",
       x = "Cell Line", y = "% Cells")

pt_bin_line <- mapped_MalignantCD4T@meta.data %>%
  filter(!is.na(pseudotime_bin)) %>%
  count(cell_line, pseudotime_bin) %>%
  group_by(cell_line) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup()

bin_colors <- c(
  "Early (Tnaive-like)" = "#4575B4",
  "Mid (TCM/TEM-like)"  = "#FEE090",
  "Late (Temra-like)"   = "#D73027"
)

p_bar_bins <- ggplot(pt_bin_line,
                      aes(x = cell_line, y = pct, fill = pseudotime_bin)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  geom_text(aes(label = ifelse(pct >= 8, paste0(pct, "%"), "")),
            position = position_stack(vjust = 0.5),
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = bin_colors, name = "Pseudotime\nBin") +
  theme_classic(base_size = 11) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1),
        plot.title  = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Malignant CD4T — Pseudotime Bin Distribution per Cell Line",
       x = "Cell Line", y = "% Cells")

p_bar_overall / p_bar_bins
```

## Heatmap — State Proportions Across Cell Lines

```{r heatmap-states, fig.width=10, fig.height=6}
library(pheatmap)

heatmap_df <- state_per_line %>%
  select(cell_line, state_azimuth_l2, pct) %>%
  pivot_wider(names_from  = state_azimuth_l2,
              values_from = pct,
              values_fill = 0)

heatmap_mat             <- as.matrix(heatmap_df[, -1])
rownames(heatmap_mat)   <- heatmap_df$cell_line

pheatmap(
  heatmap_mat,
  color            = colorRampPalette(c("white", "#FEE090", "#D73027"))(50),
  display_numbers  = TRUE,
  number_format    = "%.1f%%",
  fontsize_number  = 8,
  fontsize_row     = 10,
  fontsize_col     = 9,
  angle_col        = 45,
  cluster_rows     = TRUE,
  cluster_cols     = TRUE,
  main             = "% Malignant Cells per State (Azimuth l2)\nClustered by cell line similarity",
  border_color     = "white"
)
```


# Pseudotime Analysis of Malignant Cells

## Pseudotime Distribution

```{r pseudotime-distribution, fig.width=14, fig.height=6}
ref_pt <- data.frame(
  pseudotime = reference_integrated$monocle3_pseudotime[
    is.finite(reference_integrated$monocle3_pseudotime)
  ],
  source = "Healthy reference"
)

mal_pt <- data.frame(
  pseudotime = mapped_MalignantCD4T$pseudotime_value[
    is.finite(mapped_MalignantCD4T$pseudotime_value)
  ],
  source = "Malignant CD4T"
)

pt_combined <- bind_rows(ref_pt, mal_pt)

p_density <- ggplot(pt_combined, aes(x = pseudotime, fill = source)) +
  geom_density(alpha = 0.65, adjust = 1.2) +
  scale_fill_manual(
    values = c("Healthy reference" = "#4575B4", "Malignant CD4T" = malignant_color),
    name = ""
  ) +
  theme_classic(base_size = 12) +
  theme(legend.position = "top",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Pseudotime Distribution: Healthy Reference vs Malignant CD4T",
       subtitle = "Peak shift indicates differentiation arrest point",
       x = "Monocle3 Pseudotime", y = "Density")

mal_pt_line <- mapped_MalignantCD4T@meta.data %>%
  filter(is.finite(pseudotime_value)) %>%
  select(cell_line, pseudotime_value)

p_ridge_lines <- ggplot(mal_pt_line,
                         aes(x = pseudotime_value, y = cell_line, fill = cell_line)) +
  geom_density_ridges(alpha = 0.80, scale = 1.3, rel_min_height = 0.01,
                      quantile_lines = TRUE, quantiles = 2) +
  scale_fill_manual(values = cell_line_palette) +
  theme_classic(base_size = 11) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Pseudotime Distribution per Cell Line",
       subtitle = "Vertical line = median",
       x = "Pseudotime", y = "")

p_density | p_ridge_lines
```

## Pseudotime Summary Statistics Table

```{r pseudotime-stats}
pt_stats <- mapped_MalignantCD4T@meta.data %>%
  filter(is.finite(pseudotime_value)) %>%
  group_by(cell_line) %>%
  summarise(
    n_cells   = n(),
    mean_pt   = round(mean(pseudotime_value), 3),
    median_pt = round(median(pseudotime_value), 3),
    sd_pt     = round(sd(pseudotime_value), 3),
    q25_pt    = round(quantile(pseudotime_value, 0.25), 3),
    q75_pt    = round(quantile(pseudotime_value, 0.75), 3),
    pct_early = round(100 * mean(pseudotime_value < pt_breaks[2], na.rm = TRUE), 1),
    pct_mid   = round(100 * mean(pseudotime_value >= pt_breaks[2] &
                                  pseudotime_value < pt_breaks[3], na.rm = TRUE), 1),
    pct_late  = round(100 * mean(pseudotime_value >= pt_breaks[3], na.rm = TRUE), 1)
  ) %>%
  arrange(median_pt)

cat("=== Pseudotime Statistics per Cell Line ===\n")
print(pt_stats, n = Inf)

ref_stats <- reference_integrated@meta.data %>%
  filter(is.finite(monocle3_pseudotime), !is.na(predicted.celltype.l2)) %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n_cells   = n(),
    mean_pt   = round(mean(monocle3_pseudotime), 3),
    median_pt = round(median(monocle3_pseudotime), 3)
  ) %>%
  arrange(median_pt)

cat("\n=== Reference Pseudotime per Azimuth l2 State (for comparison) ===\n")
print(ref_stats)
```


# Integrated Summary Visualisation

## Four-Panel Summary Figure

```{r summary-figure, fig.width=12, fig.height=10}
p_ref <- DimPlot(
  reference_integrated, group.by = "predicted.celltype.l2",
  reduction = "umap", label = TRUE, repel = TRUE,
  label.size = 3, pt.size = 0.4
) +
  scale_color_manual(values = azimuth_l2_colors, na.value = "grey40") +
  ggtitle("A. Reference CD4+ T Cells\n(Azimuth l2 — Proliferating removed)") +
  theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
  NoLegend()

p_pt_ref <- FeaturePlot(
  reference_integrated, features = "monocle3_pseudotime",
  reduction = "umap", cols = c("lightblue","yellow","red"), pt.size = 0.4
) +
  ggtitle("B. Reference Pseudotime\n(Naive → Temra axis)") +
  theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold"))

p_proj <- ggplot() +
  geom_point(data = ref_umap, aes(x = UMAP_1, y = UMAP_2),
             color = "grey88", size = 0.3, alpha = 0.6) +
  geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
             aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
             size = 1.0, alpha = 0.85) +
  scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
  theme_classic(base_size = 10) +
  theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
  ggtitle("C. Malignant CD4T Projected\n(pseudotime transferred)")

top_states   <- state_summary %>% head(8) %>% pull(state_azimuth_l2)
state_plot_df <- state_per_line %>% filter(state_azimuth_l2 %in% top_states)

p_quant <- ggplot(state_plot_df,
                   aes(x = reorder(cell_line, -pct), y = pct,
                       fill = state_azimuth_l2)) +
  geom_col(position = "stack", color = "white", linewidth = 0.3) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey40",
                    name = "Azimuth l2") +
  theme_classic(base_size = 10) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
        plot.title  = element_text(hjust = 0.5, size = 11, face = "bold"),
        legend.key.size = unit(0.4, "cm"),
        legend.text = element_text(size = 8)) +
  labs(title = "D. State Proportions per Cell Line\n(Azimuth l2 transferred)",
       x = "", y = "% Cells")

(p_ref | p_pt_ref) / (p_proj | p_quant) +
  plot_annotation(
    title = "Malignant CD4+ T Cell Differentiation State Mapping\n(Sézary Syndrome — Monocle3 Reference Trajectory)",
    theme = theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
  )
```

## Pseudotime vs State Violin (publication-ready)

```{r pseudotime-vs-state, fig.width=6, fig.height=5}
mal_state_pt <- mapped_MalignantCD4T@meta.data %>%
  filter(is.finite(pseudotime_value), !is.na(state_azimuth_l2)) %>%
  mutate(state = state_azimuth_l2)

state_order_mal <- mal_state_pt %>%
  group_by(state) %>%
  summarise(med = median(pseudotime_value)) %>%
  arrange(med) %>%
  pull(state)

mal_state_pt$state <- factor(mal_state_pt$state, levels = state_order_mal)

ggplot(mal_state_pt, aes(x = state, y = pseudotime_value, fill = state)) +
  geom_violin(scale = "width", alpha = 0.8, trim = TRUE) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.3) +
  scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
  theme_classic(base_size = 11) +
  theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Pseudotime of Malignant CD4T per Transferred State\n(Azimuth l2 labels)",
       x = "Transferred Differentiation State",
       y = "Transferred Pseudotime")
```


# Save Outputs

```{r save-outputs}
out_dir <- "results/MalignantCD4T_Monocle3_projection"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

saveRDS(mapped_MalignantCD4T,file.path(out_dir, "MalignantCD4T_mapped_monocle3_pseudotime_noProlif_ref.rds"))

write.csv(state_summary,
          file.path(out_dir, "state_distribution_overall.csv"),
          row.names = FALSE)
write.csv(state_per_line,
          file.path(out_dir, "state_distribution_per_cellline.csv"),
          row.names = FALSE)
write.csv(pt_stats,
          file.path(out_dir, "pseudotime_stats_per_cellline.csv"),
          row.names = FALSE)
write.csv(mapped_MalignantCD4T@meta.data,
          file.path(out_dir, "MalignantCD4T_full_metadata_with_projection.csv"),
          row.names = TRUE)

cat("✅ All outputs saved to:", out_dir, "\n")
cat("\nFiles saved:\n")
print(list.files(out_dir))
```


# Session Info

```{r session-info}

sessionInfo()

```

<!-- ═══════════════════════════════════════════════════════════════════════
     INTERPRETATION GUIDE
     ═══════════════════════════════════════════════════════════════════════

  HOW TO INTERPRET YOUR RESULTS:
  ───────────────────────────────

  1. WHERE DO MALIGNANT CELLS PROJECT?
     Look at Panel C of the summary figure. If malignant cells cluster
     predominantly over:
       - Tnaive/TCM territory → early/central memory arrest (common in SS)
       - TEM territory         → effector memory phenotype
       - Temra territory       → late effector/cytotoxic phenotype
       - Mixed/spread          → heterogeneous population

  2. PSEUDOTIME VALUES:
     Compare malignant median pseudotime to reference values from ref_stats.
     If malignant median ≈ reference TCM median → TCM-like arrest.
     A left-shifted distribution (low pseudotime) = more naive-like cells.

  3. CELL LINE DIFFERENCES:
     The per-cell-line heatmap reveals whether different Sézary cell lines
     have distinct differentiation states — biologically important as
     different lines may model different stages of the disease.

  4. SANITY CHECK: If >90% of cells still map to one state after per-line SCT,
     investigate:
       a. Are the common_features (~2000+ genes) diverse and biologically
          meaningful (CCR7, GZMK, FOXP3, TCF7, SELL — not just ribosomal/MT)?
       b. Is the reference UMAP model correctly frozen (return.model = TRUE)?
       c. Are there <500 cells in any one cell line? Small lines can produce
          unstable SCT models. Exclude lines with <200 cells.

  KNOWN BIOLOGY (Sézary Syndrome):
  ─────────────────────────────────
  Published literature (Cerapio et al.) shows Sézary cells predominantly
  resemble central memory (TCM) or skin-homing TEM cells. FOXP3+ Treg-like
  features are also reported in some SS cases. If your malignant cells map
  predominantly to TCM/TEM territory with low-to-medium pseudotime, this
  is consistent with published findings.

══════════════════════════════════════════════════════════════════════════ -->
