1 Load Libraries

library(dplyr)
library(ggplot2)
library(Seurat)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)
library(tidyr)
library(ggrepel)
library(pheatmap)

options(timeout = 300)
set.seed(123)

# Colour palette — consistent throughout
flow_colours <- c(
  "Tnaive" = "#4472C4",
  "Tcm"    = "#ED7D31",
  "Tem"    = "#A9D18E",
  "Temra"  = "#C00000",
  "Treg"   = "#FFD700"
)

2 Load Object

clean_obj <- readRDS("../CD4_reference_clean_Azimuth_ready_for_Slingshot.rds")

cat("Total cells loaded:", ncol(clean_obj), "\n")
Total cells loaded: 11466 
cat("\nAzimuth l2 composition:\n")

Azimuth l2 composition:
print(table(clean_obj$predicted.celltype.l2))

    CD4 Naive       CD4 TCM       CD4 TEM CD4 Temra/CTL          Treg 
         2037          9067           145            10           207 

3 Assign Trajectory States

# Map Azimuth l2 → trajectory state labels
# CD4 CTL merged into Temra:
#   - Only 10 cells — too few for stable Slingshot endpoint
#   - Biologically justified: CD4 CTL is cytotoxic CD4, equivalent to Temra

clean_obj$trajectory_state <- dplyr::recode(
  clean_obj$predicted.celltype.l2,
  "CD4 Naive"     = "Tnaive",
  "CD4 TCM"       = "Tcm",
  "CD4 TEM"       = "Tem",
  "CD4 Temra/CTL"       = "Temra",   # merged into Temra
  "Treg"          = "Treg",
  .default        = NA_character_
)

clean_obj$trajectory_state <- factor(
  clean_obj$trajectory_state,
  levels = c("Tnaive","Tcm","Tem","Temra","Treg")
)

cat("Trajectory state distribution:\n")
Trajectory state distribution:
print(table(clean_obj$trajectory_state, useNA = "ifany"))

Tnaive    Tcm    Tem  Temra   Treg 
  2037   9067    145     10    207 
# Visualise on UMAP
DimPlot(clean_obj,
        group.by = "trajectory_state",
        cols     = flow_colours,
        na.value = "grey90",
        label    = TRUE,
        repel    = TRUE) +
  ggtitle("Trajectory states — Azimuth l2 mapped to flow-equivalent labels")

4 Run Slingshot Trajectory

# ── Prepare input ──
# Use UMAP coordinates directly — no SCE conversion needed
umap_coords <- Embeddings(clean_obj, "umap")

# Only use cells with a confirmed trajectory state
keep_cells  <- !is.na(clean_obj$trajectory_state)
umap_sub    <- umap_coords[keep_cells, ]
cluster_sub <- droplevels(clean_obj$trajectory_state[keep_cells])

cat("Cells entering Slingshot:\n")
Cells entering Slingshot:
print(table(cluster_sub))
cluster_sub
Tnaive    Tcm    Tem  Temra   Treg 
  2037   9067    145     10    207 
# ── Run Slingshot ──
# start.clus = Tnaive (confirmed earliest state)
# end.clus   = Temra (effector/cytotoxic arm) + Treg (regulatory arm)
# Two terminal states = two lineages
sce_sling <- slingshot(
  data          = umap_sub,
  clusterLabels = cluster_sub,
  start.clus    = "Tnaive",
  end.clus      = c("Temra", "Treg"),
  approx_points = 100,      # reduced from 300 — less smoothing
  stretch       = 0         # prevents curves extending beyond terminal clusters
)

cat("\nLineages after fix:\n")

Lineages after fix:
print(slingLineages(sce_sling))
$Lineage1
[1] "Tnaive" "Tcm"    "Tem"    "Temra" 

$Lineage2
[1] "Tnaive" "Tcm"    "Treg"  
cat("\nLineages inferred:\n")

Lineages inferred:
print(slingLineages(sce_sling))
$Lineage1
[1] "Tnaive" "Tcm"    "Tem"    "Temra" 

$Lineage2
[1] "Tnaive" "Tcm"    "Treg"  
# Expected:
# Lineage 1: Tnaive → Tcm → Tem → Temra  (effector arm)
# Lineage 2: Tnaive → Tcm → Treg          (regulatory arm)

5 Extract Pseudotime

# Extract pseudotime matrix (cells × lineages)
pt_matrix           <- slingPseudotime(sce_sling)
colnames(pt_matrix) <- c("PT_effector","PT_regulatory")

cat("Pseudotime summary — effector lineage:\n")
Pseudotime summary — effector lineage:
print(summary(pt_matrix[, "PT_effector"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   1.915   4.670   5.409   8.889  12.911    1784 
cat("\nPseudotime summary — regulatory lineage:\n")

Pseudotime summary — regulatory lineage:
print(summary(pt_matrix[, "PT_regulatory"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   2.048   6.640   6.974  10.844  17.587    1199 
# Extract pseudotime matrix (cells × lineages)
pt_matrix           <- slingPseudotime(sce_sling)
colnames(pt_matrix) <- c("PT_effector","PT_regulatory")

cat("Pseudotime summary — effector lineage:\n")
Pseudotime summary — effector lineage:
print(summary(pt_matrix[, "PT_effector"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   1.915   4.670   5.409   8.889  12.911    1784 
cat("\nPseudotime summary — regulatory lineage:\n")

Pseudotime summary — regulatory lineage:
print(summary(pt_matrix[, "PT_regulatory"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   2.048   6.640   6.974  10.844  17.587    1199 
# ── Add pseudotime to metadata via direct dataframe assignment ──
# IMPORTANT: Do NOT use clean_obj$PT_effector <- ... 
# That triggers Seurat's validObject() which fails on large objects
# Instead extract meta.data, modify as plain R dataframe, write back

meta <- clean_obj@meta.data
meta$PT_effector   <- NA_real_
meta$PT_regulatory <- NA_real_

common <- intersect(rownames(pt_matrix), rownames(meta))
meta[common, "PT_effector"]   <- pt_matrix[common, "PT_effector"]
meta[common, "PT_regulatory"] <- pt_matrix[common, "PT_regulatory"]

clean_obj@meta.data <- meta

cat("\nCells with effector pseudotime:  ",
    sum(!is.na(clean_obj@meta.data$PT_effector)), "\n")

Cells with effector pseudotime:   9682 
cat("Cells with regulatory pseudotime:",
    sum(!is.na(clean_obj@meta.data$PT_regulatory)), "\n")
Cells with regulatory pseudotime: 10267 

6 Trajectory Plot — Curves on UMAP

# ── Extract curve coordinates ──
curves <- slingCurves(sce_sling)

# Curve 1 — effector lineage (Tnaive → Tcm → Tem → Temra)
curve1_df           <- as.data.frame(curves[[1]]$s[curves[[1]]$ord, ])
colnames(curve1_df) <- c("UMAP_1","UMAP_2")

# Curve 2 — regulatory lineage (Tnaive → Tcm → Treg)
curve2_df           <- as.data.frame(curves[[2]]$s[curves[[2]]$ord, ])
colnames(curve2_df) <- c("UMAP_1","UMAP_2")

# ── UMAP base dataframe ──
umap_df           <- as.data.frame(umap_coords)
colnames(umap_df) <- c("UMAP_1","UMAP_2")
umap_df$trajectory_state <- clean_obj$trajectory_state
umap_df <- umap_df %>% filter(!is.na(trajectory_state))

# ── Milestone centroids ──
milestones <- umap_df %>%
  group_by(trajectory_state) %>%
  summarise(
    UMAP_1 = median(UMAP_1),
    UMAP_2 = median(UMAP_2),
    .groups = "drop"
  )

# ── Main plot ──
p_traj <- ggplot(umap_df,
                 aes(x = UMAP_1, y = UMAP_2,
                     colour = trajectory_state)) +
  geom_point(size = 0.4, alpha = 0.5) +
  # Effector lineage — solid black line
  geom_path(data        = curve1_df,
            aes(x = UMAP_1, y = UMAP_2),
            colour      = "black",
            linewidth   = 1.5,
            inherit.aes = FALSE) +
  # Regulatory lineage — dashed black line
  geom_path(data        = curve2_df,
            aes(x = UMAP_1, y = UMAP_2),
            colour      = "black",
            linewidth   = 1.5,
            linetype    = "dashed",
            inherit.aes = FALSE) +
  # Milestone labels
  ggrepel::geom_label_repel(
    data        = milestones,
    aes(x     = UMAP_1,
        y     = UMAP_2,
        label = trajectory_state,
        fill  = trajectory_state),
    colour      = "white",
    fontface    = "bold",
    size        = 5,
    box.padding = 0.8,
    inherit.aes = FALSE
  ) +
  scale_colour_manual(values = flow_colours, name = "State") +
  scale_fill_manual(values   = flow_colours, guide = "none") +
  theme_classic() +
  theme(
    plot.title    = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    legend.position = "right"
  ) +
  labs(
    title    = "CD4 T-cell Differentiation Trajectory (Slingshot)",
    subtitle = "Solid: Tnaive → Tcm → Tem → Temra | Dashed: Tnaive → Tcm → Treg",
    x        = "UMAP 1",
    y        = "UMAP 2"
  )

print(p_traj)


ggsave("cd4_trajectory_slingshot.png",
       plot   = p_traj,
       width  = 14, height = 8, dpi = 300)


# Get Treg centroid
treg_centroid <- umap_df %>%
  filter(trajectory_state == "Treg") %>%
  summarise(UMAP_1 = median(UMAP_1), UMAP_2 = median(UMAP_2))

# Get last point of dashed curve
curve2_end <- tail(curve2_df, 1)

p_traj +
  # Add arrow from curve endpoint to Treg centroid
  annotate("segment",
           x      = curve2_end$UMAP_1,
           y      = curve2_end$UMAP_2,
           xend   = treg_centroid$UMAP_1,
           yend   = treg_centroid$UMAP_2,
           colour = "black",
           linewidth = 1.2,
           linetype  = "dashed",
           arrow  = arrow(length = unit(0.3,"cm"),
                          type   = "closed")) +
  labs(subtitle = "Solid: Tnaive→Tcm→Tem→Temra | Dashed: Tnaive→Tcm→Treg")

7 Pseudotime Feature Plots

# Fix: add na.rm handling by filtering NAs before quantile calculation
# Use a numeric cutoff instead of "q95"

# Calculate q95 manually excluding NAs
eff_q95 <- quantile(clean_obj@meta.data$PT_effector, 
                     probs = 0.95, na.rm = TRUE)
reg_q95 <- quantile(clean_obj@meta.data$PT_regulatory, 
                     probs = 0.95, na.rm = TRUE)

cat("Effector q95 cutoff:",   round(eff_q95, 2), "\n")
Effector q95 cutoff: 12.29 
cat("Regulatory q95 cutoff:", round(reg_q95, 2), "\n")
Regulatory q95 cutoff: 17.04 
p_eff <- FeaturePlot(clean_obj,
                     features   = "PT_effector",
                     reduction  = "umap",
                     cols       = c("grey90","#C00000"),
                     pt.size    = 0.5,
                     min.cutoff = 0,
                     max.cutoff = eff_q95) +   # numeric value not "q95"
  ggtitle("Effector pseudotime\nTnaive → Tcm → Tem → Temra") +
  theme(plot.title = element_text(size = 11, face = "bold"))

p_reg <- FeaturePlot(clean_obj,
                     features   = "PT_regulatory",
                     reduction  = "umap",
                     cols       = c("grey90","#4472C4"),
                     pt.size    = 0.5,
                     min.cutoff = 0,
                     max.cutoff = reg_q95) +   # numeric value not "q95"
  ggtitle("Regulatory pseudotime\nTnaive → Tcm → Treg") +
  theme(plot.title = element_text(size = 11, face = "bold"))

p_eff | p_reg


ggsave("pseudotime_featureplots.png",
       plot  = p_eff | p_reg,
       width = 14, height = 6, dpi = 300)

8 Pseudotime Violin per State per Lineage

# Build long-format dataframe for plotting
pt_df <- data.frame(
  trajectory_state = clean_obj$trajectory_state[keep_cells],
  PT_effector      = pt_matrix[, "PT_effector"],
  PT_regulatory    = pt_matrix[, "PT_regulatory"]
) %>%
  tidyr::pivot_longer(
    cols      = c(PT_effector, PT_regulatory),
    names_to  = "lineage",
    values_to = "pseudotime"
  ) %>%
  dplyr::filter(
    !is.na(trajectory_state),
    !is.na(pseudotime)
  )

ggplot(pt_df,
       aes(x    = factor(trajectory_state,
                          levels = c("Tnaive","Tcm","Tem","Temra","Treg")),
           y    = pseudotime,
           fill = trajectory_state)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.3) +
  facet_wrap(~ lineage,
             ncol     = 2,
             labeller = labeller(lineage = c(
               "PT_effector"   = "Effector lineage (→ Temra)",
               "PT_regulatory" = "Regulatory lineage (→ Treg)"
             ))) +
  scale_fill_manual(values = flow_colours) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 10),
    strip.text      = element_text(size = 11, face = "bold")
  ) +
  labs(
    title    = "Pseudotime per state per lineage",
    subtitle = "Pseudotime should increase monotonically: Tnaive → Temra / Treg",
    x        = NULL,
    y        = "Slingshot pseudotime"
  )


# Confirm monotonic increase — key validation
cat("Median pseudotime per state — effector lineage:\n")
Median pseudotime per state — effector lineage:
pt_df %>%
  filter(lineage == "PT_effector") %>%
  group_by(trajectory_state) %>%
  summarise(median_pt = round(median(pseudotime, na.rm = TRUE), 2),
            n         = n(),
            .groups   = "drop") %>%
  arrange(median_pt) %>%
  print()

9 Gene Expression Along Pseudotime Heatmap

DefaultAssay(clean_obj) <- "RNA"

key_genes <- c(
  "LEF1","TCF7","KLF2","SELL",        # Tnaive
  "IL7R","S100A4","ITGB1","BCL2",     # Tcm
  "GZMK","CXCR3","CCR5",             # Tem
  "CX3CR1","GNLY","FGFBP2","PRF1",   # Temra
  "FOXP3","IL2RA","CTLA4","IKZF2"    # Treg
)
key_genes <- intersect(key_genes, rownames(clean_obj))
cat("Genes found:", length(key_genes), "\n")
Genes found: 19 
# Order cells by effector pseudotime
eff_pt       <- clean_obj$PT_effector
eff_cells    <- !is.na(eff_pt)
eff_order    <- order(eff_pt[eff_cells])
eff_barcodes <- colnames(clean_obj)[eff_cells][eff_order]

expr_mat <- as.matrix(
  GetAssayData(clean_obj, assay = "RNA", layer = "data")[key_genes, eff_barcodes]
)

# Smooth into 50 pseudotime bins
n_bins   <- 50
bin_size <- floor(ncol(expr_mat) / n_bins)
bin_mat  <- sapply(seq_len(n_bins), function(i) {
  idx <- ((i-1)*bin_size + 1):min(i*bin_size, ncol(expr_mat))
  rowMeans(expr_mat[, idx, drop = FALSE])
})

# Z-score per gene and clip to [-2, 2]
bin_z            <- t(scale(t(bin_mat)))
bin_z[is.nan(bin_z)] <- 0
bin_z            <- pmin(pmax(bin_z, -2), 2)

# Draw in notebook — remove filename parameter
pheatmap::pheatmap(
  bin_z,
  cluster_cols  = FALSE,
  cluster_rows  = FALSE,
  color         = colorRampPalette(c("navy","white","firebrick3"))(100),
  breaks        = seq(-2, 2, length.out = 101),
  main          = "Gene expression along effector pseudotime\nTnaive → Tcm → Tem → Temra",
  fontsize_row  = 9,
  border_color  = NA,
  show_colnames = FALSE,
  gaps_row      = c(4, 8, 11)
)

# Save separately AFTER displaying
dev.copy(png, "pseudotime_heatmap.png", width = 12, height = 7, units = "in", res = 300)
png 
  3 
dev.off()
png 
  2 

10 Gene Expression Along Pseudotime Heatmap

# ── Regulatory lineage heatmap (Tnaive → Tcm → Treg) ──

# Treg lineage specific genes — include Treg markers prominently
reg_genes <- c(
  "LEF1","TCF7","KLF2","SELL",          # Tnaive — should decline
  "IL7R","S100A4","ITGB1","BCL2",       # Tcm — intermediate
  "FOXP3","IL2RA","CTLA4","IKZF2",      # Treg core — should rise
  "TIGIT","TNFRSF18","TNFRSF4","CCR8"   # Treg effector — rise at end
)
reg_genes <- intersect(reg_genes, rownames(clean_obj))
cat("Regulatory lineage genes found:", length(reg_genes), "\n")
Regulatory lineage genes found: 16 
# Order cells by REGULATORY pseudotime (not effector)
reg_pt       <- clean_obj$PT_regulatory
reg_cells    <- !is.na(reg_pt)
reg_order    <- order(reg_pt[reg_cells])
reg_barcodes <- colnames(clean_obj)[reg_cells][reg_order]

cat("Cells on regulatory lineage:", length(reg_barcodes), "\n")
Cells on regulatory lineage: 10267 
# Extract expression matrix ordered by regulatory pseudotime
expr_mat_reg <- as.matrix(
  GetAssayData(clean_obj, assay = "RNA", layer = "data")[reg_genes, reg_barcodes]
)

# Smooth into 50 bins
n_bins        <- 50
bin_size_reg  <- floor(ncol(expr_mat_reg) / n_bins)
bin_mat_reg   <- sapply(seq_len(n_bins), function(i) {
  idx <- ((i-1)*bin_size_reg + 1):min(i*bin_size_reg, ncol(expr_mat_reg))
  rowMeans(expr_mat_reg[, idx, drop = FALSE])
})

# Z-score and clip
bin_z_reg             <- t(scale(t(bin_mat_reg)))
bin_z_reg[is.nan(bin_z_reg)] <- 0
bin_z_reg             <- pmin(pmax(bin_z_reg, -2), 2)

# Add pseudotime axis annotation (Early → Late)
col_annotation <- data.frame(
  Pseudotime = seq(0, 1, length.out = n_bins),
  row.names  = paste0("bin", seq_len(n_bins))
)
colnames(bin_z_reg) <- paste0("bin", seq_len(n_bins))

annotation_colors <- list(
  Pseudotime = colorRampPalette(c("grey90","#FFD700"))(100)
)

# Plot regulatory lineage heatmap
pheatmap::pheatmap(
  bin_z_reg,
  cluster_cols      = FALSE,
  cluster_rows      = FALSE,
  color             = colorRampPalette(c("navy","white","firebrick3"))(100),
  breaks            = seq(-2, 2, length.out = 101),
  main              = "Gene expression along regulatory pseudotime\nTnaive → Tcm → Treg",
  fontsize_row      = 9,
  border_color      = NA,
  show_colnames     = FALSE,
  annotation_col    = col_annotation,
  annotation_colors = annotation_colors,
  annotation_names_col = FALSE,
  gaps_row          = c(4, 8, 12)  # Tnaive | Tcm | Treg core | Treg effector
)

# Save
dev.copy(png, "pseudotime_heatmap_regulatory.png",
         width = 12, height = 7, units = "in", res = 300)
png 
  3 
dev.off()
png 
  2 

11 Save Final Object

DefaultAssay(clean_obj) <- "integrated"

saveRDS(
  clean_obj,
  "CD4_reference_clean_Azimuth_Slingshot.rds",
  compress = FALSE
)

cat("✅ Saved:", ncol(clean_obj), "cells\n")
✅ Saved: 11466 cells
cat("\nFinal trajectory state distribution:\n")

Final trajectory state distribution:
print(table(clean_obj$trajectory_state, useNA = "ifany"))

Tnaive    Tcm    Tem  Temra   Treg 
  2037   9067    145     10    207 
cat("\nPseudotime columns added:\n")

Pseudotime columns added:
cat("  - PT_effector   (Tnaive → Tcm → Tem → Temra)\n")
  - PT_effector   (Tnaive → Tcm → Tem → Temra)
cat("  - PT_regulatory (Tnaive → Tcm → Treg)\n")
  - PT_regulatory (Tnaive → Tcm → Treg)
cat("\nObject ready for malignant cell projection via MapQuery\n")

Object ready for malignant cell projection via MapQuery
cat("return.model = TRUE confirmed — MapQuery will work\n")
return.model = TRUE confirmed — MapQuery will work
---
title: "CD4 T-cell Reference — Slingshot Trajectory"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---

# Load Libraries
```{r setup, include=TRUE}
library(dplyr)
library(ggplot2)
library(Seurat)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)
library(tidyr)
library(ggrepel)
library(pheatmap)

options(timeout = 300)
set.seed(123)

# Colour palette — consistent throughout
flow_colours <- c(
  "Tnaive" = "#4472C4",
  "Tcm"    = "#ED7D31",
  "Tem"    = "#A9D18E",
  "Temra"  = "#C00000",
  "Treg"   = "#FFD700"
)
```

# Load Object
```{r load-object}
clean_obj <- readRDS("../CD4_reference_clean_Azimuth_ready_for_Slingshot.rds")

cat("Total cells loaded:", ncol(clean_obj), "\n")
cat("\nAzimuth l2 composition:\n")
print(table(clean_obj$predicted.celltype.l2))
```

# Assign Trajectory States
```{r trajectory-states}
# Map Azimuth l2 → trajectory state labels
# CD4 CTL merged into Temra:
#   - Only 10 cells — too few for stable Slingshot endpoint
#   - Biologically justified: CD4 CTL is cytotoxic CD4, equivalent to Temra

clean_obj$trajectory_state <- dplyr::recode(
  clean_obj$predicted.celltype.l2,
  "CD4 Naive"     = "Tnaive",
  "CD4 TCM"       = "Tcm",
  "CD4 TEM"       = "Tem",
  "CD4 Temra/CTL"       = "Temra",   # merged into Temra
  "Treg"          = "Treg",
  .default        = NA_character_
)

clean_obj$trajectory_state <- factor(
  clean_obj$trajectory_state,
  levels = c("Tnaive","Tcm","Tem","Temra","Treg")
)

cat("Trajectory state distribution:\n")
print(table(clean_obj$trajectory_state, useNA = "ifany"))

# Visualise on UMAP
DimPlot(clean_obj,
        group.by = "trajectory_state",
        cols     = flow_colours,
        na.value = "grey90",
        label    = TRUE,
        repel    = TRUE) +
  ggtitle("Trajectory states — Azimuth l2 mapped to flow-equivalent labels")
```

# Run Slingshot Trajectory
```{r slingshot-run}
# ── Prepare input ──
# Use UMAP coordinates directly — no SCE conversion needed
umap_coords <- Embeddings(clean_obj, "umap")

# Only use cells with a confirmed trajectory state
keep_cells  <- !is.na(clean_obj$trajectory_state)
umap_sub    <- umap_coords[keep_cells, ]
cluster_sub <- droplevels(clean_obj$trajectory_state[keep_cells])

cat("Cells entering Slingshot:\n")
print(table(cluster_sub))

# ── Run Slingshot ──
# start.clus = Tnaive (confirmed earliest state)
# end.clus   = Temra (effector/cytotoxic arm) + Treg (regulatory arm)
# Two terminal states = two lineages
sce_sling <- slingshot(
  data          = umap_sub,
  clusterLabels = cluster_sub,
  start.clus    = "Tnaive",
  end.clus      = c("Temra", "Treg"),
  approx_points = 100,      # reduced from 300 — less smoothing
  stretch       = 0         # prevents curves extending beyond terminal clusters
)

cat("\nLineages after fix:\n")
print(slingLineages(sce_sling))
cat("\nLineages inferred:\n")
print(slingLineages(sce_sling))
# Expected:
# Lineage 1: Tnaive → Tcm → Tem → Temra  (effector arm)
# Lineage 2: Tnaive → Tcm → Treg          (regulatory arm)
```

# Extract Pseudotime
```{r pseudotime-extract}
# Extract pseudotime matrix (cells × lineages)
pt_matrix           <- slingPseudotime(sce_sling)
colnames(pt_matrix) <- c("PT_effector","PT_regulatory")

cat("Pseudotime summary — effector lineage:\n")
print(summary(pt_matrix[, "PT_effector"]))

cat("\nPseudotime summary — regulatory lineage:\n")
print(summary(pt_matrix[, "PT_regulatory"]))

# Extract pseudotime matrix (cells × lineages)
pt_matrix           <- slingPseudotime(sce_sling)
colnames(pt_matrix) <- c("PT_effector","PT_regulatory")

cat("Pseudotime summary — effector lineage:\n")
print(summary(pt_matrix[, "PT_effector"]))
cat("\nPseudotime summary — regulatory lineage:\n")
print(summary(pt_matrix[, "PT_regulatory"]))

# ── Add pseudotime to metadata via direct dataframe assignment ──
# IMPORTANT: Do NOT use clean_obj$PT_effector <- ... 
# That triggers Seurat's validObject() which fails on large objects
# Instead extract meta.data, modify as plain R dataframe, write back

meta <- clean_obj@meta.data
meta$PT_effector   <- NA_real_
meta$PT_regulatory <- NA_real_

common <- intersect(rownames(pt_matrix), rownames(meta))
meta[common, "PT_effector"]   <- pt_matrix[common, "PT_effector"]
meta[common, "PT_regulatory"] <- pt_matrix[common, "PT_regulatory"]

clean_obj@meta.data <- meta

cat("\nCells with effector pseudotime:  ",
    sum(!is.na(clean_obj@meta.data$PT_effector)), "\n")
cat("Cells with regulatory pseudotime:",
    sum(!is.na(clean_obj@meta.data$PT_regulatory)), "\n")
```

# Trajectory Plot — Curves on UMAP
```{r trajectory-plot, fig.width=14, fig.height=8}
# ── Extract curve coordinates ──
curves <- slingCurves(sce_sling)

# Curve 1 — effector lineage (Tnaive → Tcm → Tem → Temra)
curve1_df           <- as.data.frame(curves[[1]]$s[curves[[1]]$ord, ])
colnames(curve1_df) <- c("UMAP_1","UMAP_2")

# Curve 2 — regulatory lineage (Tnaive → Tcm → Treg)
curve2_df           <- as.data.frame(curves[[2]]$s[curves[[2]]$ord, ])
colnames(curve2_df) <- c("UMAP_1","UMAP_2")

# ── UMAP base dataframe ──
umap_df           <- as.data.frame(umap_coords)
colnames(umap_df) <- c("UMAP_1","UMAP_2")
umap_df$trajectory_state <- clean_obj$trajectory_state
umap_df <- umap_df %>% filter(!is.na(trajectory_state))

# ── Milestone centroids ──
milestones <- umap_df %>%
  group_by(trajectory_state) %>%
  summarise(
    UMAP_1 = median(UMAP_1),
    UMAP_2 = median(UMAP_2),
    .groups = "drop"
  )

# ── Main plot ──
p_traj <- ggplot(umap_df,
                 aes(x = UMAP_1, y = UMAP_2,
                     colour = trajectory_state)) +
  geom_point(size = 0.4, alpha = 0.5) +
  # Effector lineage — solid black line
  geom_path(data        = curve1_df,
            aes(x = UMAP_1, y = UMAP_2),
            colour      = "black",
            linewidth   = 1.5,
            inherit.aes = FALSE) +
  # Regulatory lineage — dashed black line
  geom_path(data        = curve2_df,
            aes(x = UMAP_1, y = UMAP_2),
            colour      = "black",
            linewidth   = 1.5,
            linetype    = "dashed",
            inherit.aes = FALSE) +
  # Milestone labels
  ggrepel::geom_label_repel(
    data        = milestones,
    aes(x     = UMAP_1,
        y     = UMAP_2,
        label = trajectory_state,
        fill  = trajectory_state),
    colour      = "white",
    fontface    = "bold",
    size        = 5,
    box.padding = 0.8,
    inherit.aes = FALSE
  ) +
  scale_colour_manual(values = flow_colours, name = "State") +
  scale_fill_manual(values   = flow_colours, guide = "none") +
  theme_classic() +
  theme(
    plot.title    = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    legend.position = "right"
  ) +
  labs(
    title    = "CD4 T-cell Differentiation Trajectory (Slingshot)",
    subtitle = "Solid: Tnaive → Tcm → Tem → Temra | Dashed: Tnaive → Tcm → Treg",
    x        = "UMAP 1",
    y        = "UMAP 2"
  )

print(p_traj)

ggsave("cd4_trajectory_slingshot.png",
       plot   = p_traj,
       width  = 14, height = 8, dpi = 300)


# Get Treg centroid
treg_centroid <- umap_df %>%
  filter(trajectory_state == "Treg") %>%
  summarise(UMAP_1 = median(UMAP_1), UMAP_2 = median(UMAP_2))

# Get last point of dashed curve
curve2_end <- tail(curve2_df, 1)

p_traj +
  # Add arrow from curve endpoint to Treg centroid
  annotate("segment",
           x      = curve2_end$UMAP_1,
           y      = curve2_end$UMAP_2,
           xend   = treg_centroid$UMAP_1,
           yend   = treg_centroid$UMAP_2,
           colour = "black",
           linewidth = 1.2,
           linetype  = "dashed",
           arrow  = arrow(length = unit(0.3,"cm"),
                          type   = "closed")) +
  labs(subtitle = "Solid: Tnaive→Tcm→Tem→Temra | Dashed: Tnaive→Tcm→Treg")
```

# Pseudotime Feature Plots
```{r pseudotime-featureplots, fig.width=14, fig.height=6}
# Fix: add na.rm handling by filtering NAs before quantile calculation
# Use a numeric cutoff instead of "q95"

# Calculate q95 manually excluding NAs
eff_q95 <- quantile(clean_obj@meta.data$PT_effector, 
                     probs = 0.95, na.rm = TRUE)
reg_q95 <- quantile(clean_obj@meta.data$PT_regulatory, 
                     probs = 0.95, na.rm = TRUE)

cat("Effector q95 cutoff:",   round(eff_q95, 2), "\n")
cat("Regulatory q95 cutoff:", round(reg_q95, 2), "\n")

p_eff <- FeaturePlot(clean_obj,
                     features   = "PT_effector",
                     reduction  = "umap",
                     cols       = c("grey90","#C00000"),
                     pt.size    = 0.5,
                     min.cutoff = 0,
                     max.cutoff = eff_q95) +   # numeric value not "q95"
  ggtitle("Effector pseudotime\nTnaive → Tcm → Tem → Temra") +
  theme(plot.title = element_text(size = 11, face = "bold"))

p_reg <- FeaturePlot(clean_obj,
                     features   = "PT_regulatory",
                     reduction  = "umap",
                     cols       = c("grey90","#4472C4"),
                     pt.size    = 0.5,
                     min.cutoff = 0,
                     max.cutoff = reg_q95) +   # numeric value not "q95"
  ggtitle("Regulatory pseudotime\nTnaive → Tcm → Treg") +
  theme(plot.title = element_text(size = 11, face = "bold"))

p_eff | p_reg

ggsave("pseudotime_featureplots.png",
       plot  = p_eff | p_reg,
       width = 14, height = 6, dpi = 300)
```

# Pseudotime Violin per State per Lineage
```{r pseudotime-violin, fig.width=14, fig.height=6}
# Build long-format dataframe for plotting
pt_df <- data.frame(
  trajectory_state = clean_obj$trajectory_state[keep_cells],
  PT_effector      = pt_matrix[, "PT_effector"],
  PT_regulatory    = pt_matrix[, "PT_regulatory"]
) %>%
  tidyr::pivot_longer(
    cols      = c(PT_effector, PT_regulatory),
    names_to  = "lineage",
    values_to = "pseudotime"
  ) %>%
  dplyr::filter(
    !is.na(trajectory_state),
    !is.na(pseudotime)
  )

ggplot(pt_df,
       aes(x    = factor(trajectory_state,
                          levels = c("Tnaive","Tcm","Tem","Temra","Treg")),
           y    = pseudotime,
           fill = trajectory_state)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.3) +
  facet_wrap(~ lineage,
             ncol     = 2,
             labeller = labeller(lineage = c(
               "PT_effector"   = "Effector lineage (→ Temra)",
               "PT_regulatory" = "Regulatory lineage (→ Treg)"
             ))) +
  scale_fill_manual(values = flow_colours) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 10),
    strip.text      = element_text(size = 11, face = "bold")
  ) +
  labs(
    title    = "Pseudotime per state per lineage",
    subtitle = "Pseudotime should increase monotonically: Tnaive → Temra / Treg",
    x        = NULL,
    y        = "Slingshot pseudotime"
  )

# Confirm monotonic increase — key validation
cat("Median pseudotime per state — effector lineage:\n")
pt_df %>%
  filter(lineage == "PT_effector") %>%
  group_by(trajectory_state) %>%
  summarise(median_pt = round(median(pseudotime, na.rm = TRUE), 2),
            n         = n(),
            .groups   = "drop") %>%
  arrange(median_pt) %>%
  print()
```

# Gene Expression Along Pseudotime Heatmap
```{r gene-heatmap, fig.width=12, fig.height=7}
DefaultAssay(clean_obj) <- "RNA"

key_genes <- c(
  "LEF1","TCF7","KLF2","SELL",        # Tnaive
  "IL7R","S100A4","ITGB1","BCL2",     # Tcm
  "GZMK","CXCR3","CCR5",             # Tem
  "CX3CR1","GNLY","FGFBP2","PRF1",   # Temra
  "FOXP3","IL2RA","CTLA4","IKZF2"    # Treg
)
key_genes <- intersect(key_genes, rownames(clean_obj))
cat("Genes found:", length(key_genes), "\n")

# Order cells by effector pseudotime
eff_pt       <- clean_obj$PT_effector
eff_cells    <- !is.na(eff_pt)
eff_order    <- order(eff_pt[eff_cells])
eff_barcodes <- colnames(clean_obj)[eff_cells][eff_order]

expr_mat <- as.matrix(
  GetAssayData(clean_obj, assay = "RNA", layer = "data")[key_genes, eff_barcodes]
)

# Smooth into 50 pseudotime bins
n_bins   <- 50
bin_size <- floor(ncol(expr_mat) / n_bins)
bin_mat  <- sapply(seq_len(n_bins), function(i) {
  idx <- ((i-1)*bin_size + 1):min(i*bin_size, ncol(expr_mat))
  rowMeans(expr_mat[, idx, drop = FALSE])
})

# Z-score per gene and clip to [-2, 2]
bin_z            <- t(scale(t(bin_mat)))
bin_z[is.nan(bin_z)] <- 0
bin_z            <- pmin(pmax(bin_z, -2), 2)

# Draw in notebook — remove filename parameter
pheatmap::pheatmap(
  bin_z,
  cluster_cols  = FALSE,
  cluster_rows  = FALSE,
  color         = colorRampPalette(c("navy","white","firebrick3"))(100),
  breaks        = seq(-2, 2, length.out = 101),
  main          = "Gene expression along effector pseudotime\nTnaive → Tcm → Tem → Temra",
  fontsize_row  = 9,
  border_color  = NA,
  show_colnames = FALSE,
  gaps_row      = c(4, 8, 11)
)

# Save separately AFTER displaying
dev.copy(png, "pseudotime_heatmap.png", width = 12, height = 7, units = "in", res = 300)
dev.off()

```



# Gene Expression Along Pseudotime Heatmap
```{r gene-heatmap-Treg, fig.width=12, fig.height=7}
# ── Regulatory lineage heatmap (Tnaive → Tcm → Treg) ──

# Treg lineage specific genes — include Treg markers prominently
reg_genes <- c(
  "LEF1","TCF7","KLF2","SELL",          # Tnaive — should decline
  "IL7R","S100A4","ITGB1","BCL2",       # Tcm — intermediate
  "FOXP3","IL2RA","CTLA4","IKZF2",      # Treg core — should rise
  "TIGIT","TNFRSF18","TNFRSF4","CCR8"   # Treg effector — rise at end
)
reg_genes <- intersect(reg_genes, rownames(clean_obj))
cat("Regulatory lineage genes found:", length(reg_genes), "\n")

# Order cells by REGULATORY pseudotime (not effector)
reg_pt       <- clean_obj$PT_regulatory
reg_cells    <- !is.na(reg_pt)
reg_order    <- order(reg_pt[reg_cells])
reg_barcodes <- colnames(clean_obj)[reg_cells][reg_order]

cat("Cells on regulatory lineage:", length(reg_barcodes), "\n")

# Extract expression matrix ordered by regulatory pseudotime
expr_mat_reg <- as.matrix(
  GetAssayData(clean_obj, assay = "RNA", layer = "data")[reg_genes, reg_barcodes]
)

# Smooth into 50 bins
n_bins        <- 50
bin_size_reg  <- floor(ncol(expr_mat_reg) / n_bins)
bin_mat_reg   <- sapply(seq_len(n_bins), function(i) {
  idx <- ((i-1)*bin_size_reg + 1):min(i*bin_size_reg, ncol(expr_mat_reg))
  rowMeans(expr_mat_reg[, idx, drop = FALSE])
})

# Z-score and clip
bin_z_reg             <- t(scale(t(bin_mat_reg)))
bin_z_reg[is.nan(bin_z_reg)] <- 0
bin_z_reg             <- pmin(pmax(bin_z_reg, -2), 2)

# Add pseudotime axis annotation (Early → Late)
col_annotation <- data.frame(
  Pseudotime = seq(0, 1, length.out = n_bins),
  row.names  = paste0("bin", seq_len(n_bins))
)
colnames(bin_z_reg) <- paste0("bin", seq_len(n_bins))

annotation_colors <- list(
  Pseudotime = colorRampPalette(c("grey90","#FFD700"))(100)
)

# Plot regulatory lineage heatmap
pheatmap::pheatmap(
  bin_z_reg,
  cluster_cols      = FALSE,
  cluster_rows      = FALSE,
  color             = colorRampPalette(c("navy","white","firebrick3"))(100),
  breaks            = seq(-2, 2, length.out = 101),
  main              = "Gene expression along regulatory pseudotime\nTnaive → Tcm → Treg",
  fontsize_row      = 9,
  border_color      = NA,
  show_colnames     = FALSE,
  annotation_col    = col_annotation,
  annotation_colors = annotation_colors,
  annotation_names_col = FALSE,
  gaps_row          = c(4, 8, 12)  # Tnaive | Tcm | Treg core | Treg effector
)

# Save
dev.copy(png, "pseudotime_heatmap_regulatory.png",
         width = 12, height = 7, units = "in", res = 300)
dev.off()
```


















# Save Final Object
```{r save}
DefaultAssay(clean_obj) <- "integrated"

saveRDS(
  clean_obj,
  "CD4_reference_clean_Azimuth_Slingshot.rds",
  compress = FALSE
)

cat("✅ Saved:", ncol(clean_obj), "cells\n")
cat("\nFinal trajectory state distribution:\n")
print(table(clean_obj$trajectory_state, useNA = "ifany"))
cat("\nPseudotime columns added:\n")
cat("  - PT_effector   (Tnaive → Tcm → Tem → Temra)\n")
cat("  - PT_regulatory (Tnaive → Tcm → Treg)\n")
cat("\nObject ready for malignant cell projection via MapQuery\n")
cat("return.model = TRUE confirmed — MapQuery will work\n")
```
