Load Libraries
library(dplyr)
library(ggplot2)
library(Seurat)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)
library(tidyr)
library(ggrepel)
library(pheatmap)
options(timeout = 300)
set.seed(123)
# Colour palette — consistent throughout
flow_colours <- c(
"Tnaive" = "#4472C4",
"Tcm" = "#ED7D31",
"Tem" = "#A9D18E",
"Temra" = "#C00000",
"Treg" = "#FFD700"
)
Load Object
clean_obj <- readRDS("../CD4_reference_clean_Azimuth_ready_for_Slingshot.rds")
cat("Total cells loaded:", ncol(clean_obj), "\n")
Total cells loaded: 11466
cat("\nAzimuth l2 composition:\n")
Azimuth l2 composition:
print(table(clean_obj$predicted.celltype.l2))
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
Assign Trajectory
States
# Map Azimuth l2 → trajectory state labels
# CD4 CTL merged into Temra:
# - Only 10 cells — too few for stable Slingshot endpoint
# - Biologically justified: CD4 CTL is cytotoxic CD4, equivalent to Temra
clean_obj$trajectory_state <- dplyr::recode(
clean_obj$predicted.celltype.l2,
"CD4 Naive" = "Tnaive",
"CD4 TCM" = "Tcm",
"CD4 TEM" = "Tem",
"CD4 Temra/CTL" = "Temra", # merged into Temra
"Treg" = "Treg",
.default = NA_character_
)
clean_obj$trajectory_state <- factor(
clean_obj$trajectory_state,
levels = c("Tnaive","Tcm","Tem","Temra","Treg")
)
cat("Trajectory state distribution:\n")
Trajectory state distribution:
print(table(clean_obj$trajectory_state, useNA = "ifany"))
Tnaive Tcm Tem Temra Treg
2037 9067 145 10 207
# Visualise on UMAP
DimPlot(clean_obj,
group.by = "trajectory_state",
cols = flow_colours,
na.value = "grey90",
label = TRUE,
repel = TRUE) +
ggtitle("Trajectory states — Azimuth l2 mapped to flow-equivalent labels")

Run Slingshot
Trajectory
# ── Prepare input ──
# Use UMAP coordinates directly — no SCE conversion needed
umap_coords <- Embeddings(clean_obj, "umap")
# Only use cells with a confirmed trajectory state
keep_cells <- !is.na(clean_obj$trajectory_state)
umap_sub <- umap_coords[keep_cells, ]
cluster_sub <- droplevels(clean_obj$trajectory_state[keep_cells])
cat("Cells entering Slingshot:\n")
Cells entering Slingshot:
print(table(cluster_sub))
cluster_sub
Tnaive Tcm Tem Temra Treg
2037 9067 145 10 207
# ── Run Slingshot ──
# start.clus = Tnaive (confirmed earliest state)
# end.clus = Temra (effector/cytotoxic arm) + Treg (regulatory arm)
# Two terminal states = two lineages
sce_sling <- slingshot(
data = umap_sub,
clusterLabels = cluster_sub,
start.clus = "Tnaive",
end.clus = c("Temra", "Treg"),
approx_points = 100, # reduced from 300 — less smoothing
stretch = 0 # prevents curves extending beyond terminal clusters
)
cat("\nLineages after fix:\n")
Lineages after fix:
print(slingLineages(sce_sling))
$Lineage1
[1] "Tnaive" "Tcm" "Tem" "Temra"
$Lineage2
[1] "Tnaive" "Tcm" "Treg"
cat("\nLineages inferred:\n")
Lineages inferred:
print(slingLineages(sce_sling))
$Lineage1
[1] "Tnaive" "Tcm" "Tem" "Temra"
$Lineage2
[1] "Tnaive" "Tcm" "Treg"
# Expected:
# Lineage 1: Tnaive → Tcm → Tem → Temra (effector arm)
# Lineage 2: Tnaive → Tcm → Treg (regulatory arm)
Trajectory Plot —
Curves on UMAP
# ── Extract curve coordinates ──
curves <- slingCurves(sce_sling)
# Curve 1 — effector lineage (Tnaive → Tcm → Tem → Temra)
curve1_df <- as.data.frame(curves[[1]]$s[curves[[1]]$ord, ])
colnames(curve1_df) <- c("UMAP_1","UMAP_2")
# Curve 2 — regulatory lineage (Tnaive → Tcm → Treg)
curve2_df <- as.data.frame(curves[[2]]$s[curves[[2]]$ord, ])
colnames(curve2_df) <- c("UMAP_1","UMAP_2")
# ── UMAP base dataframe ──
umap_df <- as.data.frame(umap_coords)
colnames(umap_df) <- c("UMAP_1","UMAP_2")
umap_df$trajectory_state <- clean_obj$trajectory_state
umap_df <- umap_df %>% filter(!is.na(trajectory_state))
# ── Milestone centroids ──
milestones <- umap_df %>%
group_by(trajectory_state) %>%
summarise(
UMAP_1 = median(UMAP_1),
UMAP_2 = median(UMAP_2),
.groups = "drop"
)
# ── Main plot ──
p_traj <- ggplot(umap_df,
aes(x = UMAP_1, y = UMAP_2,
colour = trajectory_state)) +
geom_point(size = 0.4, alpha = 0.5) +
# Effector lineage — solid black line
geom_path(data = curve1_df,
aes(x = UMAP_1, y = UMAP_2),
colour = "black",
linewidth = 1.5,
inherit.aes = FALSE) +
# Regulatory lineage — dashed black line
geom_path(data = curve2_df,
aes(x = UMAP_1, y = UMAP_2),
colour = "black",
linewidth = 1.5,
linetype = "dashed",
inherit.aes = FALSE) +
# Milestone labels
ggrepel::geom_label_repel(
data = milestones,
aes(x = UMAP_1,
y = UMAP_2,
label = trajectory_state,
fill = trajectory_state),
colour = "white",
fontface = "bold",
size = 5,
box.padding = 0.8,
inherit.aes = FALSE
) +
scale_colour_manual(values = flow_colours, name = "State") +
scale_fill_manual(values = flow_colours, guide = "none") +
theme_classic() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11),
legend.position = "right"
) +
labs(
title = "CD4 T-cell Differentiation Trajectory (Slingshot)",
subtitle = "Solid: Tnaive → Tcm → Tem → Temra | Dashed: Tnaive → Tcm → Treg",
x = "UMAP 1",
y = "UMAP 2"
)
print(p_traj)

ggsave("cd4_trajectory_slingshot.png",
plot = p_traj,
width = 14, height = 8, dpi = 300)
# Get Treg centroid
treg_centroid <- umap_df %>%
filter(trajectory_state == "Treg") %>%
summarise(UMAP_1 = median(UMAP_1), UMAP_2 = median(UMAP_2))
# Get last point of dashed curve
curve2_end <- tail(curve2_df, 1)
p_traj +
# Add arrow from curve endpoint to Treg centroid
annotate("segment",
x = curve2_end$UMAP_1,
y = curve2_end$UMAP_2,
xend = treg_centroid$UMAP_1,
yend = treg_centroid$UMAP_2,
colour = "black",
linewidth = 1.2,
linetype = "dashed",
arrow = arrow(length = unit(0.3,"cm"),
type = "closed")) +
labs(subtitle = "Solid: Tnaive→Tcm→Tem→Temra | Dashed: Tnaive→Tcm→Treg")

Pseudotime Feature
Plots
# Fix: add na.rm handling by filtering NAs before quantile calculation
# Use a numeric cutoff instead of "q95"
# Calculate q95 manually excluding NAs
eff_q95 <- quantile(clean_obj@meta.data$PT_effector,
probs = 0.95, na.rm = TRUE)
reg_q95 <- quantile(clean_obj@meta.data$PT_regulatory,
probs = 0.95, na.rm = TRUE)
cat("Effector q95 cutoff:", round(eff_q95, 2), "\n")
Effector q95 cutoff: 12.29
cat("Regulatory q95 cutoff:", round(reg_q95, 2), "\n")
Regulatory q95 cutoff: 17.04
p_eff <- FeaturePlot(clean_obj,
features = "PT_effector",
reduction = "umap",
cols = c("grey90","#C00000"),
pt.size = 0.5,
min.cutoff = 0,
max.cutoff = eff_q95) + # numeric value not "q95"
ggtitle("Effector pseudotime\nTnaive → Tcm → Tem → Temra") +
theme(plot.title = element_text(size = 11, face = "bold"))
p_reg <- FeaturePlot(clean_obj,
features = "PT_regulatory",
reduction = "umap",
cols = c("grey90","#4472C4"),
pt.size = 0.5,
min.cutoff = 0,
max.cutoff = reg_q95) + # numeric value not "q95"
ggtitle("Regulatory pseudotime\nTnaive → Tcm → Treg") +
theme(plot.title = element_text(size = 11, face = "bold"))
p_eff | p_reg

ggsave("pseudotime_featureplots.png",
plot = p_eff | p_reg,
width = 14, height = 6, dpi = 300)
Pseudotime Violin per
State per Lineage
# Build long-format dataframe for plotting
pt_df <- data.frame(
trajectory_state = clean_obj$trajectory_state[keep_cells],
PT_effector = pt_matrix[, "PT_effector"],
PT_regulatory = pt_matrix[, "PT_regulatory"]
) %>%
tidyr::pivot_longer(
cols = c(PT_effector, PT_regulatory),
names_to = "lineage",
values_to = "pseudotime"
) %>%
dplyr::filter(
!is.na(trajectory_state),
!is.na(pseudotime)
)
ggplot(pt_df,
aes(x = factor(trajectory_state,
levels = c("Tnaive","Tcm","Tem","Temra","Treg")),
y = pseudotime,
fill = trajectory_state)) +
geom_violin(scale = "width", trim = TRUE) +
geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.3) +
facet_wrap(~ lineage,
ncol = 2,
labeller = labeller(lineage = c(
"PT_effector" = "Effector lineage (→ Temra)",
"PT_regulatory" = "Regulatory lineage (→ Treg)"
))) +
scale_fill_manual(values = flow_colours) +
theme_bw() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
strip.text = element_text(size = 11, face = "bold")
) +
labs(
title = "Pseudotime per state per lineage",
subtitle = "Pseudotime should increase monotonically: Tnaive → Temra / Treg",
x = NULL,
y = "Slingshot pseudotime"
)

# Confirm monotonic increase — key validation
cat("Median pseudotime per state — effector lineage:\n")
Median pseudotime per state — effector lineage:
pt_df %>%
filter(lineage == "PT_effector") %>%
group_by(trajectory_state) %>%
summarise(median_pt = round(median(pseudotime, na.rm = TRUE), 2),
n = n(),
.groups = "drop") %>%
arrange(median_pt) %>%
print()
Gene Expression Along
Pseudotime Heatmap
DefaultAssay(clean_obj) <- "RNA"
key_genes <- c(
"LEF1","TCF7","KLF2","SELL", # Tnaive
"IL7R","S100A4","ITGB1","BCL2", # Tcm
"GZMK","CXCR3","CCR5", # Tem
"CX3CR1","GNLY","FGFBP2","PRF1", # Temra
"FOXP3","IL2RA","CTLA4","IKZF2" # Treg
)
key_genes <- intersect(key_genes, rownames(clean_obj))
cat("Genes found:", length(key_genes), "\n")
Genes found: 19
# Order cells by effector pseudotime
eff_pt <- clean_obj$PT_effector
eff_cells <- !is.na(eff_pt)
eff_order <- order(eff_pt[eff_cells])
eff_barcodes <- colnames(clean_obj)[eff_cells][eff_order]
expr_mat <- as.matrix(
GetAssayData(clean_obj, assay = "RNA", layer = "data")[key_genes, eff_barcodes]
)
# Smooth into 50 pseudotime bins
n_bins <- 50
bin_size <- floor(ncol(expr_mat) / n_bins)
bin_mat <- sapply(seq_len(n_bins), function(i) {
idx <- ((i-1)*bin_size + 1):min(i*bin_size, ncol(expr_mat))
rowMeans(expr_mat[, idx, drop = FALSE])
})
# Z-score per gene and clip to [-2, 2]
bin_z <- t(scale(t(bin_mat)))
bin_z[is.nan(bin_z)] <- 0
bin_z <- pmin(pmax(bin_z, -2), 2)
# Draw in notebook — remove filename parameter
pheatmap::pheatmap(
bin_z,
cluster_cols = FALSE,
cluster_rows = FALSE,
color = colorRampPalette(c("navy","white","firebrick3"))(100),
breaks = seq(-2, 2, length.out = 101),
main = "Gene expression along effector pseudotime\nTnaive → Tcm → Tem → Temra",
fontsize_row = 9,
border_color = NA,
show_colnames = FALSE,
gaps_row = c(4, 8, 11)
)
# Save separately AFTER displaying
dev.copy(png, "pseudotime_heatmap.png", width = 12, height = 7, units = "in", res = 300)
png
3
dev.off()
png
2

Gene Expression Along
Pseudotime Heatmap
# ── Regulatory lineage heatmap (Tnaive → Tcm → Treg) ──
# Treg lineage specific genes — include Treg markers prominently
reg_genes <- c(
"LEF1","TCF7","KLF2","SELL", # Tnaive — should decline
"IL7R","S100A4","ITGB1","BCL2", # Tcm — intermediate
"FOXP3","IL2RA","CTLA4","IKZF2", # Treg core — should rise
"TIGIT","TNFRSF18","TNFRSF4","CCR8" # Treg effector — rise at end
)
reg_genes <- intersect(reg_genes, rownames(clean_obj))
cat("Regulatory lineage genes found:", length(reg_genes), "\n")
Regulatory lineage genes found: 16
# Order cells by REGULATORY pseudotime (not effector)
reg_pt <- clean_obj$PT_regulatory
reg_cells <- !is.na(reg_pt)
reg_order <- order(reg_pt[reg_cells])
reg_barcodes <- colnames(clean_obj)[reg_cells][reg_order]
cat("Cells on regulatory lineage:", length(reg_barcodes), "\n")
Cells on regulatory lineage: 10267
# Extract expression matrix ordered by regulatory pseudotime
expr_mat_reg <- as.matrix(
GetAssayData(clean_obj, assay = "RNA", layer = "data")[reg_genes, reg_barcodes]
)
# Smooth into 50 bins
n_bins <- 50
bin_size_reg <- floor(ncol(expr_mat_reg) / n_bins)
bin_mat_reg <- sapply(seq_len(n_bins), function(i) {
idx <- ((i-1)*bin_size_reg + 1):min(i*bin_size_reg, ncol(expr_mat_reg))
rowMeans(expr_mat_reg[, idx, drop = FALSE])
})
# Z-score and clip
bin_z_reg <- t(scale(t(bin_mat_reg)))
bin_z_reg[is.nan(bin_z_reg)] <- 0
bin_z_reg <- pmin(pmax(bin_z_reg, -2), 2)
# Add pseudotime axis annotation (Early → Late)
col_annotation <- data.frame(
Pseudotime = seq(0, 1, length.out = n_bins),
row.names = paste0("bin", seq_len(n_bins))
)
colnames(bin_z_reg) <- paste0("bin", seq_len(n_bins))
annotation_colors <- list(
Pseudotime = colorRampPalette(c("grey90","#FFD700"))(100)
)
# Plot regulatory lineage heatmap
pheatmap::pheatmap(
bin_z_reg,
cluster_cols = FALSE,
cluster_rows = FALSE,
color = colorRampPalette(c("navy","white","firebrick3"))(100),
breaks = seq(-2, 2, length.out = 101),
main = "Gene expression along regulatory pseudotime\nTnaive → Tcm → Treg",
fontsize_row = 9,
border_color = NA,
show_colnames = FALSE,
annotation_col = col_annotation,
annotation_colors = annotation_colors,
annotation_names_col = FALSE,
gaps_row = c(4, 8, 12) # Tnaive | Tcm | Treg core | Treg effector
)
# Save
dev.copy(png, "pseudotime_heatmap_regulatory.png",
width = 12, height = 7, units = "in", res = 300)
png
3
dev.off()
png
2

Save Final Object
DefaultAssay(clean_obj) <- "integrated"
saveRDS(
clean_obj,
"CD4_reference_clean_Azimuth_Slingshot.rds",
compress = FALSE
)
cat("✅ Saved:", ncol(clean_obj), "cells\n")
✅ Saved: 11466 cells
cat("\nFinal trajectory state distribution:\n")
Final trajectory state distribution:
print(table(clean_obj$trajectory_state, useNA = "ifany"))
Tnaive Tcm Tem Temra Treg
2037 9067 145 10 207
cat("\nPseudotime columns added:\n")
Pseudotime columns added:
cat(" - PT_effector (Tnaive → Tcm → Tem → Temra)\n")
- PT_effector (Tnaive → Tcm → Tem → Temra)
cat(" - PT_regulatory (Tnaive → Tcm → Treg)\n")
- PT_regulatory (Tnaive → Tcm → Treg)
cat("\nObject ready for malignant cell projection via MapQuery\n")
Object ready for malignant cell projection via MapQuery
cat("return.model = TRUE confirmed — MapQuery will work\n")
return.model = TRUE confirmed — MapQuery will work
---
title: "CD4 T-cell Reference — Slingshot Trajectory"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---

# Load Libraries
```{r setup, include=TRUE}
library(dplyr)
library(ggplot2)
library(Seurat)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)
library(tidyr)
library(ggrepel)
library(pheatmap)

options(timeout = 300)
set.seed(123)

# Colour palette — consistent throughout
flow_colours <- c(
  "Tnaive" = "#4472C4",
  "Tcm"    = "#ED7D31",
  "Tem"    = "#A9D18E",
  "Temra"  = "#C00000",
  "Treg"   = "#FFD700"
)
```

# Load Object
```{r load-object}
clean_obj <- readRDS("../CD4_reference_clean_Azimuth_ready_for_Slingshot.rds")

cat("Total cells loaded:", ncol(clean_obj), "\n")
cat("\nAzimuth l2 composition:\n")
print(table(clean_obj$predicted.celltype.l2))
```

# Assign Trajectory States
```{r trajectory-states}
# Map Azimuth l2 → trajectory state labels
# CD4 CTL merged into Temra:
#   - Only 10 cells — too few for stable Slingshot endpoint
#   - Biologically justified: CD4 CTL is cytotoxic CD4, equivalent to Temra

clean_obj$trajectory_state <- dplyr::recode(
  clean_obj$predicted.celltype.l2,
  "CD4 Naive"     = "Tnaive",
  "CD4 TCM"       = "Tcm",
  "CD4 TEM"       = "Tem",
  "CD4 Temra/CTL"       = "Temra",   # merged into Temra
  "Treg"          = "Treg",
  .default        = NA_character_
)

clean_obj$trajectory_state <- factor(
  clean_obj$trajectory_state,
  levels = c("Tnaive","Tcm","Tem","Temra","Treg")
)

cat("Trajectory state distribution:\n")
print(table(clean_obj$trajectory_state, useNA = "ifany"))

# Visualise on UMAP
DimPlot(clean_obj,
        group.by = "trajectory_state",
        cols     = flow_colours,
        na.value = "grey90",
        label    = TRUE,
        repel    = TRUE) +
  ggtitle("Trajectory states — Azimuth l2 mapped to flow-equivalent labels")
```

# Run Slingshot Trajectory
```{r slingshot-run}
# ── Prepare input ──
# Use UMAP coordinates directly — no SCE conversion needed
umap_coords <- Embeddings(clean_obj, "umap")

# Only use cells with a confirmed trajectory state
keep_cells  <- !is.na(clean_obj$trajectory_state)
umap_sub    <- umap_coords[keep_cells, ]
cluster_sub <- droplevels(clean_obj$trajectory_state[keep_cells])

cat("Cells entering Slingshot:\n")
print(table(cluster_sub))

# ── Run Slingshot ──
# start.clus = Tnaive (confirmed earliest state)
# end.clus   = Temra (effector/cytotoxic arm) + Treg (regulatory arm)
# Two terminal states = two lineages
sce_sling <- slingshot(
  data          = umap_sub,
  clusterLabels = cluster_sub,
  start.clus    = "Tnaive",
  end.clus      = c("Temra", "Treg"),
  approx_points = 100,      # reduced from 300 — less smoothing
  stretch       = 0         # prevents curves extending beyond terminal clusters
)

cat("\nLineages after fix:\n")
print(slingLineages(sce_sling))
cat("\nLineages inferred:\n")
print(slingLineages(sce_sling))
# Expected:
# Lineage 1: Tnaive → Tcm → Tem → Temra  (effector arm)
# Lineage 2: Tnaive → Tcm → Treg          (regulatory arm)
```

# Extract Pseudotime
```{r pseudotime-extract}
# Extract pseudotime matrix (cells × lineages)
pt_matrix           <- slingPseudotime(sce_sling)
colnames(pt_matrix) <- c("PT_effector","PT_regulatory")

cat("Pseudotime summary — effector lineage:\n")
print(summary(pt_matrix[, "PT_effector"]))

cat("\nPseudotime summary — regulatory lineage:\n")
print(summary(pt_matrix[, "PT_regulatory"]))

# Extract pseudotime matrix (cells × lineages)
pt_matrix           <- slingPseudotime(sce_sling)
colnames(pt_matrix) <- c("PT_effector","PT_regulatory")

cat("Pseudotime summary — effector lineage:\n")
print(summary(pt_matrix[, "PT_effector"]))
cat("\nPseudotime summary — regulatory lineage:\n")
print(summary(pt_matrix[, "PT_regulatory"]))

# ── Add pseudotime to metadata via direct dataframe assignment ──
# IMPORTANT: Do NOT use clean_obj$PT_effector <- ... 
# That triggers Seurat's validObject() which fails on large objects
# Instead extract meta.data, modify as plain R dataframe, write back

meta <- clean_obj@meta.data
meta$PT_effector   <- NA_real_
meta$PT_regulatory <- NA_real_

common <- intersect(rownames(pt_matrix), rownames(meta))
meta[common, "PT_effector"]   <- pt_matrix[common, "PT_effector"]
meta[common, "PT_regulatory"] <- pt_matrix[common, "PT_regulatory"]

clean_obj@meta.data <- meta

cat("\nCells with effector pseudotime:  ",
    sum(!is.na(clean_obj@meta.data$PT_effector)), "\n")
cat("Cells with regulatory pseudotime:",
    sum(!is.na(clean_obj@meta.data$PT_regulatory)), "\n")
```

# Trajectory Plot — Curves on UMAP
```{r trajectory-plot, fig.width=14, fig.height=8}
# ── Extract curve coordinates ──
curves <- slingCurves(sce_sling)

# Curve 1 — effector lineage (Tnaive → Tcm → Tem → Temra)
curve1_df           <- as.data.frame(curves[[1]]$s[curves[[1]]$ord, ])
colnames(curve1_df) <- c("UMAP_1","UMAP_2")

# Curve 2 — regulatory lineage (Tnaive → Tcm → Treg)
curve2_df           <- as.data.frame(curves[[2]]$s[curves[[2]]$ord, ])
colnames(curve2_df) <- c("UMAP_1","UMAP_2")

# ── UMAP base dataframe ──
umap_df           <- as.data.frame(umap_coords)
colnames(umap_df) <- c("UMAP_1","UMAP_2")
umap_df$trajectory_state <- clean_obj$trajectory_state
umap_df <- umap_df %>% filter(!is.na(trajectory_state))

# ── Milestone centroids ──
milestones <- umap_df %>%
  group_by(trajectory_state) %>%
  summarise(
    UMAP_1 = median(UMAP_1),
    UMAP_2 = median(UMAP_2),
    .groups = "drop"
  )

# ── Main plot ──
p_traj <- ggplot(umap_df,
                 aes(x = UMAP_1, y = UMAP_2,
                     colour = trajectory_state)) +
  geom_point(size = 0.4, alpha = 0.5) +
  # Effector lineage — solid black line
  geom_path(data        = curve1_df,
            aes(x = UMAP_1, y = UMAP_2),
            colour      = "black",
            linewidth   = 1.5,
            inherit.aes = FALSE) +
  # Regulatory lineage — dashed black line
  geom_path(data        = curve2_df,
            aes(x = UMAP_1, y = UMAP_2),
            colour      = "black",
            linewidth   = 1.5,
            linetype    = "dashed",
            inherit.aes = FALSE) +
  # Milestone labels
  ggrepel::geom_label_repel(
    data        = milestones,
    aes(x     = UMAP_1,
        y     = UMAP_2,
        label = trajectory_state,
        fill  = trajectory_state),
    colour      = "white",
    fontface    = "bold",
    size        = 5,
    box.padding = 0.8,
    inherit.aes = FALSE
  ) +
  scale_colour_manual(values = flow_colours, name = "State") +
  scale_fill_manual(values   = flow_colours, guide = "none") +
  theme_classic() +
  theme(
    plot.title    = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    legend.position = "right"
  ) +
  labs(
    title    = "CD4 T-cell Differentiation Trajectory (Slingshot)",
    subtitle = "Solid: Tnaive → Tcm → Tem → Temra | Dashed: Tnaive → Tcm → Treg",
    x        = "UMAP 1",
    y        = "UMAP 2"
  )

print(p_traj)

ggsave("cd4_trajectory_slingshot.png",
       plot   = p_traj,
       width  = 14, height = 8, dpi = 300)


# Get Treg centroid
treg_centroid <- umap_df %>%
  filter(trajectory_state == "Treg") %>%
  summarise(UMAP_1 = median(UMAP_1), UMAP_2 = median(UMAP_2))

# Get last point of dashed curve
curve2_end <- tail(curve2_df, 1)

p_traj +
  # Add arrow from curve endpoint to Treg centroid
  annotate("segment",
           x      = curve2_end$UMAP_1,
           y      = curve2_end$UMAP_2,
           xend   = treg_centroid$UMAP_1,
           yend   = treg_centroid$UMAP_2,
           colour = "black",
           linewidth = 1.2,
           linetype  = "dashed",
           arrow  = arrow(length = unit(0.3,"cm"),
                          type   = "closed")) +
  labs(subtitle = "Solid: Tnaive→Tcm→Tem→Temra | Dashed: Tnaive→Tcm→Treg")
```

# Pseudotime Feature Plots
```{r pseudotime-featureplots, fig.width=14, fig.height=6}
# Fix: add na.rm handling by filtering NAs before quantile calculation
# Use a numeric cutoff instead of "q95"

# Calculate q95 manually excluding NAs
eff_q95 <- quantile(clean_obj@meta.data$PT_effector, 
                     probs = 0.95, na.rm = TRUE)
reg_q95 <- quantile(clean_obj@meta.data$PT_regulatory, 
                     probs = 0.95, na.rm = TRUE)

cat("Effector q95 cutoff:",   round(eff_q95, 2), "\n")
cat("Regulatory q95 cutoff:", round(reg_q95, 2), "\n")

p_eff <- FeaturePlot(clean_obj,
                     features   = "PT_effector",
                     reduction  = "umap",
                     cols       = c("grey90","#C00000"),
                     pt.size    = 0.5,
                     min.cutoff = 0,
                     max.cutoff = eff_q95) +   # numeric value not "q95"
  ggtitle("Effector pseudotime\nTnaive → Tcm → Tem → Temra") +
  theme(plot.title = element_text(size = 11, face = "bold"))

p_reg <- FeaturePlot(clean_obj,
                     features   = "PT_regulatory",
                     reduction  = "umap",
                     cols       = c("grey90","#4472C4"),
                     pt.size    = 0.5,
                     min.cutoff = 0,
                     max.cutoff = reg_q95) +   # numeric value not "q95"
  ggtitle("Regulatory pseudotime\nTnaive → Tcm → Treg") +
  theme(plot.title = element_text(size = 11, face = "bold"))

p_eff | p_reg

ggsave("pseudotime_featureplots.png",
       plot  = p_eff | p_reg,
       width = 14, height = 6, dpi = 300)
```

# Pseudotime Violin per State per Lineage
```{r pseudotime-violin, fig.width=14, fig.height=6}
# Build long-format dataframe for plotting
pt_df <- data.frame(
  trajectory_state = clean_obj$trajectory_state[keep_cells],
  PT_effector      = pt_matrix[, "PT_effector"],
  PT_regulatory    = pt_matrix[, "PT_regulatory"]
) %>%
  tidyr::pivot_longer(
    cols      = c(PT_effector, PT_regulatory),
    names_to  = "lineage",
    values_to = "pseudotime"
  ) %>%
  dplyr::filter(
    !is.na(trajectory_state),
    !is.na(pseudotime)
  )

ggplot(pt_df,
       aes(x    = factor(trajectory_state,
                          levels = c("Tnaive","Tcm","Tem","Temra","Treg")),
           y    = pseudotime,
           fill = trajectory_state)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.3) +
  facet_wrap(~ lineage,
             ncol     = 2,
             labeller = labeller(lineage = c(
               "PT_effector"   = "Effector lineage (→ Temra)",
               "PT_regulatory" = "Regulatory lineage (→ Treg)"
             ))) +
  scale_fill_manual(values = flow_colours) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x     = element_text(angle = 45, hjust = 1, size = 10),
    strip.text      = element_text(size = 11, face = "bold")
  ) +
  labs(
    title    = "Pseudotime per state per lineage",
    subtitle = "Pseudotime should increase monotonically: Tnaive → Temra / Treg",
    x        = NULL,
    y        = "Slingshot pseudotime"
  )

# Confirm monotonic increase — key validation
cat("Median pseudotime per state — effector lineage:\n")
pt_df %>%
  filter(lineage == "PT_effector") %>%
  group_by(trajectory_state) %>%
  summarise(median_pt = round(median(pseudotime, na.rm = TRUE), 2),
            n         = n(),
            .groups   = "drop") %>%
  arrange(median_pt) %>%
  print()
```

# Gene Expression Along Pseudotime Heatmap
```{r gene-heatmap, fig.width=12, fig.height=7}
DefaultAssay(clean_obj) <- "RNA"

key_genes <- c(
  "LEF1","TCF7","KLF2","SELL",        # Tnaive
  "IL7R","S100A4","ITGB1","BCL2",     # Tcm
  "GZMK","CXCR3","CCR5",             # Tem
  "CX3CR1","GNLY","FGFBP2","PRF1",   # Temra
  "FOXP3","IL2RA","CTLA4","IKZF2"    # Treg
)
key_genes <- intersect(key_genes, rownames(clean_obj))
cat("Genes found:", length(key_genes), "\n")

# Order cells by effector pseudotime
eff_pt       <- clean_obj$PT_effector
eff_cells    <- !is.na(eff_pt)
eff_order    <- order(eff_pt[eff_cells])
eff_barcodes <- colnames(clean_obj)[eff_cells][eff_order]

expr_mat <- as.matrix(
  GetAssayData(clean_obj, assay = "RNA", layer = "data")[key_genes, eff_barcodes]
)

# Smooth into 50 pseudotime bins
n_bins   <- 50
bin_size <- floor(ncol(expr_mat) / n_bins)
bin_mat  <- sapply(seq_len(n_bins), function(i) {
  idx <- ((i-1)*bin_size + 1):min(i*bin_size, ncol(expr_mat))
  rowMeans(expr_mat[, idx, drop = FALSE])
})

# Z-score per gene and clip to [-2, 2]
bin_z            <- t(scale(t(bin_mat)))
bin_z[is.nan(bin_z)] <- 0
bin_z            <- pmin(pmax(bin_z, -2), 2)

# Draw in notebook — remove filename parameter
pheatmap::pheatmap(
  bin_z,
  cluster_cols  = FALSE,
  cluster_rows  = FALSE,
  color         = colorRampPalette(c("navy","white","firebrick3"))(100),
  breaks        = seq(-2, 2, length.out = 101),
  main          = "Gene expression along effector pseudotime\nTnaive → Tcm → Tem → Temra",
  fontsize_row  = 9,
  border_color  = NA,
  show_colnames = FALSE,
  gaps_row      = c(4, 8, 11)
)

# Save separately AFTER displaying
dev.copy(png, "pseudotime_heatmap.png", width = 12, height = 7, units = "in", res = 300)
dev.off()

```



# Gene Expression Along Pseudotime Heatmap
```{r gene-heatmap-Treg, fig.width=12, fig.height=7}
# ── Regulatory lineage heatmap (Tnaive → Tcm → Treg) ──

# Treg lineage specific genes — include Treg markers prominently
reg_genes <- c(
  "LEF1","TCF7","KLF2","SELL",          # Tnaive — should decline
  "IL7R","S100A4","ITGB1","BCL2",       # Tcm — intermediate
  "FOXP3","IL2RA","CTLA4","IKZF2",      # Treg core — should rise
  "TIGIT","TNFRSF18","TNFRSF4","CCR8"   # Treg effector — rise at end
)
reg_genes <- intersect(reg_genes, rownames(clean_obj))
cat("Regulatory lineage genes found:", length(reg_genes), "\n")

# Order cells by REGULATORY pseudotime (not effector)
reg_pt       <- clean_obj$PT_regulatory
reg_cells    <- !is.na(reg_pt)
reg_order    <- order(reg_pt[reg_cells])
reg_barcodes <- colnames(clean_obj)[reg_cells][reg_order]

cat("Cells on regulatory lineage:", length(reg_barcodes), "\n")

# Extract expression matrix ordered by regulatory pseudotime
expr_mat_reg <- as.matrix(
  GetAssayData(clean_obj, assay = "RNA", layer = "data")[reg_genes, reg_barcodes]
)

# Smooth into 50 bins
n_bins        <- 50
bin_size_reg  <- floor(ncol(expr_mat_reg) / n_bins)
bin_mat_reg   <- sapply(seq_len(n_bins), function(i) {
  idx <- ((i-1)*bin_size_reg + 1):min(i*bin_size_reg, ncol(expr_mat_reg))
  rowMeans(expr_mat_reg[, idx, drop = FALSE])
})

# Z-score and clip
bin_z_reg             <- t(scale(t(bin_mat_reg)))
bin_z_reg[is.nan(bin_z_reg)] <- 0
bin_z_reg             <- pmin(pmax(bin_z_reg, -2), 2)

# Add pseudotime axis annotation (Early → Late)
col_annotation <- data.frame(
  Pseudotime = seq(0, 1, length.out = n_bins),
  row.names  = paste0("bin", seq_len(n_bins))
)
colnames(bin_z_reg) <- paste0("bin", seq_len(n_bins))

annotation_colors <- list(
  Pseudotime = colorRampPalette(c("grey90","#FFD700"))(100)
)

# Plot regulatory lineage heatmap
pheatmap::pheatmap(
  bin_z_reg,
  cluster_cols      = FALSE,
  cluster_rows      = FALSE,
  color             = colorRampPalette(c("navy","white","firebrick3"))(100),
  breaks            = seq(-2, 2, length.out = 101),
  main              = "Gene expression along regulatory pseudotime\nTnaive → Tcm → Treg",
  fontsize_row      = 9,
  border_color      = NA,
  show_colnames     = FALSE,
  annotation_col    = col_annotation,
  annotation_colors = annotation_colors,
  annotation_names_col = FALSE,
  gaps_row          = c(4, 8, 12)  # Tnaive | Tcm | Treg core | Treg effector
)

# Save
dev.copy(png, "pseudotime_heatmap_regulatory.png",
         width = 12, height = 7, units = "in", res = 300)
dev.off()
```


















# Save Final Object
```{r save}
DefaultAssay(clean_obj) <- "integrated"

saveRDS(
  clean_obj,
  "CD4_reference_clean_Azimuth_Slingshot.rds",
  compress = FALSE
)

cat("✅ Saved:", ncol(clean_obj), "cells\n")
cat("\nFinal trajectory state distribution:\n")
print(table(clean_obj$trajectory_state, useNA = "ifany"))
cat("\nPseudotime columns added:\n")
cat("  - PT_effector   (Tnaive → Tcm → Tem → Temra)\n")
cat("  - PT_regulatory (Tnaive → Tcm → Treg)\n")
cat("\nObject ready for malignant cell projection via MapQuery\n")
cat("return.model = TRUE confirmed — MapQuery will work\n")
```
