1 Load libraries

2 Load Annotated Healthy Object

healthy_cd4 <- readRDS("../healthy_CD4_Subsetted_full_annotated_harmony.rds")

# Confirm cell counts and labels
cat("Healthy CD4 cells:", ncol(healthy_cd4), "\n")
Healthy CD4 cells: 12034 
print(table(healthy_cd4$predicted.celltype.l2, healthy_cd4$dataset))
                   
                    CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
  CD4 CTL                     1           0       11
  CD4 Naive                1367         381      359
  CD4 Proliferating           9           4        0
  CD4 TCM                  2010        2856     4654
  CD4 TEM                    29          51       73
  Treg                       87         137        5
print(table(healthy_cd4$predicted.celltype.l3, healthy_cd4$dataset))
                   
                    CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
  CD4 CTL                     1           0       11
  CD4 Naive                1370         382      363
  CD4 Proliferating           9           4        0
  CD4 TCM_1                1518        1859     4181
  CD4 TCM_2                 113         148      248
  CD4 TCM_3                 370         849      226
  CD4 TEM_1                  12          26        8
  CD4 TEM_2                  19          11       13
  CD4 TEM_3                   2           7       49
  Treg Memory                58         134        3
  Treg Naive                 31           9        0

3 Cell Cycle Scoring


DefaultAssay(healthy_cd4) <- "RNA"
healthy_cd4 <- NormalizeData(healthy_cd4, verbose = FALSE)

healthy_cd4 <- CellCycleScoring(
  healthy_cd4,
  s.features   = cc.genes.updated.2019$s.genes,
  g2m.features = cc.genes.updated.2019$g2m.genes,
  set.ident    = FALSE
)

print(table(healthy_cd4$Phase, healthy_cd4$dataset))
     
      CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
  G1         1781        2021     1938
  G2M         242         263      491
  S          1480        1145     2673
DimPlot(healthy_cd4, group.by = "Phase") + ggtitle("Cell Cycle Phase")

4 SCTransform per Sample (with Cell Cycle + MT Regression)

5 Validation plots: dataset mixing and initial labels

# Validation plots
p1 <- DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_integrated, group.by = "dataset")
p3 <- DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE)
(p1 | p2) / p3

5.1 Validation plots

# Validation plots
DimPlot(reference_integrated, group.by = "predicted.celltype.l1",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "predicted.celltype.l3",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()


print(table(healthy_cd4$predicted.celltype.l1, healthy_cd4$seurat_clusters))
       
           0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20
  CD4 T 5035 2758 2713    0   98   10  185  725   33    4   97  368    0    0    0    0    0    2    0    0    0
  CD8 T    1    0    0    0    0    0    3    1    0    0    0    1    0    0    0    0    0    0    0    0    0
print(table(healthy_cd4$predicted.celltype.l2, healthy_cd4$seurat_clusters))
                   
                       0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20
  CD4 CTL              0    0    0    0    0    0    5    0    1    3    3    0    0    0    0    0    0    0    0    0    0
  CD4 Naive            1 1116  911    0   15    0    0   51    0    0    0   13    0    0    0    0    0    0    0    0    0
  CD4 Proliferating   13    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  CD4 TCM           4836 1599 1762    0   83   10  130  666   29    1   61  341    0    0    0    0    0    2    0    0    0
  CD4 TEM             63    0    0    0    0    0   53    2    1    0   33    1    0    0    0    0    0    0    0    0    0
  Treg               123   43   40    0    0    0    0    7    2    0    0   14    0    0    0    0    0    0    0    0    0
print(table(healthy_cd4$predicted.celltype.l3, healthy_cd4$seurat_clusters))
                   
                       0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20
  CD4 CTL              0    0    0    0    0    0    5    0    1    3    3    0    0    0    0    0    0    0    0    0    0
  CD4 Naive            4 1116  916    0   15    0    0   51    0    0    0   13    0    0    0    0    0    0    0    0    0
  CD4 Proliferating   13    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  CD4 TCM_1         3076 1599 1758    0   83    9   80  627   24    1    9  292    0    0    0    0    0    0    0    0    0
  CD4 TCM_2          443    3    1    0    0    1   27   13    4    0    0   17    0    0    0    0    0    0    0    0    0
  CD4 TCM_3         1309    0    0    0    0    0   22   25    1    0   58   28    0    0    0    0    0    2    0    0    0
  CD4 TEM_1           18    0    0    0    0    0   11    0    1    0   16    0    0    0    0    0    0    0    0    0    0
  CD4 TEM_2           13    0    0    0    0    0   26    2    0    0    0    2    0    0    0    0    0    0    0    0    0
  CD4 TEM_3           30    0    0    0    0    0   17    0    0    0   11    0    0    0    0    0    0    0    0    0    0
  Treg Memory        128   20   21    0    0    0    0    7    2    0    0   17    0    0    0    0    0    0    0    0    0
  Treg Naive           2   20   17    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0

6 Save Reference (before MapQuery)

# Save
saveRDS(reference_integrated, "CD4_reference_RPCA_SCT_integrated.rds")
cat("✅ COMPLETE:", ncol(reference_integrated), "cells integrated\n")
✅ COMPLETE: 12034 cells integrated

7 Marker Module Scores (Differentiation States)


DefaultAssay(reference_integrated) <- "RNA"


marker_list <- list(
  Tnaive = c("CCR7","SELL","LEF1","TCF7","IL7R","CD27","PTPRC"),
  Tcm    = c("CCR7","SELL","CD27","IL7R","BCL2","TCF7"),
  Tem    = c("CCR6","CXCR3","GZMK","PRF1","IFNG"),
  Temra  = c("GZMB","PRF1","KLRG1","CX3CR1"),
  Treg   = c("FOXP3","IL2RA","CTLA4","IKZF2"),
  Tex    = c("PDCD1","CTLA4","LAG3","TIGIT","TOX","ENTPD1"),
  CD4CTL = c("GZMB","PRF1","NKG7","KLRG1","CX3CR1")
)

# Keep only genes present
marker_list <- lapply(marker_list, function(x)
  intersect(x, rownames(reference_integrated))
)

# Drop any empty sets to avoid AddModuleScore warnings
marker_list <- marker_list[vapply(marker_list, length, 1L) > 0]

for (state in names(marker_list)) {
  reference_integrated <- AddModuleScore(
    reference_integrated,
    features = list(marker_list[[state]]),
    name     = state
  )
}

score_features <- paste0(names(marker_list), "1")

FeaturePlot(reference_integrated,
            features   = score_features,
            ncol       = 4,
            cols       = c("lightblue","red"),
            max.cutoff = "q95") +
  plot_annotation(title = "Differentiation State Module Scores")

7.1 FeaturePlots Markers based

# ---- Load libraries ----
library(Seurat)
library(ggplot2)
library(patchwork)

# ---- Define markers for differentiation states ----
tnaive_markers <- c("CCR7", "SELL", "LEF1", "TCF7", "IL7R", "CD27", "CD45RA")
tcm_markers    <- c("CCR7", "SELL", "CD45RO", "IL7R", "CD27")
tem_markers    <- c("CCR6", "CXCR3", "GZMK", "PRF1", "IFNG", "CD45RO")
temra_markers  <- c("GZMB", "PRF1", "KLRG1", "CX3CR1", "CD45RA")
tex_markers    <- c("PDCD1", "CTLA4", "LAG3", "TIGIT", "TOX", "ENTPD1")
cd4ctl_markers <- c("GZMB", "PRF1", "NKG7", "KLRG1", "CX3CR1")  # CD4 cytotoxic markers

# ---- Keep only markers present in the dataset ----
tnaive_markers <- tnaive_markers[tnaive_markers %in% rownames(reference_integrated)]
tcm_markers    <- tcm_markers[tcm_markers %in% rownames(reference_integrated)]
tem_markers    <- tem_markers[tem_markers %in% rownames(reference_integrated)]
temra_markers  <- temra_markers[temra_markers %in% rownames(reference_integrated)]
tex_markers    <- tex_markers[tex_markers %in% rownames(reference_integrated)]
cd4ctl_markers <- cd4ctl_markers[cd4ctl_markers %in% rownames(reference_integrated)]

# ---- Function to generate patchwork feature plots ----
plot_markers_grid <- function(marker_vector, state_name) {
  plots <- lapply(marker_vector, function(gene){
    FeaturePlot(reference_integrated, features = gene, reduction = "umap",
                cols = c("lightblue", "red"), label = TRUE) +
      theme(plot.title = element_text(hjust = 0.5))
  })
  wrap_plots(plots) + plot_annotation(title = paste(state_name, "Marker Expression"))
}

# ---- Generate grids for each state ----
tnaive_plot <- plot_markers_grid(tnaive_markers, "Tnaive")
tcm_plot    <- plot_markers_grid(tcm_markers, "Tcm")
tem_plot    <- plot_markers_grid(tem_markers, "Tem")
temra_plot  <- plot_markers_grid(temra_markers, "Temra")
tex_plot    <- plot_markers_grid(tex_markers, "Tex")
cd4ctl_plot <- plot_markers_grid(cd4ctl_markers, "CD4 CTL")

# ---- Display plots ----
tnaive_plot

tcm_plot

tem_plot

temra_plot

tex_plot

cd4ctl_plot

8 ADT FeaturePlots per differentiation state

# Tnaive: CD45RA+ CCR7+ CD127+ (naive/memory core)
tnaive_adt <- c("CD45RA-TotalC", "CD197-TotalC", "CD127-TotalC")

# Tcm: CD45RO+ CCR7+ CD127+ (central memory)
tcm_adt <- c("CD45RO-TotalC", "CD197-TotalC", "CD127-TotalC")

# Tem: CD45RO+ CCR7− (effector memory)
tem_adt <- c("CD45RO-TotalC")  # CCR7 low expected

# Temra/CD4CTL: CD45RA+ + exhaustion/effector (PD-1, TIGIT, HLA-DR)
temra_adt <- c("CD45RA-TotalC", "PD-1-TotalC", "TIGIT-TotalC", "HLA-DR-TotalC")

# Treg: CD25hi CD127low ±CD45RA (naive Treg vs activated)
treg_adt <- c("CD25-TotalC", "CD127-TotalC", "CD45RA-TotalC")

# Tex: exhaustion signature
tex_adt <- c("PD-1-TotalC", "TIGIT-TotalC", "HLA-DR-TotalC")


DefaultAssay(reference_integrated) <- "ADT"

plot_markers_grid_ADT <- function(marker_vector, state_name) {
  plots <- lapply(marker_vector, function(prot){
    FeaturePlot(reference_integrated, features = prot, reduction = "umap",
                cols = c("lightblue","red")) +
      ggtitle(prot)
  })
  wrap_plots(plots) + plot_annotation(title = paste(state_name, "ADT expression"))
}

plot_markers_grid_ADT(tnaive_adt, "Tnaive")

plot_markers_grid_ADT(tcm_adt,    "Tcm")

plot_markers_grid_ADT(tem_adt,    "Tem")

plot_markers_grid_ADT(temra_adt,  "Temra/CD4CTL")

plot_markers_grid_ADT(treg_adt,   "Treg")

plot_markers_grid_ADT(tex_adt,    "Tex")

NA
NA

8.1 Gating in 10x_S1 (uses *-TotalC)

## 10x_S1 ADT gating  (CITE-seq)
DefaultAssay(reference_integrated) <- "ADT"

s1 <- subset(reference_integrated, subset = dataset == "CD4T_10x_S1")
adt_s1 <- LayerData(s1[["ADT"]], layer = "data.CD4T_10x_S1.1")
rownames(adt_s1) <- gsub("-TotalC", "", rownames(adt_s1))  # CD45RA, CD197, ...

adt_z_s1 <- t(scale(t(adt_s1)))

is_hi <- function(g, zmat, z = 0.5) if (g %in% rownames(zmat)) zmat[g, ] > z else rep(FALSE, ncol(zmat))

CD45RA_hi <- is_hi("CD45RA", adt_z_s1)
CD45RO_hi <- is_hi("CD45RO", adt_z_s1)
CCR7_hi   <- is_hi("CD197",  adt_z_s1)
CD127_hi  <- is_hi("CD127",  adt_z_s1)
CD25_hi   <- is_hi("CD25",   adt_z_s1)
PD1_hi    <- is_hi("PD-1",   adt_z_s1)
TIGIT_hi  <- is_hi("TIGIT",  adt_z_s1)
HLADR_hi  <- is_hi("HLA-DR", adt_z_s1)

CCR7_lo  <- !CCR7_hi
CD127_lo <- !CD127_hi

state_s1 <- rep("Other", ncol(adt_s1))

# Treg
treg_idx <- CD25_hi & CD127_lo
state_s1[treg_idx] <- "Treg_ADT"

# Tnaive
tnaive_idx <- CD45RA_hi & CCR7_hi & CD127_hi & !CD45RO_hi & !PD1_hi & !TIGIT_hi
state_s1[tnaive_idx & state_s1 == "Other"] <- "Tnaive_ADT"

# Tcm
tcm_idx <- CD45RO_hi & CCR7_hi & !CD45RA_hi
state_s1[tcm_idx & state_s1 == "Other"] <- "Tcm_ADT"

# Tem
tem_idx <- CD45RO_hi & CCR7_lo & !CD45RA_hi
state_s1[tem_idx & state_s1 == "Other"] <- "Tem_ADT"

# Temra / CD4CTL
temra_idx <- CD45RA_hi & CCR7_lo & (PD1_hi | TIGIT_hi | HLADR_hi)
state_s1[temra_idx & state_s1 == "Other"] <- "Temra_CTL_ADT"

table(state_s1)
state_s1
        Other       Tcm_ADT       Tem_ADT Temra_CTL_ADT    Tnaive_ADT      Treg_ADT 
         1763            99           694           243           100           604 

8.2 Recover ADT for CD4T_lab from original object and align cells


# Match cells
lab_cells <- colnames(subset(reference_integrated, dataset == "CD4T_lab"))
lab_raw   <- sub("^PBMC_", "", lab_cells)

adt_lab_raw <- adt_lab_raw[, lab_raw, drop = FALSE]
Error in adt_lab_raw[, lab_raw, drop = FALSE] : subscript out of bounds

8.3 Extract ONLY overlapping cells


# Create new assay for CD4T_lab ADT data
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)

# Verify it was added
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")
dim(LayerData(lab[["ADT_lab"]], layer = "counts"))
head(rownames(LayerData(lab[["ADT_lab"]], layer = "counts")))

8.4 Create and add ADT_lab assay


# Create new assay for CD4T_lab ADT data
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)
Error in `fn()`:
! Cannot add new cells with [[<-
Run `]8;;x-r-run:rlang::last_trace()rlang::last_trace()]8;;` to see where the error occurred.

8.5 Now gate on CD4T_lab cells (works!)


# Create new assay for CD4T_lab ADT data
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)

# Verify it was added
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")
dim(LayerData(lab[["ADT_lab"]], layer = "counts"))
head(rownames(LayerData(lab[["ADT_lab"]], layer = "counts")))

8.6 Gating in CD4T_lab (uses markers without -TotalC)


DefaultAssay(reference_integrated) <- "ADT"

## CD4T_lab ADT gating  (lab panel)

# Subset lab dataset
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")

# Get ADT expression (RNA-like markers without -TotalC)
adt_lab <- LayerData(lab[["ADT"]], layer = "data")   # not slot=
# Check rownames:
head(rownames(adt_lab))
NULL
# Z-score
adt_z_lab <- t(scale(t(adt_lab)))

is_hi <- function(g, zmat, z = 0.5) {
  if (!g %in% rownames(zmat)) return(rep(FALSE, ncol(zmat)))
  zmat[g, ] > z
}

CD45RA_hi_l <- is_hi("CD45RA", adt_z_lab)
CD45RO_hi_l <- is_hi("CD45RO", adt_z_lab)
CCR7_hi_l   <- is_hi("CCR7",   adt_z_lab)
CD127_hi_l  <- is_hi("CD127",  adt_z_lab)
CD25_hi_l   <- is_hi("CD25",   adt_z_lab)
PD1_hi_l    <- is_hi("PD1",    adt_z_lab)
TIGIT_hi_l  <- is_hi("TIGIT",  adt_z_lab)
HLADR_hi_l  <- is_hi("HLA-DR", adt_z_lab)  # if present

CCR7_lo_l  <- !CCR7_hi_l
CD127_lo_l <- !CD127_hi_l

state_lab <- rep("Other", ncol(adt_lab))

# Treg
treg_idx_l <- CD25_hi_l & CD127_lo_l
state_lab[treg_idx_l] <- "Treg_ADT"

# Tnaive
tnaive_idx_l <- CD45RA_hi_l & CCR7_hi_l & CD127_hi_l & !CD45RO_hi_l & !PD1_hi_l & !TIGIT_hi_l
state_lab[tnaive_idx_l & state_lab == "Other"] <- "Tnaive_ADT"

# Tcm
tcm_idx_l <- CD45RO_hi_l & CCR7_hi_l & !CD45RA_hi_l
state_lab[tcm_idx_l & state_lab == "Other"] <- "Tcm_ADT"

# Tem
tem_idx_l <- CD45RO_hi_l & CCR7_lo_l & !CD45RA_hi_l
state_lab[tem_idx_l & state_lab == "Other"] <- "Tem_ADT"

# Temra/CD4CTL
temra_idx_l <- CD45RA_hi_l & CCR7_lo_l & (PD1_hi_l | TIGIT_hi_l | HLADR_hi_l)
state_lab[temra_idx_l & state_lab == "Other"] <- "Temra_CTL_ADT"

table(state_lab)
< table of extent 0 >

8.7 Assign per‑cell differentiation state

# Collect module score column names (they end with "1")
state_names <- names(marker_list)              # e.g. "Tnaive","Tcm",...
score_cols  <- paste0(state_names, "1")        # "Tnaive1","Tcm1",...

# Build matrix of scores
score_mat <- as.data.frame(reference_integrated@meta.data[, score_cols])

# For each cell, find which state has the highest score
state_max_idx   <- apply(score_mat, 1, which.max)
state_max_label <- state_names[state_max_idx]

# Optionally impose a minimum separation: if top score is not sufficiently
# higher than the second, call the cell "Mixed"
top_vals  <- apply(score_mat, 1, max)
second_vals <- apply(score_mat, 1, function(x) sort(x, decreasing = TRUE)[2])
delta <- top_vals - second_vals
state_max_label[delta < 0.05] <- "Mixed"   # 0.05 is an example threshold

# Store in metadata
reference_integrated$diff_state <- factor(state_max_label,
                                          levels = c("Tnaive","Tcm","Tem","Temra",
                                                     "Treg","Tex","CD4CTL","Mixed"))

# UMAP colored by inferred differentiation state
DimPlot(reference_integrated, group.by = "diff_state", label = TRUE, label.box = T, repel = T) +
  ggtitle("Inferred CD4 T-cell differentiation states")

9 ADT markers and extraction (10x_S1 already run)


# o you can do:
# 
# Tnaive: CD3+ CD4+ CD45RA+ CD45RO− CCR7+ CD127+
# 
# Tcm: CD3+ CD4+ CD45RO+ CCR7+ CD45RA−
# 
# Tem: CD3+ CD4+ CD45RO+ CCR7− (or low) CD45RA−
# 
# Temra: CD3+ CD4+ CD45RA+ CD45RO+/− CCR7−



DefaultAssay(reference_integrated) <- "ADT"

adt_markers_raw <- c("CD3-TotalC","CD4-TotalC","CD45RA-TotalC",
                     "CD45RO-TotalC","CD197-TotalC","CD127-TotalC",
                     "CD25-TotalC","HLA-DR-TotalC","PD1-TotalC","TIGIT-TotalC")

FeaturePlot(reference_integrated,
            features = adt_markers_raw,
            reduction = "umap",
            cols = c("lightblue","red"),
            ncol = 3) +
  plot_annotation(title = "Key ADT markers (10x_S1 cells)")

10 Derive ADT‑based “high/low” categories

# Full UMAP, 10x_S1 colored, others grey
p1 <- DimPlot(reference_integrated,
              group.by = "ADT_state2",
              reduction = "umap",
              pt.size   = 0.5) +
  scale_color_manual(values = c("Tnaive_ADT" = "#FF6B6B", "Tcm_ADT" = "#4ECDC4",
                                "Tem_ADT" = "#45B7D1", "Temra_CTL_ADT" = "#96CEB4",
                                "Treg_ADT" = "#FECA57", "Other" = "grey80")) +
  ggtitle("Full UMAP: 10x_S1 colored by ADT, others grey")

p2 <- DimPlot(subset(reference_integrated, dataset == "CD4T_10x_S1"),
              group.by = "ADT_state2",
              reduction = "umap",
              label = TRUE, label.box = TRUE, repel = TRUE) +
  ggtitle("10x_S1 ONLY") +
  NoLegend()

p1 | p2

10.1 Logical gating for Tnaive/Tcm/Tem/Temra

# Boolean vectors per marker
CD45RA_hi <- is_high("CD45RA", adt_z)
CD45RO_hi <- is_high("CD45RO", adt_z)
CCR7_hi   <- is_high("CD197",  adt_z)   # CCR7
CD127_hi  <- is_high("CD127",  adt_z)

# Gating rules
Tnaive_idx <-  CD45RA_hi &  CCR7_hi & CD127_hi & !CD45RO_hi
Tcm_idx    <- !CD45RA_hi &  CCR7_hi &  CD45RO_hi
Tem_idx    <- !CD45RA_hi & !CCR7_hi &  CD45RO_hi
Temra_idx  <-  CD45RA_hi & !CCR7_hi & (CD45RO_hi | !CD45RO_hi)  # RA+ CCR7−

# Assign state; resolve conflicts by priority
state <- rep("Other", ncol(adt))
state[Tnaive_idx] <- "Tnaive"
state[Tcm_idx]    <- "Tcm"
state[Tem_idx]    <- "Tem"
state[Temra_idx]  <- "Temra"

# Store in metadata (10x_S1 subset)
s1_subset$ADT_state <- state

# Put back into full object
reference_integrated$ADT_state <- NA
reference_integrated$ADT_state[colnames(s1_subset)] <- s1_subset$ADT_state
reference_integrated$ADT_state <- factor(reference_integrated$ADT_state,
                                         levels = c("Tnaive","Tcm","Tem","Temra","Other"))

11 Visualize ADT‑defined states on your integrated UMAP

DefaultAssay(reference_integrated) <- "integrated"

DimPlot(reference_integrated,
        group.by = "ADT_state",
        reduction = "umap",
        label = TRUE, label.box = T, repel = T) +
  ggtitle("CD4T_10x_S1 cells annotated by ADT gating")

12 Monocle3 Trajectory

cds <- as.cell_data_set(reference_integrated)

# Transfer UMAP + partitions from Seurat
reducedDim(cds, "UMAP") <- Embeddings(reference_integrated, "umap")
cds@clusters$UMAP$partitions <- factor(rep(1, ncol(cds)))
cds@clusters$UMAP$clusters   <- reference_integrated$seurat_clusters
colData(cds)$celltype         <- reference_integrated$predicted.celltype.l2
colData(cds)$dataset          <- reference_integrated$dataset
cds <- learn_graph(cds,
                   use_partition = FALSE,
                   close_loop    = FALSE,
                   verbose       = FALSE)

plot_cells(cds,
           color_cells_by          = "celltype",
           label_groups_by_cluster = FALSE,
           label_leaves            = TRUE,
           label_branch_points     = TRUE,
           graph_label_size        = 4) +
  ggtitle("Monocle3 — CD4 T-cell Trajectory Milestones")
# Root = CD4 Naive centroid
naive_ids <- rownames(colData(cds))[
  colData(cds)$celltype %in% c("CD4 Naive")
]

get_root_node <- function(cds, cell_ids) {
  closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
  closest <- as.matrix(closest[colnames(cds), ])
  igraph::V(principal_graph(cds)$UMAP)$name[
    as.numeric(names(which.max(table(closest[cell_ids, ]))))]
}

cds <- order_cells(cds, root_pr_nodes = get_root_node(cds, naive_ids))
reference_integrated$monocle3_pseudotime <- pseudotime(cds)

plot_cells(cds,
           color_cells_by    = "pseudotime",
           label_cell_groups = FALSE) +
  scale_color_viridis_c(option = "plasma") +
  ggtitle("Pseudotime: Naive → Memory → Effector → Treg")

13 Save Reference (before MapQuery)

saveRDS(reference_integrated,
        "CD4_reference_RPCA_SCT_Trajectory_annotated.rds")

cat("Reference saved:", ncol(reference_integrated), "cells\n")

```

---
title: "CD4 T-cell Reference + Sezary Projection"
subtitle: "RPCA-SCT Integration | Monocle3 | MapQuery"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# Load libraries
```{r, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                      fig.width = 12, fig.height = 8)
library(Seurat)
library(monocle3)
library(SeuratWrappers)
library(patchwork)
library(ggplot2)
library(RColorBrewer)
options(future.globals.maxSize = 8e9)
set.seed(123)
```




# Load Annotated Healthy Object
```{r load object}
healthy_cd4 <- readRDS("../healthy_CD4_Subsetted_full_annotated_harmony.rds")

# Confirm cell counts and labels
cat("Healthy CD4 cells:", ncol(healthy_cd4), "\n")
print(table(healthy_cd4$predicted.celltype.l2, healthy_cd4$dataset))
print(table(healthy_cd4$predicted.celltype.l3, healthy_cd4$dataset))

```




# Cell Cycle Scoring
```{r cellcycle}

DefaultAssay(healthy_cd4) <- "RNA"
healthy_cd4 <- NormalizeData(healthy_cd4, verbose = FALSE)

healthy_cd4 <- CellCycleScoring(
  healthy_cd4,
  s.features   = cc.genes.updated.2019$s.genes,
  g2m.features = cc.genes.updated.2019$g2m.genes,
  set.ident    = FALSE
)

print(table(healthy_cd4$Phase, healthy_cd4$dataset))
DimPlot(healthy_cd4, group.by = "Phase") + ggtitle("Cell Cycle Phase")
```

# SCTransform per Sample (with Cell Cycle + MT Regression)
```{r SCT+RPCA}
# ── 0. Split + SCT ────────────────────────────────────────────────────────
ref_list <- SplitObject(healthy_cd4, split.by = "dataset")

ref_list <- lapply(ref_list, function(x) {
  SCTransform(
    x,
    vars.to.regress = c("S.Score", "G2M.Score", "percent.mt"),
    vst.flavor = "v2",
    verbose = FALSE
  )
})
cat("SCT complete per sample\n")
print(sapply(ref_list, ncol))

# ── Select integration features ────────────────────────────────────────
ref_features <- SelectIntegrationFeatures(object.list = ref_list, nfeatures = 3000)
ref_features <- ref_features[!grepl("^TRBV|^TRAV|^TRGV|^TRDV|^HLA-|^XIST", ref_features)]
cat("Features after TCR/HLA filter:", length(ref_features), "\n")

# ── PrepSCTIntegration ─────────────────────────────────────────────────
ref_list <- PrepSCTIntegration(
  object.list     = ref_list,
  anchor.features = ref_features,
  verbose         = FALSE
)
cat("PrepSCTIntegration done\n")

# ── SKIP SYNCHRONIZATION - DIRECT TO PCA & RPCA ──────────────────────────
cat("Running PCA → RPCA integration (no manual sync needed)...\n")

# PCA per sample (RPCA uses these PCs)
ref_list <- lapply(ref_list, function(x) {
  RunPCA(x, assay = "SCT", features = ref_features, verbose = FALSE)
})
cat("✅ PCA complete\n")

# Find anchors (RPCA handles layer mismatches automatically)
ref_anchors <- FindIntegrationAnchors(
  object.list          = ref_list,
  normalization.method = "SCT",
  anchor.features      = ref_features,
  reduction            = "rpca",
  dims                 = 1:30,
  k.anchor             = 15,
  verbose              = TRUE
)
cat("✅ Anchors found\n")


# ── Integrate ──────────────────────────────────────────────────────────
reference_integrated <- IntegrateData(
  anchorset            = ref_anchors,
  normalization.method = "SCT",
  verbose              = TRUE
)
cat("✅ IntegrateData complete:", ncol(reference_integrated), "cells\n")

# ── Standardize ────────────────────────────────────────────────────────
VariableFeatures(reference_integrated) <- ref_features
DefaultAssay(reference_integrated) <- "integrated"

# ── Scale + PCA + UMAP ────────────────────────────────────────────────
reference_integrated <- ScaleData(reference_integrated, verbose = FALSE)
reference_integrated <- RunPCA(reference_integrated, npcs = 50,
                               return.model = TRUE, verbose = FALSE)
ElbowPlot(reference_integrated, ndims = 50)

dims_use <- 1:20

reference_integrated <- RunUMAP(reference_integrated, dims = dims_use,
                                return.model = TRUE, verbose = FALSE)
reference_integrated <- FindNeighbors(reference_integrated, dims = dims_use,
                                      verbose = FALSE)
reference_integrated <- FindClusters(reference_integrated, resolution = 0.3,
                                     verbose = FALSE)


```


# Validation plots: dataset mixing and initial labels
```{r validate, fig.width=14, fig.height=10}
# Validation plots
p1 <- DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_integrated, group.by = "dataset")
p3 <- DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE)
(p1 | p2) / p3
```

## Validation plots
```{r , fig.width=12, fig.height=8}
# Validation plots
DimPlot(reference_integrated, group.by = "predicted.celltype.l1",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_integrated, group.by = "predicted.celltype.l3",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

print(table(healthy_cd4$predicted.celltype.l1, healthy_cd4$seurat_clusters))
print(table(healthy_cd4$predicted.celltype.l2, healthy_cd4$seurat_clusters))
print(table(healthy_cd4$predicted.celltype.l3, healthy_cd4$seurat_clusters))

```

# Save Reference (before MapQuery)
```{r save-ref}
# Save
saveRDS(reference_integrated, "CD4_reference_RPCA_SCT_integrated.rds")
cat("✅ COMPLETE:", ncol(reference_integrated), "cells integrated\n")


```



# Marker Module Scores (Differentiation States)
```{r markersscore, fig.width=16, fig.height=8}

DefaultAssay(reference_integrated) <- "RNA"


marker_list <- list(
  Tnaive = c("CCR7","SELL","LEF1","TCF7","IL7R","CD27","PTPRC"),
  Tcm    = c("CCR7","SELL","CD27","IL7R","BCL2","TCF7"),
  Tem    = c("CCR6","CXCR3","GZMK","PRF1","IFNG"),
  Temra  = c("GZMB","PRF1","KLRG1","CX3CR1"),
  Treg   = c("FOXP3","IL2RA","CTLA4","IKZF2"),
  Tex    = c("PDCD1","CTLA4","LAG3","TIGIT","TOX","ENTPD1"),
  CD4CTL = c("GZMB","PRF1","NKG7","KLRG1","CX3CR1")
)

# Keep only genes present
marker_list <- lapply(marker_list, function(x)
  intersect(x, rownames(reference_integrated))
)

# Drop any empty sets to avoid AddModuleScore warnings
marker_list <- marker_list[vapply(marker_list, length, 1L) > 0]

for (state in names(marker_list)) {
  reference_integrated <- AddModuleScore(
    reference_integrated,
    features = list(marker_list[[state]]),
    name     = state
  )
}

score_features <- paste0(names(marker_list), "1")

FeaturePlot(reference_integrated,
            features   = score_features,
            ncol       = 4,
            cols       = c("lightblue","red"),
            max.cutoff = "q95") +
  plot_annotation(title = "Differentiation State Module Scores")

```
## FeaturePlots Markers based
```{r,fig.height=6, fig.width=10}
# ---- Load libraries ----
library(Seurat)
library(ggplot2)
library(patchwork)

# ---- Define markers for differentiation states ----
tnaive_markers <- c("CCR7", "SELL", "LEF1", "TCF7", "IL7R", "CD27", "CD45RA")
tcm_markers    <- c("CCR7", "SELL", "CD45RO", "IL7R", "CD27")
tem_markers    <- c("CCR6", "CXCR3", "GZMK", "PRF1", "IFNG", "CD45RO")
temra_markers  <- c("GZMB", "PRF1", "KLRG1", "CX3CR1", "CD45RA")
tex_markers    <- c("PDCD1", "CTLA4", "LAG3", "TIGIT", "TOX", "ENTPD1")
cd4ctl_markers <- c("GZMB", "PRF1", "NKG7", "KLRG1", "CX3CR1")  # CD4 cytotoxic markers

# ---- Keep only markers present in the dataset ----
tnaive_markers <- tnaive_markers[tnaive_markers %in% rownames(reference_integrated)]
tcm_markers    <- tcm_markers[tcm_markers %in% rownames(reference_integrated)]
tem_markers    <- tem_markers[tem_markers %in% rownames(reference_integrated)]
temra_markers  <- temra_markers[temra_markers %in% rownames(reference_integrated)]
tex_markers    <- tex_markers[tex_markers %in% rownames(reference_integrated)]
cd4ctl_markers <- cd4ctl_markers[cd4ctl_markers %in% rownames(reference_integrated)]

# ---- Function to generate patchwork feature plots ----
plot_markers_grid <- function(marker_vector, state_name) {
  plots <- lapply(marker_vector, function(gene){
    FeaturePlot(reference_integrated, features = gene, reduction = "umap",
                cols = c("lightblue", "red"), label = TRUE) +
      theme(plot.title = element_text(hjust = 0.5))
  })
  wrap_plots(plots) + plot_annotation(title = paste(state_name, "Marker Expression"))
}

# ---- Generate grids for each state ----
tnaive_plot <- plot_markers_grid(tnaive_markers, "Tnaive")
tcm_plot    <- plot_markers_grid(tcm_markers, "Tcm")
tem_plot    <- plot_markers_grid(tem_markers, "Tem")
temra_plot  <- plot_markers_grid(temra_markers, "Temra")
tex_plot    <- plot_markers_grid(tex_markers, "Tex")
cd4ctl_plot <- plot_markers_grid(cd4ctl_markers, "CD4 CTL")

# ---- Display plots ----
tnaive_plot
tcm_plot
tem_plot
temra_plot
tex_plot
cd4ctl_plot

```




# ADT FeaturePlots per differentiation state
```{r,fig.height=8, fig.width=14}
# Tnaive: CD45RA+ CCR7+ CD127+ (naive/memory core)
tnaive_adt <- c("CD45RA-TotalC", "CD197-TotalC", "CD127-TotalC")

# Tcm: CD45RO+ CCR7+ CD127+ (central memory)
tcm_adt <- c("CD45RO-TotalC", "CD197-TotalC", "CD127-TotalC")

# Tem: CD45RO+ CCR7− (effector memory)
tem_adt <- c("CD45RO-TotalC")  # CCR7 low expected

# Temra/CD4CTL: CD45RA+ + exhaustion/effector (PD-1, TIGIT, HLA-DR)
temra_adt <- c("CD45RA-TotalC", "PD-1-TotalC", "TIGIT-TotalC", "HLA-DR-TotalC")

# Treg: CD25hi CD127low ±CD45RA (naive Treg vs activated)
treg_adt <- c("CD25-TotalC", "CD127-TotalC", "CD45RA-TotalC")

# Tex: exhaustion signature
tex_adt <- c("PD-1-TotalC", "TIGIT-TotalC", "HLA-DR-TotalC")


DefaultAssay(reference_integrated) <- "ADT"

plot_markers_grid_ADT <- function(marker_vector, state_name) {
  plots <- lapply(marker_vector, function(prot){
    FeaturePlot(reference_integrated, features = prot, reduction = "umap",
                cols = c("lightblue","red")) +
      ggtitle(prot)
  })
  wrap_plots(plots) + plot_annotation(title = paste(state_name, "ADT expression"))
}

plot_markers_grid_ADT(tnaive_adt, "Tnaive")
plot_markers_grid_ADT(tcm_adt,    "Tcm")
plot_markers_grid_ADT(tem_adt,    "Tem")
plot_markers_grid_ADT(temra_adt,  "Temra/CD4CTL")
plot_markers_grid_ADT(treg_adt,   "Treg")
plot_markers_grid_ADT(tex_adt,    "Tex")


```

## Gating in 10x_S1 (uses *-TotalC)
```{r}
## 10x_S1 ADT gating  (CITE-seq)
DefaultAssay(reference_integrated) <- "ADT"

s1 <- subset(reference_integrated, subset = dataset == "CD4T_10x_S1")
adt_s1 <- LayerData(s1[["ADT"]], layer = "data.CD4T_10x_S1.1")
rownames(adt_s1) <- gsub("-TotalC", "", rownames(adt_s1))  # CD45RA, CD197, ...

adt_z_s1 <- t(scale(t(adt_s1)))

is_hi <- function(g, zmat, z = 0.5) if (g %in% rownames(zmat)) zmat[g, ] > z else rep(FALSE, ncol(zmat))

CD45RA_hi <- is_hi("CD45RA", adt_z_s1)
CD45RO_hi <- is_hi("CD45RO", adt_z_s1)
CCR7_hi   <- is_hi("CD197",  adt_z_s1)
CD127_hi  <- is_hi("CD127",  adt_z_s1)
CD25_hi   <- is_hi("CD25",   adt_z_s1)
PD1_hi    <- is_hi("PD-1",   adt_z_s1)
TIGIT_hi  <- is_hi("TIGIT",  adt_z_s1)
HLADR_hi  <- is_hi("HLA-DR", adt_z_s1)

CCR7_lo  <- !CCR7_hi
CD127_lo <- !CD127_hi

state_s1 <- rep("Other", ncol(adt_s1))

# Treg
treg_idx <- CD25_hi & CD127_lo
state_s1[treg_idx] <- "Treg_ADT"

# Tnaive
tnaive_idx <- CD45RA_hi & CCR7_hi & CD127_hi & !CD45RO_hi & !PD1_hi & !TIGIT_hi
state_s1[tnaive_idx & state_s1 == "Other"] <- "Tnaive_ADT"

# Tcm
tcm_idx <- CD45RO_hi & CCR7_hi & !CD45RA_hi
state_s1[tcm_idx & state_s1 == "Other"] <- "Tcm_ADT"

# Tem
tem_idx <- CD45RO_hi & CCR7_lo & !CD45RA_hi
state_s1[tem_idx & state_s1 == "Other"] <- "Tem_ADT"

# Temra / CD4CTL
temra_idx <- CD45RA_hi & CCR7_lo & (PD1_hi | TIGIT_hi | HLADR_hi)
state_s1[temra_idx & state_s1 == "Other"] <- "Temra_CTL_ADT"

table(state_s1)
```

## Recover ADT for CD4T_lab from original object and align cells
```{r}

# Match cells
lab_cells <- colnames(subset(reference_integrated, dataset == "CD4T_lab"))
lab_raw   <- sub("^PBMC_", "", lab_cells)

adt_lab_raw <- adt_lab_raw[, lab_raw, drop = FALSE]
colnames(adt_lab_raw) <- lab_cells

# Add assay ONCE
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)

```

## Extract ONLY overlapping cells
```{r}

# Create new assay for CD4T_lab ADT data
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)

# Verify it was added
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")
dim(LayerData(lab[["ADT_lab"]], layer = "counts"))
head(rownames(LayerData(lab[["ADT_lab"]], layer = "counts")))

```

## Create and add ADT_lab assay
```{r}

# Create new assay for CD4T_lab ADT data
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)

# Verify it was added
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")
dim(LayerData(lab[["ADT_lab"]], layer = "counts"))
head(rownames(LayerData(lab[["ADT_lab"]], layer = "counts")))

```


## Now gate on CD4T_lab cells (works!)
```{r}

# Create new assay for CD4T_lab ADT data
reference_integrated[["ADT_lab"]] <- CreateAssay5Object(counts = adt_lab_raw)

# Verify it was added
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")
dim(LayerData(lab[["ADT_lab"]], layer = "counts"))
head(rownames(LayerData(lab[["ADT_lab"]], layer = "counts")))

```









## Gating in CD4T_lab (uses markers without -TotalC)
```{r}

DefaultAssay(reference_integrated) <- "ADT"

## CD4T_lab ADT gating  (lab panel)

# Subset lab dataset
lab <- subset(reference_integrated, subset = dataset == "CD4T_lab")

# Get ADT expression (RNA-like markers without -TotalC)
adt_lab <- LayerData(lab[["ADT"]], layer = "data")   # not slot=
# Check rownames:
head(rownames(adt_lab))

# Z-score
adt_z_lab <- t(scale(t(adt_lab)))

is_hi <- function(g, zmat, z = 0.5) {
  if (!g %in% rownames(zmat)) return(rep(FALSE, ncol(zmat)))
  zmat[g, ] > z
}

CD45RA_hi_l <- is_hi("CD45RA", adt_z_lab)
CD45RO_hi_l <- is_hi("CD45RO", adt_z_lab)
CCR7_hi_l   <- is_hi("CCR7",   adt_z_lab)
CD127_hi_l  <- is_hi("CD127",  adt_z_lab)
CD25_hi_l   <- is_hi("CD25",   adt_z_lab)
PD1_hi_l    <- is_hi("PD1",    adt_z_lab)
TIGIT_hi_l  <- is_hi("TIGIT",  adt_z_lab)
HLADR_hi_l  <- is_hi("HLA-DR", adt_z_lab)  # if present

CCR7_lo_l  <- !CCR7_hi_l
CD127_lo_l <- !CD127_hi_l

state_lab <- rep("Other", ncol(adt_lab))

# Treg
treg_idx_l <- CD25_hi_l & CD127_lo_l
state_lab[treg_idx_l] <- "Treg_ADT"

# Tnaive
tnaive_idx_l <- CD45RA_hi_l & CCR7_hi_l & CD127_hi_l & !CD45RO_hi_l & !PD1_hi_l & !TIGIT_hi_l
state_lab[tnaive_idx_l & state_lab == "Other"] <- "Tnaive_ADT"

# Tcm
tcm_idx_l <- CD45RO_hi_l & CCR7_hi_l & !CD45RA_hi_l
state_lab[tcm_idx_l & state_lab == "Other"] <- "Tcm_ADT"

# Tem
tem_idx_l <- CD45RO_hi_l & CCR7_lo_l & !CD45RA_hi_l
state_lab[tem_idx_l & state_lab == "Other"] <- "Tem_ADT"

# Temra/CD4CTL
temra_idx_l <- CD45RA_hi_l & CCR7_lo_l & (PD1_hi_l | TIGIT_hi_l | HLADR_hi_l)
state_lab[temra_idx_l & state_lab == "Other"] <- "Temra_CTL_ADT"

table(state_lab)
```
## Assign per‑cell differentiation state
```{r}
# Collect module score column names (they end with "1")
state_names <- names(marker_list)              # e.g. "Tnaive","Tcm",...
score_cols  <- paste0(state_names, "1")        # "Tnaive1","Tcm1",...

# Build matrix of scores
score_mat <- as.data.frame(reference_integrated@meta.data[, score_cols])

# For each cell, find which state has the highest score
state_max_idx   <- apply(score_mat, 1, which.max)
state_max_label <- state_names[state_max_idx]

# Optionally impose a minimum separation: if top score is not sufficiently
# higher than the second, call the cell "Mixed"
top_vals  <- apply(score_mat, 1, max)
second_vals <- apply(score_mat, 1, function(x) sort(x, decreasing = TRUE)[2])
delta <- top_vals - second_vals
state_max_label[delta < 0.05] <- "Mixed"   # 0.05 is an example threshold

# Store in metadata
reference_integrated$diff_state <- factor(state_max_label,
                                          levels = c("Tnaive","Tcm","Tem","Temra",
                                                     "Treg","Tex","CD4CTL","Mixed"))

# UMAP colored by inferred differentiation state
DimPlot(reference_integrated, group.by = "diff_state", label = TRUE, label.box = T, repel = T) +
  ggtitle("Inferred CD4 T-cell differentiation states")
```



# ADT markers and extraction (10x_S1 already run)
```{r, fig.height=6, fig.width=10}

# o you can do:
# 
# Tnaive: CD3+ CD4+ CD45RA+ CD45RO− CCR7+ CD127+
# 
# Tcm: CD3+ CD4+ CD45RO+ CCR7+ CD45RA−
# 
# Tem: CD3+ CD4+ CD45RO+ CCR7− (or low) CD45RA−
# 
# Temra: CD3+ CD4+ CD45RA+ CD45RO+/− CCR7−



DefaultAssay(reference_integrated) <- "ADT"

adt_markers_raw <- c("CD3-TotalC","CD4-TotalC","CD45RA-TotalC",
                     "CD45RO-TotalC","CD197-TotalC","CD127-TotalC",
                     "CD25-TotalC","HLA-DR-TotalC","PD1-TotalC","TIGIT-TotalC")

FeaturePlot(reference_integrated,
            features = adt_markers_raw,
            reduction = "umap",
            cols = c("lightblue","red"),
            ncol = 3) +
  plot_annotation(title = "Key ADT markers (10x_S1 cells)")
```
# Derive ADT‑based “high/low” categories
```{r}
# Full UMAP, 10x_S1 colored, others grey
p1 <- DimPlot(reference_integrated,
              group.by = "ADT_state2",
              reduction = "umap",
              pt.size   = 0.5) +
  scale_color_manual(values = c("Tnaive_ADT" = "#FF6B6B", "Tcm_ADT" = "#4ECDC4",
                                "Tem_ADT" = "#45B7D1", "Temra_CTL_ADT" = "#96CEB4",
                                "Treg_ADT" = "#FECA57", "Other" = "grey80")) +
  ggtitle("Full UMAP: 10x_S1 colored by ADT, others grey")

p2 <- DimPlot(subset(reference_integrated, dataset == "CD4T_10x_S1"),
              group.by = "ADT_state2",
              reduction = "umap",
              label = TRUE, label.box = TRUE, repel = TRUE) +
  ggtitle("10x_S1 ONLY") +
  NoLegend()

p1 | p2
```

## Logical gating for Tnaive/Tcm/Tem/Temra
```{r}
# Boolean vectors per marker
CD45RA_hi <- is_high("CD45RA", adt_z)
CD45RO_hi <- is_high("CD45RO", adt_z)
CCR7_hi   <- is_high("CD197",  adt_z)   # CCR7
CD127_hi  <- is_high("CD127",  adt_z)

# Gating rules
Tnaive_idx <-  CD45RA_hi &  CCR7_hi & CD127_hi & !CD45RO_hi
Tcm_idx    <- !CD45RA_hi &  CCR7_hi &  CD45RO_hi
Tem_idx    <- !CD45RA_hi & !CCR7_hi &  CD45RO_hi
Temra_idx  <-  CD45RA_hi & !CCR7_hi & (CD45RO_hi | !CD45RO_hi)  # RA+ CCR7−

# Assign state; resolve conflicts by priority
state <- rep("Other", ncol(adt))
state[Tnaive_idx] <- "Tnaive"
state[Tcm_idx]    <- "Tcm"
state[Tem_idx]    <- "Tem"
state[Temra_idx]  <- "Temra"

# Store in metadata (10x_S1 subset)
s1_subset$ADT_state <- state

# Put back into full object
reference_integrated$ADT_state <- NA
reference_integrated$ADT_state[colnames(s1_subset)] <- s1_subset$ADT_state
reference_integrated$ADT_state <- factor(reference_integrated$ADT_state,
                                         levels = c("Tnaive","Tcm","Tem","Temra","Other"))
```

# Visualize ADT‑defined states on your integrated UMAP
```{r Monocle3}
DefaultAssay(reference_integrated) <- "integrated"

DimPlot(reference_integrated,
        group.by = "ADT_state",
        reduction = "umap",
        label = TRUE, label.box = T, repel = T) +
  ggtitle("CD4T_10x_S1 cells annotated by ADT gating")
```



# Monocle3 Trajectory
```{r }
cds <- as.cell_data_set(reference_integrated)

# Transfer UMAP + partitions from Seurat
reducedDim(cds, "UMAP") <- Embeddings(reference_integrated, "umap")
cds@clusters$UMAP$partitions <- factor(rep(1, ncol(cds)))
cds@clusters$UMAP$clusters   <- reference_integrated$seurat_clusters
colData(cds)$celltype         <- reference_integrated$predicted.celltype.l2
colData(cds)$dataset          <- reference_integrated$dataset
```

```{r learn-graph}
cds <- learn_graph(cds,
                   use_partition = FALSE,
                   close_loop    = FALSE,
                   verbose       = FALSE)

plot_cells(cds,
           color_cells_by          = "celltype",
           label_groups_by_cluster = FALSE,
           label_leaves            = TRUE,
           label_branch_points     = TRUE,
           graph_label_size        = 4) +
  ggtitle("Monocle3 — CD4 T-cell Trajectory Milestones")


```

```{r pseudotime}
# Root = CD4 Naive centroid
naive_ids <- rownames(colData(cds))[
  colData(cds)$celltype %in% c("CD4 Naive")
]

get_root_node <- function(cds, cell_ids) {
  closest <- cds@principal_graph_aux$UMAP$pr_graph_cell_proj_closest_vertex
  closest <- as.matrix(closest[colnames(cds), ])
  igraph::V(principal_graph(cds)$UMAP)$name[
    as.numeric(names(which.max(table(closest[cell_ids, ]))))]
}

cds <- order_cells(cds, root_pr_nodes = get_root_node(cds, naive_ids))
reference_integrated$monocle3_pseudotime <- pseudotime(cds)

plot_cells(cds,
           color_cells_by    = "pseudotime",
           label_cell_groups = FALSE) +
  scale_color_viridis_c(option = "plasma") +
  ggtitle("Pseudotime: Naive → Memory → Effector → Treg")
```

---

# Save Reference (before MapQuery)

```{r saveref-final}
saveRDS(reference_integrated,
        "CD4_reference_RPCA_SCT_Trajectory_annotated.rds")

cat("Reference saved:", ncol(reference_integrated), "cells\n")

```




























```




