1 Load libraries

2 Load Annotated Healthy Object

reference_integrated <- readRDS("../CD4_reference_RPCA_SCT_integrated.rds")


cat("Healthy Reference Integrated CD4 cells:", ncol(reference_integrated), "\n")
Healthy Reference Integrated CD4 cells: 12034 
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$dataset))
                   
                    CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
  CD4 CTL                     1           0       11
  CD4 Naive                1367         381      359
  CD4 Proliferating           9           4        0
  CD4 TCM                  2010        2856     4654
  CD4 TEM                    29          51       73
  Treg                       87         137        5
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$dataset))
                   
                    CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
  CD4 CTL                     1           0       11
  CD4 Naive                1370         382      363
  CD4 Proliferating           9           4        0
  CD4 TCM_1                1518        1859     4181
  CD4 TCM_2                 113         148      248
  CD4 TCM_3                 370         849      226
  CD4 TEM_1                  12          26        8
  CD4 TEM_2                  19          11       13
  CD4 TEM_3                   2           7       49
  Treg Memory                58         134        3
  Treg Naive                 31           9        0
DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)

reference_integrated
An object of class Seurat 
57742 features across 12034 samples within 7 assays 
Active assay: RNA (36601 features, 2902 variable features)
 3 layers present: scale.data, data, counts
 6 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
 2 dimensional reductions calculated: pca, umap

3 Doublet detection and removal

DefaultAssay(reference_integrated) <- "RNA"

# Convert to SingleCellExperiment
sce <- as.SingleCellExperiment(reference_integrated, assay = "RNA")

# Run scDblFinder with cluster-aware mode (recommended for droplet data)[3][4]
sce <- scDblFinder(sce)

table(sce$scDblFinder.class)

singlet doublet 
  11539     495 

3.1 Doublet detection and removal

library(dplyr)
# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>% 
  dplyr::select(starts_with('scDblFinder')) # 'scDblFinder.class')
head(meta_scdblfinder)

3.2 Bring calls back into Seurat and inspect cluster 9

# Bring doublet calls back to Seurat
reference_integrated$scDblFinder.class <- sce$scDblFinder.class[
  match(colnames(reference_integrated), colnames(sce))
]
reference_integrated$scDblFinder.score <- sce$scDblFinder.score[
  match(colnames(reference_integrated), colnames(sce))
]

# Inspect enrichment per cluster
table(reference_integrated$seurat_clusters,
      reference_integrated$scDblFinder.class)
   
    singlet doublet
  0    5325     154
  1    2907     175
  2    1314      37
  3     530      44
  4     419      29
  5     314      17
  6     307       9
  7     275      10
  8      93       2
  9      55      18
DimPlot(reference_integrated, group.by = "scDblFinder.class") +
  ggtitle("scDblFinder doublet calls on integrated UMAP")

# Doublet stats
# Check how doublets singlets differ in QC measures per sample.
VlnPlot(reference_integrated, group.by = 'dataset', split.by = "scDblFinder.class",
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 4, pt.size = 0) + theme(legend.position = 'right')

doublets_summary <- reference_integrated@meta.data %>% 
  group_by(dataset, scDblFinder.class) %>% 
  summarise(total_count = n(),.groups = 'drop') %>% as.data.frame() %>% ungroup() %>%
  group_by(dataset) %>%
  mutate(countT = sum(total_count)) %>%
  group_by(scDblFinder.class, .add = TRUE) %>%
  mutate(percent = paste0(round(100 * total_count/countT, 2),'%')) %>%
  dplyr::select(-countT)
doublets_summary
write.table(doublets_summary, file = "doublet_folder/scDblFinder_doublets_summary.txt", quote = FALSE, row.names = FALSE, sep = '\t')

4 Keep singlets only and recluster

# Keep singlets only and recluster
reference_singlets <- subset(reference_integrated,
                             subset = scDblFinder.class == "singlet")

reference_singlets <- FindClusters(reference_singlets, 
                                   graph.name = "integrated_snn",
                                   resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 11539
Number of edges: 398563

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8335
Number of communities: 13
Elapsed time: 1 seconds
reference_singlets <- RunUMAP(reference_singlets, dims = 1:15)
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
DimPlot(reference_singlets, group.by = "seurat_clusters") +
  ggtitle("Reclustered singlets using integrated_snn")

5 Validation plots after doublet removal

DefaultAssay(reference_singlets) <- "RNA"

p1 <- DimPlot(reference_singlets, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_singlets, group.by = "dataset")
p3 <- DimPlot(reference_singlets, group.by = "seurat_clusters", label = TRUE)
(p1 | p2) / p3

5.1 Validation plots

DimPlot(reference_singlets, group.by = "predicted.celltype.l1",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_singlets, group.by = "predicted.celltype.l2",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_singlets, group.by = "predicted.celltype.l3",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()


print(table(reference_singlets$predicted.celltype.l1, reference_singlets$seurat_clusters))
       
           0    1    2    3    4    5    6    7    8    9   10   11   12
  CD4 T 2748 2178 1712 1497  860  528  524  426  316  307  278  104   55
  CD8 T    0    0    0    0    0    5    0    0    0    0    1    0    0
print(table(reference_singlets$predicted.celltype.l2, reference_singlets$seurat_clusters))
                   
                       0    1    2    3    4    5    6    7    8    9   10   11   12
  CD4 CTL              0    0    0    0    0   11    0    0    0    0    0    0    1
  CD4 Naive            0 1010  400  599    0    0    0   14    6    1   12   22    0
  CD4 Proliferating    1    0    0    0    0   11    0    0    0    0    0    0    0
  CD4 TCM           2731 1159 1305  881  858  377  523  411  308  128  264   82   52
  CD4 TEM              6    1    0    0    2  134    0    0    1    0    0    0    0
  Treg                10    8    7   17    0    0    1    1    1  178    3    0    2
print(table(reference_singlets$predicted.celltype.l3, reference_singlets$seurat_clusters))
                   
                       0    1    2    3    4    5    6    7    8    9   10   11   12
  CD4 CTL              0    0    0    0    0   11    0    0    0    0    0    0    1
  CD4 Naive            1 1013  402  599    0    0    0   15    6    1   12   23    0
  CD4 Proliferating    1    0    0    0    0   11    0    0    0    0    0    0    0
  CD4 TCM_1         2283 1151 1303  878  200   77  198  392  300   53  245   73   39
  CD4 TCM_2          258    1    1    0   13   50   52   16    0   70   10    5    6
  CD4 TCM_3          186    3    1    4  645  259  270    2    8    3    6    3    7
  CD4 TEM_1            1    0    0    0    0   41    0    0    0    0    0    0    0
  CD4 TEM_2            6    1    0    0    0   32    1    0    0    0    0    0    0
  CD4 TEM_3            0    0    0    0    2   52    1    0    1    0    0    0    0
  Treg Memory          9    5    1    1    0    0    2    1    1  166    6    0    2
  Treg Naive           3    4    4   15    0    0    0    0    0   14    0    0    0

6 Save Reference (before MapQuery)

# Save
saveRDS(reference_singlets, "../CD4_reference_RPCA_SCT_integrated_doublets_removed.rds")
cat("✅ COMPLETE:", ncol(reference_singlets), "cells integrated\n")
✅ COMPLETE: 11539 cells integrated

7 RNA marker module scores (differentiation states)


DefaultAssay(reference_singlets) <- "RNA"

marker_list <- list(
  Tnaive = c("CCR7","SELL","LEF1","TCF7","IL7R","CD27","PTPRC"),
  Tcm    = c("CCR7","SELL","CD27","IL7R","BCL2","TCF7"),
  Tem    = c("CCR6","CXCR3","GZMK","PRF1","IFNG"),
  Temra  = c("GZMB","PRF1","KLRG1","CX3CR1"),
  Treg   = c("FOXP3","IL2RA","CTLA4","IKZF2"),
  Tex    = c("PDCD1","CTLA4","LAG3","TIGIT","TOX","ENTPD1"),
  CD4CTL = c("GZMB","PRF1","NKG7","KLRG1","CX3CR1")
)

marker_list <- lapply(marker_list, function(x)
  intersect(x, rownames(reference_singlets))
)
marker_list <- marker_list[vapply(marker_list, length, 1L) > 0]

for (state in names(marker_list)) {
  reference_singlets <- AddModuleScore(
    reference_singlets,
    features = list(marker_list[[state]]),
    name     = state
  )
}

score_features <- paste0(names(marker_list), "1")

FeaturePlot(reference_singlets,
            features   = score_features,
            ncol       = 4,
            cols       = c("lightblue","red"),
            max.cutoff = "q95") +
  plot_annotation(title = "Differentiation State Module Scores")

7.1 FeaturePlots Markers based

tnaive_plot
tcm_plot

tem_plot

temra_plot

tex_plot

cd4ctl_plot

8 ADT feature plots and gating (CD4T_10x_S1 ONLY)

plot_markers_grid_ADT(tnaive_adt, "Tnaive")
plot_markers_grid_ADT(tcm_adt,    "Tcm")

plot_markers_grid_ADT(tem_adt,    "Tem")

plot_markers_grid_ADT(temra_adt,  "Temra/CD4CTL")

plot_markers_grid_ADT(treg_adt,   "Treg")

plot_markers_grid_ADT(tex_adt,    "Tex")

8.1 Visualize ADT-defined states on UMAP

DefaultAssay(reference_singlets) <- "ADT"

s1 <- subset(reference_singlets, subset = dataset == "CD4T_10x_S1")

adt_s1 <- GetAssayData(s1, assay = "ADT", layer = "data")
rownames(adt_s1) <- gsub("-TotalC","",rownames(adt_s1))

adt_z_s1 <- t(scale(t(adt_s1)))

is_hi <- function(g, zmat, z = 0.5)
  if (g %in% rownames(zmat)) zmat[g, ] > z else rep(FALSE, ncol(zmat))

CD45RA_hi <- is_hi("CD45RA", adt_z_s1)
CD45RO_hi <- is_hi("CD45RO", adt_z_s1)
CCR7_hi   <- is_hi("CD197",  adt_z_s1)
CD127_hi  <- is_hi("CD127",  adt_z_s1)
CD25_hi   <- is_hi("CD25",   adt_z_s1)
PD1_hi    <- is_hi("PD-1",   adt_z_s1)
TIGIT_hi  <- is_hi("TIGIT",  adt_z_s1)
HLADR_hi  <- is_hi("HLA-DR", adt_z_s1)

CCR7_lo  <- !CCR7_hi
CD127_lo <- !CD127_hi

state_s1 <- rep("Other", ncol(adt_s1))

treg_idx   <- CD25_hi & CD127_lo
state_s1[treg_idx] <- "Treg_ADT"

tnaive_idx <- CD45RA_hi & CCR7_hi & CD127_hi & !CD45RO_hi & !PD1_hi & !TIGIT_hi
state_s1[tnaive_idx & state_s1 == "Other"] <- "Tnaive_ADT"

tcm_idx    <- CD45RO_hi & CCR7_hi & !CD45RA_hi
state_s1[tcm_idx & state_s1 == "Other"] <- "Tcm_ADT"

tem_idx    <- CD45RO_hi & CCR7_lo & !CD45RA_hi
state_s1[tem_idx & state_s1 == "Other"] <- "Tem_ADT"

temra_idx  <- CD45RA_hi & CCR7_lo & (PD1_hi | TIGIT_hi | HLADR_hi)
state_s1[temra_idx & state_s1 == "Other"] <- "Temra_CTL_ADT"

table(state_s1)
state_s1
        Other       Tcm_ADT       Tem_ADT Temra_CTL_ADT    Tnaive_ADT      Treg_ADT 
         1722            94           672           233            98           579 
# Make sure ADT_state is character BEFORE filling
reference_singlets$ADT_state <- NA_character_

# Store on Seurat object
s1$ADT_state <- state_s1
reference_singlets$ADT_state[colnames(s1)] <- s1$ADT_state

cat(
  "ADT-based phenotyping was restricted to the CD4T_10x_S1 dataset, ",
  "which was generated with a uniform antibody panel and normalization strategy. ",
  "These ADT-derived states were used to validate and calibrate RNA-based ",
  "differentiation signatures, which were subsequently applied across all samples.\n"
)
ADT-based phenotyping was restricted to the CD4T_10x_S1 dataset,  which was generated with a uniform antibody panel and normalization strategy.  These ADT-derived states were used to validate and calibrate RNA-based  differentiation signatures, which were subsequently applied across all samples.

9 Assign per-cell RNA differentiation state

DefaultAssay(reference_singlets) <- "RNA"

state_names <- names(marker_list)
score_cols  <- paste0(state_names, "1")

score_mat <- as.data.frame(reference_singlets@meta.data[, score_cols])

state_max_idx   <- apply(score_mat, 1, which.max)
state_max_label <- state_names[state_max_idx]

top_vals    <- apply(score_mat, 1, max)
second_vals <- apply(score_mat, 1, function(x) sort(x, decreasing = TRUE))[3]
delta <- top_vals - second_vals
state_max_label[delta < 0.05] <- "Mixed"

reference_singlets$diff_state <- factor(
  state_max_label,
  levels = c("Tnaive","Tcm","Tem","Temra","Treg","Tex","CD4CTL","Mixed")
)

DimPlot(reference_singlets, group.by = "diff_state",
        label = TRUE, label.box = TRUE, repel = TRUE) +
  ggtitle("Inferred CD4 T-cell differentiation states (RNA)")

10 Visualize ADT-defined states on UMAP

DefaultAssay(reference_singlets) <- "integrated"

DimPlot(reference_singlets,
        group.by = "ADT_state",
        reduction = "umap",
        label = TRUE, label.box = TRUE, repel = TRUE) +
  ggtitle("CD4T_10x_S1 cells annotated by ADT gating")

11 Definitive macrophage contamination markers for cluster 12:

FeaturePlot(reference_singlets, 
            features = c("ASGR2", "SIRPB2", "TREM1", "CD3E", "CD4"),
            ncol = 3, split.by = "seurat_clusters")

12 Seurat DoHeatmap: cleaner and shows patterns better

# Compare cluster 12 vs all other clusters
Idents(reference_singlets) <- "seurat_clusters"

cluster12_markers <- FindMarkers(reference_singlets,
                                 ident.1 = "12",
                                 ident.2 = NULL,  # vs all others
                                 logfc.threshold = 0.25,
                                 assay = "RNA")

head(cluster12_markers, n = 20)

# Top positive markers only
cluster12_pos <- cluster12_markers %>% 
  filter(avg_log2FC > 0.5, p_val_adj < 0.05) %>% 
  arrange(desc(avg_log2FC))
head(cluster12_pos, 15)

13 Seurat DoHeatmap: cleaner and shows patterns better



# Publication violin plot
VlnPlot(reference_singlets,
        features = c("ASGR2", "AQP9", "VSIG4"),
        group.by = "seurat_clusters",
        stack = TRUE,flip = T,
        pt.size = 0.1) +
  ggtitle("Cluster 12 = Hepatic Memory CD4 (ASGR2+/AQP9+)")

14 Run this after reclustering and before RNA heatmap:

```

---
title: "CD4 T-cell Reference + Annotation + Doublet"
subtitle: "Doublet | ADT | Annotation"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# Load libraries
```{r, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                      fig.width = 12, fig.height = 8)
library(Seurat)
library(monocle3)
library(SeuratWrappers)
library(patchwork)
library(ggplot2)
library(RColorBrewer)
library(SingleCellExperiment)
library(scDblFinder)
library(dplyr)
options(future.globals.maxSize = 8e9)
set.seed(123)

```


# Load Annotated Healthy Object
```{r }
reference_integrated <- readRDS("../CD4_reference_RPCA_SCT_integrated.rds")


cat("Healthy Reference Integrated CD4 cells:", ncol(reference_integrated), "\n")
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$dataset))
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$dataset))

DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)

reference_integrated

```

# Doublet detection and removal
```{r}
DefaultAssay(reference_integrated) <- "RNA"

# Convert to SingleCellExperiment
sce <- as.SingleCellExperiment(reference_integrated, assay = "RNA")

# Run scDblFinder with cluster-aware mode (recommended for droplet data)[3][4]
sce <- scDblFinder(sce)

table(sce$scDblFinder.class)
```


## Doublet detection and removal
```{r }
library(dplyr)
# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>% 
  dplyr::select(starts_with('scDblFinder')) # 'scDblFinder.class')
head(meta_scdblfinder)
```


## Bring calls back into Seurat and inspect cluster 9
```{r }
# Bring doublet calls back to Seurat
reference_integrated$scDblFinder.class <- sce$scDblFinder.class[
  match(colnames(reference_integrated), colnames(sce))
]
reference_integrated$scDblFinder.score <- sce$scDblFinder.score[
  match(colnames(reference_integrated), colnames(sce))
]

# Inspect enrichment per cluster
table(reference_integrated$seurat_clusters,
      reference_integrated$scDblFinder.class)

DimPlot(reference_integrated, group.by = "scDblFinder.class") +
  ggtitle("scDblFinder doublet calls on integrated UMAP")
```


```{r, fig.height=6, fig.width=10}
# Doublet stats
# Check how doublets singlets differ in QC measures per sample.
VlnPlot(reference_integrated, group.by = 'dataset', split.by = "scDblFinder.class",
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 4, pt.size = 0) + theme(legend.position = 'right')

```

```{r }
doublets_summary <- reference_integrated@meta.data %>% 
  group_by(dataset, scDblFinder.class) %>% 
  summarise(total_count = n(),.groups = 'drop') %>% as.data.frame() %>% ungroup() %>%
  group_by(dataset) %>%
  mutate(countT = sum(total_count)) %>%
  group_by(scDblFinder.class, .add = TRUE) %>%
  mutate(percent = paste0(round(100 * total_count/countT, 2),'%')) %>%
  dplyr::select(-countT)
doublets_summary
write.table(doublets_summary, file = "doublet_folder/scDblFinder_doublets_summary.txt", quote = FALSE, row.names = FALSE, sep = '\t')
```

# Keep singlets only and recluster
```{r }
# Keep singlets only and recluster
reference_singlets <- subset(reference_integrated,
                             subset = scDblFinder.class == "singlet")

reference_singlets <- FindClusters(reference_singlets, 
                                   graph.name = "integrated_snn",
                                   resolution = 0.5)

reference_singlets <- RunUMAP(reference_singlets, dims = 1:15)

DimPlot(reference_singlets, group.by = "seurat_clusters") +
  ggtitle("Reclustered singlets using integrated_snn")
```


# Validation plots after doublet removal
```{r validate, fig.width=14, fig.height=10}
DefaultAssay(reference_singlets) <- "RNA"

p1 <- DimPlot(reference_singlets, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_singlets, group.by = "dataset")
p3 <- DimPlot(reference_singlets, group.by = "seurat_clusters", label = TRUE)
(p1 | p2) / p3
```

## Validation plots
```{r , fig.width=12, fig.height=8}
DimPlot(reference_singlets, group.by = "predicted.celltype.l1",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_singlets, group.by = "predicted.celltype.l2",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_singlets, group.by = "predicted.celltype.l3",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

print(table(reference_singlets$predicted.celltype.l1, reference_singlets$seurat_clusters))
print(table(reference_singlets$predicted.celltype.l2, reference_singlets$seurat_clusters))
print(table(reference_singlets$predicted.celltype.l3, reference_singlets$seurat_clusters))
```










# Save Reference (before MapQuery)
```{r save-ref}
# Save
saveRDS(reference_singlets, "../CD4_reference_RPCA_SCT_integrated_doublets_removed.rds")
cat("✅ COMPLETE:", ncol(reference_singlets), "cells integrated\n")


```



# RNA marker module scores (differentiation states)
```{r markersscore, fig.width=16, fig.height=8}

DefaultAssay(reference_singlets) <- "RNA"

marker_list <- list(
  Tnaive = c("CCR7","SELL","LEF1","TCF7","IL7R","CD27","PTPRC"),
  Tcm    = c("CCR7","SELL","CD27","IL7R","BCL2","TCF7"),
  Tem    = c("CCR6","CXCR3","GZMK","PRF1","IFNG"),
  Temra  = c("GZMB","PRF1","KLRG1","CX3CR1"),
  Treg   = c("FOXP3","IL2RA","CTLA4","IKZF2"),
  Tex    = c("PDCD1","CTLA4","LAG3","TIGIT","TOX","ENTPD1"),
  CD4CTL = c("GZMB","PRF1","NKG7","KLRG1","CX3CR1")
)

marker_list <- lapply(marker_list, function(x)
  intersect(x, rownames(reference_singlets))
)
marker_list <- marker_list[vapply(marker_list, length, 1L) > 0]

for (state in names(marker_list)) {
  reference_singlets <- AddModuleScore(
    reference_singlets,
    features = list(marker_list[[state]]),
    name     = state
  )
}

score_features <- paste0(names(marker_list), "1")

FeaturePlot(reference_singlets,
            features   = score_features,
            ncol       = 4,
            cols       = c("lightblue","red"),
            max.cutoff = "q95") +
  plot_annotation(title = "Differentiation State Module Scores")

```


## FeaturePlots Markers based
```{r,fig.height=8, fig.width=12}
# ---- Load libraries ----
library(Seurat)
library(ggplot2)
library(patchwork)

# ---- Define markers for differentiation states ----
tnaive_markers <- c("CCR7","SELL","LEF1","TCF7","IL7R","CD27","PTPRC")
tcm_markers    <- c("CCR7","SELL","CD45RO","IL7R","CD27")
tem_markers    <- c("CCR6","CXCR3","GZMK","PRF1","IFNG")
temra_markers  <- c("GZMB","PRF1","KLRG1","CX3CR1","CD45RA")
tex_markers    <- c("PDCD1","CTLA4","LAG3","TIGIT","TOX","ENTPD1")
cd4ctl_markers <- c("GZMB","PRF1","NKG7","KLRG1","CX3CR1")

keep <- function(v) v[v %in% rownames(reference_singlets)]
tnaive_markers <- keep(tnaive_markers)
tcm_markers    <- keep(tcm_markers)
tem_markers    <- keep(tem_markers)
temra_markers  <- keep(temra_markers)
tex_markers    <- keep(tex_markers)
cd4ctl_markers <- keep(cd4ctl_markers)

plot_markers_grid <- function(marker_vector, state_name) {
  plots <- lapply(marker_vector, function(gene){
    FeaturePlot(reference_singlets, features = gene, reduction = "umap",
                cols = c("lightblue","red"), label = TRUE) +
      theme(plot.title = element_text(hjust = 0.5))
  })
  wrap_plots(plots) + plot_annotation(title = paste(state_name, "Marker Expression"))
}

tnaive_plot <- plot_markers_grid(tnaive_markers, "Tnaive")
tcm_plot    <- plot_markers_grid(tcm_markers, "Tcm")
tem_plot    <- plot_markers_grid(tem_markers, "Tem")
temra_plot  <- plot_markers_grid(temra_markers, "Temra")
tex_plot    <- plot_markers_grid(tex_markers, "Tex")
cd4ctl_plot <- plot_markers_grid(cd4ctl_markers, "CD4 CTL")

tnaive_plot
tcm_plot
tem_plot
temra_plot
tex_plot
cd4ctl_plot

```




# ADT feature plots and gating (CD4T_10x_S1 ONLY)
```{r,fig.height=8, fig.width=18}
DefaultAssay(reference_singlets) <- "ADT"

tnaive_adt <- c("CD45RA-TotalC","CD197-TotalC","CD127-TotalC")
tcm_adt    <- c("CD45RO-TotalC","CD197-TotalC","CD127-TotalC")
tem_adt    <- c("CD45RO-TotalC")
temra_adt  <- c("CD45RA-TotalC","PD-1-TotalC","TIGIT-TotalC","HLA-DR-TotalC")
treg_adt   <- c("CD25-TotalC","CD127-TotalC","CD45RA-TotalC")
tex_adt    <- c("PD-1-TotalC","TIGIT-TotalC","HLA-DR-TotalC")

plot_markers_grid_ADT <- function(marker_vector, state_name) {
  plots <- lapply(marker_vector, function(prot){
    FeaturePlot(reference_singlets, features = prot, reduction = "umap",
                cols = c("lightblue","red")) +
      ggtitle(prot)
  })
  wrap_plots(plots) + plot_annotation(title = paste(state_name, "ADT expression"))
}

plot_markers_grid_ADT(tnaive_adt, "Tnaive")
plot_markers_grid_ADT(tcm_adt,    "Tcm")
plot_markers_grid_ADT(tem_adt,    "Tem")
plot_markers_grid_ADT(temra_adt,  "Temra/CD4CTL")
plot_markers_grid_ADT(treg_adt,   "Treg")
plot_markers_grid_ADT(tex_adt,    "Tex")

```
## Visualize ADT-defined states on UMAP
```{r,fig.height=8, fig.width=14}
DefaultAssay(reference_singlets) <- "ADT"

s1 <- subset(reference_singlets, subset = dataset == "CD4T_10x_S1")

adt_s1 <- GetAssayData(s1, assay = "ADT", layer = "data")
rownames(adt_s1) <- gsub("-TotalC","",rownames(adt_s1))

adt_z_s1 <- t(scale(t(adt_s1)))

is_hi <- function(g, zmat, z = 0.5)
  if (g %in% rownames(zmat)) zmat[g, ] > z else rep(FALSE, ncol(zmat))

CD45RA_hi <- is_hi("CD45RA", adt_z_s1)
CD45RO_hi <- is_hi("CD45RO", adt_z_s1)
CCR7_hi   <- is_hi("CD197",  adt_z_s1)
CD127_hi  <- is_hi("CD127",  adt_z_s1)
CD25_hi   <- is_hi("CD25",   adt_z_s1)
PD1_hi    <- is_hi("PD-1",   adt_z_s1)
TIGIT_hi  <- is_hi("TIGIT",  adt_z_s1)
HLADR_hi  <- is_hi("HLA-DR", adt_z_s1)

CCR7_lo  <- !CCR7_hi
CD127_lo <- !CD127_hi

state_s1 <- rep("Other", ncol(adt_s1))

treg_idx   <- CD25_hi & CD127_lo
state_s1[treg_idx] <- "Treg_ADT"

tnaive_idx <- CD45RA_hi & CCR7_hi & CD127_hi & !CD45RO_hi & !PD1_hi & !TIGIT_hi
state_s1[tnaive_idx & state_s1 == "Other"] <- "Tnaive_ADT"

tcm_idx    <- CD45RO_hi & CCR7_hi & !CD45RA_hi
state_s1[tcm_idx & state_s1 == "Other"] <- "Tcm_ADT"

tem_idx    <- CD45RO_hi & CCR7_lo & !CD45RA_hi
state_s1[tem_idx & state_s1 == "Other"] <- "Tem_ADT"

temra_idx  <- CD45RA_hi & CCR7_lo & (PD1_hi | TIGIT_hi | HLADR_hi)
state_s1[temra_idx & state_s1 == "Other"] <- "Temra_CTL_ADT"

table(state_s1)

# Make sure ADT_state is character BEFORE filling
reference_singlets$ADT_state <- NA_character_

# Store on Seurat object
s1$ADT_state <- state_s1
reference_singlets$ADT_state[colnames(s1)] <- s1$ADT_state

cat(
  "ADT-based phenotyping was restricted to the CD4T_10x_S1 dataset, ",
  "which was generated with a uniform antibody panel and normalization strategy. ",
  "These ADT-derived states were used to validate and calibrate RNA-based ",
  "differentiation signatures, which were subsequently applied across all samples.\n"
)

```
# Assign per-cell RNA differentiation state
```{r,fig.height=8, fig.width=14}
DefaultAssay(reference_singlets) <- "RNA"

state_names <- names(marker_list)
score_cols  <- paste0(state_names, "1")

score_mat <- as.data.frame(reference_singlets@meta.data[, score_cols])

state_max_idx   <- apply(score_mat, 1, which.max)
state_max_label <- state_names[state_max_idx]

top_vals    <- apply(score_mat, 1, max)
second_vals <- apply(score_mat, 1, function(x) sort(x, decreasing = TRUE))[3]
delta <- top_vals - second_vals
state_max_label[delta < 0.05] <- "Mixed"

reference_singlets$diff_state <- factor(
  state_max_label,
  levels = c("Tnaive","Tcm","Tem","Temra","Treg","Tex","CD4CTL","Mixed")
)

DimPlot(reference_singlets, group.by = "diff_state",
        label = TRUE, label.box = TRUE, repel = TRUE) +
  ggtitle("Inferred CD4 T-cell differentiation states (RNA)")

```

# Visualize ADT-defined states on UMAP
```{r,fig.height=8, fig.width=14}
DefaultAssay(reference_singlets) <- "integrated"

DimPlot(reference_singlets,
        group.by = "ADT_state",
        reduction = "umap",
        label = TRUE, label.box = TRUE, repel = TRUE) +
  ggtitle("CD4T_10x_S1 cells annotated by ADT gating")
```



# Definitive macrophage contamination markers for cluster 12:
```{r, fig.height=20, fig.width=24 }
FeaturePlot(reference_singlets, 
            features = c("ASGR2", "SIRPB2", "TREM1", "CD3E", "CD4"),
            ncol = 3, split.by = "seurat_clusters")

```



# Seurat DoHeatmap: cleaner and shows patterns better
```{r, fig.height=8, fig.width=14  }
# Compare cluster 12 vs all other clusters
Idents(reference_singlets) <- "seurat_clusters"

cluster12_markers <- FindMarkers(reference_singlets,
                                 ident.1 = "12",
                                 ident.2 = NULL,  # vs all others
                                 logfc.threshold = 0.25,
                                 assay = "RNA")

head(cluster12_markers, n = 20)

# Top positive markers only
cluster12_pos <- cluster12_markers %>% 
  filter(avg_log2FC > 0.5, p_val_adj < 0.05) %>% 
  arrange(desc(avg_log2FC))
head(cluster12_pos, 15)

# Publication violin plot
VlnPlot(reference_singlets,
        features = c("ASGR2", "AQP9", "VSIG4"),
        group.by = "seurat_clusters",
        stack = TRUE,
        pt.size = 0) +
  ggtitle("Cluster 12 = Hepatic Memory CD4 (ASGR2+/AQP9+)")

```
# Seurat DoHeatmap: cleaner and shows patterns better
```{r, fig.height=8, fig.width=14  }


# Publication violin plot
VlnPlot(reference_singlets,
        features = c("ASGR2", "AQP9", "VSIG4"),
        group.by = "seurat_clusters",
        stack = TRUE,flip = T,
        pt.size = 0.1) +
  ggtitle("Cluster 12 = Hepatic Memory CD4 (ASGR2+/AQP9+)")

```
# Run this after reclustering and before RNA heatmap:
```{r, fig.height=8, fig.width=14  }
DefaultAssay(reference_singlets) <- "RNA"

Idents(reference_singlets) <- "seurat_clusters"

cluster_markers <- FindAllMarkers(reference_singlets,
                                  assay = "RNA",
                                  only.pos = TRUE,
                                  logfc.threshold = 0.25,
                                  min.pct = 0.1)

# Top 5 markers per cluster
top5_markers <- cluster_markers %>% 
  group_by(cluster) %>% 
  slice_max(avg_log2FC, n = 5) %>% 
  ungroup()

# Save for later
write.table(cluster_markers, 
            "cluster_markers_doublets_removed.txt",
            quote = FALSE, sep = "\t", row.names = FALSE)

top5_markers

# Save for later
write.table(top5_markers, 
            "cluster_top5_markers_doublets_removed.txt",
            quote = FALSE, sep = "\t", row.names = FALSE)

```


























```




