1 Load libraries

2 Load Annotated Healthy Integrated Object

DefaultAssay(reference_integrated) <- "RNA"

reference_integrated
An object of class Seurat 
57379 features across 11482 samples within 7 assays 
Active assay: RNA (36601 features, 2902 variable features)
 3 layers present: scale.data, data, counts
 6 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
 2 dimensional reductions calculated: pca, umap

2.2 Silhouette Score (Quantitative Optimum)

library(cluster)

# Extract PCA embeddings used for clustering
pca_embeddings <- Embeddings(reference_integrated, "pca")[, 1:20]
dist_matrix    <- dist(pca_embeddings)

res_cols <- grep("integrated_snn_res", 
                  colnames(reference_integrated@meta.data), value = TRUE)

sil_scores <- sapply(res_cols, function(res) {
  clusters <- as.integer(reference_integrated@meta.data[[res]])
  # Need at least 2 clusters
  if (length(unique(clusters)) < 2) return(NA)
  sil <- silhouette(clusters, dist_matrix)
  mean(sil[, 3])  # mean silhouette width
})

# Print ranked results
sil_df <- data.frame(
  resolution      = res_cols,
  n_clusters      = sapply(res_cols, function(r) 
                     length(unique(reference_integrated@meta.data[[r]]))),
  mean_silhouette = round(sil_scores, 4)
)
print(sil_df[order(-sil_df$mean_silhouette), ])
                                   resolution n_clusters mean_silhouette
integrated_snn_res.0.2 integrated_snn_res.0.2          5          0.1862
integrated_snn_res.0.3 integrated_snn_res.0.3          7          0.1791
integrated_snn_res.0.1 integrated_snn_res.0.1          2          0.1775
integrated_snn_res.0.5 integrated_snn_res.0.5         10          0.0770
integrated_snn_res.1     integrated_snn_res.1         16          0.0721
integrated_snn_res.0.4 integrated_snn_res.0.4          9          0.0718
integrated_snn_res.0.8 integrated_snn_res.0.8         13          0.0690
# Plot
ggplot(sil_df, aes(x = resolution, y = mean_silhouette)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = n_clusters), vjust = -0.5, size = 3) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Mean Silhouette Score per Resolution",
       subtitle = "Numbers = cluster count | Higher = better separation",
       x = "Resolution", y = "Mean Silhouette Width")

2.3 Azimuth label concordance

concordance <- sapply(res_cols, function(res) {
  tbl    <- table(reference_integrated@meta.data[[res]],
                   reference_integrated$predicted.celltype.l2)
  purity <- apply(tbl, 1, function(x) max(x) / sum(x))
  mean(purity)
})

conc_df <- data.frame(
  resolution = gsub("integrated_snn_res.", "res.", res_cols),
  n_clusters = sapply(res_cols, function(r)
                length(unique(reference_integrated@meta.data[[r]]))),
  az_purity  = round(concordance, 4)
)
cat("Azimuth concordance ranked:\n")
Azimuth concordance ranked:
print(conc_df[order(-conc_df$az_purity), ])
                       resolution n_clusters az_purity
integrated_snn_res.0.5    res.0.5         10    0.8391
integrated_snn_res.1        res.1         16    0.8329
integrated_snn_res.0.8    res.0.8         13    0.8304
integrated_snn_res.0.4    res.0.4          9    0.8206
integrated_snn_res.0.3    res.0.3          7    0.8200
integrated_snn_res.0.2    res.0.2          5    0.8044
integrated_snn_res.0.1    res.0.1          2    0.7913
ggplot(conc_df, aes(x = resolution, y = az_purity)) +
  geom_col(fill = "#2196F3") +
  geom_text(aes(label = n_clusters), vjust = -0.5, size = 3.5) +
  ylim(0, 1) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Azimuth Label Purity per Resolution",
       subtitle = "Numbers = cluster count | Higher = cleaner biological mapping",
       x = "Resolution", y = "Mean Cluster Purity")

2.4 DimPlot grid — visual comparison

res_cols <- res_cols[as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) >= 0.1 & 
                     as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) <= 1]

plots <- lapply(res_cols, function(res) {
  n <- length(unique(reference_integrated@meta.data[[res]]))
  DimPlot(reference_integrated, group.by = res,
          label = TRUE, label.size = 2.5, pt.size = 0.1) +
  ggtitle(paste0("res=", gsub("integrated_snn_res.", "", res),
                 " (", n, " clusters)")) +
  NoLegend() +
  theme(plot.title = element_text(size = 9))
})
wrap_plots(plots, ncol = 3) +
plot_annotation(title = "UMAP across all tested resolutions")

2.5 SET FINAL RESOLUTION

best_res <- "integrated_snn_res.0.3"

Idents(reference_integrated)         <- best_res
reference_integrated$seurat_clusters <- reference_integrated@meta.data[[best_res]]

cat("✅ Final resolution: res.0.3\n")
✅ Final resolution: res.0.3
cat("Clusters:", length(unique(reference_integrated$seurat_clusters)), "\n")
Clusters: 7 
print(table(reference_integrated$seurat_clusters))

   0    1    2    3    4    5    6 
5485 3998  522  491  412  341  233 

2.6 Validation plots: dataset mixing and initial labels

# Validation plots
p1 <- DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_integrated, group.by = "dataset")
p3 <- DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE) 
(p1 | p2) / p3


DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE) 

2.7 Check FOXP3 Check FOXP3 in Cluster 5

# Check FOXP3 in both locations of cluster 6
FeaturePlot(reference_integrated, 
            features = c("FOXP3", "IL2RA", "CTLA4", "TIGIT"),
            split.by = "seurat_clusters", ncol = 4) +
  plot_annotation(title = "Treg markers — both locations")


treg_cells <- subset(reference_integrated, subset = seurat_clusters == "5")

VlnPlot(treg_cells,
features = c("FOXP3", "IL2RA", "CTLA4", "HLA-DRB1"),
group.by = "seurat_clusters", pt.size = 0)

2.8 Validation — Confirm Both Are Treg

# ── Fix: use RNA assay normalized expression ────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"

# Get CTLA4 normalized expression (data layer)
ctla4_expr <- GetAssayData(reference_integrated, 
                           assay = "RNA", 
                           layer = "data")["CTLA4", ]

# Split cluster 6 by CTLA4 high/low
reference_integrated$treg_subset <- ifelse(
  reference_integrated$seurat_clusters == "6",
  ifelse(ctla4_expr > 0, "Treg_CTLA4+", "Treg_CTLA4-"),
  "non-Treg"
)

table(reference_integrated$treg_subset)

   non-Treg Treg_CTLA4- Treg_CTLA4+ 
      11249         222          11 
reference_integrated$treg_subset <- ifelse(
  reference_integrated$seurat_clusters == "6",
  ifelse(reference_integrated$Treg1 > 0.2, "Treg activated", "Treg naive"),
  "non-Treg"
)

# Or check if TIGIT exists too
if ("TIGIT" %in% rownames(reference_integrated)) {
  tigit_expr <- GetAssayData(reference_integrated, assay = "RNA", layer = "data")["TIGIT", ]
  reference_integrated$treg_subset <- ifelse(
    reference_integrated$seurat_clusters == "6",
    ifelse(ctla4_expr > 0 & tigit_expr > 0, "Treg effector", "Treg resting"),
    "non-Treg"
  )
}

DimPlot(reference_integrated, group.by = "treg_subset", label = TRUE) +
  ggtitle("Cluster 6 Treg heterogeneity by CTLA4/TIGIT")

NA
NA

2.9 Azimuth Label Validation

DimPlot(reference_integrated, group.by = "predicted.celltype.l1",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "predicted.celltype.l3",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()


DimPlot(reference_integrated, group.by = "singler.hpca",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()


DimPlot(reference_integrated, group.by = "singler.immune",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()


print(table(reference_integrated$predicted.celltype.l1, reference_integrated$seurat_clusters))
       
           0    1    2    3    4    5    6
  CD4 T 5485 3998  522  487  412  340  232
  CD8 T    0    0    0    4    0    1    1
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$seurat_clusters))
                   
                       0    1    2    3    4    5    6
  CD4 CTL              0    0    0   11    0    0    0
  CD4 Naive         2030    7    1    0   20    1    5
  CD4 Proliferating    5    0    0    0    0    7    0
  CD4 TCM           3421 3972  519  346  391  154  222
  CD4 TEM              0    8    1  134    0    1    0
  Treg                29   11    1    0    1  178    6
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$seurat_clusters))
                   
                       0    1    2    3    4    5    6
  CD4 CTL              0    0    0   11    0    0    0
  CD4 Naive         2036    9    1    0   20    1    5
  CD4 Proliferating    5    0    0    0    0    7    0
  CD4 TCM_1         3408 2545  506   74  371   48  201
  CD4 TCM_2            1  314    3   26   17  102   13
  CD4 TCM_3            8 1104   10  256    3    2    5
  CD4 TEM_1            0    1    0   41    0    0    0
  CD4 TEM_2            0    9    0   30    0    1    0
  CD4 TEM_3            0    2    1   53    0    0    0
  Treg Memory          9   14    1    0    1  158    9
  Treg Naive          18    0    0    0    0   22    0
print(table(reference_integrated$singler.hpca, reference_integrated$seurat_clusters))
                                        
                                            0    1    2    3    4    5    6
  B_cell:Memory                             1    0    0    0    0    0    0
  Neutrophil:uropathogenic_E._coli_UTI89    0    0    0    0    1    0    0
  NK_cell:CD56hiCD62L+                      0    0    0    1    0    0    0
  T_cell:CD4+                               0    0    1    1    0    0    0
  T_cell:CD4+_central_memory              460 2911  316  175   58  238   79
  T_cell:CD4+_effector_memory               5  478   11  278    2   23    5
  T_cell:CD4+_Naive                      4996  578  186    5  315   75  134
  T_cell:CD8+                               0    2    0   12    0    1    0
  T_cell:CD8+_Central_memory                0    0    0    2    0    0    0
  T_cell:CD8+_effector_memory               3   12    7    4   22    0   12
  T_cell:CD8+_effector_memory_RA            0    0    0    1    0    0    1
  T_cell:CD8+_naive                         0    1    0    5    7    0    0
  T_cell:gamma-delta                        2    0    0    0    0    1    0
  T_cell:Treg:Naive                         2    1    0    0    0    2    1
print(table(reference_integrated$singler.immune, reference_integrated$seurat_clusters))
                                  
                                      0    1    2    3    4    5    6
  NK cells                            0    1    0   10    1    0    1
  T cells, CD4+, memory TREG          6  152   18    3    3  186    5
  T cells, CD4+, naive             3827   66   48    0  121    2  100
  T cells, CD4+, naive TREG         247   29    1    0    3  108    6
  T cells, CD4+, naive, stimulated   22   16    5    0   85    0    7
  T cells, CD4+, TFH               1181 1642  165   46  113   23   47
  T cells, CD4+, Th1                 18  174   18  284    5   11    5
  T cells, CD4+, Th1_17               3  240    2  141    0    0    8
  T cells, CD4+, Th17                11  992   82    3   10    7   16
  T cells, CD4+, Th2                 51  675  173    1   23    3   34
  T cells, CD8+, naive              107    5    5    2    7    1    4
  T cells, CD8+, naive, stimulated    5    3    5    1   39    0    0

3 FindAllMarkers + Filter + Save All Files

# ── Required libraries for FindAllMarkers + filtering + saving ──────────
library(Seurat)       # FindAllMarkers, DoHeatmap, DimPlot
library(dplyr)        # filter, group_by, arrange, slice_head, pull
library(ggplot2)      # plotting
library(RColorBrewer) # heatmap color palettes
library(openxlsx)     # write Excel workbook (multi-sheet .xlsx)

DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)
cat("✅ RNA layers joined\n")
✅ RNA layers joined
# Verify layers are now accessible
cat("RNA assay layers:\n")
RNA assay layers:
print(names(Layers(reference_integrated, assay = "RNA")))
NULL
# ── Now re-run FindAllMarkers ──────────────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"
Idents(reference_integrated) <- "seurat_clusters"

# ── FindAllMarkers on the CORRECT object ───────────────────────────────
ref_markers <- FindAllMarkers(
  reference_integrated,          # ← was All_samples_Merged (wrong object)
  only.pos        = TRUE,
  min.pct         = 0.25,
  logfc.threshold = 0.25,
  min.pct.diff    = 0.20,
  verbose         = FALSE
)

cat("Total markers before filtering:", nrow(ref_markers), "\n")
Total markers before filtering: 5277 
print(head(colnames(ref_markers)))  # confirm 'gene' column exists
[1] "p_val"      "avg_log2FC" "pct.1"      "pct.2"      "p_val_adj"  "cluster"   
# ── Fix: ensure gene is a proper column (Seurat sometimes stores as rownames)
if (!"gene" %in% colnames(ref_markers)) {
  ref_markers$gene <- rownames(ref_markers)
  cat("✅ gene column added from rownames\n")
}

# ── Blacklist ───────────────────────────────────────────────────────────
blacklist_patterns <- c(
  "^TRAV", "^TRBV", "^TRGV", "^TRDV", "^TRBC", "^TRAC", "^TRDC", "^TRGC",
  "^IGH",  "^IGK",  "^IGL",  "^IGJ",
  "^RPL",  "^RPS",
  "^MT-",
  "^HBA",  "^HBB",  "^HB[ABZ]",
  "^NEAT1$", "^MALAT1$",
  "^XIST$"
)
blacklist_regex <- paste(blacklist_patterns, collapse = "|")

# Preview removed genes
to_remove <- ref_markers %>%
  dplyr::filter(grepl(blacklist_regex, gene, ignore.case = TRUE))
message("Rows to remove: ", nrow(to_remove))
print(head(to_remove$gene))
[1] "RPS10"   "NEAT1"   "RPS6KA3" "RPS6KA5" "NEAT1"   "RPS6KA3"
# Apply filter
ref_markers_filtered <- ref_markers %>%
  dplyr::filter(!grepl(blacklist_regex, gene, ignore.case = TRUE))
cat("Markers after blacklist filter:", nrow(ref_markers_filtered), "\n")
Markers after blacklist filter: 5249 
# ── Significance filter ─────────────────────────────────────────────────
ref_markers_sig <- ref_markers_filtered %>%
  dplyr::filter(p_val_adj < 0.05)
cat("Significant markers (p_val_adj < 0.05):", nrow(ref_markers_sig), "\n")
Significant markers (p_val_adj < 0.05): 3643 
# ── Top 25 per cluster sorted by avg_log2FC ─────────────────────────────
top25_markers <- ref_markers_sig %>%
  dplyr::group_by(cluster) %>%
  dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
  dplyr::slice_head(n = 25) %>%
  dplyr::ungroup()

# ── Top 5 per cluster sorted by avg_log2FC ──────────────────────────────
top5_markers <- ref_markers_sig %>%
  dplyr::group_by(cluster) %>%
  dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::ungroup()

cat("Top25 rows:", nrow(top25_markers), "\n")
Top25 rows: 175 
cat("Top5 rows:",  nrow(top5_markers),  "\n")
Top5 rows: 35 
print(top5_markers[, c("cluster","gene","avg_log2FC","pct.1","pct.2","p_val_adj")])
# A tibble: 35 × 6
   cluster gene       avg_log2FC pct.1 pct.2 p_val_adj
   <fct>   <chr>           <dbl> <dbl> <dbl>     <dbl>
 1 0       AIF1             2.35 0.261 0.056 3.01e-202
 2 0       TMIGD2           2.21 0.316 0.074 2.39e-239
 3 0       ACTN1            1.54 0.441 0.169 4.15e-238
 4 0       ADTRP            1.47 0.359 0.138 4.24e-168
 5 0       LRRN3            1.37 0.262 0.097 1.05e-110
 6 1       KLRB1            2.30 0.323 0.062 3.45e-288
 7 1       AC006369.1       1.96 0.251 0.062 3.13e-175
 8 1       MYO1F            1.94 0.296 0.072 6.41e-212
 9 1       ANXA2            1.89 0.436 0.112 0        
10 1       LGALS1           1.74 0.322 0.106 7.67e-181
# ℹ 25 more rows
# ℹ Use `print(n = ...)` to see more rows
# ── Save files ──────────────────────────────────────────────────────────
write.csv(ref_markers_filtered,
          "CD4_reference_markers_all_filtered.csv",     row.names = FALSE)
write.csv(top25_markers,
          "CD4_reference_markers_top25_per_cluster.csv", row.names = FALSE)
write.csv(top5_markers,
          "CD4_reference_markers_top5_per_cluster.csv",  row.names = FALSE)

# Optional Excel workbook
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb, "All_Filtered")
addWorksheet(wb, "Top25_per_cluster")
addWorksheet(wb, "Top5_per_cluster")
writeData(wb, "All_Filtered",      ref_markers_filtered)
writeData(wb, "Top25_per_cluster", top25_markers)
writeData(wb, "Top5_per_cluster",  top5_markers)
saveWorkbook(wb, "CD4_reference_markers_summary.xlsx", overwrite = TRUE)

cat("✅ All marker files saved\n")
✅ All marker files saved

4 Add the filtered marker results directly to your Seurat object

# ── After your existing FindAllMarkers + filtering code ─────────────────

# Add ALL filtered significant markers as metadata (1 row per cell)
# Create a cluster-to-markers mapping
marker_list_all <- ref_markers_sig %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = Inf) %>%  # all sig markers
  summarise(markers_all = paste(gene, collapse = "; "), .groups = "drop")

# Map to all cells
reference_integrated$markers_sig_all <- marker_list_all$markers_all[
  match(reference_integrated$seurat_clusters, marker_list_all$cluster)
]

# ── TOP 25 per cluster ──────────────────────────────────────────────────
marker_list_top25 <- top25_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 25) %>%
  summarise(markers_top25 = paste(gene, collapse = "; "), .groups = "drop")

reference_integrated$markers_top25 <- marker_list_top25$markers_top25[
  match(reference_integrated$seurat_clusters, marker_list_top25$cluster)
]

# ── TOP 5 per cluster ───────────────────────────────────────────────────
marker_list_top5 <- top5_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 5) %>%
  summarise(markers_top5 = paste(gene, collapse = "; "), .groups = "drop")

reference_integrated$markers_top5 <- marker_list_top5$markers_top5[
  match(reference_integrated$seurat_clusters, marker_list_top5$cluster)
]

# ── Store full data.frames as object lists (for easy retrieval) ──────────
reference_integrated@misc$markers_all_filtered <- ref_markers_filtered
reference_integrated@misc$markers_top25 <- top25_markers
reference_integrated@misc$markers_top5 <- top5_markers

cat("✅ Marker lists added to object metadata:\n")
✅ Marker lists added to object metadata:
cat("- markers_sig_all (all sig markers per cluster)\n")
- markers_sig_all (all sig markers per cluster)
cat("- markers_top25 (top 25 per cluster)\n")
- markers_top25 (top 25 per cluster)
cat("- markers_top5 (top 5 per cluster)\n")
- markers_top5 (top 5 per cluster)
cat("- @misc$markers_* data.frames\n")
- @misc$markers_* data.frames
# ── Quick verification ──────────────────────────────────────────────────
head(reference_integrated@meta.data[, grepl("markers", colnames(reference_integrated@meta.data))])
table(nchar(reference_integrated$markers_top5) > 0)  # should all be TRUE

 TRUE 
11482 

5 Quick Visual Summary of Top Markers(Heatmap)

# DoHeatmap of top 5 per cluster for quick biological validation
library(RColorBrewer)

top5_genes <- top5_markers %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  pull(gene) %>%
  unique()

DefaultAssay(reference_integrated) <- "RNA"
DoHeatmap(
  reference_integrated,
  features = top5_genes,
  group.by = "seurat_clusters",
  label    = TRUE,
  raster   = FALSE
) +
  scale_fill_gradientn(colors = rev(brewer.pal(11, "RdBu"))) +
  ggtitle("Top 5 markers per cluster (res.0.3) — pre-annotation validation")

6 Quick Visual Summary of Top Markers(Dotplot)

# ── SCpubr DotPlot for top 5 markers ───────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"

# Get top 5 genes per cluster (already computed)
top5_genes <- top5_markers %>%
  dplyr::group_by(cluster) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::pull(gene) %>%
  unique()

# ── SCpubr v1.x — minimal working syntax ───────────────────────────────

SCpubr::do_DotPlot(sample = reference_integrated, 
                         features = top5_genes, 
                         flip = TRUE,
                         )

7 Quick Visual Summary of Top Markers(Dotplot)

# Check cluster 0 immediately
DefaultAssay(reference_integrated) <- "RNA"

# Myeloid vs T cell markers
VlnPlot(reference_integrated, 
        features = c("AIF1", "CD3D", "CD4", "CD14", "CD68", "LYZ"),
        group.by = "seurat_clusters", pt.size = 0)


# Azimuth predicted labels in cluster 0
table(reference_integrated$predicted.celltype.l2[
  reference_integrated$seurat_clusters == "0"])

        CD4 Naive CD4 Proliferating           CD4 TCM              Treg 
             2030                 5              3421                29 

8 FeaturePlot to check TEMRA

FeaturePlot(
  object = reference_integrated,
  features = c("PRF1", "NKG7", "FGFBP2", "GNLY"),
  cols = c("lightgrey", "red"),label = T,
  ncol = 2
)

9 R Annotation Code

# # COMPLETE CELL TYPE ANNOTATION CODE
# 
# # Step 1: Ensure clusters are characters
# reference_integrated$seurat_clusters <- as.character(reference_integrated$seurat_clusters)
# 
# # Step 2: Define named labels (exactly matches your 0-6 clusters)
# cluster_labels <- c(
#   "0" = "CD4 Tnaive (CCR7+SELL+TCF7+)",         # Canonical naive
#   "1" = "CD4 TCM (CD161+/IL7R+)",               # TCM, KLRB1/CD161+ Th17-memory
#   "2" = "CD4 TCM (CCR4+/Th2-like)",             # Activated TCM, CCR4+BATF+
#   "3" = "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",     # Cytotoxic effector, Temra
#   "4" = "CD4 TEM (NF-kB activated)",            # Effector memory, stress-activated
#   "5" = "CD4 Treg (FOXP3+Helios+CD25+)",        # Natural tTreg, highest confidence
#   "6" = "CD4 Tnaive-RTE (IGF1R+)"              # Quiescent/RTE naive subset (Quiescent/RTE naive subset)
# )
# 
# # Step 3: Create vector (you already verified this works)
# clusters_chr <- as.character(reference_integrated$seurat_clusters)
# cell_type_vec <- cluster_labels[clusters_chr]
# 
# # Step 4: SAFE ASSIGNMENT (direct metadata write)
# reference_integrated@meta.data$cell_type <- cell_type_vec
# 
# # Step 5: Force factor ordering 0→6 for plots
# reference_integrated@meta.data$cell_type <- factor(
#   reference_integrated@meta.data$cell_type,
#   levels = c("CD4 Tnaive (CCR7+SELL+TCF7+)",
#              "CD4 TCM (CD161+/IL7R+)",
#              "CD4 TCM (CCR4+/Th2-like)",
#              "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",
#              "CD4 TEM (NF-kB activated)",
#              "CD4 Treg (FOXP3+Helios+CD25+)",
#              "CD4 Tnaive-RTE (IGF1R+)")
# )

# Step 6: VALIDATE
cat("✅ Assignment complete!\n")
✅ Assignment complete!
print(table(reference_integrated$seurat_clusters, reference_integrated@meta.data$cell_type))
   
    CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like) CD4 CTL/Temra (GZMK+GZMA+CCL5+)
  0                         5485                      0                        0                               0
  1                            0                   3998                        0                               0
  2                            0                      0                      522                               0
  3                            0                      0                        0                             491
  4                            0                      0                        0                               0
  5                            0                      0                        0                               0
  6                            0                      0                        0                               0
   
    CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+) CD4 Tnaive-RTE (IGF1R+)
  0                         0                             0                       0
  1                         0                             0                       0
  2                         0                             0                       0
  3                         0                             0                       0
  4                       412                             0                       0
  5                         0                           341                       0
  6                         0                             0                     233
print(table(reference_integrated@meta.data$cell_type))

   CD4 Tnaive (CCR7+SELL+TCF7+)          CD4 TCM (CD161+/IL7R+)        CD4 TCM (CCR4+/Th2-like) 
                           5485                            3998                             522 
CD4 CTL/Temra (GZMK+GZMA+CCL5+)       CD4 TEM (NF-kB activated)   CD4 Treg (FOXP3+Helios+CD25+) 
                            491                             412                             341 
        CD4 Tnaive-RTE (IGF1R+) 
                            233 
# Step 7: VISUALIZE
p1 <- DimPlot(reference_integrated, group.by = "seurat_clusters", 
              label = TRUE, label.size = 3) + ggtitle("Clusters 0-6")
p2 <- DimPlot(reference_integrated, group.by = "cell_type", 
              label = TRUE, repel = TRUE, label.size = 3) + ggtitle("Cell Types")

p1 | p2


# Step 8: Cross-check with Azimuth
table(reference_integrated$predicted.celltype.l2, 
      reference_integrated@meta.data$cell_type)
                   
                    CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like)
  CD4 CTL                                      0                      0                        0
  CD4 Naive                                 2030                      7                        1
  CD4 Proliferating                            5                      0                        0
  CD4 TCM                                   3421                   3972                      519
  CD4 TEM                                      0                      8                        1
  Treg                                        29                     11                        1
                   
                    CD4 CTL/Temra (GZMK+GZMA+CCL5+) CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+)
  CD4 CTL                                        11                         0                             0
  CD4 Naive                                       0                        20                             1
  CD4 Proliferating                               0                         0                             7
  CD4 TCM                                       346                       391                           154
  CD4 TEM                                       134                         0                             1
  Treg                                            0                         1                           178
                   
                    CD4 Tnaive-RTE (IGF1R+)
  CD4 CTL                                 0
  CD4 Naive                               5
  CD4 Proliferating                       0
  CD4 TCM                               222
  CD4 TEM                                 0
  Treg                                    6

10 Save annotated reference

# Save annotated reference
saveRDS(reference_integrated, "CD4_reference_annotated_with_markers.rds")
cat("✅ Annotated reference  with markers saved!\n")
✅ Annotated reference  with markers saved!
---
title: "Remove cluster 12 cells — Clean Re-Integration"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# Load libraries
```{r, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                      fig.width = 12, fig.height = 8)
library(Seurat)
library(monocle3)
library(SeuratWrappers)
library(patchwork)
library(ggplot2)
library(RColorBrewer)
options(future.globals.maxSize = 8e9)
set.seed(123)

# ── Define is_hi() ONCE here so it is available in ALL downstream chunks ──
is_hi <- function(g, zmat, z = 0.5) {
  if (!g %in% rownames(zmat)) return(rep(FALSE, ncol(zmat)))
  zmat[g, ] > z
}
```

# Load Annotated Healthy Integrated Object
```{r load object}
reference_integrated<- readRDS("CD4_reference_annotated.rds")

DefaultAssay(reference_integrated) <- "RNA"

reference_integrated
```


## Clustree for Resolution Selection (Recommended)
```{r , fig.width=8, fig.height=6}
ElbowPlot(reference_integrated, ndims = 50)

# ✅ Clustree — visualize cluster stability across resolutions
library(clustree)

clustree(reference_integrated, prefix = "integrated_snn_res.") +
  ggtitle("Cluster stability across resolutions") +
  theme(legend.position = "right")

```

## Silhouette Score (Quantitative Optimum)
```{r , fig.width=8, fig.height=6}
library(cluster)

# Extract PCA embeddings used for clustering
pca_embeddings <- Embeddings(reference_integrated, "pca")[, 1:20]
dist_matrix    <- dist(pca_embeddings)

res_cols <- grep("integrated_snn_res", 
                  colnames(reference_integrated@meta.data), value = TRUE)

sil_scores <- sapply(res_cols, function(res) {
  clusters <- as.integer(reference_integrated@meta.data[[res]])
  # Need at least 2 clusters
  if (length(unique(clusters)) < 2) return(NA)
  sil <- silhouette(clusters, dist_matrix)
  mean(sil[, 3])  # mean silhouette width
})

# Print ranked results
sil_df <- data.frame(
  resolution      = res_cols,
  n_clusters      = sapply(res_cols, function(r) 
                     length(unique(reference_integrated@meta.data[[r]]))),
  mean_silhouette = round(sil_scores, 4)
)
print(sil_df[order(-sil_df$mean_silhouette), ])

# Plot
ggplot(sil_df, aes(x = resolution, y = mean_silhouette)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = n_clusters), vjust = -0.5, size = 3) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Mean Silhouette Score per Resolution",
       subtitle = "Numbers = cluster count | Higher = better separation",
       x = "Resolution", y = "Mean Silhouette Width")

```


## Azimuth label concordance
```{r, fig.width=8, fig.height=6}
concordance <- sapply(res_cols, function(res) {
  tbl    <- table(reference_integrated@meta.data[[res]],
                   reference_integrated$predicted.celltype.l2)
  purity <- apply(tbl, 1, function(x) max(x) / sum(x))
  mean(purity)
})

conc_df <- data.frame(
  resolution = gsub("integrated_snn_res.", "res.", res_cols),
  n_clusters = sapply(res_cols, function(r)
                length(unique(reference_integrated@meta.data[[r]]))),
  az_purity  = round(concordance, 4)
)
cat("Azimuth concordance ranked:\n")
print(conc_df[order(-conc_df$az_purity), ])

ggplot(conc_df, aes(x = resolution, y = az_purity)) +
  geom_col(fill = "#2196F3") +
  geom_text(aes(label = n_clusters), vjust = -0.5, size = 3.5) +
  ylim(0, 1) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Azimuth Label Purity per Resolution",
       subtitle = "Numbers = cluster count | Higher = cleaner biological mapping",
       x = "Resolution", y = "Mean Cluster Purity")
```

## DimPlot grid — visual comparison
```{r , fig.width=14, fig.height=10}
res_cols <- res_cols[as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) >= 0.1 & 
                     as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) <= 1]

plots <- lapply(res_cols, function(res) {
  n <- length(unique(reference_integrated@meta.data[[res]]))
  DimPlot(reference_integrated, group.by = res,
          label = TRUE, label.size = 2.5, pt.size = 0.1) +
  ggtitle(paste0("res=", gsub("integrated_snn_res.", "", res),
                 " (", n, " clusters)")) +
  NoLegend() +
  theme(plot.title = element_text(size = 9))
})
wrap_plots(plots, ncol = 3) +
plot_annotation(title = "UMAP across all tested resolutions")
```


## SET FINAL RESOLUTION
```{r , fig.width=14, fig.height=10}
best_res <- "integrated_snn_res.0.3"

Idents(reference_integrated)         <- best_res
reference_integrated$seurat_clusters <- reference_integrated@meta.data[[best_res]]

cat("✅ Final resolution: res.0.3\n")
cat("Clusters:", length(unique(reference_integrated$seurat_clusters)), "\n")
print(table(reference_integrated$seurat_clusters))
```

##  Validation plots: dataset mixing and initial labels
```{r validate, fig.width=14, fig.height=10}
# Validation plots
p1 <- DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_integrated, group.by = "dataset")
p3 <- DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE) 
(p1 | p2) / p3

DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE) 

```

## Check FOXP3 Check FOXP3 in Cluster 5
```{r , fig.width=28, fig.height=16}
# Check FOXP3 in both locations of cluster 6
FeaturePlot(reference_integrated, 
            features = c("FOXP3", "IL2RA", "CTLA4", "TIGIT"),
            split.by = "seurat_clusters", ncol = 4) +
  plot_annotation(title = "Treg markers — both locations")

treg_cells <- subset(reference_integrated, subset = seurat_clusters == "5")

VlnPlot(treg_cells,
features = c("FOXP3", "IL2RA", "CTLA4", "HLA-DRB1"),
group.by = "seurat_clusters", pt.size = 0)

```

## Validation — Confirm Both Are Treg
```{r treg-validation, fig.width=8, fig.height=6}
# ── Fix: use RNA assay normalized expression ────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"

# Get CTLA4 normalized expression (data layer)
ctla4_expr <- GetAssayData(reference_integrated, 
                           assay = "RNA", 
                           layer = "data")["CTLA4", ]

# Split cluster 6 by CTLA4 high/low
reference_integrated$treg_subset <- ifelse(
  reference_integrated$seurat_clusters == "6",
  ifelse(ctla4_expr > 0, "Treg_CTLA4+", "Treg_CTLA4-"),
  "non-Treg"
)

table(reference_integrated$treg_subset)


reference_integrated$treg_subset <- ifelse(
  reference_integrated$seurat_clusters == "6",
  ifelse(reference_integrated$Treg1 > 0.2, "Treg activated", "Treg naive"),
  "non-Treg"
)

# Or check if TIGIT exists too
if ("TIGIT" %in% rownames(reference_integrated)) {
  tigit_expr <- GetAssayData(reference_integrated, assay = "RNA", layer = "data")["TIGIT", ]
  reference_integrated$treg_subset <- ifelse(
    reference_integrated$seurat_clusters == "6",
    ifelse(ctla4_expr > 0 & tigit_expr > 0, "Treg effector", "Treg resting"),
    "non-Treg"
  )
}

DimPlot(reference_integrated, group.by = "treg_subset", label = TRUE) +
  ggtitle("Cluster 6 Treg heterogeneity by CTLA4/TIGIT")


```
## Azimuth Label Validation
```{r , fig.width=8, fig.height=6}
DimPlot(reference_integrated, group.by = "predicted.celltype.l1",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_integrated, group.by = "predicted.celltype.l3",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "singler.hpca",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "singler.immune",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

print(table(reference_integrated$predicted.celltype.l1, reference_integrated$seurat_clusters))
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$seurat_clusters))
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$seurat_clusters))

print(table(reference_integrated$singler.hpca, reference_integrated$seurat_clusters))
print(table(reference_integrated$singler.immune, reference_integrated$seurat_clusters))
```


# FindAllMarkers + Filter + Save All Files
```{r }
# ── Required libraries for FindAllMarkers + filtering + saving ──────────
library(Seurat)       # FindAllMarkers, DoHeatmap, DimPlot
library(dplyr)        # filter, group_by, arrange, slice_head, pull
library(ggplot2)      # plotting
library(RColorBrewer) # heatmap color palettes
library(openxlsx)     # write Excel workbook (multi-sheet .xlsx)

DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)
cat("✅ RNA layers joined\n")

# Verify layers are now accessible
cat("RNA assay layers:\n")
print(names(Layers(reference_integrated, assay = "RNA")))

# ── Now re-run FindAllMarkers ──────────────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"
Idents(reference_integrated) <- "seurat_clusters"

# ── FindAllMarkers on the CORRECT object ───────────────────────────────
ref_markers <- FindAllMarkers(
  reference_integrated,          # ← was All_samples_Merged (wrong object)
  only.pos        = TRUE,
  min.pct         = 0.25,
  logfc.threshold = 0.25,
  min.pct.diff    = 0.20,
  verbose         = FALSE
)

cat("Total markers before filtering:", nrow(ref_markers), "\n")
print(head(colnames(ref_markers)))  # confirm 'gene' column exists

# ── Fix: ensure gene is a proper column (Seurat sometimes stores as rownames)
if (!"gene" %in% colnames(ref_markers)) {
  ref_markers$gene <- rownames(ref_markers)
  cat("✅ gene column added from rownames\n")
}

# ── Blacklist ───────────────────────────────────────────────────────────
blacklist_patterns <- c(
  "^TRAV", "^TRBV", "^TRGV", "^TRDV", "^TRBC", "^TRAC", "^TRDC", "^TRGC",
  "^IGH",  "^IGK",  "^IGL",  "^IGJ",
  "^RPL",  "^RPS",
  "^MT-",
  "^HBA",  "^HBB",  "^HB[ABZ]",
  "^NEAT1$", "^MALAT1$",
  "^XIST$"
)
blacklist_regex <- paste(blacklist_patterns, collapse = "|")

# Preview removed genes
to_remove <- ref_markers %>%
  dplyr::filter(grepl(blacklist_regex, gene, ignore.case = TRUE))
message("Rows to remove: ", nrow(to_remove))
print(head(to_remove$gene))

# Apply filter
ref_markers_filtered <- ref_markers %>%
  dplyr::filter(!grepl(blacklist_regex, gene, ignore.case = TRUE))
cat("Markers after blacklist filter:", nrow(ref_markers_filtered), "\n")

# ── Significance filter ─────────────────────────────────────────────────
ref_markers_sig <- ref_markers_filtered %>%
  dplyr::filter(p_val_adj < 0.05)
cat("Significant markers (p_val_adj < 0.05):", nrow(ref_markers_sig), "\n")

# ── Top 25 per cluster sorted by avg_log2FC ─────────────────────────────
top25_markers <- ref_markers_sig %>%
  dplyr::group_by(cluster) %>%
  dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
  dplyr::slice_head(n = 25) %>%
  dplyr::ungroup()

# ── Top 5 per cluster sorted by avg_log2FC ──────────────────────────────
top5_markers <- ref_markers_sig %>%
  dplyr::group_by(cluster) %>%
  dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::ungroup()

cat("Top25 rows:", nrow(top25_markers), "\n")
cat("Top5 rows:",  nrow(top5_markers),  "\n")
print(top5_markers[, c("cluster","gene","avg_log2FC","pct.1","pct.2","p_val_adj")])

# ── Save files ──────────────────────────────────────────────────────────
write.csv(ref_markers_filtered,
          "CD4_reference_markers_all_filtered.csv",     row.names = FALSE)
write.csv(top25_markers,
          "CD4_reference_markers_top25_per_cluster.csv", row.names = FALSE)
write.csv(top5_markers,
          "CD4_reference_markers_top5_per_cluster.csv",  row.names = FALSE)

# Optional Excel workbook
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb, "All_Filtered")
addWorksheet(wb, "Top25_per_cluster")
addWorksheet(wb, "Top5_per_cluster")
writeData(wb, "All_Filtered",      ref_markers_filtered)
writeData(wb, "Top25_per_cluster", top25_markers)
writeData(wb, "Top5_per_cluster",  top5_markers)
saveWorkbook(wb, "CD4_reference_markers_summary.xlsx", overwrite = TRUE)

cat("✅ All marker files saved\n")

```
# Add the filtered marker results directly to your Seurat object 
```{r }
# ── After your existing FindAllMarkers + filtering code ─────────────────

# Add ALL filtered significant markers as metadata (1 row per cell)
# Create a cluster-to-markers mapping
marker_list_all <- ref_markers_sig %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = Inf) %>%  # all sig markers
  summarise(markers_all = paste(gene, collapse = "; "), .groups = "drop")

# Map to all cells
reference_integrated$markers_sig_all <- marker_list_all$markers_all[
  match(reference_integrated$seurat_clusters, marker_list_all$cluster)
]

# ── TOP 25 per cluster ──────────────────────────────────────────────────
marker_list_top25 <- top25_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 25) %>%
  summarise(markers_top25 = paste(gene, collapse = "; "), .groups = "drop")

reference_integrated$markers_top25 <- marker_list_top25$markers_top25[
  match(reference_integrated$seurat_clusters, marker_list_top25$cluster)
]

# ── TOP 5 per cluster ───────────────────────────────────────────────────
marker_list_top5 <- top5_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 5) %>%
  summarise(markers_top5 = paste(gene, collapse = "; "), .groups = "drop")

reference_integrated$markers_top5 <- marker_list_top5$markers_top5[
  match(reference_integrated$seurat_clusters, marker_list_top5$cluster)
]

# ── Store full data.frames as object lists (for easy retrieval) ──────────
reference_integrated@misc$markers_all_filtered <- ref_markers_filtered
reference_integrated@misc$markers_top25 <- top25_markers
reference_integrated@misc$markers_top5 <- top5_markers

cat("✅ Marker lists added to object metadata:\n")
cat("- markers_sig_all (all sig markers per cluster)\n")
cat("- markers_top25 (top 25 per cluster)\n")
cat("- markers_top5 (top 5 per cluster)\n")
cat("- @misc$markers_* data.frames\n")

# ── Quick verification ──────────────────────────────────────────────────
head(reference_integrated@meta.data[, grepl("markers", colnames(reference_integrated@meta.data))])
table(nchar(reference_integrated$markers_top5) > 0)  # should all be TRUE

```

# Quick Visual Summary of Top Markers(Heatmap)
```{r, fig.width=14, fig.height=8}
# DoHeatmap of top 5 per cluster for quick biological validation
library(RColorBrewer)

top5_genes <- top5_markers %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  pull(gene) %>%
  unique()

DefaultAssay(reference_integrated) <- "RNA"
DoHeatmap(
  reference_integrated,
  features = top5_genes,
  group.by = "seurat_clusters",
  label    = TRUE,
  raster   = FALSE
) +
  scale_fill_gradientn(colors = rev(brewer.pal(11, "RdBu"))) +
  ggtitle("Top 5 markers per cluster (res.0.3) — pre-annotation validation")

```



# Quick Visual Summary of Top Markers(Dotplot)
```{r, fig.width=10, fig.height=8}
# ── SCpubr DotPlot for top 5 markers ───────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"

# Get top 5 genes per cluster (already computed)
top5_genes <- top5_markers %>%
  dplyr::group_by(cluster) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::pull(gene) %>%
  unique()

# ── SCpubr v1.x — minimal working syntax ───────────────────────────────

SCpubr::do_DotPlot(sample = reference_integrated, 
                         features = top5_genes, 
                         flip = TRUE,
                         )

```

# Quick Visual Summary of Top Markers(Dotplot)
```{r, fig.width=14, fig.height=12}
# Check cluster 0 immediately
DefaultAssay(reference_integrated) <- "RNA"

# Myeloid vs T cell markers
VlnPlot(reference_integrated, 
        features = c("AIF1", "CD3D", "CD4", "CD14", "CD68", "LYZ"),
        group.by = "seurat_clusters", pt.size = 0)

# Azimuth predicted labels in cluster 0
table(reference_integrated$predicted.celltype.l2[
  reference_integrated$seurat_clusters == "0"])

```

# FeaturePlot to check TEMRA
```{r, fig.width=14, fig.height=12}
FeaturePlot(
  object = reference_integrated,
  features = c("PRF1", "NKG7", "FGFBP2", "GNLY"),
  cols = c("lightgrey", "red"),label = T,
  ncol = 2
)

``` 

# R Annotation Code
```{r, fig.width=14, fig.height=5}
# # COMPLETE CELL TYPE ANNOTATION CODE
# 
# # Step 1: Ensure clusters are characters
# reference_integrated$seurat_clusters <- as.character(reference_integrated$seurat_clusters)
# 
# # Step 2: Define named labels (exactly matches your 0-6 clusters)
# cluster_labels <- c(
#   "0" = "CD4 Tnaive (CCR7+SELL+TCF7+)",         # Canonical naive
#   "1" = "CD4 TCM (CD161+/IL7R+)",               # TCM, KLRB1/CD161+ Th17-memory
#   "2" = "CD4 TCM (CCR4+/Th2-like)",             # Activated TCM, CCR4+BATF+
#   "3" = "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",     # Cytotoxic effector, Temra
#   "4" = "CD4 TEM (NF-kB activated)",            # Effector memory, stress-activated
#   "5" = "CD4 Treg (FOXP3+Helios+CD25+)",        # Natural tTreg, highest confidence
#   "6" = "CD4 Tnaive-RTE (IGF1R+)"              # Quiescent/RTE naive subset (Quiescent/RTE naive subset)
# )
# 
# # Step 3: Create vector (you already verified this works)
# clusters_chr <- as.character(reference_integrated$seurat_clusters)
# cell_type_vec <- cluster_labels[clusters_chr]
# 
# # Step 4: SAFE ASSIGNMENT (direct metadata write)
# reference_integrated@meta.data$cell_type <- cell_type_vec
# 
# # Step 5: Force factor ordering 0→6 for plots
# reference_integrated@meta.data$cell_type <- factor(
#   reference_integrated@meta.data$cell_type,
#   levels = c("CD4 Tnaive (CCR7+SELL+TCF7+)",
#              "CD4 TCM (CD161+/IL7R+)",
#              "CD4 TCM (CCR4+/Th2-like)",
#              "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",
#              "CD4 TEM (NF-kB activated)",
#              "CD4 Treg (FOXP3+Helios+CD25+)",
#              "CD4 Tnaive-RTE (IGF1R+)")
# )

# Step 6: VALIDATE
cat("✅ Assignment complete!\n")
print(table(reference_integrated$seurat_clusters, reference_integrated@meta.data$cell_type))
print(table(reference_integrated@meta.data$cell_type))

# Step 7: VISUALIZE
p1 <- DimPlot(reference_integrated, group.by = "seurat_clusters", 
              label = TRUE, label.size = 3) + ggtitle("Clusters 0-6")
p2 <- DimPlot(reference_integrated, group.by = "cell_type", 
              label = TRUE, repel = TRUE, label.size = 3) + ggtitle("Cell Types")

p1 | p2

# Step 8: Cross-check with Azimuth
table(reference_integrated$predicted.celltype.l2, 
      reference_integrated@meta.data$cell_type)
```

# Save annotated reference
```{r}
# Save annotated reference
saveRDS(reference_integrated, "CD4_reference_annotated_with_markers.rds")
cat("✅ Annotated reference  with markers saved!\n")

``` 
