Load libraries
Load Annotated Healthy
Integrated Object
DefaultAssay(reference_integrated) <- "RNA"
reference_integrated
An object of class Seurat
57379 features across 11482 samples within 7 assays
Active assay: RNA (36601 features, 2902 variable features)
3 layers present: scale.data, data, counts
6 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
2 dimensional reductions calculated: pca, umap
Clustree for
Resolution Selection (Recommended)
ElbowPlot(reference_integrated, ndims = 50)

# ✅ Clustree — visualize cluster stability across resolutions
library(clustree)
clustree(reference_integrated, prefix = "integrated_snn_res.") +
ggtitle("Cluster stability across resolutions") +
theme(legend.position = "right")

Silhouette Score
(Quantitative Optimum)
library(cluster)
# Extract PCA embeddings used for clustering
pca_embeddings <- Embeddings(reference_integrated, "pca")[, 1:20]
dist_matrix <- dist(pca_embeddings)
res_cols <- grep("integrated_snn_res",
colnames(reference_integrated@meta.data), value = TRUE)
sil_scores <- sapply(res_cols, function(res) {
clusters <- as.integer(reference_integrated@meta.data[[res]])
# Need at least 2 clusters
if (length(unique(clusters)) < 2) return(NA)
sil <- silhouette(clusters, dist_matrix)
mean(sil[, 3]) # mean silhouette width
})
# Print ranked results
sil_df <- data.frame(
resolution = res_cols,
n_clusters = sapply(res_cols, function(r)
length(unique(reference_integrated@meta.data[[r]]))),
mean_silhouette = round(sil_scores, 4)
)
print(sil_df[order(-sil_df$mean_silhouette), ])
resolution n_clusters mean_silhouette
integrated_snn_res.0.2 integrated_snn_res.0.2 5 0.1862
integrated_snn_res.0.3 integrated_snn_res.0.3 7 0.1791
integrated_snn_res.0.1 integrated_snn_res.0.1 2 0.1775
integrated_snn_res.0.5 integrated_snn_res.0.5 10 0.0770
integrated_snn_res.1 integrated_snn_res.1 16 0.0721
integrated_snn_res.0.4 integrated_snn_res.0.4 9 0.0718
integrated_snn_res.0.8 integrated_snn_res.0.8 13 0.0690
# Plot
ggplot(sil_df, aes(x = resolution, y = mean_silhouette)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = n_clusters), vjust = -0.5, size = 3) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mean Silhouette Score per Resolution",
subtitle = "Numbers = cluster count | Higher = better separation",
x = "Resolution", y = "Mean Silhouette Width")

Azimuth label
concordance
concordance <- sapply(res_cols, function(res) {
tbl <- table(reference_integrated@meta.data[[res]],
reference_integrated$predicted.celltype.l2)
purity <- apply(tbl, 1, function(x) max(x) / sum(x))
mean(purity)
})
conc_df <- data.frame(
resolution = gsub("integrated_snn_res.", "res.", res_cols),
n_clusters = sapply(res_cols, function(r)
length(unique(reference_integrated@meta.data[[r]]))),
az_purity = round(concordance, 4)
)
cat("Azimuth concordance ranked:\n")
Azimuth concordance ranked:
print(conc_df[order(-conc_df$az_purity), ])
resolution n_clusters az_purity
integrated_snn_res.0.5 res.0.5 10 0.8391
integrated_snn_res.1 res.1 16 0.8329
integrated_snn_res.0.8 res.0.8 13 0.8304
integrated_snn_res.0.4 res.0.4 9 0.8206
integrated_snn_res.0.3 res.0.3 7 0.8200
integrated_snn_res.0.2 res.0.2 5 0.8044
integrated_snn_res.0.1 res.0.1 2 0.7913
ggplot(conc_df, aes(x = resolution, y = az_purity)) +
geom_col(fill = "#2196F3") +
geom_text(aes(label = n_clusters), vjust = -0.5, size = 3.5) +
ylim(0, 1) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Azimuth Label Purity per Resolution",
subtitle = "Numbers = cluster count | Higher = cleaner biological mapping",
x = "Resolution", y = "Mean Cluster Purity")

DimPlot grid — visual
comparison
res_cols <- res_cols[as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) >= 0.1 &
as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) <= 1]
plots <- lapply(res_cols, function(res) {
n <- length(unique(reference_integrated@meta.data[[res]]))
DimPlot(reference_integrated, group.by = res,
label = TRUE, label.size = 2.5, pt.size = 0.1) +
ggtitle(paste0("res=", gsub("integrated_snn_res.", "", res),
" (", n, " clusters)")) +
NoLegend() +
theme(plot.title = element_text(size = 9))
})
wrap_plots(plots, ncol = 3) +
plot_annotation(title = "UMAP across all tested resolutions")

SET FINAL
RESOLUTION
best_res <- "integrated_snn_res.0.3"
Idents(reference_integrated) <- best_res
reference_integrated$seurat_clusters <- reference_integrated@meta.data[[best_res]]
cat("✅ Final resolution: res.0.3\n")
✅ Final resolution: res.0.3
cat("Clusters:", length(unique(reference_integrated$seurat_clusters)), "\n")
Clusters: 7
print(table(reference_integrated$seurat_clusters))
0 1 2 3 4 5 6
5485 3998 522 491 412 341 233
Validation plots:
dataset mixing and initial labels
# Validation plots
p1 <- DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_integrated, group.by = "dataset")
p3 <- DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE)
(p1 | p2) / p3

DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE)

Check FOXP3 Check
FOXP3 in Cluster 5
# Check FOXP3 in both locations of cluster 6
FeaturePlot(reference_integrated,
features = c("FOXP3", "IL2RA", "CTLA4", "TIGIT"),
split.by = "seurat_clusters", ncol = 4) +
plot_annotation(title = "Treg markers — both locations")

treg_cells <- subset(reference_integrated, subset = seurat_clusters == "5")
VlnPlot(treg_cells,
features = c("FOXP3", "IL2RA", "CTLA4", "HLA-DRB1"),
group.by = "seurat_clusters", pt.size = 0)

Validation — Confirm
Both Are Treg
# ── Fix: use RNA assay normalized expression ────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"
# Get CTLA4 normalized expression (data layer)
ctla4_expr <- GetAssayData(reference_integrated,
assay = "RNA",
layer = "data")["CTLA4", ]
# Split cluster 6 by CTLA4 high/low
reference_integrated$treg_subset <- ifelse(
reference_integrated$seurat_clusters == "6",
ifelse(ctla4_expr > 0, "Treg_CTLA4+", "Treg_CTLA4-"),
"non-Treg"
)
table(reference_integrated$treg_subset)
non-Treg Treg_CTLA4- Treg_CTLA4+
11249 222 11
reference_integrated$treg_subset <- ifelse(
reference_integrated$seurat_clusters == "6",
ifelse(reference_integrated$Treg1 > 0.2, "Treg activated", "Treg naive"),
"non-Treg"
)
# Or check if TIGIT exists too
if ("TIGIT" %in% rownames(reference_integrated)) {
tigit_expr <- GetAssayData(reference_integrated, assay = "RNA", layer = "data")["TIGIT", ]
reference_integrated$treg_subset <- ifelse(
reference_integrated$seurat_clusters == "6",
ifelse(ctla4_expr > 0 & tigit_expr > 0, "Treg effector", "Treg resting"),
"non-Treg"
)
}
DimPlot(reference_integrated, group.by = "treg_subset", label = TRUE) +
ggtitle("Cluster 6 Treg heterogeneity by CTLA4/TIGIT")

NA
NA
Azimuth Label
Validation
DimPlot(reference_integrated, group.by = "predicted.celltype.l1",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "predicted.celltype.l3",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "singler.hpca",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "singler.immune",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

print(table(reference_integrated$predicted.celltype.l1, reference_integrated$seurat_clusters))
0 1 2 3 4 5 6
CD4 T 5485 3998 522 487 412 340 232
CD8 T 0 0 0 4 0 1 1
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$seurat_clusters))
0 1 2 3 4 5 6
CD4 CTL 0 0 0 11 0 0 0
CD4 Naive 2030 7 1 0 20 1 5
CD4 Proliferating 5 0 0 0 0 7 0
CD4 TCM 3421 3972 519 346 391 154 222
CD4 TEM 0 8 1 134 0 1 0
Treg 29 11 1 0 1 178 6
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$seurat_clusters))
0 1 2 3 4 5 6
CD4 CTL 0 0 0 11 0 0 0
CD4 Naive 2036 9 1 0 20 1 5
CD4 Proliferating 5 0 0 0 0 7 0
CD4 TCM_1 3408 2545 506 74 371 48 201
CD4 TCM_2 1 314 3 26 17 102 13
CD4 TCM_3 8 1104 10 256 3 2 5
CD4 TEM_1 0 1 0 41 0 0 0
CD4 TEM_2 0 9 0 30 0 1 0
CD4 TEM_3 0 2 1 53 0 0 0
Treg Memory 9 14 1 0 1 158 9
Treg Naive 18 0 0 0 0 22 0
print(table(reference_integrated$singler.hpca, reference_integrated$seurat_clusters))
0 1 2 3 4 5 6
B_cell:Memory 1 0 0 0 0 0 0
Neutrophil:uropathogenic_E._coli_UTI89 0 0 0 0 1 0 0
NK_cell:CD56hiCD62L+ 0 0 0 1 0 0 0
T_cell:CD4+ 0 0 1 1 0 0 0
T_cell:CD4+_central_memory 460 2911 316 175 58 238 79
T_cell:CD4+_effector_memory 5 478 11 278 2 23 5
T_cell:CD4+_Naive 4996 578 186 5 315 75 134
T_cell:CD8+ 0 2 0 12 0 1 0
T_cell:CD8+_Central_memory 0 0 0 2 0 0 0
T_cell:CD8+_effector_memory 3 12 7 4 22 0 12
T_cell:CD8+_effector_memory_RA 0 0 0 1 0 0 1
T_cell:CD8+_naive 0 1 0 5 7 0 0
T_cell:gamma-delta 2 0 0 0 0 1 0
T_cell:Treg:Naive 2 1 0 0 0 2 1
print(table(reference_integrated$singler.immune, reference_integrated$seurat_clusters))
0 1 2 3 4 5 6
NK cells 0 1 0 10 1 0 1
T cells, CD4+, memory TREG 6 152 18 3 3 186 5
T cells, CD4+, naive 3827 66 48 0 121 2 100
T cells, CD4+, naive TREG 247 29 1 0 3 108 6
T cells, CD4+, naive, stimulated 22 16 5 0 85 0 7
T cells, CD4+, TFH 1181 1642 165 46 113 23 47
T cells, CD4+, Th1 18 174 18 284 5 11 5
T cells, CD4+, Th1_17 3 240 2 141 0 0 8
T cells, CD4+, Th17 11 992 82 3 10 7 16
T cells, CD4+, Th2 51 675 173 1 23 3 34
T cells, CD8+, naive 107 5 5 2 7 1 4
T cells, CD8+, naive, stimulated 5 3 5 1 39 0 0
FindAllMarkers + Filter
+ Save All Files
# ── Required libraries for FindAllMarkers + filtering + saving ──────────
library(Seurat) # FindAllMarkers, DoHeatmap, DimPlot
library(dplyr) # filter, group_by, arrange, slice_head, pull
library(ggplot2) # plotting
library(RColorBrewer) # heatmap color palettes
library(openxlsx) # write Excel workbook (multi-sheet .xlsx)
DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)
cat("✅ RNA layers joined\n")
✅ RNA layers joined
# Verify layers are now accessible
cat("RNA assay layers:\n")
RNA assay layers:
print(names(Layers(reference_integrated, assay = "RNA")))
NULL
# ── Now re-run FindAllMarkers ──────────────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"
Idents(reference_integrated) <- "seurat_clusters"
# ── FindAllMarkers on the CORRECT object ───────────────────────────────
ref_markers <- FindAllMarkers(
reference_integrated, # ← was All_samples_Merged (wrong object)
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25,
min.pct.diff = 0.20,
verbose = FALSE
)
cat("Total markers before filtering:", nrow(ref_markers), "\n")
Total markers before filtering: 5277
print(head(colnames(ref_markers))) # confirm 'gene' column exists
[1] "p_val" "avg_log2FC" "pct.1" "pct.2" "p_val_adj" "cluster"
# ── Fix: ensure gene is a proper column (Seurat sometimes stores as rownames)
if (!"gene" %in% colnames(ref_markers)) {
ref_markers$gene <- rownames(ref_markers)
cat("✅ gene column added from rownames\n")
}
# ── Blacklist ───────────────────────────────────────────────────────────
blacklist_patterns <- c(
"^TRAV", "^TRBV", "^TRGV", "^TRDV", "^TRBC", "^TRAC", "^TRDC", "^TRGC",
"^IGH", "^IGK", "^IGL", "^IGJ",
"^RPL", "^RPS",
"^MT-",
"^HBA", "^HBB", "^HB[ABZ]",
"^NEAT1$", "^MALAT1$",
"^XIST$"
)
blacklist_regex <- paste(blacklist_patterns, collapse = "|")
# Preview removed genes
to_remove <- ref_markers %>%
dplyr::filter(grepl(blacklist_regex, gene, ignore.case = TRUE))
message("Rows to remove: ", nrow(to_remove))
print(head(to_remove$gene))
[1] "RPS10" "NEAT1" "RPS6KA3" "RPS6KA5" "NEAT1" "RPS6KA3"
# Apply filter
ref_markers_filtered <- ref_markers %>%
dplyr::filter(!grepl(blacklist_regex, gene, ignore.case = TRUE))
cat("Markers after blacklist filter:", nrow(ref_markers_filtered), "\n")
Markers after blacklist filter: 5249
# ── Significance filter ─────────────────────────────────────────────────
ref_markers_sig <- ref_markers_filtered %>%
dplyr::filter(p_val_adj < 0.05)
cat("Significant markers (p_val_adj < 0.05):", nrow(ref_markers_sig), "\n")
Significant markers (p_val_adj < 0.05): 3643
# ── Top 25 per cluster sorted by avg_log2FC ─────────────────────────────
top25_markers <- ref_markers_sig %>%
dplyr::group_by(cluster) %>%
dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
dplyr::slice_head(n = 25) %>%
dplyr::ungroup()
# ── Top 5 per cluster sorted by avg_log2FC ──────────────────────────────
top5_markers <- ref_markers_sig %>%
dplyr::group_by(cluster) %>%
dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
dplyr::slice_head(n = 5) %>%
dplyr::ungroup()
cat("Top25 rows:", nrow(top25_markers), "\n")
Top25 rows: 175
cat("Top5 rows:", nrow(top5_markers), "\n")
Top5 rows: 35
print(top5_markers[, c("cluster","gene","avg_log2FC","pct.1","pct.2","p_val_adj")])
# A tibble: 35 × 6
cluster gene avg_log2FC pct.1 pct.2 p_val_adj
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 0 AIF1 2.35 0.261 0.056 3.01e-202
2 0 TMIGD2 2.21 0.316 0.074 2.39e-239
3 0 ACTN1 1.54 0.441 0.169 4.15e-238
4 0 ADTRP 1.47 0.359 0.138 4.24e-168
5 0 LRRN3 1.37 0.262 0.097 1.05e-110
6 1 KLRB1 2.30 0.323 0.062 3.45e-288
7 1 AC006369.1 1.96 0.251 0.062 3.13e-175
8 1 MYO1F 1.94 0.296 0.072 6.41e-212
9 1 ANXA2 1.89 0.436 0.112 0
10 1 LGALS1 1.74 0.322 0.106 7.67e-181
# ℹ 25 more rows
# ℹ Use `print(n = ...)` to see more rows
# ── Save files ──────────────────────────────────────────────────────────
write.csv(ref_markers_filtered,
"CD4_reference_markers_all_filtered.csv", row.names = FALSE)
write.csv(top25_markers,
"CD4_reference_markers_top25_per_cluster.csv", row.names = FALSE)
write.csv(top5_markers,
"CD4_reference_markers_top5_per_cluster.csv", row.names = FALSE)
# Optional Excel workbook
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb, "All_Filtered")
addWorksheet(wb, "Top25_per_cluster")
addWorksheet(wb, "Top5_per_cluster")
writeData(wb, "All_Filtered", ref_markers_filtered)
writeData(wb, "Top25_per_cluster", top25_markers)
writeData(wb, "Top5_per_cluster", top5_markers)
saveWorkbook(wb, "CD4_reference_markers_summary.xlsx", overwrite = TRUE)
cat("✅ All marker files saved\n")
✅ All marker files saved
Add the filtered marker
results directly to your Seurat object
# ── After your existing FindAllMarkers + filtering code ─────────────────
# Add ALL filtered significant markers as metadata (1 row per cell)
# Create a cluster-to-markers mapping
marker_list_all <- ref_markers_sig %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = Inf) %>% # all sig markers
summarise(markers_all = paste(gene, collapse = "; "), .groups = "drop")
# Map to all cells
reference_integrated$markers_sig_all <- marker_list_all$markers_all[
match(reference_integrated$seurat_clusters, marker_list_all$cluster)
]
# ── TOP 25 per cluster ──────────────────────────────────────────────────
marker_list_top25 <- top25_markers %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = 25) %>%
summarise(markers_top25 = paste(gene, collapse = "; "), .groups = "drop")
reference_integrated$markers_top25 <- marker_list_top25$markers_top25[
match(reference_integrated$seurat_clusters, marker_list_top25$cluster)
]
# ── TOP 5 per cluster ───────────────────────────────────────────────────
marker_list_top5 <- top5_markers %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = 5) %>%
summarise(markers_top5 = paste(gene, collapse = "; "), .groups = "drop")
reference_integrated$markers_top5 <- marker_list_top5$markers_top5[
match(reference_integrated$seurat_clusters, marker_list_top5$cluster)
]
# ── Store full data.frames as object lists (for easy retrieval) ──────────
reference_integrated@misc$markers_all_filtered <- ref_markers_filtered
reference_integrated@misc$markers_top25 <- top25_markers
reference_integrated@misc$markers_top5 <- top5_markers
cat("✅ Marker lists added to object metadata:\n")
✅ Marker lists added to object metadata:
cat("- markers_sig_all (all sig markers per cluster)\n")
- markers_sig_all (all sig markers per cluster)
cat("- markers_top25 (top 25 per cluster)\n")
- markers_top25 (top 25 per cluster)
cat("- markers_top5 (top 5 per cluster)\n")
- markers_top5 (top 5 per cluster)
cat("- @misc$markers_* data.frames\n")
- @misc$markers_* data.frames
# ── Quick verification ──────────────────────────────────────────────────
head(reference_integrated@meta.data[, grepl("markers", colnames(reference_integrated@meta.data))])
table(nchar(reference_integrated$markers_top5) > 0) # should all be TRUE
TRUE
11482
Quick Visual Summary of
Top Markers(Heatmap)
# DoHeatmap of top 5 per cluster for quick biological validation
library(RColorBrewer)
top5_genes <- top5_markers %>%
group_by(cluster) %>%
slice_head(n = 5) %>%
pull(gene) %>%
unique()
DefaultAssay(reference_integrated) <- "RNA"
DoHeatmap(
reference_integrated,
features = top5_genes,
group.by = "seurat_clusters",
label = TRUE,
raster = FALSE
) +
scale_fill_gradientn(colors = rev(brewer.pal(11, "RdBu"))) +
ggtitle("Top 5 markers per cluster (res.0.3) — pre-annotation validation")

Quick Visual Summary of
Top Markers(Dotplot)
# ── SCpubr DotPlot for top 5 markers ───────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"
# Get top 5 genes per cluster (already computed)
top5_genes <- top5_markers %>%
dplyr::group_by(cluster) %>%
dplyr::slice_head(n = 5) %>%
dplyr::pull(gene) %>%
unique()
# ── SCpubr v1.x — minimal working syntax ───────────────────────────────
SCpubr::do_DotPlot(sample = reference_integrated,
features = top5_genes,
flip = TRUE,
)

Quick Visual Summary of
Top Markers(Dotplot)
# Check cluster 0 immediately
DefaultAssay(reference_integrated) <- "RNA"
# Myeloid vs T cell markers
VlnPlot(reference_integrated,
features = c("AIF1", "CD3D", "CD4", "CD14", "CD68", "LYZ"),
group.by = "seurat_clusters", pt.size = 0)

# Azimuth predicted labels in cluster 0
table(reference_integrated$predicted.celltype.l2[
reference_integrated$seurat_clusters == "0"])
CD4 Naive CD4 Proliferating CD4 TCM Treg
2030 5 3421 29
FeaturePlot to check
TEMRA
FeaturePlot(
object = reference_integrated,
features = c("PRF1", "NKG7", "FGFBP2", "GNLY"),
cols = c("lightgrey", "red"),label = T,
ncol = 2
)

R Annotation Code
# # COMPLETE CELL TYPE ANNOTATION CODE
#
# # Step 1: Ensure clusters are characters
# reference_integrated$seurat_clusters <- as.character(reference_integrated$seurat_clusters)
#
# # Step 2: Define named labels (exactly matches your 0-6 clusters)
# cluster_labels <- c(
# "0" = "CD4 Tnaive (CCR7+SELL+TCF7+)", # Canonical naive
# "1" = "CD4 TCM (CD161+/IL7R+)", # TCM, KLRB1/CD161+ Th17-memory
# "2" = "CD4 TCM (CCR4+/Th2-like)", # Activated TCM, CCR4+BATF+
# "3" = "CD4 CTL/Temra (GZMK+GZMA+CCL5+)", # Cytotoxic effector, Temra
# "4" = "CD4 TEM (NF-kB activated)", # Effector memory, stress-activated
# "5" = "CD4 Treg (FOXP3+Helios+CD25+)", # Natural tTreg, highest confidence
# "6" = "CD4 Tnaive-RTE (IGF1R+)" # Quiescent/RTE naive subset (Quiescent/RTE naive subset)
# )
#
# # Step 3: Create vector (you already verified this works)
# clusters_chr <- as.character(reference_integrated$seurat_clusters)
# cell_type_vec <- cluster_labels[clusters_chr]
#
# # Step 4: SAFE ASSIGNMENT (direct metadata write)
# reference_integrated@meta.data$cell_type <- cell_type_vec
#
# # Step 5: Force factor ordering 0→6 for plots
# reference_integrated@meta.data$cell_type <- factor(
# reference_integrated@meta.data$cell_type,
# levels = c("CD4 Tnaive (CCR7+SELL+TCF7+)",
# "CD4 TCM (CD161+/IL7R+)",
# "CD4 TCM (CCR4+/Th2-like)",
# "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",
# "CD4 TEM (NF-kB activated)",
# "CD4 Treg (FOXP3+Helios+CD25+)",
# "CD4 Tnaive-RTE (IGF1R+)")
# )
# Step 6: VALIDATE
cat("✅ Assignment complete!\n")
✅ Assignment complete!
print(table(reference_integrated$seurat_clusters, reference_integrated@meta.data$cell_type))
CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like) CD4 CTL/Temra (GZMK+GZMA+CCL5+)
0 5485 0 0 0
1 0 3998 0 0
2 0 0 522 0
3 0 0 0 491
4 0 0 0 0
5 0 0 0 0
6 0 0 0 0
CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+) CD4 Tnaive-RTE (IGF1R+)
0 0 0 0
1 0 0 0
2 0 0 0
3 0 0 0
4 412 0 0
5 0 341 0
6 0 0 233
print(table(reference_integrated@meta.data$cell_type))
CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like)
5485 3998 522
CD4 CTL/Temra (GZMK+GZMA+CCL5+) CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+)
491 412 341
CD4 Tnaive-RTE (IGF1R+)
233
# Step 7: VISUALIZE
p1 <- DimPlot(reference_integrated, group.by = "seurat_clusters",
label = TRUE, label.size = 3) + ggtitle("Clusters 0-6")
p2 <- DimPlot(reference_integrated, group.by = "cell_type",
label = TRUE, repel = TRUE, label.size = 3) + ggtitle("Cell Types")
p1 | p2

# Step 8: Cross-check with Azimuth
table(reference_integrated$predicted.celltype.l2,
reference_integrated@meta.data$cell_type)
CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like)
CD4 CTL 0 0 0
CD4 Naive 2030 7 1
CD4 Proliferating 5 0 0
CD4 TCM 3421 3972 519
CD4 TEM 0 8 1
Treg 29 11 1
CD4 CTL/Temra (GZMK+GZMA+CCL5+) CD4 TEM (NF-kB activated) CD4 Treg (FOXP3+Helios+CD25+)
CD4 CTL 11 0 0
CD4 Naive 0 20 1
CD4 Proliferating 0 0 7
CD4 TCM 346 391 154
CD4 TEM 134 0 1
Treg 0 1 178
CD4 Tnaive-RTE (IGF1R+)
CD4 CTL 0
CD4 Naive 5
CD4 Proliferating 0
CD4 TCM 222
CD4 TEM 0
Treg 6
Save annotated
reference
# Save annotated reference
saveRDS(reference_integrated, "CD4_reference_annotated_with_markers.rds")
cat("✅ Annotated reference with markers saved!\n")
✅ Annotated reference with markers saved!
---
title: "Remove cluster 12 cells — Clean Re-Integration"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# Load libraries
```{r, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
                      fig.width = 12, fig.height = 8)
library(Seurat)
library(monocle3)
library(SeuratWrappers)
library(patchwork)
library(ggplot2)
library(RColorBrewer)
options(future.globals.maxSize = 8e9)
set.seed(123)

# ── Define is_hi() ONCE here so it is available in ALL downstream chunks ──
is_hi <- function(g, zmat, z = 0.5) {
  if (!g %in% rownames(zmat)) return(rep(FALSE, ncol(zmat)))
  zmat[g, ] > z
}
```

# Load Annotated Healthy Integrated Object
```{r load object}
reference_integrated<- readRDS("CD4_reference_annotated.rds")

DefaultAssay(reference_integrated) <- "RNA"

reference_integrated
```


## Clustree for Resolution Selection (Recommended)
```{r , fig.width=8, fig.height=6}
ElbowPlot(reference_integrated, ndims = 50)

# ✅ Clustree — visualize cluster stability across resolutions
library(clustree)

clustree(reference_integrated, prefix = "integrated_snn_res.") +
  ggtitle("Cluster stability across resolutions") +
  theme(legend.position = "right")

```

## Silhouette Score (Quantitative Optimum)
```{r , fig.width=8, fig.height=6}
library(cluster)

# Extract PCA embeddings used for clustering
pca_embeddings <- Embeddings(reference_integrated, "pca")[, 1:20]
dist_matrix    <- dist(pca_embeddings)

res_cols <- grep("integrated_snn_res", 
                  colnames(reference_integrated@meta.data), value = TRUE)

sil_scores <- sapply(res_cols, function(res) {
  clusters <- as.integer(reference_integrated@meta.data[[res]])
  # Need at least 2 clusters
  if (length(unique(clusters)) < 2) return(NA)
  sil <- silhouette(clusters, dist_matrix)
  mean(sil[, 3])  # mean silhouette width
})

# Print ranked results
sil_df <- data.frame(
  resolution      = res_cols,
  n_clusters      = sapply(res_cols, function(r) 
                     length(unique(reference_integrated@meta.data[[r]]))),
  mean_silhouette = round(sil_scores, 4)
)
print(sil_df[order(-sil_df$mean_silhouette), ])

# Plot
ggplot(sil_df, aes(x = resolution, y = mean_silhouette)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = n_clusters), vjust = -0.5, size = 3) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Mean Silhouette Score per Resolution",
       subtitle = "Numbers = cluster count | Higher = better separation",
       x = "Resolution", y = "Mean Silhouette Width")

```


## Azimuth label concordance
```{r, fig.width=8, fig.height=6}
concordance <- sapply(res_cols, function(res) {
  tbl    <- table(reference_integrated@meta.data[[res]],
                   reference_integrated$predicted.celltype.l2)
  purity <- apply(tbl, 1, function(x) max(x) / sum(x))
  mean(purity)
})

conc_df <- data.frame(
  resolution = gsub("integrated_snn_res.", "res.", res_cols),
  n_clusters = sapply(res_cols, function(r)
                length(unique(reference_integrated@meta.data[[r]]))),
  az_purity  = round(concordance, 4)
)
cat("Azimuth concordance ranked:\n")
print(conc_df[order(-conc_df$az_purity), ])

ggplot(conc_df, aes(x = resolution, y = az_purity)) +
  geom_col(fill = "#2196F3") +
  geom_text(aes(label = n_clusters), vjust = -0.5, size = 3.5) +
  ylim(0, 1) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Azimuth Label Purity per Resolution",
       subtitle = "Numbers = cluster count | Higher = cleaner biological mapping",
       x = "Resolution", y = "Mean Cluster Purity")
```

## DimPlot grid — visual comparison
```{r , fig.width=14, fig.height=10}
res_cols <- res_cols[as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) >= 0.1 & 
                     as.numeric(gsub("integrated_snn_res\\.", "", res_cols)) <= 1]

plots <- lapply(res_cols, function(res) {
  n <- length(unique(reference_integrated@meta.data[[res]]))
  DimPlot(reference_integrated, group.by = res,
          label = TRUE, label.size = 2.5, pt.size = 0.1) +
  ggtitle(paste0("res=", gsub("integrated_snn_res.", "", res),
                 " (", n, " clusters)")) +
  NoLegend() +
  theme(plot.title = element_text(size = 9))
})
wrap_plots(plots, ncol = 3) +
plot_annotation(title = "UMAP across all tested resolutions")
```


## SET FINAL RESOLUTION
```{r , fig.width=14, fig.height=10}
best_res <- "integrated_snn_res.0.3"

Idents(reference_integrated)         <- best_res
reference_integrated$seurat_clusters <- reference_integrated@meta.data[[best_res]]

cat("✅ Final resolution: res.0.3\n")
cat("Clusters:", length(unique(reference_integrated$seurat_clusters)), "\n")
print(table(reference_integrated$seurat_clusters))
```

##  Validation plots: dataset mixing and initial labels
```{r validate, fig.width=14, fig.height=10}
# Validation plots
p1 <- DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
              label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_integrated, group.by = "dataset")
p3 <- DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE) 
(p1 | p2) / p3

DimPlot(reference_integrated, group.by = "seurat_clusters", label = TRUE,label.box = T, repel = TRUE) 

```

## Check FOXP3 Check FOXP3 in Cluster 5
```{r , fig.width=28, fig.height=16}
# Check FOXP3 in both locations of cluster 6
FeaturePlot(reference_integrated, 
            features = c("FOXP3", "IL2RA", "CTLA4", "TIGIT"),
            split.by = "seurat_clusters", ncol = 4) +
  plot_annotation(title = "Treg markers — both locations")

treg_cells <- subset(reference_integrated, subset = seurat_clusters == "5")

VlnPlot(treg_cells,
features = c("FOXP3", "IL2RA", "CTLA4", "HLA-DRB1"),
group.by = "seurat_clusters", pt.size = 0)

```

## Validation — Confirm Both Are Treg
```{r treg-validation, fig.width=8, fig.height=6}
# ── Fix: use RNA assay normalized expression ────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"

# Get CTLA4 normalized expression (data layer)
ctla4_expr <- GetAssayData(reference_integrated, 
                           assay = "RNA", 
                           layer = "data")["CTLA4", ]

# Split cluster 6 by CTLA4 high/low
reference_integrated$treg_subset <- ifelse(
  reference_integrated$seurat_clusters == "6",
  ifelse(ctla4_expr > 0, "Treg_CTLA4+", "Treg_CTLA4-"),
  "non-Treg"
)

table(reference_integrated$treg_subset)


reference_integrated$treg_subset <- ifelse(
  reference_integrated$seurat_clusters == "6",
  ifelse(reference_integrated$Treg1 > 0.2, "Treg activated", "Treg naive"),
  "non-Treg"
)

# Or check if TIGIT exists too
if ("TIGIT" %in% rownames(reference_integrated)) {
  tigit_expr <- GetAssayData(reference_integrated, assay = "RNA", layer = "data")["TIGIT", ]
  reference_integrated$treg_subset <- ifelse(
    reference_integrated$seurat_clusters == "6",
    ifelse(ctla4_expr > 0 & tigit_expr > 0, "Treg effector", "Treg resting"),
    "non-Treg"
  )
}

DimPlot(reference_integrated, group.by = "treg_subset", label = TRUE) +
  ggtitle("Cluster 6 Treg heterogeneity by CTLA4/TIGIT")


```
## Azimuth Label Validation
```{r , fig.width=8, fig.height=6}
DimPlot(reference_integrated, group.by = "predicted.celltype.l1",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_integrated, group.by = "predicted.celltype.l2",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
DimPlot(reference_integrated, group.by = "predicted.celltype.l3",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "singler.hpca",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_integrated, group.by = "singler.immune",
        label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

print(table(reference_integrated$predicted.celltype.l1, reference_integrated$seurat_clusters))
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$seurat_clusters))
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$seurat_clusters))

print(table(reference_integrated$singler.hpca, reference_integrated$seurat_clusters))
print(table(reference_integrated$singler.immune, reference_integrated$seurat_clusters))
```


# FindAllMarkers + Filter + Save All Files
```{r }
# ── Required libraries for FindAllMarkers + filtering + saving ──────────
library(Seurat)       # FindAllMarkers, DoHeatmap, DimPlot
library(dplyr)        # filter, group_by, arrange, slice_head, pull
library(ggplot2)      # plotting
library(RColorBrewer) # heatmap color palettes
library(openxlsx)     # write Excel workbook (multi-sheet .xlsx)

DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)
cat("✅ RNA layers joined\n")

# Verify layers are now accessible
cat("RNA assay layers:\n")
print(names(Layers(reference_integrated, assay = "RNA")))

# ── Now re-run FindAllMarkers ──────────────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"
Idents(reference_integrated) <- "seurat_clusters"

# ── FindAllMarkers on the CORRECT object ───────────────────────────────
ref_markers <- FindAllMarkers(
  reference_integrated,          # ← was All_samples_Merged (wrong object)
  only.pos        = TRUE,
  min.pct         = 0.25,
  logfc.threshold = 0.25,
  min.pct.diff    = 0.20,
  verbose         = FALSE
)

cat("Total markers before filtering:", nrow(ref_markers), "\n")
print(head(colnames(ref_markers)))  # confirm 'gene' column exists

# ── Fix: ensure gene is a proper column (Seurat sometimes stores as rownames)
if (!"gene" %in% colnames(ref_markers)) {
  ref_markers$gene <- rownames(ref_markers)
  cat("✅ gene column added from rownames\n")
}

# ── Blacklist ───────────────────────────────────────────────────────────
blacklist_patterns <- c(
  "^TRAV", "^TRBV", "^TRGV", "^TRDV", "^TRBC", "^TRAC", "^TRDC", "^TRGC",
  "^IGH",  "^IGK",  "^IGL",  "^IGJ",
  "^RPL",  "^RPS",
  "^MT-",
  "^HBA",  "^HBB",  "^HB[ABZ]",
  "^NEAT1$", "^MALAT1$",
  "^XIST$"
)
blacklist_regex <- paste(blacklist_patterns, collapse = "|")

# Preview removed genes
to_remove <- ref_markers %>%
  dplyr::filter(grepl(blacklist_regex, gene, ignore.case = TRUE))
message("Rows to remove: ", nrow(to_remove))
print(head(to_remove$gene))

# Apply filter
ref_markers_filtered <- ref_markers %>%
  dplyr::filter(!grepl(blacklist_regex, gene, ignore.case = TRUE))
cat("Markers after blacklist filter:", nrow(ref_markers_filtered), "\n")

# ── Significance filter ─────────────────────────────────────────────────
ref_markers_sig <- ref_markers_filtered %>%
  dplyr::filter(p_val_adj < 0.05)
cat("Significant markers (p_val_adj < 0.05):", nrow(ref_markers_sig), "\n")

# ── Top 25 per cluster sorted by avg_log2FC ─────────────────────────────
top25_markers <- ref_markers_sig %>%
  dplyr::group_by(cluster) %>%
  dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
  dplyr::slice_head(n = 25) %>%
  dplyr::ungroup()

# ── Top 5 per cluster sorted by avg_log2FC ──────────────────────────────
top5_markers <- ref_markers_sig %>%
  dplyr::group_by(cluster) %>%
  dplyr::arrange(dplyr::desc(avg_log2FC)) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::ungroup()

cat("Top25 rows:", nrow(top25_markers), "\n")
cat("Top5 rows:",  nrow(top5_markers),  "\n")
print(top5_markers[, c("cluster","gene","avg_log2FC","pct.1","pct.2","p_val_adj")])

# ── Save files ──────────────────────────────────────────────────────────
write.csv(ref_markers_filtered,
          "CD4_reference_markers_all_filtered.csv",     row.names = FALSE)
write.csv(top25_markers,
          "CD4_reference_markers_top25_per_cluster.csv", row.names = FALSE)
write.csv(top5_markers,
          "CD4_reference_markers_top5_per_cluster.csv",  row.names = FALSE)

# Optional Excel workbook
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb, "All_Filtered")
addWorksheet(wb, "Top25_per_cluster")
addWorksheet(wb, "Top5_per_cluster")
writeData(wb, "All_Filtered",      ref_markers_filtered)
writeData(wb, "Top25_per_cluster", top25_markers)
writeData(wb, "Top5_per_cluster",  top5_markers)
saveWorkbook(wb, "CD4_reference_markers_summary.xlsx", overwrite = TRUE)

cat("✅ All marker files saved\n")

```
# Add the filtered marker results directly to your Seurat object 
```{r }
# ── After your existing FindAllMarkers + filtering code ─────────────────

# Add ALL filtered significant markers as metadata (1 row per cell)
# Create a cluster-to-markers mapping
marker_list_all <- ref_markers_sig %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = Inf) %>%  # all sig markers
  summarise(markers_all = paste(gene, collapse = "; "), .groups = "drop")

# Map to all cells
reference_integrated$markers_sig_all <- marker_list_all$markers_all[
  match(reference_integrated$seurat_clusters, marker_list_all$cluster)
]

# ── TOP 25 per cluster ──────────────────────────────────────────────────
marker_list_top25 <- top25_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 25) %>%
  summarise(markers_top25 = paste(gene, collapse = "; "), .groups = "drop")

reference_integrated$markers_top25 <- marker_list_top25$markers_top25[
  match(reference_integrated$seurat_clusters, marker_list_top25$cluster)
]

# ── TOP 5 per cluster ───────────────────────────────────────────────────
marker_list_top5 <- top5_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 5) %>%
  summarise(markers_top5 = paste(gene, collapse = "; "), .groups = "drop")

reference_integrated$markers_top5 <- marker_list_top5$markers_top5[
  match(reference_integrated$seurat_clusters, marker_list_top5$cluster)
]

# ── Store full data.frames as object lists (for easy retrieval) ──────────
reference_integrated@misc$markers_all_filtered <- ref_markers_filtered
reference_integrated@misc$markers_top25 <- top25_markers
reference_integrated@misc$markers_top5 <- top5_markers

cat("✅ Marker lists added to object metadata:\n")
cat("- markers_sig_all (all sig markers per cluster)\n")
cat("- markers_top25 (top 25 per cluster)\n")
cat("- markers_top5 (top 5 per cluster)\n")
cat("- @misc$markers_* data.frames\n")

# ── Quick verification ──────────────────────────────────────────────────
head(reference_integrated@meta.data[, grepl("markers", colnames(reference_integrated@meta.data))])
table(nchar(reference_integrated$markers_top5) > 0)  # should all be TRUE

```

# Quick Visual Summary of Top Markers(Heatmap)
```{r, fig.width=14, fig.height=8}
# DoHeatmap of top 5 per cluster for quick biological validation
library(RColorBrewer)

top5_genes <- top5_markers %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  pull(gene) %>%
  unique()

DefaultAssay(reference_integrated) <- "RNA"
DoHeatmap(
  reference_integrated,
  features = top5_genes,
  group.by = "seurat_clusters",
  label    = TRUE,
  raster   = FALSE
) +
  scale_fill_gradientn(colors = rev(brewer.pal(11, "RdBu"))) +
  ggtitle("Top 5 markers per cluster (res.0.3) — pre-annotation validation")

```



# Quick Visual Summary of Top Markers(Dotplot)
```{r, fig.width=10, fig.height=8}
# ── SCpubr DotPlot for top 5 markers ───────────────────────────────────
DefaultAssay(reference_integrated) <- "RNA"

# Get top 5 genes per cluster (already computed)
top5_genes <- top5_markers %>%
  dplyr::group_by(cluster) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::pull(gene) %>%
  unique()

# ── SCpubr v1.x — minimal working syntax ───────────────────────────────

SCpubr::do_DotPlot(sample = reference_integrated, 
                         features = top5_genes, 
                         flip = TRUE,
                         )

```

# Quick Visual Summary of Top Markers(Dotplot)
```{r, fig.width=14, fig.height=12}
# Check cluster 0 immediately
DefaultAssay(reference_integrated) <- "RNA"

# Myeloid vs T cell markers
VlnPlot(reference_integrated, 
        features = c("AIF1", "CD3D", "CD4", "CD14", "CD68", "LYZ"),
        group.by = "seurat_clusters", pt.size = 0)

# Azimuth predicted labels in cluster 0
table(reference_integrated$predicted.celltype.l2[
  reference_integrated$seurat_clusters == "0"])

```

# FeaturePlot to check TEMRA
```{r, fig.width=14, fig.height=12}
FeaturePlot(
  object = reference_integrated,
  features = c("PRF1", "NKG7", "FGFBP2", "GNLY"),
  cols = c("lightgrey", "red"),label = T,
  ncol = 2
)

``` 

# R Annotation Code
```{r, fig.width=14, fig.height=5}
# # COMPLETE CELL TYPE ANNOTATION CODE
# 
# # Step 1: Ensure clusters are characters
# reference_integrated$seurat_clusters <- as.character(reference_integrated$seurat_clusters)
# 
# # Step 2: Define named labels (exactly matches your 0-6 clusters)
# cluster_labels <- c(
#   "0" = "CD4 Tnaive (CCR7+SELL+TCF7+)",         # Canonical naive
#   "1" = "CD4 TCM (CD161+/IL7R+)",               # TCM, KLRB1/CD161+ Th17-memory
#   "2" = "CD4 TCM (CCR4+/Th2-like)",             # Activated TCM, CCR4+BATF+
#   "3" = "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",     # Cytotoxic effector, Temra
#   "4" = "CD4 TEM (NF-kB activated)",            # Effector memory, stress-activated
#   "5" = "CD4 Treg (FOXP3+Helios+CD25+)",        # Natural tTreg, highest confidence
#   "6" = "CD4 Tnaive-RTE (IGF1R+)"              # Quiescent/RTE naive subset (Quiescent/RTE naive subset)
# )
# 
# # Step 3: Create vector (you already verified this works)
# clusters_chr <- as.character(reference_integrated$seurat_clusters)
# cell_type_vec <- cluster_labels[clusters_chr]
# 
# # Step 4: SAFE ASSIGNMENT (direct metadata write)
# reference_integrated@meta.data$cell_type <- cell_type_vec
# 
# # Step 5: Force factor ordering 0→6 for plots
# reference_integrated@meta.data$cell_type <- factor(
#   reference_integrated@meta.data$cell_type,
#   levels = c("CD4 Tnaive (CCR7+SELL+TCF7+)",
#              "CD4 TCM (CD161+/IL7R+)",
#              "CD4 TCM (CCR4+/Th2-like)",
#              "CD4 CTL/Temra (GZMK+GZMA+CCL5+)",
#              "CD4 TEM (NF-kB activated)",
#              "CD4 Treg (FOXP3+Helios+CD25+)",
#              "CD4 Tnaive-RTE (IGF1R+)")
# )

# Step 6: VALIDATE
cat("✅ Assignment complete!\n")
print(table(reference_integrated$seurat_clusters, reference_integrated@meta.data$cell_type))
print(table(reference_integrated@meta.data$cell_type))

# Step 7: VISUALIZE
p1 <- DimPlot(reference_integrated, group.by = "seurat_clusters", 
              label = TRUE, label.size = 3) + ggtitle("Clusters 0-6")
p2 <- DimPlot(reference_integrated, group.by = "cell_type", 
              label = TRUE, repel = TRUE, label.size = 3) + ggtitle("Cell Types")

p1 | p2

# Step 8: Cross-check with Azimuth
table(reference_integrated$predicted.celltype.l2, 
      reference_integrated@meta.data$cell_type)
```

# Save annotated reference
```{r}
# Save annotated reference
saveRDS(reference_integrated, "CD4_reference_annotated_with_markers.rds")
cat("✅ Annotated reference  with markers saved!\n")

``` 
