1. load libraries
2. Read obj
# Load your object
All_sample_Merged <- readRDS("../All_samples_Merged_with_STCAT_and_renamed_FINAL.rds")
# Check unique cell_line values
unique(All_sample_Merged$cell_line)
[1] L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab
[9] CD4Tcells_10x
Levels: L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab CD4Tcells_10x
# [Expect something like: "L1","L2","L3","L4","L5","L6","L7","CD4Tcells_lab","CD4Tcells_10x"]
# Subset for malignant cell lines only
malignant_lines <- c("L1","L2","L3","L4","L5","L6","L7")
Malignant_cells <- subset(All_sample_Merged, subset = cell_line %in% malignant_lines)
# Confirm
table(Malignant_cells$cell_line)
L1 L2 L3 L4 L5 L6 L7
5825 5935 6428 6006 6022 5148 5331
3. Check all coulumns
# Adjust these if your object or column names differ
seurat_obj <- Malignant_cells
cluster_col <- "seurat_clusters"
azimuth_col <- "predicted.celltype.l2" # as you specified
stcat_col <- "Prediction" # STCAT automatic annotation column
# ADT assay common names we'll try to detect
adt_assay_candidates <- c("ADT", "AntibodyCapture", "Antibody", "CITE", "CITEseq")
cat("Seurat object present in workspace? ", exists("All_sample_Merged"), "\n")
Seurat object present in workspace? TRUE
4. Detect ADT assay & list ADT features
# detect ADT assay
assays_present <- Assays(seurat_obj)
adt_assay <- intersect(adt_assay_candidates, assays_present)
if (length(adt_assay) == 0) {
adt_assay <- NA
warning("No common ADT assay name found. Available assays: ", paste(assays_present, collapse = ", "))
} else {
adt_assay <- adt_assay[1]
}
cat("ADT assay detected: ", adt_assay, "\n")
ADT assay detected: ADT
if (!is.na(adt_assay)) {
# list features
adt_features <- rownames(GetAssayData(seurat_obj, assay = adt_assay, slot = "counts"))
cat("Number of ADT features: ", length(adt_features), "\n")
print(adt_features)
} else {
adt_features <- character(0)
}
Number of ADT features: 28
[1] "CD274" "CD30" "CD40" "CD3" "CD45RA" "CD7" "CCR4" "CD4" "CD25" "CD45RO" "PD1" "CD44"
[13] "CD5" "CXCR3" "CCR6" "CD62L" "CCR7" "CD95" "TCRab" "CXCR4" "CD2" "CD28" "CD127" "CD45"
[25] "CD26" "CCR10" "CCR8" "CD19"
5. Compare your ADT features against a canonical T-cell ADT
list
canonical_adt <- c("CD3", "CD4", "CD8", "CD45RA", "CD45RO", "CD62L", "CCR7",
"CD27", "CD28", "CD25", "CD127", "PD1", "PDCD1", "CTLA4", "CD69",
"HLA-DR", "CD38", "ICOS", "CXCR5", "CCR4", "CCR6", "CD45",
"CD7", "CD2", "CD26", "CD5", "TIM3", "LAG3", "CD16", "CD19")
present_in_panel <- intersect(canonical_adt, adt_features)
missing_from_panel <- setdiff(canonical_adt, adt_features)
cat("Canonical ADT markers present in your panel (intersection):\n")
Canonical ADT markers present in your panel (intersection):
print(present_in_panel)
[1] "CD3" "CD4" "CD45RA" "CD45RO" "CD62L" "CCR7" "CD28" "CD25" "CD127" "PD1" "CCR4" "CCR6"
[13] "CD45" "CD7" "CD2" "CD26" "CD5" "CD19"
cat("\nCanonical ADT markers NOT present / differently named: (inspect adt features manually)\n")
Canonical ADT markers NOT present / differently named: (inspect adt features manually)
print(missing_from_panel)
[1] "CD8" "CD27" "PDCD1" "CTLA4" "CD69" "HLA-DR" "CD38" "ICOS" "CXCR5" "TIM3" "LAG3" "CD16"
Interpretation notes
- If
CD45RA
/ CD45RO
are present you can
separate naïve vs memory by ADT. CD25
and
CD127
help to define Tregs.
- Panel naming can vary (e.g.
PDCD1
vs PD-1
,
TIM3
vs Tim-3
) — if names differ, map them
manually.
6. Validate automatic labels (Azimuth & STCAT) using RNA marker
genes (cluster-level)
# Canonical RNA signatures for CD4+ T subsets (expand as needed)
signatures <- list(
Tn = c("CCR7","SELL","LEF1","TCF7"),
Tcm = c("CCR7","SELL","IL7R","CD27"),
Tem = c("GZMK","PRF1","IFNG","KLRB1"),
Treg = c("FOXP3","IL2RA","CTLA4","IKZF2"),
Tex = c("PDCD1","CTLA4","LAG3","TOX","HAVCR2"),
Temra = c("GZMB","PRF1","KLRG1","CX3CR1","IFNG"),
Th1 = c("TBX21","IFNG","CXCR3"),
Th2 = c("GATA3","IL4","IL5","IL13"),
Th17 = c("RORC","IL17A","IL17F","CCR6"),
Tfh = c("CXCR5","BCL6","PDCD1")
)
# flatten and check presence
all_sig_genes <- unique(unlist(signatures))
genes_present <- intersect(all_sig_genes, rownames(seurat_obj))
cat("Signature genes present in your RNA data:", length(genes_present), "of", length(all_sig_genes), "\n")
Signature genes present in your RNA data: 33 of 33
# Detect Azimuth/STCAT columns presence
cat('Azimuth column present?:', azimuth_col %in% colnames(seurat_obj@meta.data), '\n')
Azimuth column present?: TRUE
cat('STCAT Prediction column present?:', stcat_col %in% colnames(seurat_obj@meta.data), '\n')
STCAT Prediction column present?: TRUE
# compute average expression per cluster for signature genes
features_to_use <- intersect(all_sig_genes, rownames(seurat_obj))
if (length(features_to_use) == 0) stop("No canonical signature genes found in the RNA assay. Check gene names / expression matrix.")
avg_rna_by_cluster <- AverageExpression(seurat_obj, assays = "RNA", features = features_to_use,
slot = "data", group.by = cluster_col)$RNA
# z-score rows for heatmap visualization
zmat <- t(scale(t(as.matrix(avg_rna_by_cluster) + 1e-6)))
pheatmap(zmat, cluster_rows = TRUE, cluster_cols = TRUE, main = "Cluster x RNA marker (z-scaled average)")

# DotPlot of canonical RNA markers by cluster
DotPlot(seurat_obj, features = features_to_use, group.by = cluster_col) + RotatedAxis()+ scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) + ggtitle("RNA canonical markers by cluster")

Interpretation notes
- Clusters high for
FOXP3
+ IL2RA
likely
represent Tregs.
CCR7
+ SELL
mark Naïve/Tcm; verify with
other genes (TCF7/LEF1).
GZMK
/PRF1
/IFNG
suggest
effector/Tem-like cells.
- Use DotPlots and the heatmap to map clusters to candidate
labels.
7. ADT validation: ensure normalization and compute cluster-level
ADT averages
# Check if ADT assay exists
adt_assay <- intersect(adt_assay_candidates, Assays(seurat_obj))
if (length(adt_assay) == 0) {
message("No ADT assay detected; skipping ADT validation.")
} else {
adt_assay <- adt_assay[1] # use the first match
DefaultAssay(seurat_obj) <- adt_assay
# Check slots
adt_slots <- slotNames(seurat_obj[[adt_assay]])
cat("ADT assay slots:", paste(adt_slots, collapse = ","), "\n")
# Extract ADT data
adt_data <- GetAssayData(seurat_obj, assay = adt_assay, slot = "data")
# Replace NA/Inf with 0
adt_data[is.na(adt_data)] <- 0
adt_data[is.infinite(adt_data)] <- 0
seurat_obj[[adt_assay]]@data <- adt_data
# CLR normalization if data looks unnormalized (optional check)
if (max(adt_data, na.rm = TRUE) > 50 || min(adt_data, na.rm = TRUE) < -10) {
cat("Running CLR normalization and scaling ADT.\n")
seurat_obj <- NormalizeData(seurat_obj, assay = adt_assay, normalization.method = "CLR")
seurat_obj <- ScaleData(seurat_obj, assay = adt_assay)
} else {
cat("ADT assay assumed normalized (CLR/modal).\n")
}
# Compute cluster-level average ADT values
avg_adt_by_cluster <- AverageExpression(
seurat_obj,
assays = adt_assay,
features = intersect(canonical_adt, adt_features),
slot = "data",
group.by = cluster_col
)[[adt_assay]]
# z-score across clusters and plot heatmap
z_adt <- t(scale(t(as.matrix(avg_adt_by_cluster) + 1e-6)))
pheatmap(z_adt, main = "ADT markers (z-scored avg per cluster)")
# Violin plots for key ADTs (subset to avoid overcrowding)
adt_to_plot <- intersect(c("CD4","CD8","CD45RA","CD45RO","CD25","CD127","PD1","CXCR5"), adt_features)
if (length(adt_to_plot) > 0) {
print(
VlnPlot(
seurat_obj,
features = adt_to_plot,
assay = adt_assay,
slot = "data",
group.by = cluster_col,
pt.size = 0.05,
ncol = 3
)
)
}
# Reset default assay
DefaultAssay(seurat_obj) <- "RNA"
}
ADT assay slots: counts,data,scale.data,assay.orig,var.features,meta.features,misc,key
ADT assay assumed normalized (CLR/modal).


Interpretation notes
- ADT
CD4
high + CD8
low confirms CD4 T
identity per cluster (useful QC).
CD45RA
vs CD45RO
separates Naïve vs
Memory.
CD25
high and CD127
low (by ADT) supports
Treg calls.
- PD-1/ICOS/LAG3 ADT elevation suggests activation/exhaustion
phenotypes.
8. Module scoring (per-cell) for signatures and cluster-level
summarization
# Set SCT assay
DefaultAssay(seurat_obj) <- "SCT"
# Ensure signature genes exist in the dataset
sigs_present <- lapply(signatures, function(g) intersect(g, rownames(seurat_obj)))
sigs_present <- sigs_present[sapply(sigs_present, length) > 0]
if (length(sigs_present) == 0) {
stop("No signature genes present in SCT assay for module scoring. Adjust gene names.")
}
# Keep signatures with >=3 genes and non-zero variance
sigs_present <- lapply(sigs_present, function(g) {
if(length(g) < 3) return(NULL)
expr_mat <- GetAssayData(seurat_obj, assay = "SCT", slot = "data")[g, , drop = FALSE]
if(all(rowSums(expr_mat) == 0)) return(NULL)
return(g)
})
sigs_present <- sigs_present[sapply(sigs_present, length) > 0]
if(length(sigs_present) == 0) stop("No valid signatures with enough variance for module scoring.")
# Add module scores safely
seurat_obj <- AddModuleScore(seurat_obj, features = sigs_present, name = "mod", ctrl = 5)
# Rename module columns to readable names
added_cols <- paste0("mod", seq_along(sigs_present))
new_colnames <- paste0("score_", names(sigs_present))
for (i in seq_along(added_cols)) {
old <- added_cols[i]
new <- new_colnames[i]
if(old %in% colnames(seurat_obj@meta.data)) {
colnames(seurat_obj@meta.data)[which(colnames(seurat_obj@meta.data) == old)] <- new
}
}
# Prepare module scores for UMAP plotting (subset first 6)
plot_scores <- new_colnames[1:min(6, length(new_colnames))]
# Filter out constant module scores to avoid FeaturePlot errors
plot_scores <- plot_scores[sapply(plot_scores, function(f) length(unique(seurat_obj[[f]])) > 1)]
if(length(plot_scores) > 0) {
print(
FeaturePlot(
seurat_obj,
features = plot_scores,
reduction = "umap",
ncol = 3,
pt.size = 0.05,
cols = c("lightgrey", "blue")
)
)
} else {
message("No variable module scores to plot with FeaturePlot().")
}
# Violin plots per cluster for all module scores
vln_scores <- new_colnames[sapply(new_colnames, function(f) length(unique(seurat_obj[[f]])) > 1)]
if(length(vln_scores) > 0) {
print(
VlnPlot(
seurat_obj,
features = vln_scores,
group.by = cluster_col,
pt.size = 0.05,
ncol = 3
)
)
} else {
message("No variable module scores to plot with VlnPlot().")
}
# Cluster-level mean module scores
cluster_scores_df <- seurat_obj@meta.data %>%
as.data.frame() %>%
group_by(!!sym(cluster_col)) %>%
summarise(across(all_of(new_colnames), mean, .names = "{.col}"))
print(cluster_scores_df)
# Convert to matrix for heatmap
mat <- as.data.frame(cluster_scores_df)
rownames(mat) <- mat[[cluster_col]]
mat[[cluster_col]] <- NULL
# Optional: z-score across clusters for readability
mat_z <- t(scale(t(as.matrix(mat) + 1e-6)))
pheatmap(mat_z, main = "Mean module scores per cluster", cluster_rows = TRUE, cluster_cols = TRUE)

Interpretation notes
- The highest module score per cluster provides a
gene-expression-driven candidate label (e.g., cluster with highest
score_Treg
suggests a regulatory phenotype).
- Module scores are robust when signatures contain several genes;
expand gene lists if a signature is weak.
9. Consensus labelling (weighted voting across Azimuth, STCAT
Prediction, module scores, and ADT rules)
library(tibble)
# Helper: modal label in a cluster
get_modal <- function(vec) {
if (all(is.na(vec))) return(NA)
ux <- sort(table(vec), decreasing = TRUE)
names(ux)[1]
}
meta <- seurat_obj@meta.data
clusters <- sort(unique(meta[[cluster_col]]))
# modal Azimuth and STCAT per cluster
modal_azimuth <- if (azimuth_col %in% colnames(meta)) {
meta %>% group_by(!!sym(cluster_col)) %>% summarise(mod = get_modal(!!sym(azimuth_col))) %>% deframe()
} else {
NULL
}
modal_stcat <- if (stcat_col %in% colnames(meta)) {
meta %>% group_by(!!sym(cluster_col)) %>% summarise(mod = get_modal(!!sym(stcat_col))) %>% deframe()
} else {
NULL
}
# top module signature per cluster
mean_sig_by_cluster <- seurat_obj@meta.data %>% as.data.frame() %>%
group_by(!!sym(cluster_col)) %>%
summarise(across(starts_with('score_'), mean))
# tidy and get top signature
mean_long <- mean_sig_by_cluster %>% pivot_longer(-all_of(cluster_col), names_to = 'sig', values_to = 'val')
mean_long$sig <- sub('^score_', '', mean_long$sig)
top_sig <- mean_long %>% group_by(!!sym(cluster_col)) %>% slice_max(order_by = val, n = 1) %>%
select(all_of(cluster_col), sig) %>% deframe()
# ADT votes per cluster (simple heuristic on z-scored cluster means)
adt_votes <- NULL
if (exists('avg_adt_by_cluster')) {
adt_z <- t(scale(t(as.matrix(avg_adt_by_cluster) + 1e-6)))
adt_votes <- apply(adt_z, 1, function(r) {
# Treg: CD25 high & CD127 low
if (!is.na(r['CD25']) && !is.na(r['CD127']) && (r['CD25'] > 0.5) && (r['CD127'] < 0)) return('Treg')
# Naive: CD45RA high
if (!is.na(r['CD45RA']) && (r['CD45RA'] > 0.5)) return('Naive')
# Memory: CD45RO high
if (!is.na(r['CD45RO']) && (r['CD45RO'] > 0.5)) return('Memory')
# CD4 vs CD8 confirmation
if (!is.na(r['CD4']) && !is.na(r['CD8']) && (r['CD4'] - r['CD8'] > 0.3)) return('CD4')
return(NA)
})
}
# weights (tunable) --- you can adjust these; here we give equal weight
weights <- list(azimuth = 0.23, stcat = 0.15, module = 0.25, adt = 0.25)
# Voting per cluster
consensus <- setNames(rep(NA, length(clusters)), clusters)
for (cl in clusters) {
votes <- list()
if (!is.null(modal_azimuth)) {
lbl <- modal_azimuth[as.character(cl)]
if (!is.na(lbl)) votes[[lbl]] <- (votes[[lbl]] %||% 0) + weights$azimuth
}
if (!is.null(modal_stcat)) {
lbl <- modal_stcat[as.character(cl)]
if (!is.na(lbl)) votes[[lbl]] <- (votes[[lbl]] %||% 0) + weights$stcat
}
# module
msig <- top_sig[as.character(cl)]
if (!is.na(msig)) votes[[msig]] <- (votes[[msig]] %||% 0) + weights$module
# adt
if (!is.null(adt_votes)) {
lbl <- adt_votes[as.character(cl)]
if (!is.na(lbl)) votes[[lbl]] <- (votes[[lbl]] %||% 0) + weights$adt
}
if (length(votes) == 0) {
consensus[as.character(cl)] <- NA
} else {
top_score <- max(unlist(votes))
top_labels <- names(votes)[unlist(votes) == top_score]
if (length(top_labels) == 1) {
consensus[as.character(cl)] <- top_labels
} else {
# tie-break: prefer Azimuth -> STCAT -> module -> ADT
chosen <- NA
if (!is.null(modal_azimuth) && modal_azimuth[as.character(cl)] %in% top_labels) chosen <- modal_azimuth[as.character(cl)]
else if (!is.null(modal_stcat) && modal_stcat[as.character(cl)] %in% top_labels) chosen <- modal_stcat[as.character(cl)]
else if (!is.na(msig) && msig %in% top_labels) chosen <- msig
else chosen <- top_labels[1]
consensus[as.character(cl)] <- chosen
}
}
}
# compile summary table
consensus_df <- data.frame(
cluster = names(consensus),
consensus_label = unname(consensus),
azimuth = if (!is.null(modal_azimuth)) modal_azimuth[names(consensus)] else NA,
stcat = if (!is.null(modal_stcat)) modal_stcat[names(consensus)] else NA,
top_module = top_sig[names(consensus)],
adt_vote = if (!is.null(adt_votes)) adt_votes[names(consensus)] else NA,
stringsAsFactors = FALSE
)
print(consensus_df)
# Extract per-cell cluster IDs
clusters <- as.character(seurat_obj[[cluster_col]][,1])
# Map cluster IDs to consensus labels
consensus_per_cell <- consensus[match(clusters, names(consensus))]
# Make sure the vector has the same names as the cells in the Seurat object
names(consensus_per_cell) <- colnames(seurat_obj)
# Add to metadata
seurat_obj$consensus_label <- consensus_per_cell
# Check
table(seurat_obj$consensus_label, clusters, useNA = "ifany")
clusters
0 1 10 11 12 13 2 3 4 5 6 7 8 9
Tcm 0 0 0 0 0 686 0 0 0 0 0 3408 0 0
Th1 0 5256 18 0 0 0 0 0 0 0 0 0 0 0
Th2 0 0 0 1658 0 0 0 0 0 0 3536 0 0 0
Tn 0 0 0 0 0 0 0 0 0 2986 0 0 0 3238
Treg 6788 0 0 0 1035 0 4663 1 4086 0 0 0 3336 0
# Plot on UMAP
DimPlot(seurat_obj, group.by = "consensus_label", label = TRUE, repel = TRUE) + NoLegend()

DimPlot(seurat_obj, group.by = "seurat_clusters", label = TRUE, repel = TRUE) + NoLegend()

# UMAP colored by consensus
if ("umap" %in% names(seurat_obj@reductions)) {
print(DimPlot(seurat_obj, group.by = 'consensus_label', label = TRUE, repel = TRUE) + ggtitle('Consensus labels (cluster-level)'))
} else {
message('No UMAP reduction found — skip UMAP plotting')
}

Interpretation notes
consensus_df
summarizes the weighted votes for each
cluster. Clusters with NA
consensus are ambiguous — inspect
them manually.
- You may tune
weights
depending on which annotation you
trust more. For example, if Azimuth mapping used your exact reference
and looked high-confidence, increase weights$azimuth
to
0.35 and proportionally reduce others.
10. Inspect ambiguous clusters (manual rescue)
ambiguous <- consensus_df %>% filter(is.na(consensus_label) | (azimuth != consensus_label & stcat != consensus_label))
print(ambiguous)
if (nrow(ambiguous) > 0) {
for (cl in ambiguous$cluster) {
cat("\n---\nInspecting cluster", cl, "\n")
# Extract cells by seurat_clusters
cells <- WhichCells(seurat_obj, expression = seurat_obj[[cluster_col]] == cl)
cat("Number of cells in cluster:", length(cells), "\n")
if (length(cells) == 0) {
cat("⚠️ No cells found for cluster", cl, "\n")
next
}
# Temporarily switch Idents to seurat_clusters
Idents(seurat_obj) <- seurat_obj[[cluster_col]][,1]
# Top RNA markers
markers <- FindMarkers(
seurat_obj,
ident.1 = cl,
logfc.threshold = 0.25
)
cat("Top RNA markers (head):\n")
print(head(markers, 20))
# ADT summary if available
if (exists("avg_adt_by_cluster")) {
if (cl %in% rownames(avg_adt_by_cluster)) {
cat("ADT averages for this cluster:\n")
print(avg_adt_by_cluster[as.character(cl), , drop = FALSE])
} else {
cat("No ADT summary available for cluster", cl, "\n")
}
}
}
} else {
cat("No ambiguous clusters found by the chosen rules.\n")
}
---
Inspecting cluster 0
Error in `FetchData()`:
! None of the requested variables were found:
Run `]8;;x-r-run:rlang::last_trace()rlang::last_trace()]8;;` to see where the error occurred.
Interpretation & manual curation tips
- For ambiguous clusters, inspect the top RNA markers (from
FindMarkers
) and ADT averages side-by-side.
- Consider re-clustering the ambiguous cluster cells at higher
resolution or performing sub-clustering, because clusters sometimes mix
states (e.g. activated Tcm + early effector).
- You can also assign per-cell consensus (using the same voting rules)
to capture mixed clusters — this workflow uses cluster-level consensus
to be conservative.
11. Reporting recommendations (Methods text snippet)
---
title: "Manual Cluster Annotation of Malignant Cell Lines"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}
library(Seurat)
library(dplyr)
library(ggplot2)
library(pheatmap)
library(tidyr)

```


# 2. Read obj
```{r obj}

# Load your object
All_sample_Merged <- readRDS("../All_samples_Merged_with_STCAT_and_renamed_FINAL.rds")


# Check unique cell_line values
unique(All_sample_Merged$cell_line)
# [Expect something like: "L1","L2","L3","L4","L5","L6","L7","CD4Tcells_lab","CD4Tcells_10x"]

# Subset for malignant cell lines only
malignant_lines <- c("L1","L2","L3","L4","L5","L6","L7")
Malignant_cells <- subset(All_sample_Merged, subset = cell_line %in% malignant_lines)

# Confirm
table(Malignant_cells$cell_line)
```

# 3. Check all coulumns
```{r check, error=TRUE}

# Adjust these if your object or column names differ
seurat_obj <- Malignant_cells
cluster_col <- "seurat_clusters"
azimuth_col <- "predicted.celltype.l2"  # as you specified
stcat_col <- "Prediction"                 # STCAT automatic annotation column

# ADT assay common names we'll try to detect
adt_assay_candidates <- c("ADT", "AntibodyCapture", "Antibody", "CITE", "CITEseq")

cat("Seurat object present in workspace? ", exists("All_sample_Merged"), "\n")

```

# 4. Detect ADT assay & list ADT features
```{r detect-adt}
# detect ADT assay
assays_present <- Assays(seurat_obj)
adt_assay <- intersect(adt_assay_candidates, assays_present)
if (length(adt_assay) == 0) {
  adt_assay <- NA
  warning("No common ADT assay name found. Available assays: ", paste(assays_present, collapse = ", "))
} else {
  adt_assay <- adt_assay[1]
}
cat("ADT assay detected: ", adt_assay, "\n")

if (!is.na(adt_assay)) {
  # list features
  adt_features <- rownames(GetAssayData(seurat_obj, assay = adt_assay, slot = "counts"))
  cat("Number of ADT features: ", length(adt_features), "\n")
  print(adt_features)
} else {
  adt_features <- character(0)
}
```



# 5. Compare your ADT features against a canonical T-cell ADT list
```{r canonical-adt}
canonical_adt <- c("CD3", "CD4", "CD8", "CD45RA", "CD45RO", "CD62L", "CCR7",
                   "CD27", "CD28", "CD25", "CD127", "PD1", "PDCD1", "CTLA4", "CD69",
                   "HLA-DR", "CD38", "ICOS", "CXCR5", "CCR4", "CCR6", "CD45",
                   "CD7", "CD2", "CD26", "CD5", "TIM3", "LAG3", "CD16", "CD19")

present_in_panel <- intersect(canonical_adt, adt_features)
missing_from_panel <- setdiff(canonical_adt, adt_features)

cat("Canonical ADT markers present in your panel (intersection):\n")
print(present_in_panel)

cat("\nCanonical ADT markers NOT present / differently named: (inspect adt features manually)\n")
print(missing_from_panel)
```

### Interpretation notes

- If `CD45RA` / `CD45RO` are present you can separate naïve vs memory by ADT. `CD25` and `CD127` help to define Tregs.
- Panel naming can vary (e.g. `PDCD1` vs `PD-1`, `TIM3` vs `Tim-3`) — if names differ, map them manually.


# 6. Validate automatic labels (Azimuth & STCAT) using RNA marker genes (cluster-level)
```{r define-signatures, fig.height=8, fig.width=12}
# Canonical RNA signatures for CD4+ T subsets (expand as needed)
signatures <- list(
  Tn = c("CCR7","SELL","LEF1","TCF7"),
  Tcm = c("CCR7","SELL","IL7R","CD27"),
  Tem = c("GZMK","PRF1","IFNG","KLRB1"),
  Treg = c("FOXP3","IL2RA","CTLA4","IKZF2"),
  Tex = c("PDCD1","CTLA4","LAG3","TOX","HAVCR2"),     
  Temra = c("GZMB","PRF1","KLRG1","CX3CR1","IFNG"),    
  Th1 = c("TBX21","IFNG","CXCR3"),
  Th2 = c("GATA3","IL4","IL5","IL13"),
  Th17 = c("RORC","IL17A","IL17F","CCR6"),
  Tfh = c("CXCR5","BCL6","PDCD1")
  
)

# flatten and check presence
all_sig_genes <- unique(unlist(signatures))
genes_present <- intersect(all_sig_genes, rownames(seurat_obj))
cat("Signature genes present in your RNA data:", length(genes_present), "of", length(all_sig_genes), "\n")

# Detect Azimuth/STCAT columns presence
cat('Azimuth column present?:', azimuth_col %in% colnames(seurat_obj@meta.data), '\n')
cat('STCAT Prediction column present?:', stcat_col %in% colnames(seurat_obj@meta.data), '\n')
```

```{r avg-rna-heatmap, fig.height=7, fig.width=8}
# compute average expression per cluster for signature genes
features_to_use <- intersect(all_sig_genes, rownames(seurat_obj))
if (length(features_to_use) == 0) stop("No canonical signature genes found in the RNA assay. Check gene names / expression matrix.")

avg_rna_by_cluster <- AverageExpression(seurat_obj, assays = "RNA", features = features_to_use,
                                        slot = "data", group.by = cluster_col)$RNA

# z-score rows for heatmap visualization
zmat <- t(scale(t(as.matrix(avg_rna_by_cluster) + 1e-6)))

pheatmap(zmat, cluster_rows = TRUE, cluster_cols = TRUE, main = "Cluster x RNA marker (z-scaled average)")
```

```{r dotplot-rna, fig.height=5, fig.width=12}
# DotPlot of canonical RNA markers by cluster
DotPlot(seurat_obj, features = features_to_use, group.by = cluster_col) + RotatedAxis()+ scale_color_gradient2(low = "lightyellow", mid = "red", high = "firebrick", midpoint = 1) + ggtitle("RNA canonical markers by cluster")
```

### Interpretation notes

- Clusters high for `FOXP3` + `IL2RA` likely represent Tregs.
- `CCR7` + `SELL` mark Naïve/Tcm; verify with other genes (TCF7/LEF1).
- `GZMK`/`PRF1`/`IFNG` suggest effector/Tem-like cells.
- Use DotPlots and the heatmap to map clusters to candidate labels.


# 7. ADT validation: ensure normalization and compute cluster-level ADT averages
```{r adt-normalize-and-avg, fig.height=5, fig.width=10}

# Check if ADT assay exists
adt_assay <- intersect(adt_assay_candidates, Assays(seurat_obj))
if (length(adt_assay) == 0) {
  message("No ADT assay detected; skipping ADT validation.")
} else {
  adt_assay <- adt_assay[1]  # use the first match
  DefaultAssay(seurat_obj) <- adt_assay
  
  # Check slots
  adt_slots <- slotNames(seurat_obj[[adt_assay]])
  cat("ADT assay slots:", paste(adt_slots, collapse = ","), "\n")
  
  # Extract ADT data
  adt_data <- GetAssayData(seurat_obj, assay = adt_assay, slot = "data")
  
  # Replace NA/Inf with 0
  adt_data[is.na(adt_data)] <- 0
  adt_data[is.infinite(adt_data)] <- 0
  seurat_obj[[adt_assay]]@data <- adt_data
  
  # CLR normalization if data looks unnormalized (optional check)
  if (max(adt_data, na.rm = TRUE) > 50 || min(adt_data, na.rm = TRUE) < -10) {
    cat("Running CLR normalization and scaling ADT.\n")
    seurat_obj <- NormalizeData(seurat_obj, assay = adt_assay, normalization.method = "CLR")
    seurat_obj <- ScaleData(seurat_obj, assay = adt_assay)
  } else {
    cat("ADT assay assumed normalized (CLR/modal).\n")
  }
  
  # Compute cluster-level average ADT values
  avg_adt_by_cluster <- AverageExpression(
    seurat_obj,
    assays = adt_assay,
    features = intersect(canonical_adt, adt_features),
    slot = "data",
    group.by = cluster_col
  )[[adt_assay]]
  
  # z-score across clusters and plot heatmap
  z_adt <- t(scale(t(as.matrix(avg_adt_by_cluster) + 1e-6)))
  pheatmap(z_adt, main = "ADT markers (z-scored avg per cluster)")
  
  # Violin plots for key ADTs (subset to avoid overcrowding)
  adt_to_plot <- intersect(c("CD4","CD8","CD45RA","CD45RO","CD25","CD127","PD1","CXCR5"), adt_features)
  if (length(adt_to_plot) > 0) {
    print(
      VlnPlot(
              seurat_obj,
              features = adt_to_plot,
              assay = adt_assay,
              slot = "data",
              group.by = cluster_col,
              pt.size = 0.05,
              ncol = 3
)

      )
    
  }
  
  # Reset default assay
  DefaultAssay(seurat_obj) <- "RNA"
}
```


### Interpretation notes

- ADT `CD4` high + `CD8` low confirms CD4 T identity per cluster (useful QC).
- `CD45RA` vs `CD45RO` separates Naïve vs Memory.
- `CD25` high and `CD127` low (by ADT) supports Treg calls.
- PD-1/ICOS/LAG3 ADT elevation suggests activation/exhaustion phenotypes.


# 8. Module scoring (per-cell) for signatures and cluster-level summarization
```{r module-scores}

# Set SCT assay
DefaultAssay(seurat_obj) <- "SCT"

# Ensure signature genes exist in the dataset
sigs_present <- lapply(signatures, function(g) intersect(g, rownames(seurat_obj)))
sigs_present <- sigs_present[sapply(sigs_present, length) > 0]

if (length(sigs_present) == 0) {
  stop("No signature genes present in SCT assay for module scoring. Adjust gene names.")
}

# Keep signatures with >=3 genes and non-zero variance
sigs_present <- lapply(sigs_present, function(g) {
  if(length(g) < 3) return(NULL)
  expr_mat <- GetAssayData(seurat_obj, assay = "SCT", slot = "data")[g, , drop = FALSE]
  if(all(rowSums(expr_mat) == 0)) return(NULL)
  return(g)
})
sigs_present <- sigs_present[sapply(sigs_present, length) > 0]

if(length(sigs_present) == 0) stop("No valid signatures with enough variance for module scoring.")

# Add module scores safely
seurat_obj <- AddModuleScore(seurat_obj, features = sigs_present, name = "mod", ctrl = 5)

# Rename module columns to readable names
added_cols <- paste0("mod", seq_along(sigs_present))
new_colnames <- paste0("score_", names(sigs_present))
for (i in seq_along(added_cols)) {
  old <- added_cols[i]
  new <- new_colnames[i]
  if(old %in% colnames(seurat_obj@meta.data)) {
    colnames(seurat_obj@meta.data)[which(colnames(seurat_obj@meta.data) == old)] <- new
  }
}

# Prepare module scores for UMAP plotting (subset first 6)
plot_scores <- new_colnames[1:min(6, length(new_colnames))]

# Filter out constant module scores to avoid FeaturePlot errors
plot_scores <- plot_scores[sapply(plot_scores, function(f) length(unique(seurat_obj[[f]])) > 1)]

if(length(plot_scores) > 0) {
  print(
    FeaturePlot(
      seurat_obj,
      features = plot_scores,
      reduction = "umap",
      ncol = 3,
      pt.size = 0.05,
      cols = c("lightgrey", "blue")
    )
  )
} else {
  message("No variable module scores to plot with FeaturePlot().")
}

# Violin plots per cluster for all module scores
vln_scores <- new_colnames[sapply(new_colnames, function(f) length(unique(seurat_obj[[f]])) > 1)]
if(length(vln_scores) > 0) {
  print(
    VlnPlot(
      seurat_obj,
      features = vln_scores,
      group.by = cluster_col,
      pt.size = 0.05,
      ncol = 3
    )
  )
} else {
  message("No variable module scores to plot with VlnPlot().")
}

# Cluster-level mean module scores
cluster_scores_df <- seurat_obj@meta.data %>%
  as.data.frame() %>%
  group_by(!!sym(cluster_col)) %>%
  summarise(across(all_of(new_colnames), mean, .names = "{.col}"))

print(cluster_scores_df)

# Convert to matrix for heatmap
mat <- as.data.frame(cluster_scores_df)
rownames(mat) <- mat[[cluster_col]]
mat[[cluster_col]] <- NULL

# Optional: z-score across clusters for readability
mat_z <- t(scale(t(as.matrix(mat) + 1e-6)))

pheatmap(mat_z, main = "Mean module scores per cluster", cluster_rows = TRUE, cluster_cols = TRUE)

```


### Interpretation notes

- The highest module score per cluster provides a gene-expression-driven candidate label (e.g., cluster with highest `score_Treg` suggests a regulatory phenotype).
- Module scores are robust when signatures contain several genes; expand gene lists if a signature is weak.


# 9. Consensus labelling (weighted voting across Azimuth, STCAT Prediction, module scores, and ADT rules)
```{r consensus-voting}

library(tibble)

# Helper: modal label in a cluster
get_modal <- function(vec) {
  if (all(is.na(vec))) return(NA)
  ux <- sort(table(vec), decreasing = TRUE)
  names(ux)[1]
}

meta <- seurat_obj@meta.data
clusters <- sort(unique(meta[[cluster_col]]))

# modal Azimuth and STCAT per cluster
modal_azimuth <- if (azimuth_col %in% colnames(meta)) {
  meta %>% group_by(!!sym(cluster_col)) %>% summarise(mod = get_modal(!!sym(azimuth_col))) %>% deframe()
} else {
  NULL
}
modal_stcat <- if (stcat_col %in% colnames(meta)) {
  meta %>% group_by(!!sym(cluster_col)) %>% summarise(mod = get_modal(!!sym(stcat_col))) %>% deframe()
} else {
  NULL
}

# top module signature per cluster
mean_sig_by_cluster <- seurat_obj@meta.data %>% as.data.frame() %>%
  group_by(!!sym(cluster_col)) %>%
  summarise(across(starts_with('score_'), mean))

# tidy and get top signature
mean_long <- mean_sig_by_cluster %>% pivot_longer(-all_of(cluster_col), names_to = 'sig', values_to = 'val')
mean_long$sig <- sub('^score_', '', mean_long$sig)
top_sig <- mean_long %>% group_by(!!sym(cluster_col)) %>% slice_max(order_by = val, n = 1) %>%
  select(all_of(cluster_col), sig) %>% deframe()

# ADT votes per cluster (simple heuristic on z-scored cluster means)
adt_votes <- NULL
if (exists('avg_adt_by_cluster')) {
  adt_z <- t(scale(t(as.matrix(avg_adt_by_cluster) + 1e-6)))
  adt_votes <- apply(adt_z, 1, function(r) {
    # Treg: CD25 high & CD127 low
    if (!is.na(r['CD25']) && !is.na(r['CD127']) && (r['CD25'] > 0.5) && (r['CD127'] < 0)) return('Treg')
    # Naive: CD45RA high
    if (!is.na(r['CD45RA']) && (r['CD45RA'] > 0.5)) return('Naive')
    # Memory: CD45RO high
    if (!is.na(r['CD45RO']) && (r['CD45RO'] > 0.5)) return('Memory')
    # CD4 vs CD8 confirmation
    if (!is.na(r['CD4']) && !is.na(r['CD8']) && (r['CD4'] - r['CD8'] > 0.3)) return('CD4')
    return(NA)
  })
}

# weights (tunable) --- you can adjust these; here we give equal weight
weights <- list(azimuth = 0.23, stcat = 0.15, module = 0.25, adt = 0.25)

# Voting per cluster
consensus <- setNames(rep(NA, length(clusters)), clusters)
for (cl in clusters) {
  votes <- list()
  if (!is.null(modal_azimuth)) {
    lbl <- modal_azimuth[as.character(cl)]
    if (!is.na(lbl)) votes[[lbl]] <- (votes[[lbl]] %||% 0) + weights$azimuth
  }
  if (!is.null(modal_stcat)) {
    lbl <- modal_stcat[as.character(cl)]
    if (!is.na(lbl)) votes[[lbl]] <- (votes[[lbl]] %||% 0) + weights$stcat
  }
  # module
  msig <- top_sig[as.character(cl)]
  if (!is.na(msig)) votes[[msig]] <- (votes[[msig]] %||% 0) + weights$module
  # adt
  if (!is.null(adt_votes)) {
    lbl <- adt_votes[as.character(cl)]
    if (!is.na(lbl)) votes[[lbl]] <- (votes[[lbl]] %||% 0) + weights$adt
  }

  if (length(votes) == 0) {
    consensus[as.character(cl)] <- NA
  } else {
    top_score <- max(unlist(votes))
    top_labels <- names(votes)[unlist(votes) == top_score]
    if (length(top_labels) == 1) {
      consensus[as.character(cl)] <- top_labels
    } else {
      # tie-break: prefer Azimuth -> STCAT -> module -> ADT
      chosen <- NA
      if (!is.null(modal_azimuth) && modal_azimuth[as.character(cl)] %in% top_labels) chosen <- modal_azimuth[as.character(cl)]
      else if (!is.null(modal_stcat) && modal_stcat[as.character(cl)] %in% top_labels) chosen <- modal_stcat[as.character(cl)]
      else if (!is.na(msig) && msig %in% top_labels) chosen <- msig
      else chosen <- top_labels[1]
      consensus[as.character(cl)] <- chosen
    }
  }
}

# compile summary table
consensus_df <- data.frame(
  cluster = names(consensus),
  consensus_label = unname(consensus),
  azimuth = if (!is.null(modal_azimuth)) modal_azimuth[names(consensus)] else NA,
  stcat = if (!is.null(modal_stcat)) modal_stcat[names(consensus)] else NA,
  top_module = top_sig[names(consensus)],
  adt_vote = if (!is.null(adt_votes)) adt_votes[names(consensus)] else NA,
  stringsAsFactors = FALSE
)

print(consensus_df)

# Extract per-cell cluster IDs
clusters <- as.character(seurat_obj[[cluster_col]][,1])

# Map cluster IDs to consensus labels
consensus_per_cell <- consensus[match(clusters, names(consensus))]

# Make sure the vector has the same names as the cells in the Seurat object
names(consensus_per_cell) <- colnames(seurat_obj)

# Add to metadata
seurat_obj$consensus_label <- consensus_per_cell

# Check
table(seurat_obj$consensus_label, clusters, useNA = "ifany")


# Plot on UMAP
DimPlot(seurat_obj, group.by = "consensus_label", label = TRUE, repel = TRUE) + NoLegend()

DimPlot(seurat_obj, group.by = "seurat_clusters", label = TRUE, repel = TRUE) + NoLegend()

# UMAP colored by consensus
if ("umap" %in% names(seurat_obj@reductions)) {
  print(DimPlot(seurat_obj, group.by = 'consensus_label', label = TRUE, repel = TRUE) + ggtitle('Consensus labels (cluster-level)'))
} else {
  message('No UMAP reduction found — skip UMAP plotting')
}
```

### Interpretation notes

- `consensus_df` summarizes the weighted votes for each cluster. Clusters with `NA` consensus are ambiguous — inspect them manually.
- You may tune `weights` depending on which annotation you trust more. For example, if Azimuth mapping used your exact reference and looked high-confidence, increase `weights$azimuth` to 0.35 and proportionally reduce others.

# 10. Inspect ambiguous clusters (manual rescue)
```{r inspect-ambiguous}

ambiguous <- consensus_df %>% filter(is.na(consensus_label) | (azimuth != consensus_label & stcat != consensus_label))
print(ambiguous)

if (nrow(ambiguous) > 0) {
  for (cl in ambiguous$cluster) {
    cat("\n---\nInspecting cluster", cl, "\n")

    # Extract cells by seurat_clusters
    cells <- WhichCells(seurat_obj, expression = seurat_obj[[cluster_col]] == cl)
    cat("Number of cells in cluster:", length(cells), "\n")

    if (length(cells) == 0) {
      cat("⚠️ No cells found for cluster", cl, "\n")
      next
    }

    # Temporarily switch Idents to seurat_clusters
    Idents(seurat_obj) <- seurat_obj[[cluster_col]][,1]

    # Top RNA markers
    markers <- FindMarkers(
      seurat_obj,
      ident.1 = cl,
      logfc.threshold = 0.25
    )
    cat("Top RNA markers (head):\n")
    print(head(markers, 20))

    # ADT summary if available
    if (exists("avg_adt_by_cluster")) {
      if (cl %in% rownames(avg_adt_by_cluster)) {
        cat("ADT averages for this cluster:\n")
        print(avg_adt_by_cluster[as.character(cl), , drop = FALSE])
      } else {
        cat("No ADT summary available for cluster", cl, "\n")
      }
    }
  }
} else {
  cat("No ambiguous clusters found by the chosen rules.\n")
}


```

### Interpretation & manual curation tips

- For ambiguous clusters, inspect the top RNA markers (from `FindMarkers`) and ADT averages side-by-side.
- Consider re-clustering the ambiguous cluster cells at higher resolution or performing sub-clustering, because clusters sometimes mix states (e.g. activated Tcm + early effector).
- You can also assign per-cell consensus (using the same voting rules) to capture mixed clusters — this workflow uses cluster-level consensus to be conservative.

# 11. Reporting recommendations (Methods text snippet)
```{r methods-snippet, echo=FALSE}
cat("Suggested Methods text (edit before using):\n\n")
cat("Cell type annotation was performed by combining automated reference mapping and marker-based validation. We mapped cells using Azimuth (predicted.celltype.l2) and an independent classifier (STCAT, column 'Prediction'). We validated automated labels using (1) canonical RNA marker expression (CCR7, SELL, FOXP3, TBX21, etc.), (2) ADT (CITE-seq) surface protein expression (CLR-normalized), and (3) gene-signature module scores. For each cluster we computed the modal Azimuth and STCAT labels, the top-scoring gene signature and ADT-based heuristics (CD45RA/CD45RO for naïve vs memory; CD25 high / CD127 low for Treg). We combined these four evidence sources in a weighted voting scheme (equal weights by default) to produce a consensus cluster label. Ambiguous clusters were inspected manually using cluster-level markers and ADT patterns.\n")
```

