1. load libraries
library(dplyr)
library(ggplot2)
library(Seurat)
library(ComplexHeatmap)
library(presto)
library(tictoc)
library(SCpubr)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)
options(timeout = 300)
set.seed(123)
# Colour palette used throughout — do not change
flow_colours <- c(
"Tnaive" = "#4472C4",
"Tcm" = "#ED7D31",
"Tem" = "#A9D18E",
"Temra" = "#C00000",
"Treg" = "#FFD700"
)
2. Load Seurat
Object
#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
All_samples_Merged <- readRDS("../../CD4_reference_annotated_with_markers.rds")
Run Azmimuth and check
annotation again
library(SeuratData)
library(Azimuth)
options(timeout = 300) # Increase download timeout if needed [web:63]
sample <- All_samples_Merged
rm(All_samples_Merged)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 14034042 749.5 25295498 1351.0 21650150 1156.3
Vcells 313545618 2392.2 514272694 3923.6 334874035 2554.9
DefaultAssay(sample) <- "RNA"
sample <- RunAzimuth(sample, reference = "pbmcref")
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
| | 0 % ~calculating
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=05s
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
DefaultAssay(sample) <- "SCT"
SCpubr::do_DimPlot(
sample = sample,
reduction = "umap",
group.by = "predicted.celltype.l2",
font.size = 18, legend.position = "right",
legend.title = "Cell type (Healthy CD4 T cells)"
)

NA
NA
Clean Object

Rename CD4 CTL → CD4
Temra/CTL
# Rename in metadata
clean_obj$predicted.celltype.l2[clean_obj$predicted.celltype.l2 == "CD4 CTL"] <- "CD4 Temra/CTL"
# Verify
table(clean_obj$predicted.celltype.l2)
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
Define Logical Order
for Legend
# Define desired order (Naive → TCM → TEM → Temra → Treg)
cell_order <- c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL", "Treg")
# Set factor levels
clean_obj$predicted.celltype.l2 <- factor(clean_obj$predicted.celltype.l2,
levels = cell_order)
# Verify order
levels(clean_obj$predicted.celltype.l2)
[1] "CD4 Naive" "CD4 TCM" "CD4 TEM" "CD4 Temra/CTL" "Treg"
Publication-Quality
UMAP Plot
library(ggplot2)
p1 <- DimPlot(clean_obj,
group.by = "predicted.celltype.l2",
label = TRUE,
repel = TRUE,
cols = c("CD4 Naive" = "#1f77b4", # Blue
"CD4 TCM" = "#ff7f0e", # Orange
"CD4 TEM" = "#2ca02c", # Green
"CD4 Temra/CTL" = "#d62728", # Red
"Treg" = "#9467bd"), # Purple
pt.size = 0.5) +
guides(color = guide_legend(override.aes = list(size = 4))) +
labs(title = "Healthy CD4 T Cell Reference Atlas",
subtitle = "Quiescent states only | Azimuth L2 annotation") +
theme_void() +
theme(legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
plot.subtitle = element_text(hjust = 0.5, size = 12))
print(p1)

ggsave("healthy_cd4_reference_umap.pdf", p1, width = 10, height = 8, dpi = 300)
# PNG version (for presentations)
ggsave("healthy_cd4_reference_umap.png", p1,
width = 10, height = 8, dpi = 300, bg = "white")
library(ggplot2)
library(Azimuth)
# Run this diagnostic to understand what is happening
DefaultAssay(clean_obj) <- "RNA"
# Check S and G2M scores distribution
p1 <- ggplot(clean_obj@meta.data,
aes(x = S.Score, fill = dataset)) +
geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
facet_wrap(~dataset, ncol = 1) +
theme_bw() +
labs(title = "S score distribution per dataset",
subtitle = "Healthy resting T-cells should peak below 0")
p2 <- ggplot(clean_obj@meta.data,
aes(x = G2M.Score, fill = dataset)) +
geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
facet_wrap(~dataset, ncol = 1) +
theme_bw() +
labs(title = "G2M score distribution per dataset")
p1 | p2

# Check which cell types are driving S phase
cat("Phase per cell type:\n")
Phase per cell type:
print(table(clean_obj$Phase, clean_obj$predicted.celltype.l2))
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
G1 1151 4165 69 6 86
G2M 107 816 15 1 33
S 779 4086 61 3 88
# Check S phase scores per cell type
ggplot(clean_obj@meta.data,
aes(x = predicted.celltype.l2,
y = S.Score,
fill = predicted.celltype.l2)) +
geom_violin(scale = "width", trim = TRUE) +
geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
labs(title = "S.Score per cell type — values above 0 = cycling",
x = NULL, y = "S.Score")

Re-embed — UMAP Only
(Preserve Existing RPCA Integration)
# CRITICAL:
# This object is already SCT normalised and RPCA integrated.
# The integrated assay PCA embedding IS the batch-corrected space.
#
# DO NOT run: ScaleData (would re-scale and destroy CC regression from SCT)
# DO NOT run: RunPCA (would recompute PCA, destroying RPCA integration
# and causing datasets to separate again)
# DO NOT run: SCTransform or IntegrateData
#
# ONLY run: FindNeighbors and RunUMAP on the EXISTING pca reduction
# This preserves the RPCA-corrected embedding and batch correction
cat("Available reductions:\n")
Available reductions:
print(names(clean_obj@reductions))
[1] "pca" "umap" "integrated_dr" "ref.umap"
DefaultAssay(clean_obj) <- "integrated"
# Use same dims as original integration
dims_use <- 1:20
# Rebuild neighbor graph on EXISTING pca embedding
clean_obj <- FindNeighbors(
clean_obj,
reduction = "pca",
dims = dims_use,
verbose = FALSE
)
# Rerun UMAP on EXISTING pca embedding
# return.model = TRUE is CRITICAL for MapQuery (malignant cell projection)
clean_obj <- RunUMAP(
clean_obj,
reduction = "pca",
dims = dims_use,
return.model = TRUE,
verbose = FALSE
)
# Recluster at multiple resolutions for flexibility
clean_obj <- FindClusters(
clean_obj,
resolution = c(0.1, 0.2, 0.3, 0.4, 0.5),
verbose = FALSE
)
cat("Clusters at resolution 0.3:",
length(unique(clean_obj$integrated_snn_res.0.3)), "\n")
Clusters at resolution 0.3: 10
Verify Integration
Preserved
# CRITICAL CHECK: datasets should be well mixed on UMAP
# If they are separated — integration was destroyed and needs to be re-done
p1 <- DimPlot(clean_obj,
group.by = "dataset",
shuffle = TRUE) +
ggtitle("Dataset mixing — should be well mixed\n(if separated: re-integration needed)")
p2 <- DimPlot(clean_obj,
group.by = "predicted.celltype.l2",
label = TRUE, repel = TRUE, label.box = TRUE) +
ggtitle("Azimuth l2 labels")
p3 <- DimPlot(clean_obj,
group.by = "Phase") +
ggtitle("Cell cycle phase (should be mixed, not separated)")
p1 | p2 | p3

Verify Integration
Preserved
p1

p2

# No CD8 cells remain
table(clean_obj$predicted.celltype.l2)
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
# High confidence scores
summary(clean_obj$mapping.score) # >0.65 ✓
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1581 0.6618 0.7762 0.7455 0.8731 1.0000
summary(clean_obj$predicted.celltype.l2.score) # >0.75 ✓
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2726 0.6740 0.8621 0.8126 0.9653 1.0000
Azimuth Score
Assessment
# Assess mapping and prediction scores to decide if l2 labels
# are reliable enough for trajectory analysis
p1 <- ggplot(clean_obj@meta.data,
aes(x = mapping.score)) +
geom_histogram(bins = 50, fill = "#4472C4", colour = "white") +
geom_vline(xintercept = 0.5, colour = "red",
linetype = "dashed", linewidth = 1) +
theme_bw() +
labs(title = "Mapping score distribution",
subtitle = "Cells below 0.5 are poorly mapped to PBMC reference",
x = "Mapping score", y = "Count")
p2 <- ggplot(clean_obj@meta.data,
aes(x = predicted.celltype.l2.score)) +
geom_histogram(bins = 50, fill = "#ED7D31", colour = "white") +
geom_vline(xintercept = 0.5, colour = "red",
linetype = "dashed", linewidth = 1) +
theme_bw() +
labs(title = "Prediction score l2",
x = "Prediction score", y = "Count")
p3 <- ggplot(clean_obj@meta.data,
aes(x = reorder(predicted.celltype.l2,
mapping.score, median),
y = mapping.score,
fill = predicted.celltype.l2)) +
geom_violin(scale = "width", trim = TRUE) +
geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
legend.position = "none") +
labs(title = "Mapping score per cell type", x = NULL, y = "Mapping score")
p4 <- ggplot(clean_obj@meta.data,
aes(x = reorder(predicted.celltype.l2,
predicted.celltype.l2.score, median),
y = predicted.celltype.l2.score,
fill = predicted.celltype.l2)) +
geom_violin(scale = "width", trim = TRUE) +
geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
legend.position = "none") +
labs(title = "Prediction score per cell type (l2)",
x = NULL, y = "Prediction score")
(p1 | p2) / p3 / p4

# Scores on UMAP
p5 <- FeaturePlot(clean_obj,
features = "mapping.score",
cols = c("grey90","#C00000"),
min.cutoff = 0, max.cutoff = 1) +
ggtitle("Mapping score on UMAP\n(red = high confidence)")
p6 <- FeaturePlot(clean_obj,
features = "predicted.celltype.l2.score",
cols = c("grey90","#4472C4"),
min.cutoff = 0, max.cutoff = 1) +
ggtitle("Prediction score l2 on UMAP\n(blue = high confidence)")
p5 | p6


library(patchwork)
# 4-panel report (exact SCpubr replica)
p1 <- FeaturePlot(clean_obj, features = "mapping.score") +
ggtitle("Azimuth Mapping Score")
p2 <- FeaturePlot(clean_obj, features = "predicted.celltype.l2.score") +
ggtitle("Azimuth L2 Score")
p3 <- DimPlot(clean_obj, group.by = "predicted.celltype.l2", label = TRUE) +
ggtitle("Local Clustering vs Azimuth")
p4 <- DimPlot(clean_obj, reduction = "ref.umap",
group.by = "predicted.celltype.l2", label = TRUE) +
ggtitle("PBMC Reference Projection")
# Final report
(p1 | p2) / (p3 | p4)

ggsave("azimuth_manual_report.png", width = 16, height = 10, dpi = 300)
library(dplyr)
prop_table <- clean_obj@meta.data %>%
dplyr::count(predicted.celltype.l2) %>% # ← Explicit dplyr::
mutate(Percent = round(n / sum(n) * 100, 1)) %>%
arrange(desc(n))
print(prop_table)
write.csv(prop_table, "celltype_proportions.csv", row.names = FALSE)
Decision Table — Should
We Use l2 Labels for Trajectory?
decision_df <- clean_obj@meta.data %>%
group_by(predicted.celltype.l2) %>%
summarise(
n_cells = n(),
median_mapping = round(median(mapping.score, na.rm = TRUE), 3),
median_pred = round(median(predicted.celltype.l2.score,
na.rm = TRUE), 3),
pct_above_0.5 = round(mean(mapping.score >= 0.5,
na.rm = TRUE) * 100, 1),
.groups = "drop"
) %>%
arrange(desc(n_cells))
cat("=== Full decision table ===\n")
=== Full decision table ===
print(decision_df)
key_states <- c("CD4 Naive","CD4 TCM","CD4 TEM","CD4 TEMRA/CTL","Treg")
cat("\n=== Key trajectory states ===\n")
=== Key trajectory states ===
print(decision_df %>% filter(predicted.celltype.l2 %in% key_states))
cat("\n=== Decision rule ===\n")
=== Decision rule ===
cat("If median mapping score > 0.5 for CD4 Naive + CD4 TCM + Treg:\n")
If median mapping score > 0.5 for CD4 Naive + CD4 TCM + Treg:
cat("→ Proceed with Azimuth l2 labels for trajectory\n")
→ Proceed with Azimuth l2 labels for trajectory
cat("If median mapping score < 0.5 for key states:\n")
If median mapping score < 0.5 for key states:
cat("→ Fall back to seurat_clusters with manual marker validation\n")
→ Fall back to seurat_clusters with manual marker validation
Save Final Reference
Object
cat("✅ Saved:", ncol(clean_obj), "cells\n")
✅ Saved: 11466 cells
```
---
title: "CD4 T-cell Reference + Azimuth Annotation Report"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# 1. load libraries
```{r setup, include=TRUE}
library(dplyr)
library(ggplot2)
library(Seurat)
library(ComplexHeatmap)
library(presto)
library(tictoc)
library(SCpubr)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)

options(timeout = 300)
set.seed(123)

# Colour palette used throughout — do not change
flow_colours <- c(
  "Tnaive" = "#4472C4",
  "Tcm"    = "#ED7D31",
  "Tem"    = "#A9D18E",
  "Temra"  = "#C00000",
  "Treg"   = "#FFD700"
)
```


# 2. Load Seurat Object 
```{r}

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
All_samples_Merged <- readRDS("../../CD4_reference_annotated_with_markers.rds")

```


# Run Azmimuth and check annotation again
```{r, fig.height=6, fig.width=12}

library(SeuratData)
library(Azimuth)
options(timeout = 300)  # Increase download timeout if needed [web:63] 

sample <- All_samples_Merged

rm(All_samples_Merged)

gc()

DefaultAssay(sample) <- "RNA"

sample <- RunAzimuth(sample, reference = "pbmcref")

DefaultAssay(sample) <- "SCT"

SCpubr::do_DimPlot(
  sample = sample,
  reduction = "umap",
  group.by = "predicted.celltype.l2",
  font.size = 18, legend.position = "right",
  legend.title = "Cell type (Healthy CD4 T cells)"
)


```


## Clean Object
```{r, fig.height=6, fig.width=10}

clean_obj <- subset(
     sample,
     subset = predicted.celltype.l2 %in% c(
         "CD8 Naive", "CD8 TCM", "CD8 TEM", "Platelet", "CD4 Proliferating"
     ),
     invert = TRUE
 )

cat("\nAfter cleaning:", ncol(clean_obj), "cells\n")
cat("\nRemoved cell types:\n")

cat("\nRemaining:\n")
print(table(clean_obj$predicted.celltype.l2))

rm(sample); gc()

SCpubr::do_DimPlot(
  sample = clean_obj,
  reduction = "umap",
  group.by = "predicted.celltype.l2",
  font.size = 18, legend.position = "right",
  legend.title = "Cell type (Healthy CD4 T cells)"
)

```

## Rename CD4 CTL → CD4 Temra/CTL
```{r, fig.height=6, fig.width=10}
# Rename in metadata
clean_obj$predicted.celltype.l2[clean_obj$predicted.celltype.l2 == "CD4 CTL"] <- "CD4 Temra/CTL"

# Verify
table(clean_obj$predicted.celltype.l2)

```


## Define Logical Order for Legend
```{r, fig.height=6, fig.width=10}
# Define desired order (Naive → TCM → TEM → Temra → Treg)
cell_order <- c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL", "Treg")

# Set factor levels
clean_obj$predicted.celltype.l2 <- factor(clean_obj$predicted.celltype.l2,
                                          levels = cell_order)

# Verify order
levels(clean_obj$predicted.celltype.l2)

```



## Publication-Quality UMAP Plot
```{r, fig.height=6, fig.width=10}
library(ggplot2)
p1 <- DimPlot(clean_obj,
              group.by = "predicted.celltype.l2",
              label = TRUE,
              repel = TRUE,
              cols = c("CD4 Naive" = "#1f77b4",      # Blue
                       "CD4 TCM"   = "#ff7f0e",      # Orange  
                       "CD4 TEM"   = "#2ca02c",      # Green
                       "CD4 Temra/CTL" = "#d62728",      # Red
                       "Treg"      = "#9467bd"),     # Purple
              pt.size = 0.5) +
  guides(color = guide_legend(override.aes = list(size = 4))) +
  labs(title = "Healthy CD4 T Cell Reference Atlas",
       subtitle = "Quiescent states only | Azimuth L2 annotation") +
  theme_void() +
  theme(legend.position = "right",
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        plot.subtitle = element_text(hjust = 0.5, size = 12))

print(p1)
ggsave("healthy_cd4_reference_umap.pdf", p1, width = 10, height = 8, dpi = 300)

# PNG version (for presentations)
ggsave("healthy_cd4_reference_umap.png", p1, 
       width = 10, height = 8, dpi = 300, bg = "white")

```

```{r}
library(ggplot2)
library(Azimuth)

# Run this diagnostic to understand what is happening
DefaultAssay(clean_obj) <- "RNA"

# Check S and G2M scores distribution
p1 <- ggplot(clean_obj@meta.data,
             aes(x = S.Score, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
  facet_wrap(~dataset, ncol = 1) +
  theme_bw() +
  labs(title = "S score distribution per dataset",
       subtitle = "Healthy resting T-cells should peak below 0")

p2 <- ggplot(clean_obj@meta.data,
             aes(x = G2M.Score, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
  facet_wrap(~dataset, ncol = 1) +
  theme_bw() +
  labs(title = "G2M score distribution per dataset")

p1 | p2

# Check which cell types are driving S phase
cat("Phase per cell type:\n")
print(table(clean_obj$Phase, clean_obj$predicted.celltype.l2))

# Check S phase scores per cell type
ggplot(clean_obj@meta.data,
       aes(x    = predicted.celltype.l2,
           y    = S.Score,
           fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x    = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
  labs(title = "S.Score per cell type — values above 0 = cycling",
       x = NULL, y = "S.Score")
```
# Re-embed — UMAP Only (Preserve Existing RPCA Integration)
```{r reembed}
# CRITICAL:
# This object is already SCT normalised and RPCA integrated.
# The integrated assay PCA embedding IS the batch-corrected space.
#
# DO NOT run:  ScaleData (would re-scale and destroy CC regression from SCT)
# DO NOT run:  RunPCA    (would recompute PCA, destroying RPCA integration
#                         and causing datasets to separate again)
# DO NOT run:  SCTransform or IntegrateData
#
# ONLY run:   FindNeighbors and RunUMAP on the EXISTING pca reduction
#             This preserves the RPCA-corrected embedding and batch correction

cat("Available reductions:\n")
print(names(clean_obj@reductions))

DefaultAssay(clean_obj) <- "integrated"

# Use same dims as original integration
dims_use <- 1:20

# Rebuild neighbor graph on EXISTING pca embedding
clean_obj <- FindNeighbors(
  clean_obj,
  reduction = "pca",
  dims      = dims_use,
  verbose   = FALSE
)

# Rerun UMAP on EXISTING pca embedding
# return.model = TRUE is CRITICAL for MapQuery (malignant cell projection)
clean_obj <- RunUMAP(
  clean_obj,
  reduction    = "pca",
  dims         = dims_use,
  return.model = TRUE,
  verbose      = FALSE
)

# Recluster at multiple resolutions for flexibility
clean_obj <- FindClusters(
  clean_obj,
  resolution = c(0.1, 0.2, 0.3, 0.4, 0.5),
  verbose    = FALSE
)

cat("Clusters at resolution 0.3:",
    length(unique(clean_obj$integrated_snn_res.0.3)), "\n")
```

# Verify Integration Preserved
```{r verify-integration, fig.width=20, fig.height=7}
# CRITICAL CHECK: datasets should be well mixed on UMAP
# If they are separated — integration was destroyed and needs to be re-done

p1 <- DimPlot(clean_obj,
              group.by = "dataset",
              shuffle  = TRUE) +
  ggtitle("Dataset mixing — should be well mixed\n(if separated: re-integration needed)")

p2 <- DimPlot(clean_obj,
              group.by = "predicted.celltype.l2",
              label    = TRUE, repel = TRUE, label.box = TRUE) +
  ggtitle("Azimuth l2 labels")

p3 <- DimPlot(clean_obj,
              group.by = "Phase") +
  ggtitle("Cell cycle phase (should be mixed, not separated)")

p1 | p2 | p3

```

# Verify Integration Preserved
```{r , fig.width=9, fig.height=6}

p1 
p2 

# No CD8 cells remain
table(clean_obj$predicted.celltype.l2)

# High confidence scores
summary(clean_obj$mapping.score)      # >0.65 ✓
summary(clean_obj$predicted.celltype.l2.score)  # >0.75 ✓

```






# Azimuth Score Assessment
```{r azimuth-scores, fig.width=14, fig.height=16}
# Assess mapping and prediction scores to decide if l2 labels
# are reliable enough for trajectory analysis

p1 <- ggplot(clean_obj@meta.data,
             aes(x = mapping.score)) +
  geom_histogram(bins = 50, fill = "#4472C4", colour = "white") +
  geom_vline(xintercept = 0.5, colour = "red",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title    = "Mapping score distribution",
       subtitle = "Cells below 0.5 are poorly mapped to PBMC reference",
       x = "Mapping score", y = "Count")

p2 <- ggplot(clean_obj@meta.data,
             aes(x = predicted.celltype.l2.score)) +
  geom_histogram(bins = 50, fill = "#ED7D31", colour = "white") +
  geom_vline(xintercept = 0.5, colour = "red",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title = "Prediction score l2",
       x = "Prediction score", y = "Count")

p3 <- ggplot(clean_obj@meta.data,
             aes(x    = reorder(predicted.celltype.l2,
                                mapping.score, median),
                 y    = mapping.score,
                 fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x     = element_text(angle = 45, hjust = 1, size = 9),
        legend.position = "none") +
  labs(title = "Mapping score per cell type", x = NULL, y = "Mapping score")

p4 <- ggplot(clean_obj@meta.data,
             aes(x    = reorder(predicted.celltype.l2,
                                predicted.celltype.l2.score, median),
                 y    = predicted.celltype.l2.score,
                 fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x     = element_text(angle = 45, hjust = 1, size = 9),
        legend.position = "none") +
  labs(title = "Prediction score per cell type (l2)",
       x = NULL, y = "Prediction score")

(p1 | p2) / p3 / p4

# Scores on UMAP
p5 <- FeaturePlot(clean_obj,
                  features   = "mapping.score",
                  cols       = c("grey90","#C00000"),
                  min.cutoff = 0, max.cutoff = 1) +
  ggtitle("Mapping score on UMAP\n(red = high confidence)")

p6 <- FeaturePlot(clean_obj,
                  features   = "predicted.celltype.l2.score",
                  cols       = c("grey90","#4472C4"),
                  min.cutoff = 0, max.cutoff = 1) +
  ggtitle("Prediction score l2 on UMAP\n(blue = high confidence)")

p5 | p6
```
```{r}

# Works immediately
p_annot <- do_DimPlot(
  sample = clean_obj,
  group.by = "predicted.celltype.l2",
  label = FALSE,  # ← Fix: disable labels
  plot.title = "Azimuth L2 Annotation"
)
p_annot

```




```{r, fig.width=14, fig.height=16}
library(patchwork)

# 4-panel report (exact SCpubr replica)
p1 <- FeaturePlot(clean_obj, features = "mapping.score") + 
      ggtitle("Azimuth Mapping Score")

p2 <- FeaturePlot(clean_obj, features = "predicted.celltype.l2.score") + 
      ggtitle("Azimuth L2 Score")

p3 <- DimPlot(clean_obj, group.by = "predicted.celltype.l2", label = TRUE) +
      ggtitle("Local Clustering vs Azimuth")

p4 <- DimPlot(clean_obj, reduction = "ref.umap", 
              group.by = "predicted.celltype.l2", label = TRUE) +
      ggtitle("PBMC Reference Projection")

# Final report
(p1 | p2) / (p3 | p4)
ggsave("azimuth_manual_report.png", width = 16, height = 10, dpi = 300)



```




```{r}
library(dplyr)

prop_table <- clean_obj@meta.data %>%
  dplyr::count(predicted.celltype.l2) %>%  # ← Explicit dplyr::
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

print(prop_table)
write.csv(prop_table, "celltype_proportions.csv", row.names = FALSE)

```



# Decision Table — Should We Use l2 Labels for Trajectory?
```{r decision-table}
decision_df <- clean_obj@meta.data %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n_cells        = n(),
    median_mapping = round(median(mapping.score, na.rm = TRUE), 3),
    median_pred    = round(median(predicted.celltype.l2.score,
                                  na.rm = TRUE), 3),
    pct_above_0.5  = round(mean(mapping.score >= 0.5,
                                 na.rm = TRUE) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(n_cells))

cat("=== Full decision table ===\n")
print(decision_df)

key_states <- c("CD4 Naive","CD4 TCM","CD4 TEM","CD4 TEMRA/CTL","Treg")
cat("\n=== Key trajectory states ===\n")
print(decision_df %>% filter(predicted.celltype.l2 %in% key_states))

cat("\n=== Decision rule ===\n")
cat("If median mapping score > 0.5 for CD4 Naive + CD4 TCM + Treg:\n")
cat("→ Proceed with Azimuth l2 labels for trajectory\n")
cat("If median mapping score < 0.5 for key states:\n")
cat("→ Fall back to seurat_clusters with manual marker validation\n")
```



# Save Final Reference Object
```{r save}
DefaultAssay(clean_obj) <- "integrated"

saveRDS(
  clean_obj,
  "CD4_reference_clean_Azimuth_ready_for_Slingshot.rds",
  compress = FALSE   # faster for large objects
)

cat("✅ Saved:", ncol(clean_obj), "cells\n")

```




```
