1 1. load libraries

library(dplyr)
library(ggplot2)
library(Seurat)
library(ComplexHeatmap)
library(presto)
library(tictoc)
library(SCpubr)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)

options(timeout = 300)
set.seed(123)

# Colour palette used throughout — do not change
flow_colours <- c(
  "Tnaive" = "#4472C4",
  "Tcm"    = "#ED7D31",
  "Tem"    = "#A9D18E",
  "Temra"  = "#C00000",
  "Treg"   = "#FFD700"
)

2 2. Load Seurat Object


#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
All_samples_Merged <- readRDS("../../CD4_reference_annotated_with_markers.rds")

3 Run Azmimuth and check annotation again


library(SeuratData)
library(Azimuth)
options(timeout = 300)  # Increase download timeout if needed [web:63] 

sample <- All_samples_Merged

rm(All_samples_Merged)

gc()
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells  14034042  749.5   25295498 1351.0  21650150 1156.3
Vcells 313545618 2392.2  514272694 3923.6 334874035 2554.9
DefaultAssay(sample) <- "RNA"

sample <- RunAzimuth(sample, reference = "pbmcref")
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|

  |                                                  | 0 % ~calculating  
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=05s  
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
DefaultAssay(sample) <- "SCT"

SCpubr::do_DimPlot(
  sample = sample,
  reduction = "umap",
  group.by = "predicted.celltype.l2",
  font.size = 18, legend.position = "right",
  legend.title = "Cell type (Healthy CD4 T cells)"
)

NA
NA

3.1 Clean Object

3.2 Rename CD4 CTL → CD4 Temra/CTL

# Rename in metadata
clean_obj$predicted.celltype.l2[clean_obj$predicted.celltype.l2 == "CD4 CTL"] <- "CD4 Temra/CTL"

# Verify
table(clean_obj$predicted.celltype.l2)

    CD4 Naive       CD4 TCM       CD4 TEM CD4 Temra/CTL          Treg 
         2037          9067           145            10           207 

3.3 Define Logical Order for Legend

# Define desired order (Naive → TCM → TEM → Temra → Treg)
cell_order <- c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL", "Treg")

# Set factor levels
clean_obj$predicted.celltype.l2 <- factor(clean_obj$predicted.celltype.l2,
                                          levels = cell_order)

# Verify order
levels(clean_obj$predicted.celltype.l2)
[1] "CD4 Naive"     "CD4 TCM"       "CD4 TEM"       "CD4 Temra/CTL" "Treg"         

3.4 Publication-Quality UMAP Plot

library(ggplot2)
p1 <- DimPlot(clean_obj,
              group.by = "predicted.celltype.l2",
              label = TRUE,
              repel = TRUE,
              cols = c("CD4 Naive" = "#1f77b4",      # Blue
                       "CD4 TCM"   = "#ff7f0e",      # Orange  
                       "CD4 TEM"   = "#2ca02c",      # Green
                       "CD4 Temra/CTL" = "#d62728",      # Red
                       "Treg"      = "#9467bd"),     # Purple
              pt.size = 0.5) +
  guides(color = guide_legend(override.aes = list(size = 4))) +
  labs(title = "Healthy CD4 T Cell Reference Atlas",
       subtitle = "Quiescent states only | Azimuth L2 annotation") +
  theme_void() +
  theme(legend.position = "right",
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        plot.subtitle = element_text(hjust = 0.5, size = 12))

print(p1)

ggsave("healthy_cd4_reference_umap.pdf", p1, width = 10, height = 8, dpi = 300)

# PNG version (for presentations)
ggsave("healthy_cd4_reference_umap.png", p1, 
       width = 10, height = 8, dpi = 300, bg = "white")
library(ggplot2)
library(Azimuth)

# Run this diagnostic to understand what is happening
DefaultAssay(clean_obj) <- "RNA"

# Check S and G2M scores distribution
p1 <- ggplot(clean_obj@meta.data,
             aes(x = S.Score, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
  facet_wrap(~dataset, ncol = 1) +
  theme_bw() +
  labs(title = "S score distribution per dataset",
       subtitle = "Healthy resting T-cells should peak below 0")

p2 <- ggplot(clean_obj@meta.data,
             aes(x = G2M.Score, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
  facet_wrap(~dataset, ncol = 1) +
  theme_bw() +
  labs(title = "G2M score distribution per dataset")

p1 | p2


# Check which cell types are driving S phase
cat("Phase per cell type:\n")
Phase per cell type:
print(table(clean_obj$Phase, clean_obj$predicted.celltype.l2))
     
      CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
  G1       1151    4165      69             6   86
  G2M       107     816      15             1   33
  S         779    4086      61             3   88
# Check S phase scores per cell type
ggplot(clean_obj@meta.data,
       aes(x    = predicted.celltype.l2,
           y    = S.Score,
           fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x    = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
  labs(title = "S.Score per cell type — values above 0 = cycling",
       x = NULL, y = "S.Score")

4 Re-embed — UMAP Only (Preserve Existing RPCA Integration)

# CRITICAL:
# This object is already SCT normalised and RPCA integrated.
# The integrated assay PCA embedding IS the batch-corrected space.
#
# DO NOT run:  ScaleData (would re-scale and destroy CC regression from SCT)
# DO NOT run:  RunPCA    (would recompute PCA, destroying RPCA integration
#                         and causing datasets to separate again)
# DO NOT run:  SCTransform or IntegrateData
#
# ONLY run:   FindNeighbors and RunUMAP on the EXISTING pca reduction
#             This preserves the RPCA-corrected embedding and batch correction

cat("Available reductions:\n")
Available reductions:
print(names(clean_obj@reductions))
[1] "pca"           "umap"          "integrated_dr" "ref.umap"     
DefaultAssay(clean_obj) <- "integrated"

# Use same dims as original integration
dims_use <- 1:20

# Rebuild neighbor graph on EXISTING pca embedding
clean_obj <- FindNeighbors(
  clean_obj,
  reduction = "pca",
  dims      = dims_use,
  verbose   = FALSE
)

# Rerun UMAP on EXISTING pca embedding
# return.model = TRUE is CRITICAL for MapQuery (malignant cell projection)
clean_obj <- RunUMAP(
  clean_obj,
  reduction    = "pca",
  dims         = dims_use,
  return.model = TRUE,
  verbose      = FALSE
)

# Recluster at multiple resolutions for flexibility
clean_obj <- FindClusters(
  clean_obj,
  resolution = c(0.1, 0.2, 0.3, 0.4, 0.5),
  verbose    = FALSE
)

cat("Clusters at resolution 0.3:",
    length(unique(clean_obj$integrated_snn_res.0.3)), "\n")
Clusters at resolution 0.3: 10 

5 Verify Integration Preserved

# CRITICAL CHECK: datasets should be well mixed on UMAP
# If they are separated — integration was destroyed and needs to be re-done

p1 <- DimPlot(clean_obj,
              group.by = "dataset",
              shuffle  = TRUE) +
  ggtitle("Dataset mixing — should be well mixed\n(if separated: re-integration needed)")

p2 <- DimPlot(clean_obj,
              group.by = "predicted.celltype.l2",
              label    = TRUE, repel = TRUE, label.box = TRUE) +
  ggtitle("Azimuth l2 labels")

p3 <- DimPlot(clean_obj,
              group.by = "Phase") +
  ggtitle("Cell cycle phase (should be mixed, not separated)")

p1 | p2 | p3

6 Verify Integration Preserved


p1 

p2 


# No CD8 cells remain
table(clean_obj$predicted.celltype.l2)

    CD4 Naive       CD4 TCM       CD4 TEM CD4 Temra/CTL          Treg 
         2037          9067           145            10           207 
# High confidence scores
summary(clean_obj$mapping.score)      # >0.65 ✓
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1581  0.6618  0.7762  0.7455  0.8731  1.0000 
summary(clean_obj$predicted.celltype.l2.score)  # >0.75 ✓
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2726  0.6740  0.8621  0.8126  0.9653  1.0000 

7 Azimuth Score Assessment

# Assess mapping and prediction scores to decide if l2 labels
# are reliable enough for trajectory analysis

p1 <- ggplot(clean_obj@meta.data,
             aes(x = mapping.score)) +
  geom_histogram(bins = 50, fill = "#4472C4", colour = "white") +
  geom_vline(xintercept = 0.5, colour = "red",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title    = "Mapping score distribution",
       subtitle = "Cells below 0.5 are poorly mapped to PBMC reference",
       x = "Mapping score", y = "Count")

p2 <- ggplot(clean_obj@meta.data,
             aes(x = predicted.celltype.l2.score)) +
  geom_histogram(bins = 50, fill = "#ED7D31", colour = "white") +
  geom_vline(xintercept = 0.5, colour = "red",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title = "Prediction score l2",
       x = "Prediction score", y = "Count")

p3 <- ggplot(clean_obj@meta.data,
             aes(x    = reorder(predicted.celltype.l2,
                                mapping.score, median),
                 y    = mapping.score,
                 fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x     = element_text(angle = 45, hjust = 1, size = 9),
        legend.position = "none") +
  labs(title = "Mapping score per cell type", x = NULL, y = "Mapping score")

p4 <- ggplot(clean_obj@meta.data,
             aes(x    = reorder(predicted.celltype.l2,
                                predicted.celltype.l2.score, median),
                 y    = predicted.celltype.l2.score,
                 fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x     = element_text(angle = 45, hjust = 1, size = 9),
        legend.position = "none") +
  labs(title = "Prediction score per cell type (l2)",
       x = NULL, y = "Prediction score")

(p1 | p2) / p3 / p4


# Scores on UMAP
p5 <- FeaturePlot(clean_obj,
                  features   = "mapping.score",
                  cols       = c("grey90","#C00000"),
                  min.cutoff = 0, max.cutoff = 1) +
  ggtitle("Mapping score on UMAP\n(red = high confidence)")

p6 <- FeaturePlot(clean_obj,
                  features   = "predicted.celltype.l2.score",
                  cols       = c("grey90","#4472C4"),
                  min.cutoff = 0, max.cutoff = 1) +
  ggtitle("Prediction score l2 on UMAP\n(blue = high confidence)")

p5 | p6

library(patchwork)

# 4-panel report (exact SCpubr replica)
p1 <- FeaturePlot(clean_obj, features = "mapping.score") + 
      ggtitle("Azimuth Mapping Score")

p2 <- FeaturePlot(clean_obj, features = "predicted.celltype.l2.score") + 
      ggtitle("Azimuth L2 Score")

p3 <- DimPlot(clean_obj, group.by = "predicted.celltype.l2", label = TRUE) +
      ggtitle("Local Clustering vs Azimuth")

p4 <- DimPlot(clean_obj, reduction = "ref.umap", 
              group.by = "predicted.celltype.l2", label = TRUE) +
      ggtitle("PBMC Reference Projection")

# Final report
(p1 | p2) / (p3 | p4)

ggsave("azimuth_manual_report.png", width = 16, height = 10, dpi = 300)
library(dplyr)

prop_table <- clean_obj@meta.data %>%
  dplyr::count(predicted.celltype.l2) %>%  # ← Explicit dplyr::
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

print(prop_table)
write.csv(prop_table, "celltype_proportions.csv", row.names = FALSE)

8 Decision Table — Should We Use l2 Labels for Trajectory?

decision_df <- clean_obj@meta.data %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n_cells        = n(),
    median_mapping = round(median(mapping.score, na.rm = TRUE), 3),
    median_pred    = round(median(predicted.celltype.l2.score,
                                  na.rm = TRUE), 3),
    pct_above_0.5  = round(mean(mapping.score >= 0.5,
                                 na.rm = TRUE) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(n_cells))

cat("=== Full decision table ===\n")
=== Full decision table ===
print(decision_df)

key_states <- c("CD4 Naive","CD4 TCM","CD4 TEM","CD4 TEMRA/CTL","Treg")
cat("\n=== Key trajectory states ===\n")

=== Key trajectory states ===
print(decision_df %>% filter(predicted.celltype.l2 %in% key_states))

cat("\n=== Decision rule ===\n")

=== Decision rule ===
cat("If median mapping score > 0.5 for CD4 Naive + CD4 TCM + Treg:\n")
If median mapping score > 0.5 for CD4 Naive + CD4 TCM + Treg:
cat("→ Proceed with Azimuth l2 labels for trajectory\n")
→ Proceed with Azimuth l2 labels for trajectory
cat("If median mapping score < 0.5 for key states:\n")
If median mapping score < 0.5 for key states:
cat("→ Fall back to seurat_clusters with manual marker validation\n")
→ Fall back to seurat_clusters with manual marker validation

9 Save Final Reference Object

cat("✅ Saved:", ncol(clean_obj), "cells\n")
✅ Saved: 11466 cells

```

---
title: "CD4 T-cell Reference + Azimuth Annotation Report"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---


# 1. load libraries
```{r setup, include=TRUE}
library(dplyr)
library(ggplot2)
library(Seurat)
library(ComplexHeatmap)
library(presto)
library(tictoc)
library(SCpubr)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(slingshot)
library(SingleCellExperiment)

options(timeout = 300)
set.seed(123)

# Colour palette used throughout — do not change
flow_colours <- c(
  "Tnaive" = "#4472C4",
  "Tcm"    = "#ED7D31",
  "Tem"    = "#A9D18E",
  "Temra"  = "#C00000",
  "Treg"   = "#FFD700"
)
```


# 2. Load Seurat Object 
```{r}

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
All_samples_Merged <- readRDS("../../CD4_reference_annotated_with_markers.rds")

```


# Run Azmimuth and check annotation again
```{r, fig.height=6, fig.width=12}

library(SeuratData)
library(Azimuth)
options(timeout = 300)  # Increase download timeout if needed [web:63] 

sample <- All_samples_Merged

rm(All_samples_Merged)

gc()

DefaultAssay(sample) <- "RNA"

sample <- RunAzimuth(sample, reference = "pbmcref")

DefaultAssay(sample) <- "SCT"

SCpubr::do_DimPlot(
  sample = sample,
  reduction = "umap",
  group.by = "predicted.celltype.l2",
  font.size = 18, legend.position = "right",
  legend.title = "Cell type (Healthy CD4 T cells)"
)


```


## Clean Object
```{r, fig.height=6, fig.width=10}

clean_obj <- subset(
     sample,
     subset = predicted.celltype.l2 %in% c(
         "CD8 Naive", "CD8 TCM", "CD8 TEM", "Platelet", "CD4 Proliferating"
     ),
     invert = TRUE
 )

cat("\nAfter cleaning:", ncol(clean_obj), "cells\n")
cat("\nRemoved cell types:\n")

cat("\nRemaining:\n")
print(table(clean_obj$predicted.celltype.l2))

rm(sample); gc()

SCpubr::do_DimPlot(
  sample = clean_obj,
  reduction = "umap",
  group.by = "predicted.celltype.l2",
  font.size = 18, legend.position = "right",
  legend.title = "Cell type (Healthy CD4 T cells)"
)

```

## Rename CD4 CTL → CD4 Temra/CTL
```{r, fig.height=6, fig.width=10}
# Rename in metadata
clean_obj$predicted.celltype.l2[clean_obj$predicted.celltype.l2 == "CD4 CTL"] <- "CD4 Temra/CTL"

# Verify
table(clean_obj$predicted.celltype.l2)

```


## Define Logical Order for Legend
```{r, fig.height=6, fig.width=10}
# Define desired order (Naive → TCM → TEM → Temra → Treg)
cell_order <- c("CD4 Naive", "CD4 TCM", "CD4 TEM", "CD4 Temra/CTL", "Treg")

# Set factor levels
clean_obj$predicted.celltype.l2 <- factor(clean_obj$predicted.celltype.l2,
                                          levels = cell_order)

# Verify order
levels(clean_obj$predicted.celltype.l2)

```



## Publication-Quality UMAP Plot
```{r, fig.height=6, fig.width=10}
library(ggplot2)
p1 <- DimPlot(clean_obj,
              group.by = "predicted.celltype.l2",
              label = TRUE,
              repel = TRUE,
              cols = c("CD4 Naive" = "#1f77b4",      # Blue
                       "CD4 TCM"   = "#ff7f0e",      # Orange  
                       "CD4 TEM"   = "#2ca02c",      # Green
                       "CD4 Temra/CTL" = "#d62728",      # Red
                       "Treg"      = "#9467bd"),     # Purple
              pt.size = 0.5) +
  guides(color = guide_legend(override.aes = list(size = 4))) +
  labs(title = "Healthy CD4 T Cell Reference Atlas",
       subtitle = "Quiescent states only | Azimuth L2 annotation") +
  theme_void() +
  theme(legend.position = "right",
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        plot.subtitle = element_text(hjust = 0.5, size = 12))

print(p1)
ggsave("healthy_cd4_reference_umap.pdf", p1, width = 10, height = 8, dpi = 300)

# PNG version (for presentations)
ggsave("healthy_cd4_reference_umap.png", p1, 
       width = 10, height = 8, dpi = 300, bg = "white")

```

```{r}
library(ggplot2)
library(Azimuth)

# Run this diagnostic to understand what is happening
DefaultAssay(clean_obj) <- "RNA"

# Check S and G2M scores distribution
p1 <- ggplot(clean_obj@meta.data,
             aes(x = S.Score, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
  facet_wrap(~dataset, ncol = 1) +
  theme_bw() +
  labs(title = "S score distribution per dataset",
       subtitle = "Healthy resting T-cells should peak below 0")

p2 <- ggplot(clean_obj@meta.data,
             aes(x = G2M.Score, fill = dataset)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = 0, colour = "red", linetype = "dashed") +
  facet_wrap(~dataset, ncol = 1) +
  theme_bw() +
  labs(title = "G2M score distribution per dataset")

p1 | p2

# Check which cell types are driving S phase
cat("Phase per cell type:\n")
print(table(clean_obj$Phase, clean_obj$predicted.celltype.l2))

# Check S phase scores per cell type
ggplot(clean_obj@meta.data,
       aes(x    = predicted.celltype.l2,
           y    = S.Score,
           fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x    = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
  labs(title = "S.Score per cell type — values above 0 = cycling",
       x = NULL, y = "S.Score")
```
# Re-embed — UMAP Only (Preserve Existing RPCA Integration)
```{r reembed}
# CRITICAL:
# This object is already SCT normalised and RPCA integrated.
# The integrated assay PCA embedding IS the batch-corrected space.
#
# DO NOT run:  ScaleData (would re-scale and destroy CC regression from SCT)
# DO NOT run:  RunPCA    (would recompute PCA, destroying RPCA integration
#                         and causing datasets to separate again)
# DO NOT run:  SCTransform or IntegrateData
#
# ONLY run:   FindNeighbors and RunUMAP on the EXISTING pca reduction
#             This preserves the RPCA-corrected embedding and batch correction

cat("Available reductions:\n")
print(names(clean_obj@reductions))

DefaultAssay(clean_obj) <- "integrated"

# Use same dims as original integration
dims_use <- 1:20

# Rebuild neighbor graph on EXISTING pca embedding
clean_obj <- FindNeighbors(
  clean_obj,
  reduction = "pca",
  dims      = dims_use,
  verbose   = FALSE
)

# Rerun UMAP on EXISTING pca embedding
# return.model = TRUE is CRITICAL for MapQuery (malignant cell projection)
clean_obj <- RunUMAP(
  clean_obj,
  reduction    = "pca",
  dims         = dims_use,
  return.model = TRUE,
  verbose      = FALSE
)

# Recluster at multiple resolutions for flexibility
clean_obj <- FindClusters(
  clean_obj,
  resolution = c(0.1, 0.2, 0.3, 0.4, 0.5),
  verbose    = FALSE
)

cat("Clusters at resolution 0.3:",
    length(unique(clean_obj$integrated_snn_res.0.3)), "\n")
```

# Verify Integration Preserved
```{r verify-integration, fig.width=20, fig.height=7}
# CRITICAL CHECK: datasets should be well mixed on UMAP
# If they are separated — integration was destroyed and needs to be re-done

p1 <- DimPlot(clean_obj,
              group.by = "dataset",
              shuffle  = TRUE) +
  ggtitle("Dataset mixing — should be well mixed\n(if separated: re-integration needed)")

p2 <- DimPlot(clean_obj,
              group.by = "predicted.celltype.l2",
              label    = TRUE, repel = TRUE, label.box = TRUE) +
  ggtitle("Azimuth l2 labels")

p3 <- DimPlot(clean_obj,
              group.by = "Phase") +
  ggtitle("Cell cycle phase (should be mixed, not separated)")

p1 | p2 | p3

```

# Verify Integration Preserved
```{r , fig.width=9, fig.height=6}

p1 
p2 

# No CD8 cells remain
table(clean_obj$predicted.celltype.l2)

# High confidence scores
summary(clean_obj$mapping.score)      # >0.65 ✓
summary(clean_obj$predicted.celltype.l2.score)  # >0.75 ✓

```






# Azimuth Score Assessment
```{r azimuth-scores, fig.width=14, fig.height=16}
# Assess mapping and prediction scores to decide if l2 labels
# are reliable enough for trajectory analysis

p1 <- ggplot(clean_obj@meta.data,
             aes(x = mapping.score)) +
  geom_histogram(bins = 50, fill = "#4472C4", colour = "white") +
  geom_vline(xintercept = 0.5, colour = "red",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title    = "Mapping score distribution",
       subtitle = "Cells below 0.5 are poorly mapped to PBMC reference",
       x = "Mapping score", y = "Count")

p2 <- ggplot(clean_obj@meta.data,
             aes(x = predicted.celltype.l2.score)) +
  geom_histogram(bins = 50, fill = "#ED7D31", colour = "white") +
  geom_vline(xintercept = 0.5, colour = "red",
             linetype = "dashed", linewidth = 1) +
  theme_bw() +
  labs(title = "Prediction score l2",
       x = "Prediction score", y = "Count")

p3 <- ggplot(clean_obj@meta.data,
             aes(x    = reorder(predicted.celltype.l2,
                                mapping.score, median),
                 y    = mapping.score,
                 fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x     = element_text(angle = 45, hjust = 1, size = 9),
        legend.position = "none") +
  labs(title = "Mapping score per cell type", x = NULL, y = "Mapping score")

p4 <- ggplot(clean_obj@meta.data,
             aes(x    = reorder(predicted.celltype.l2,
                                predicted.celltype.l2.score, median),
                 y    = predicted.celltype.l2.score,
                 fill = predicted.celltype.l2)) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.08, fill = "white", outlier.size = 0.2) +
  geom_hline(yintercept = 0.5, colour = "red", linetype = "dashed") +
  theme_bw() +
  theme(axis.text.x     = element_text(angle = 45, hjust = 1, size = 9),
        legend.position = "none") +
  labs(title = "Prediction score per cell type (l2)",
       x = NULL, y = "Prediction score")

(p1 | p2) / p3 / p4

# Scores on UMAP
p5 <- FeaturePlot(clean_obj,
                  features   = "mapping.score",
                  cols       = c("grey90","#C00000"),
                  min.cutoff = 0, max.cutoff = 1) +
  ggtitle("Mapping score on UMAP\n(red = high confidence)")

p6 <- FeaturePlot(clean_obj,
                  features   = "predicted.celltype.l2.score",
                  cols       = c("grey90","#4472C4"),
                  min.cutoff = 0, max.cutoff = 1) +
  ggtitle("Prediction score l2 on UMAP\n(blue = high confidence)")

p5 | p6
```
```{r}

# Works immediately
p_annot <- do_DimPlot(
  sample = clean_obj,
  group.by = "predicted.celltype.l2",
  label = FALSE,  # ← Fix: disable labels
  plot.title = "Azimuth L2 Annotation"
)
p_annot

```




```{r, fig.width=14, fig.height=16}
library(patchwork)

# 4-panel report (exact SCpubr replica)
p1 <- FeaturePlot(clean_obj, features = "mapping.score") + 
      ggtitle("Azimuth Mapping Score")

p2 <- FeaturePlot(clean_obj, features = "predicted.celltype.l2.score") + 
      ggtitle("Azimuth L2 Score")

p3 <- DimPlot(clean_obj, group.by = "predicted.celltype.l2", label = TRUE) +
      ggtitle("Local Clustering vs Azimuth")

p4 <- DimPlot(clean_obj, reduction = "ref.umap", 
              group.by = "predicted.celltype.l2", label = TRUE) +
      ggtitle("PBMC Reference Projection")

# Final report
(p1 | p2) / (p3 | p4)
ggsave("azimuth_manual_report.png", width = 16, height = 10, dpi = 300)



```




```{r}
library(dplyr)

prop_table <- clean_obj@meta.data %>%
  dplyr::count(predicted.celltype.l2) %>%  # ← Explicit dplyr::
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  arrange(desc(n))

print(prop_table)
write.csv(prop_table, "celltype_proportions.csv", row.names = FALSE)

```



# Decision Table — Should We Use l2 Labels for Trajectory?
```{r decision-table}
decision_df <- clean_obj@meta.data %>%
  group_by(predicted.celltype.l2) %>%
  summarise(
    n_cells        = n(),
    median_mapping = round(median(mapping.score, na.rm = TRUE), 3),
    median_pred    = round(median(predicted.celltype.l2.score,
                                  na.rm = TRUE), 3),
    pct_above_0.5  = round(mean(mapping.score >= 0.5,
                                 na.rm = TRUE) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(n_cells))

cat("=== Full decision table ===\n")
print(decision_df)

key_states <- c("CD4 Naive","CD4 TCM","CD4 TEM","CD4 TEMRA/CTL","Treg")
cat("\n=== Key trajectory states ===\n")
print(decision_df %>% filter(predicted.celltype.l2 %in% key_states))

cat("\n=== Decision rule ===\n")
cat("If median mapping score > 0.5 for CD4 Naive + CD4 TCM + Treg:\n")
cat("→ Proceed with Azimuth l2 labels for trajectory\n")
cat("If median mapping score < 0.5 for key states:\n")
cat("→ Fall back to seurat_clusters with manual marker validation\n")
```



# Save Final Reference Object
```{r save}
DefaultAssay(clean_obj) <- "integrated"

saveRDS(
  clean_obj,
  "CD4_reference_clean_Azimuth_ready_for_Slingshot.rds",
  compress = FALSE   # faster for large objects
)

cat("✅ Saved:", ncol(clean_obj), "cells\n")

```




```
