load libraries
Load RDS with all
annotations
# Ensure clusters are ordered 0-18
All_samples_Merged$seurat_clusters <- factor(All_samples_Merged$seurat_clusters,
levels = as.character(0:18))
# Order samples logically
sample_order <- c("Healthy_Blood", "Healthy_Skin",
"SS1_Blood", "SS1_Skin",
"SS2_Blood", "SS2_Skin",
"SS3_Blood", "SS3_Skin",
"SS4_Blood", "SS4_Skin",
"SS5_Blood", "SS6_Blood")
All_samples_Merged$sample_id <- factor(All_samples_Merged$sample_id,
levels = sample_order)
cat("Total cells:", ncol(All_samples_Merged), "\n")
Total cells: 10785
cat("Total clusters:", length(unique(All_samples_Merged$seurat_clusters)), "\n\n")
Total clusters: 19
cat("Clusters (ordered 0-18):\n")
Clusters (ordered 0-18):
print(table(All_samples_Merged$seurat_clusters))
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
185 92 3241 1006 2278 223 404 230 923 84 829 727 124 150 183 48 4 44 10
cat("\n\nSamples (ordered):\n")
Samples (ordered):
print(table(All_samples_Merged$sample_id))
Healthy_Blood Healthy_Skin SS1_Blood SS1_Skin SS2_Blood SS2_Skin SS3_Blood
4386 33 1635 58 935 38 997
SS3_Skin SS4_Blood SS4_Skin SS5_Blood SS6_Blood
23 373 184 959 1164
Create Annotation
Summary Table by Clusters
library(dplyr)
library(tidyr)
library(purrr)
# Define annotation methods
methods <- c(
"predicted.id", # scPred
"predicted.celltype.l2", # Azimuth (l2 prediction)
"singler.immune", # SingleR
"scATOMIC_annotation" # scATOMIC
)
# Create summary - most common label per cluster for each method
annotation_summary <- map_dfr(methods, function(m) {
df <- All_samples_Merged@meta.data
df %>%
filter(!is.na(.data[[m]])) %>%
group_by(seurat_clusters, label = .data[[m]]) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(seurat_clusters) %>%
slice_max(n, n = 1, with_ties = FALSE) %>%
mutate(method = m)
})
# Rename methods for display
annotation_summary <- annotation_summary %>%
mutate(method = recode(method,
"predicted.id" = "scPred",
"predicted.celltype.l2" = "Azimuth.l2",
"singler.immune" = "SingleR(Immune)",
"scATOMIC_annotation" = "scATOMIC"))
# Set method order
annotation_summary$method <- factor(annotation_summary$method,
levels = c("scPred", "Azimuth.l2",
"SingleR(Immune)", "scATOMIC"))
# Ensure cluster order 0-18
annotation_summary$seurat_clusters <- factor(annotation_summary$seurat_clusters,
levels = as.character(0:18))
# Save annotation summary by clusters (long format)
write.csv(annotation_summary,
"annotation_summary_by_clusters_04-02-2026.csv",
row.names = FALSE)
# Create and save wide format
annotation_table_clusters_wide <- annotation_summary %>%
select(seurat_clusters, method, label) %>%
pivot_wider(names_from = method, values_from = label)
write.csv(annotation_table_clusters_wide,
"annotation_table_clusters_wide_04-02-2026.csv",
row.names = FALSE)
# Display
cat("\n=== Annotation Summary by Clusters ===\n\n")
=== Annotation Summary by Clusters ===
print(annotation_summary)
# A tibble: 76 × 4
# Groups: seurat_clusters [19]
seurat_clusters label n method
<fct> <chr> <int> <fct>
1 0 CD4 T cell 123 scPred
2 1 cMono 85 scPred
3 2 CD4 T cell 3137 scPred
4 3 CD8 T cell 981 scPred
5 4 CD8 T cell 1393 scPred
6 5 NK cell 210 scPred
7 6 B cell 374 scPred
8 7 CD4 T cell 230 scPred
9 8 CD4 T cell 922 scPred
10 9 ncMono 84 scPred
# ℹ 66 more rows
# ℹ Use `print(n = ...)` to see more rows
cat("\n✓ Saved: annotation_summary_by_clusters_04-02-2026.csv (long format)\n")
✓ Saved: annotation_summary_by_clusters_04-02-2026.csv (long format)
cat("✓ Saved: annotation_table_clusters_wide_04-02-2026.csv (wide format)\n")
✓ Saved: annotation_table_clusters_wide_04-02-2026.csv (wide format)
Summary
Statistics
# Cell counts per method
cat("\n=== Cells Annotated Per Method ===\n\n")
=== Cells Annotated Per Method ===
method_cols <- c("predicted.id", "predicted.celltype.l2", "singler.immune", "scATOMIC_annotation")
for(m in method_cols){
n_annotated <- sum(!is.na(All_samples_Merged@meta.data[[m]]))
cat(sprintf("%-25s: %5d cells (%.1f%%)\n",
m, n_annotated, n_annotated/ncol(All_samples_Merged)*100))
}
predicted.id : 10785 cells (100.0%)
predicted.celltype.l2 : 10785 cells (100.0%)
singler.immune : 10710 cells (99.3%)
scATOMIC_annotation : 10752 cells (99.7%)
# Number of unique labels per method
cat("\n=== Unique Labels Per Method ===\n\n")
=== Unique Labels Per Method ===
annotation_summary %>%
group_by(method) %>%
summarise(n_unique_labels = n_distinct(label)) %>%
print()
# A tibble: 4 × 2
method n_unique_labels
<fct> <int>
1 scPred 8
2 Azimuth.l2 10
3 SingleR(Immune) 9
4 scATOMIC 10
Basic Heatmap
Visualization
library(ggplot2)
ggplot(annotation_summary, aes(x = seurat_clusters, y = method, fill = label)) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_discrete() +
labs(
x = "Seurat Clusters",
y = "Annotation Method",
fill = "Assigned Cell Type",
title = "Cross-Method Comparison of Cell Type Annotations"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)
)

NA
NA
Enhanced Heatmap with
Custom Colors
library(RColorBrewer)
# Generate color palette
num_colors <- length(unique(annotation_summary$label))
my_colors <- colorRampPalette(brewer.pal(8, "Set2"))(num_colors)
p_heatmap <- ggplot(annotation_summary, aes(x = seurat_clusters, y = method, fill = label)) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_manual(values = my_colors) +
labs(
x = "Seurat Clusters",
y = "Annotation Method",
fill = "Assigned Cell Type",
title = "Cross-Method Comparison of Cell Type Annotations"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
legend.position = "right"
)
print(p_heatmap)

# Save high-quality versions
ggsave("annotation_heatmap_04-02-2026.png", plot = p_heatmap,
width = 14, height = 5, dpi = 300, bg = "white")
ggsave("annotation_heatmap_04-02-2026.pdf", plot = p_heatmap,
width = 14, height = 5)
cat("\n✓ Heatmap saved as PNG and PDF\n")
✓ Heatmap saved as PNG and PDF
Heatmap: Methods ×
Clusters
library(RColorBrewer)
# Generate color palette
num_colors <- length(unique(annotation_summary$label))
my_colors <- colorRampPalette(brewer.pal(8, "Set2"))(num_colors)
p_heatmap_clusters <- ggplot(annotation_summary,
aes(x = seurat_clusters, y = method, fill = label)) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_manual(values = my_colors) +
scale_x_discrete(drop = FALSE) + # Keep all cluster levels even if missing
labs(
x = "Seurat Clusters (0-18)",
y = "Annotation Method",
fill = "Assigned Cell Type",
title = "Cross-Method Comparison of Cell Type Annotations"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
legend.position = "right"
)
print(p_heatmap_clusters)

# Save
ggsave("annotation_heatmap_methods_clusters_04-02-2026.png",
plot = p_heatmap_clusters, width = 14, height = 5, dpi = 300, bg = "white")
ggsave("annotation_heatmap_methods_clusters_04-02-2026.pdf",
plot = p_heatmap_clusters, width = 14, height = 5)
cat("\n✓ Methods × Clusters heatmap saved\n")
✓ Methods × Clusters heatmap saved
Heatmap: Clusters ×
Samples (scATOMIC annotations)
# Create summary by cluster and sample for scATOMIC
cluster_sample_summary <- All_samples_Merged@meta.data %>%
filter(!is.na(scATOMIC_annotation)) %>%
group_by(seurat_clusters, sample_id, scATOMIC_annotation) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(seurat_clusters, sample_id) %>%
slice_max(n, n = 1, with_ties = FALSE) %>%
ungroup()
# Ensure cluster and sample order
cluster_sample_summary$seurat_clusters <- factor(cluster_sample_summary$seurat_clusters,
levels = as.character(0:18))
cluster_sample_summary$sample_id <- factor(cluster_sample_summary$sample_id,
levels = sample_order)
# Plot
p_cluster_sample <- ggplot(cluster_sample_summary,
aes(x = sample_id, y = seurat_clusters, fill = scATOMIC_annotation)) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_manual(values = my_colors) +
scale_y_discrete(drop = FALSE) + # Keep all cluster levels
scale_x_discrete(drop = FALSE) + # Keep all sample levels
labs(
x = "Sample ID",
y = "Seurat Clusters (0-18)",
fill = "scATOMIC Annotation",
title = "Cluster Distribution Across Samples (scATOMIC Annotation)"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
legend.position = "right"
)
print(p_cluster_sample)

# Save
ggsave("annotation_heatmap_clusters_samples_04-02-2026.png",
plot = p_cluster_sample, width = 16, height = 10, dpi = 300, bg = "white")
ggsave("annotation_heatmap_clusters_samples_04-02-2026.pdf",
plot = p_cluster_sample, width = 16, height = 10)
cat("\n✓ Clusters × Samples heatmap saved\n")
✓ Clusters × Samples heatmap saved
Alluvial Diagram -
Annotation Flow Across Methods
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggalluvial)
# Extract columns safely using base R (avoids select() conflicts)
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2",
"singler.immune", "scATOMIC_annotation")
alluvial_df_raw <- All_samples_Merged@meta.data[, cols_needed]
colnames(alluvial_df_raw) <- c("Cluster", "scPred", "Azimuth", "SingleR", "scATOMIC")
# Summarize and filter
alluvial_data <- alluvial_df_raw %>%
mutate(Cluster = factor(as.character(Cluster), levels = as.character(0:18))) %>%
filter(!is.na(scPred) & !is.na(Azimuth) & !is.na(SingleR) & !is.na(scATOMIC)) %>%
group_by(Cluster, scPred, Azimuth, SingleR, scATOMIC) %>%
summarise(Freq = n(), .groups = "drop") %>%
filter(Freq > 5)
# Check
print(paste("Rows in plot data:", nrow(alluvial_data)))
[1] "Rows in plot data: 151"
# Plot
p_alluvial <- ggplot(
alluvial_data,
aes(axis1 = scPred, axis2 = Azimuth, axis3 = SingleR, axis4 = scATOMIC, y = Freq)
) +
geom_alluvium(aes(fill = scATOMIC), alpha = 0.7, curve_type = "sigmoid") +
geom_stratum(width = 1/5, fill = "white", color = "grey50") +
geom_text(stat = "stratum", aes(label = after_stat(stratum)),
size = 3, fontface = "bold") +
scale_x_discrete(
limits = c("scPred", "Azimuth", "SingleR", "scATOMIC"),
expand = c(0.15, 0.05)
) +
labs(
title = "Annotation Flow Across Methods",
subtitle = "Flow shows how cell annotations change across different methods",
y = "Number of Cells",
fill = "scATOMIC Label"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "bottom",
axis.text.x = element_text(face = "bold", size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
plot.subtitle = element_text(hjust = 0.5, size = 12),
legend.text = element_text(size = 8)
) +
guides(fill = guide_legend(nrow = 3))
print(p_alluvial)

# Save
ggsave("annotation_alluvial_flow_04-02-2026.png", plot = p_alluvial,
width = 12, height = 8, dpi = 300, bg = "white")
ggsave("annotation_alluvial_flow_04-02-2026.pdf", plot = p_alluvial,
width = 12, height = 8)
cat("\n✓ Alluvial diagram saved\n")
✓ Alluvial diagram saved
Method Agreement
Analysis
Calculate Pairwise
Agreement
library(dplyr)
library(tidyr)
# Create matrix: clusters × methods
# Step 1: Extract columns safely
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2",
"singler.immune", "scATOMIC_annotation")
cluster_annotation_df <- All_samples_Merged@meta.data[, cols_needed]
# Step 2: Pivot and find dominant label per cluster per method
cluster_annotation_matrix <- cluster_annotation_df %>%
pivot_longer(cols = -seurat_clusters, names_to = "method", values_to = "label") %>%
filter(!is.na(label)) %>%
group_by(seurat_clusters, method, label) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(seurat_clusters, method) %>%
slice_max(n, n = 1, with_ties = FALSE) %>%
ungroup() %>%
dplyr::select(seurat_clusters, method, label) %>%
pivot_wider(names_from = method, values_from = label)
# Convert to matrix
mat <- as.matrix(cluster_annotation_matrix[, -1])
rownames(mat) <- cluster_annotation_matrix$seurat_clusters
# Calculate method similarity (Agreement proportion)
method_similarity <- function(m1, m2) {
sum(m1 == m2, na.rm = TRUE) / sum(!is.na(m1) & !is.na(m2))
}
# Create similarity matrix
n_methods <- ncol(mat)
sim_matrix <- matrix(0, n_methods, n_methods)
colnames(sim_matrix) <- c("scPred", "Azimuth.l2", "SingleR", "scATOMIC")
rownames(sim_matrix) <- c("scPred", "Azimuth.l2", "SingleR", "scATOMIC")
for(i in 1:n_methods) {
for(j in 1:n_methods) {
sim_matrix[i, j] <- method_similarity(mat[, i], mat[, j])
}
}
# Display agreement matrix
cat("\n=== Pairwise Agreement Between Methods ===\n\n")
=== Pairwise Agreement Between Methods ===
print(round(sim_matrix, 3))
scPred Azimuth.l2 SingleR scATOMIC
scPred 1.000 0 0.105 0
Azimuth.l2 0.000 1 0.000 0
SingleR 0.105 0 1.000 0
scATOMIC 0.000 0 0.000 1
# Save as CSV
write.csv(sim_matrix, "method_agreement_matrix_04-02-2026.csv", row.names = TRUE)
cat("\n✓ Saved: method_agreement_matrix_04-02-2026.csv\n")
✓ Saved: method_agreement_matrix_04-02-2026.csv
Agreement
Heatmap
library(pheatmap)
# Plot agreement heatmap
pheatmap(
sim_matrix,
display_numbers = TRUE,
number_format = "%.2f",
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("white", "orange", "red"))(100),
main = "Pairwise Agreement Between Annotation Methods",
fontsize = 14,
fontsize_number = 12,
angle_col = 45,
border_color = "grey60"
)
# Save PNG
png("annotation_method_agreement_heatmap_04-02-2026.png",
width = 10, height = 10, units = "in", res = 300)

pheatmap(
sim_matrix,
display_numbers = TRUE,
number_format = "%.2f",
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("white", "orange", "red"))(100),
main = "Pairwise Agreement Between Annotation Methods",
fontsize = 14,
fontsize_number = 12,
angle_col = 45,
border_color = "grey60"
)
dev.off()
png
3
# Save PDF
pdf("annotation_method_agreement_heatmap_04-02-2026.pdf",
width = 10, height = 10)

pheatmap(
sim_matrix,
display_numbers = TRUE,
number_format = "%.2f",
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("white", "orange", "red"))(100),
main = "Pairwise Agreement Between Annotation Methods",
fontsize = 14,
fontsize_number = 12,
angle_col = 45,
border_color = "grey60"
)
dev.off()
png
3
cat("\n✓ Agreement heatmap saved\n")
✓ Agreement heatmap saved
Hierarchical
Clustering Dendrogram
# Convert similarity to distance
dist_matrix <- as.dist(1 - sim_matrix)
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'
# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
par(cex = 1.2)
plot(hc,
main = "Hierarchical Clustering of Annotation Methods",
xlab = "Annotation Method",
ylab = "Distance (1 - Agreement)",
hang = -1,
cex.main = 1.5,
cex.lab = 1.3,
cex.axis = 1.2)
# Save PNG
png("annotation_method_dendrogram_04-02-2026.png",
width = 12, height = 8, units = "in", res = 300)
par(cex = 1.2)
plot(hc,
main = "Hierarchical Clustering of Annotation Methods",
xlab = "Annotation Method",
ylab = "Distance (1 - Agreement)",
hang = -1,
cex.main = 1.5,
cex.lab = 1.3,
cex.axis = 1.2)
dev.off()
png
2
# Save PDF
pdf("annotation_method_dendrogram_04-02-2026.pdf",
width = 12, height = 8)
par(cex = 1.2)
plot(hc,
main = "Hierarchical Clustering of Annotation Methods",
xlab = "Annotation Method",
ylab = "Distance (1 - Agreement)",
hang = -1,
cex.main = 1.5,
cex.lab = 1.3,
cex.axis = 1.2)
dev.off()
png
2

cat("\n✓ Dendrogram saved\n")
✓ Dendrogram saved
Dominant Label Table
(Simplest)
# A clean table showing the dominant label from each method per cluster:
library(dplyr)
library(knitr)
library(kableExtra)
# Extract dominant label per cluster per method
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2",
"singler.immune", "scATOMIC_annotation")
annotation_table <- All_samples_Merged@meta.data[, cols_needed] %>%
group_by(seurat_clusters) %>%
summarise(
scPred = names(sort(table(predicted.id), decreasing = TRUE))[1],
Azimuth = names(sort(table(predicted.celltype.l2), decreasing = TRUE))[1],
SingleR = names(sort(table(singler.immune), decreasing = TRUE))[1],
scATOMIC = names(sort(table(scATOMIC_annotation), decreasing = TRUE))[1],
n_cells = n(),
.groups = "drop"
) %>%
arrange(seurat_clusters)
# Add consensus column (TRUE if all 4 agree)
annotation_table <- annotation_table %>%
mutate(
Consensus = (scPred == Azimuth & Azimuth == SingleR & SingleR == scATOMIC)
)
# Display
kable(annotation_table, caption = "Dominant Cell Type Annotation per Cluster") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, font_size = 10) %>%
column_spec(7, background = ifelse(annotation_table$Consensus, "#d4edda", "#f8d7da"))
Dominant Cell Type Annotation per Cluster
| seurat_clusters |
scPred |
Azimuth |
SingleR |
scATOMIC |
n_cells |
Consensus |
| 0 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, Th1 |
Effector/Memory CD4+ T cells |
185 |
FALSE |
| 1 |
cMono |
CD14 Mono |
Monocytes, CD14+ |
CD14 Monocyte |
92 |
FALSE |
| 2 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, memory TREG |
Naive CD4+ T cells |
3241 |
FALSE |
| 3 |
CD8 T cell |
CD8 TEM |
NK cells |
Effector/Memory CD8+ T cells |
1006 |
FALSE |
| 4 |
CD8 T cell |
CD4 TCM |
T cells, CD8+, naive |
Naive CD8+ T cells |
2278 |
FALSE |
| 5 |
NK cell |
NK |
NK cells |
Natural killer cell |
223 |
FALSE |
| 6 |
B cell |
B naive |
B cells, naive |
B Cell |
404 |
FALSE |
| 7 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, Th2 |
Naive CD4+ T cells |
230 |
FALSE |
| 8 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, Th17 |
Naive CD4+ T cells |
923 |
FALSE |
| 9 |
ncMono |
CD16 Mono |
Monocytes, CD16+ |
CD16 Monocyte |
84 |
FALSE |
| 10 |
cMono |
CD14 Mono |
Monocytes, CD14+ |
CD14 Monocyte |
829 |
FALSE |
| 11 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, memory TREG |
Naive CD4+ T cells |
727 |
FALSE |
| 12 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, memory TREG |
Effector/Memory CD4+ T cells |
124 |
FALSE |
| 13 |
cDC |
cDC2 |
Monocytes, CD14+ |
cDC2 |
150 |
FALSE |
| 14 |
CD4 T cell |
CD4 Proliferating |
T cells, CD4+, memory TREG |
Effector/Memory CD8+ T cells |
183 |
FALSE |
| 15 |
CD4 T cell |
HSPC |
T cells, CD4+, Th2 |
HSPC |
48 |
FALSE |
| 16 |
CD4 T cell |
CD4 TCM |
T cells, CD4+, memory TREG |
Effector/Memory CD4+ T cells |
4 |
FALSE |
| 17 |
Plasma cell |
Plasmablast |
Monocytes, CD14+ |
CD14 Monocyte |
44 |
FALSE |
| 18 |
CD8 T cell |
CD8 TEM |
NK cells |
Effector/Memory CD8+ T cells |
10 |
FALSE |
# Save as CSV
write.csv(annotation_table, "annotation_comparison_clean_table_04-02-2026.csv",
row.names = FALSE)
cat("\n✓ Clean annotation table saved\n")
✓ Clean annotation table saved
Simplified Text-Based
Heatmap
library(ggplot2)
library(dplyr)
library(tidyr)
# Prepare data
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2",
"singler.immune", "scATOMIC_annotation")
heatmap_data <- All_samples_Merged@meta.data[, cols_needed] %>%
group_by(seurat_clusters) %>%
summarise(
scPred = names(sort(table(predicted.id), decreasing = TRUE))[1],
Azimuth = names(sort(table(predicted.celltype.l2), decreasing = TRUE))[1],
SingleR = names(sort(table(singler.immune), decreasing = TRUE))[1],
scATOMIC = names(sort(table(scATOMIC_annotation), decreasing = TRUE))[1],
.groups = "drop"
) %>%
pivot_longer(cols = c(scPred, Azimuth, SingleR, scATOMIC),
names_to = "Method", values_to = "CellType") %>%
mutate(
Method = factor(Method, levels = c("scPred", "Azimuth", "SingleR", "scATOMIC")),
seurat_clusters = factor(seurat_clusters, levels = as.character(0:18))
)
# Create text-based heatmap
p_text_heatmap <- ggplot(heatmap_data, aes(x = seurat_clusters, y = Method, label = CellType)) +
geom_tile(color = "grey30", fill = "white", linewidth = 0.5) +
geom_text(size = 2.5, hjust = 0.5, vjust = 0.5) +
labs(
title = "Cell Type Annotation Comparison Across Methods",
x = "Seurat Cluster",
y = "Annotation Method"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
panel.grid = element_blank()
)
print(p_text_heatmap)

# Save
ggsave("annotation_text_heatmap_04-02-2026.png",
plot = p_text_heatmap, width = 28, height = 4, dpi = 300, bg = "white")
ggsave("annotation_text_heatmap_04-02-2026.pdf",
plot = p_text_heatmap, width = 28, height = 4)
cat("\n✓ Text-based heatmap saved\n")
✓ Text-based heatmap saved
Dot Plot with
Consensus Highlighting
library(ggplot2)
# Add consensus column
heatmap_data_consensus <- heatmap_data %>%
group_by(seurat_clusters) %>%
mutate(
all_same = n_distinct(CellType) == 1,
CellType_short = ifelse(nchar(CellType) > 15,
paste0(substr(CellType, 1, 12), "..."),
CellType)
) %>%
ungroup()
# Create dot plot
p_dotplot <- ggplot(heatmap_data_consensus,
aes(x = seurat_clusters, y = Method, fill = all_same)) +
geom_tile(color = "grey40", linewidth = 0.3) +
geom_text(aes(label = CellType_short), size = 2.2, fontface = "bold") +
scale_fill_manual(
values = c("TRUE" = "#d4edda", "FALSE" = "white"),
labels = c("TRUE" = "Consensus", "FALSE" = "No consensus"),
name = ""
) +
labs(
title = "Cell Type Annotation Comparison with Consensus Highlighting",
x = "Seurat Cluster (0-18)",
y = "Annotation Method"
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(face = "bold", size = 11),
axis.text.y = element_text(face = "bold", size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
panel.grid = element_blank(),
legend.position = "top"
)
print(p_dotplot)

# Save
ggsave("annotation_dotplot_consensus_04-02-2026.png",
plot = p_dotplot, width = 16, height = 6, dpi = 300, bg = "white")
ggsave("annotation_dotplot_consensus_04-02-2026.pdf",
plot = p_dotplot, width = 16, height = 6)
cat("\n✓ Dot plot with consensus highlighting saved\n")
✓ Dot plot with consensus highlighting saved
Per-Cluster
Annotation Distribution
library(ggplot2)
library(dplyr)
library(tidyr)
library(patchwork)
# Function to plot annotation distribution for one cluster
plot_cluster_agreement <- function(cluster_id) {
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2",
"singler.immune", "scATOMIC_annotation")
# Extract and filter using base R
temp_df <- All_samples_Merged@meta.data[, cols_needed]
temp_df <- temp_df[temp_df$seurat_clusters == cluster_id, ]
# Rename columns
colnames(temp_df) <- c("Cluster", "scPred", "Azimuth", "SingleR", "scATOMIC")
# Now use dplyr safely with explicit namespace
df <- temp_df %>%
dplyr::select(-Cluster) %>%
pivot_longer(everything(), names_to = "method", values_to = "label") %>%
filter(!is.na(label)) %>%
dplyr::count(method, label) %>%
group_by(method) %>%
mutate(prop = n / sum(n)) %>%
ungroup()
ggplot(df, aes(x = method, y = prop, fill = label)) +
geom_bar(stat = "identity") +
labs(title = paste("Cluster", cluster_id),
y = "Proportion", x = "") +
theme_minimal(base_size = 10) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")
)
}
# Create plots for clusters 0-18
clusters <- as.character(0:18)
cluster_plots <- lapply(clusters, plot_cluster_agreement)
# Combine plots
p_combined <- wrap_plots(cluster_plots, ncol = 5) +
plot_annotation(
title = "Annotation Distribution Across Methods per Cluster (0-18)",
theme = theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16))
)
print(p_combined)

# Save
ggsave("annotation_per_cluster_comparison_04-02-2026.png",
plot = p_combined, width = 16, height = 12, dpi = 300, bg = "white")
ggsave("annotation_per_cluster_comparison_04-02-2026.pdf",
plot = p_combined, width = 16, height = 12)
cat("\n✓ Per-cluster comparison saved\n")
✓ Per-cluster comparison saved
Improved Version for
Manuscript Comparison
library(ggplot2)
library(dplyr)
library(tidyr)
# Prepare data
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2",
"singler.immune", "scATOMIC_annotation")
heatmap_data <- All_samples_Merged@meta.data[, cols_needed] %>%
group_by(seurat_clusters) %>%
summarise(
scPred = names(sort(table(predicted.id), decreasing = TRUE))[1],
Azimuth = names(sort(table(predicted.celltype.l2), decreasing = TRUE))[1],
SingleR = names(sort(table(singler.immune), decreasing = TRUE))[1],
scATOMIC = names(sort(table(scATOMIC_annotation), decreasing = TRUE))[1],
.groups = "drop"
) %>%
pivot_longer(cols = c(scPred, Azimuth, SingleR, scATOMIC),
names_to = "Method", values_to = "CellType") %>%
mutate(
Method = factor(Method, levels = c("scATOMIC", "SingleR", "Azimuth", "scPred")),
seurat_clusters = factor(seurat_clusters, levels = as.character(0:18)),
# Shorten long labels for readability
CellType_display = ifelse(nchar(CellType) > 20,
paste0(substr(CellType, 1, 17), "..."),
CellType)
)
# Create publication-quality heatmap
p_manuscript <- ggplot(heatmap_data,
aes(x = seurat_clusters, y = Method, label = CellType_display)) +
geom_tile(color = "grey20", fill = "white", linewidth = 0.8) +
geom_text(size = 3, hjust = 0.5, vjust = 0.5, fontface = "bold") +
labs(
title = "Discordance in Automated Cell Type Annotation - Herrera Dataset",
x = "Seurat Cluster",
y = "Annotation Method"
) +
theme_minimal(base_size = 16) +
theme(
axis.text.x = element_text(face = "bold", size = 14),
axis.text.y = element_text(face = "bold", size = 14),
axis.title = element_text(size = 16, face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
panel.grid = element_blank(),
plot.margin = margin(10, 10, 10, 10)
)
print(p_manuscript)

# Save high-quality versions for manuscript
ggsave("FigureX_Herrera_Annotation_Comparison.png",
plot = p_manuscript, width = 28, height = 5, dpi = 600, bg = "white")
ggsave("FigureX_Herrera_Annotation_Comparison.pdf",
plot = p_manuscript, width = 28, height = 5)
ggsave("FigureX_Herrera_Annotation_Comparison.tiff",
plot = p_manuscript, width = 28, height = 5, dpi = 600, bg = "white")
cat("\n✓ Herrera manuscript figure saved (PNG, PDF, TIFF at 600 DPI)\n")
✓ Herrera manuscript figure saved (PNG, PDF, TIFF at 600 DPI)
#
# **Figure X. Automated annotation methods show poor concordance across two independent Sézary syndrome datasets.**
# **(A)** Cell line-derived Sézary samples. **(B)** Patient-derived Sézary samples (Herrera dataset). Each cell displays the dominant annotation assigned by four reference-based methods (scATOMIC, SingleR, Azimuth, and scPred) to cells within each cluster. Despite using identical methodology, methods assign divergent labels in both datasets, demonstrating that malignant T-cell populations from Sézary syndrome consistently fail to match healthy reference atlases. This cross-dataset validation highlights the necessity for manual curation or disease-specific annotation strategies.
sessionInfo()
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=fr_FR.UTF-8
[4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] kableExtra_1.4.0 knitr_1.51 ggalluvial_0.12.5
[4] RColorBrewer_1.1-3 purrr_1.2.1 tidyr_1.3.2
[7] Azimuth_0.5.0 shinyBS_0.63.0 pbmcsca.SeuratData_3.0.0
[10] pbmcref.SeuratData_1.0.0 SeuratData_0.2.2.9002 SeuratDisk_0.0.0.9021
[13] presto_1.0.0 data.table_1.18.2.1 Rcpp_1.1.1
[16] remotes_2.5.0 SingleR_2.12.0 celldex_1.20.0
[19] SummarizedExperiment_1.40.0 Biobase_2.70.0 GenomicRanges_1.62.1
[22] Seqinfo_1.0.0 IRanges_2.44.0 S4Vectors_0.48.0
[25] BiocGenerics_0.56.0 generics_0.1.4 MatrixGenerics_1.22.0
[28] matrixStats_1.5.0 scPred_1.9.2 pheatmap_1.0.13
[31] ggplot2_4.0.1 patchwork_1.3.2 dplyr_1.1.4
[34] Seurat_5.4.0 SeuratObject_5.3.0 sp_2.2-0
loaded via a namespace (and not attached):
[1] dichromat_2.0-0.1 nnet_7.3-20
[3] goftest_1.2-3 DT_0.34.0
[5] Biostrings_2.78.0 HDF5Array_1.38.0
[7] vctrs_0.7.1 spatstat.random_3.4-4
[9] digest_0.6.39 png_0.1-8
[11] gypsum_1.6.0 ggrepel_0.9.6
[13] deldir_2.0-4 parallelly_1.46.1
[15] MASS_7.3-65 Signac_1.16.0
[17] reshape2_1.4.5 httpuv_1.6.16
[19] foreach_1.5.2 withr_3.0.2
[21] xfun_0.56 survival_3.8-3
[23] EnsDb.Hsapiens.v86_2.99.0 memoise_2.0.1
[25] ggbeeswarm_0.7.3 systemfonts_1.3.1
[27] ragg_1.5.0 zoo_1.8-15
[29] gtools_3.9.5 pbapply_1.7-4
[31] KEGGREST_1.50.0 promises_1.5.0
[33] otel_0.2.0 httr_1.4.7
[35] restfulr_0.0.16 globals_0.18.0
[37] fitdistrplus_1.2-6 rhdf5filters_1.22.0
[39] rhdf5_2.54.1 rstudioapi_0.18.0
[41] UCSC.utils_1.6.1 miniUI_0.1.2
[43] curl_7.0.0 h5mread_1.2.1
[45] polyclip_1.10-7 ExperimentHub_3.0.0
[47] SparseArray_1.10.8 xtable_1.8-4
[49] stringr_1.6.0 evaluate_1.0.5
[51] S4Arrays_1.10.1 BiocFileCache_3.0.0
[53] irlba_2.3.5.1 filelock_1.0.3
[55] hdf5r_1.3.12 ROCR_1.0-12
[57] harmony_1.2.4 reticulate_1.44.1
[59] spatstat.data_3.1-9 magrittr_2.0.4
[61] lmtest_0.9-40 later_1.4.5
[63] lattice_0.22-7 spatstat.geom_3.7-0
[65] future.apply_1.20.1 scattermore_1.2
[67] XML_3.99-0.20 cowplot_1.2.0
[69] RcppAnnoy_0.0.23 class_7.3-23
[71] pillar_1.11.1 nlme_3.1-168
[73] iterators_1.0.14 pwalign_1.6.0
[75] caTools_1.18.3 compiler_4.5.2
[77] beachmat_2.26.0 RSpectra_0.16-2
[79] stringi_1.8.7 gower_1.0.2
[81] tensor_1.5.1 lubridate_1.9.4
[83] GenomicAlignments_1.46.0 plyr_1.8.9
[85] crayon_1.5.3 abind_1.4-8
[87] BiocIO_1.20.0 googledrive_2.1.2
[89] bit_4.6.0 fastmatch_1.1-8
[91] textshaping_1.0.4 codetools_0.2-20
[93] recipes_1.3.1 bslib_0.10.0
[95] alabaster.ranges_1.10.0 plotly_4.12.0
[97] mime_0.13 splines_4.5.2
[99] fastDummies_1.7.5 dbplyr_2.5.1
[101] sparseMatrixStats_1.22.0 cellranger_1.1.0
[103] utf8_1.2.6 blob_1.3.0
[105] BiocVersion_3.22.0 seqLogo_1.76.0
[107] AnnotationFilter_1.34.0 fs_1.6.6
[109] listenv_0.10.0 DelayedMatrixStats_1.32.0
[111] tibble_3.3.1 Matrix_1.7-4
[113] svglite_2.2.2 pkgconfig_2.0.3
[115] tools_4.5.2 cachem_1.1.0
[117] cigarillo_1.0.0 RSQLite_2.4.5
[119] viridisLite_0.4.2 DBI_1.2.3
[121] rmarkdown_2.30 fastmap_1.2.0
[123] scales_1.4.0 grid_4.5.2
[125] ica_1.0-3 shinydashboard_0.7.3
[127] Rsamtools_2.26.0 sass_0.4.10
[129] AnnotationHub_4.0.0 BiocManager_1.30.27
[131] dotCall64_1.2 RANN_2.6.2
[133] alabaster.schemas_1.10.0 rpart_4.1.24
[135] farver_2.1.2 yaml_2.3.12
[137] rtracklayer_1.70.1 cli_3.6.5
[139] lifecycle_1.0.5 caret_7.0-1
[141] uwot_0.2.4 lava_1.8.2
[143] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BiocParallel_1.44.0
[145] timechange_0.3.0 gtable_0.3.6
[147] rjson_0.2.23 ggridges_0.5.7
[149] progressr_0.18.0 parallel_4.5.2
[151] pROC_1.19.0.1 jsonlite_2.0.0
[153] RcppHNSW_0.6.0 TFBSTools_1.48.0
[155] bitops_1.0-9 bit64_4.6.0-1
[157] Rtsne_0.17 alabaster.matrix_1.10.0
[159] spatstat.utils_3.2-1 BiocNeighbors_2.4.0
[161] jquerylib_0.1.4 alabaster.se_1.10.0
[163] shinyjs_2.1.1 spatstat.univar_3.1-6
[165] timeDate_4052.112 lazyeval_0.2.2
[167] alabaster.base_1.10.0 shiny_1.12.1
[169] htmltools_0.5.9 sctransform_0.4.3
[171] rappdirs_0.3.4 ensembldb_2.34.0
[173] glue_1.8.0 TFMPvalue_1.0.0
[175] spam_2.11-3 googlesheets4_1.1.2
[177] httr2_1.2.2 XVector_0.50.0
[179] RCurl_1.98-1.17 BSgenome_1.78.0
[181] gridExtra_2.3 JASPAR2020_0.99.10
[183] igraph_2.2.1 R6_2.6.1
[185] labeling_0.4.3 RcppRoll_0.3.1
[187] GenomicFeatures_1.62.0 cluster_2.1.8.1
[189] gargle_1.6.0 Rhdf5lib_1.32.0
[191] GenomeInfoDb_1.46.2 ipred_0.9-15
[193] DirichletMultinomial_1.52.0 DelayedArray_0.36.0
[195] tidyselect_1.2.1 vipor_0.4.7
[197] ProtGenerics_1.42.0 xml2_1.5.2
[199] AnnotationDbi_1.72.0 future_1.69.0
[201] ModelMetrics_1.2.2.2 KernSmooth_2.23-26
[203] S7_0.2.1 htmlwidgets_1.6.4
[205] rlang_1.1.7 spatstat.sparse_3.1-0
[207] spatstat.explore_3.7-0 hardhat_1.4.2
[209] beeswarm_0.4.0 prodlim_2025.04.28
---
title: "Cross-Method Annotation Comparison - Herrera Sézary Syndrome Dataset"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  html_notebook:
    number_sections: true
    toc: true
    toc_float:
      collapsed: true
    theme: journal
---



# load libraries
```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width = 14,
  fig.height = 8,
  dpi = 300
)
# load libraries
    library(Seurat)
    library(dplyr)
    library(patchwork)
    library(ggplot2)
    library(pheatmap)
    library(scPred)
    library(celldex)
    library(SingleR)
    library(remotes)
    library(presto)
    library(SeuratDisk)
    library(SeuratData)
    library(Azimuth)

```




# Load RDS with all annotations
```{r}
# Load main object with all annotations
All_samples_Merged <- readRDS("/home/bioinfo/1-Thesis_Final_Year_2025/2025-Year3_Analysis/1-scRNA_RESULTS-19-11-2025/17-Herrera_DATA_CELL_Annotation/All_samples_Herrera_with_scATOMIC_04-02-2026.rds")


Idents(All_samples_Merged) <- "seurat_clusters"

# Ensure clusters are ordered 0-18
All_samples_Merged$seurat_clusters <- factor(All_samples_Merged$seurat_clusters, 
                                              levels = as.character(0:18))

# Order samples logically
sample_order <- c("Healthy_Blood", "Healthy_Skin", 
                  "SS1_Blood", "SS1_Skin", 
                  "SS2_Blood", "SS2_Skin", 
                  "SS3_Blood", "SS3_Skin", 
                  "SS4_Blood", "SS4_Skin", 
                  "SS5_Blood", "SS6_Blood")

All_samples_Merged$sample_id <- factor(All_samples_Merged$sample_id, 
                                       levels = sample_order)

cat("Total cells:", ncol(All_samples_Merged), "\n")
cat("Total clusters:", length(unique(All_samples_Merged$seurat_clusters)), "\n\n")
cat("Clusters (ordered 0-18):\n")
print(table(All_samples_Merged$seurat_clusters))

cat("\n\nSamples (ordered):\n")
print(table(All_samples_Merged$sample_id))

```

# Create Annotation Summary Table by Clusters
```{r }
library(dplyr)
library(tidyr)
library(purrr)

# Define annotation methods
methods <- c(
  "predicted.id",             # scPred
  "predicted.celltype.l2",    # Azimuth (l2 prediction)
  "singler.immune",           # SingleR
  "scATOMIC_annotation"       # scATOMIC
)

# Create summary - most common label per cluster for each method
annotation_summary <- map_dfr(methods, function(m) {
  df <- All_samples_Merged@meta.data
  df %>%
    filter(!is.na(.data[[m]])) %>%
    group_by(seurat_clusters, label = .data[[m]]) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(seurat_clusters) %>%
    slice_max(n, n = 1, with_ties = FALSE) %>%
    mutate(method = m)
})

# Rename methods for display
annotation_summary <- annotation_summary %>%
  mutate(method = recode(method,
                         "predicted.id" = "scPred",
                         "predicted.celltype.l2" = "Azimuth.l2",
                         "singler.immune" = "SingleR(Immune)",
                         "scATOMIC_annotation" = "scATOMIC"))

# Set method order
annotation_summary$method <- factor(annotation_summary$method, 
                                    levels = c("scPred", "Azimuth.l2", 
                                              "SingleR(Immune)", "scATOMIC"))

# Ensure cluster order 0-18
annotation_summary$seurat_clusters <- factor(annotation_summary$seurat_clusters, 
                                             levels = as.character(0:18))

# Save annotation summary by clusters (long format)
write.csv(annotation_summary, 
          "annotation_summary_by_clusters_04-02-2026.csv", 
          row.names = FALSE)

# Create and save wide format
annotation_table_clusters_wide <- annotation_summary %>%
  select(seurat_clusters, method, label) %>%
  pivot_wider(names_from = method, values_from = label)

write.csv(annotation_table_clusters_wide, 
          "annotation_table_clusters_wide_04-02-2026.csv", 
          row.names = FALSE)

# Display
cat("\n=== Annotation Summary by Clusters ===\n\n")
print(annotation_summary)
cat("\n✓ Saved: annotation_summary_by_clusters_04-02-2026.csv (long format)\n")
cat("✓ Saved: annotation_table_clusters_wide_04-02-2026.csv (wide format)\n")

```

## Summary Statistics
```{r }
# Cell counts per method
cat("\n=== Cells Annotated Per Method ===\n\n")
method_cols <- c("predicted.id", "predicted.celltype.l2", "singler.immune", "scATOMIC_annotation")
for(m in method_cols){
  n_annotated <- sum(!is.na(All_samples_Merged@meta.data[[m]]))
  cat(sprintf("%-25s: %5d cells (%.1f%%)\n", 
              m, n_annotated, n_annotated/ncol(All_samples_Merged)*100))
}

# Number of unique labels per method
cat("\n=== Unique Labels Per Method ===\n\n")
annotation_summary %>%
  group_by(method) %>%
  summarise(n_unique_labels = n_distinct(label)) %>%
  print()

```
# Basic Heatmap Visualization
```{r, fig.height= 5, fig.width= 18}
library(ggplot2)

ggplot(annotation_summary, aes(x = seurat_clusters, y = method, fill = label)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_discrete() +
  labs(
    x = "Seurat Clusters",
    y = "Annotation Method",
    fill = "Assigned Cell Type",
    title = "Cross-Method Comparison of Cell Type Annotations"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16)
  )


```

## Enhanced Heatmap with Custom Colors
```{r, fig.height= 5, fig.width= 18 }
library(RColorBrewer)

# Generate color palette
num_colors <- length(unique(annotation_summary$label))
my_colors <- colorRampPalette(brewer.pal(8, "Set2"))(num_colors)

p_heatmap <- ggplot(annotation_summary, aes(x = seurat_clusters, y = method, fill = label)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_manual(values = my_colors) +
  labs(
    x = "Seurat Clusters",
    y = "Annotation Method",
    fill = "Assigned Cell Type",
    title = "Cross-Method Comparison of Cell Type Annotations"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    legend.position = "right"
  )

print(p_heatmap)

# Save high-quality versions
ggsave("annotation_heatmap_04-02-2026.png", plot = p_heatmap, 
       width = 14, height = 5, dpi = 300, bg = "white")
ggsave("annotation_heatmap_04-02-2026.pdf", plot = p_heatmap, 
       width = 14, height = 5)

cat("\n✓ Heatmap saved as PNG and PDF\n")

```

#  Heatmap: Methods × Clusters
```{r, fig.height= 5, fig.width= 18 }
library(RColorBrewer)

# Generate color palette
num_colors <- length(unique(annotation_summary$label))
my_colors <- colorRampPalette(brewer.pal(8, "Set2"))(num_colors)

p_heatmap_clusters <- ggplot(annotation_summary, 
                              aes(x = seurat_clusters, y = method, fill = label)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_manual(values = my_colors) +
  scale_x_discrete(drop = FALSE) +  # Keep all cluster levels even if missing
  labs(
    x = "Seurat Clusters (0-18)",
    y = "Annotation Method",
    fill = "Assigned Cell Type",
    title = "Cross-Method Comparison of Cell Type Annotations"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    legend.position = "right"
  )

print(p_heatmap_clusters)

# Save
ggsave("annotation_heatmap_methods_clusters_04-02-2026.png", 
       plot = p_heatmap_clusters, width = 14, height = 5, dpi = 300, bg = "white")
ggsave("annotation_heatmap_methods_clusters_04-02-2026.pdf", 
       plot = p_heatmap_clusters, width = 14, height = 5)

cat("\n✓ Methods × Clusters heatmap saved\n")


```



#  Heatmap: Clusters × Samples (scATOMIC annotations)
```{r, fig.height= 6, fig.width= 12 }
# Create summary by cluster and sample for scATOMIC
cluster_sample_summary <- All_samples_Merged@meta.data %>%
  filter(!is.na(scATOMIC_annotation)) %>%
  group_by(seurat_clusters, sample_id, scATOMIC_annotation) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(seurat_clusters, sample_id) %>%
  slice_max(n, n = 1, with_ties = FALSE) %>%
  ungroup()

# Ensure cluster and sample order
cluster_sample_summary$seurat_clusters <- factor(cluster_sample_summary$seurat_clusters, 
                                                 levels = as.character(0:18))
cluster_sample_summary$sample_id <- factor(cluster_sample_summary$sample_id, 
                                           levels = sample_order)

# Plot
p_cluster_sample <- ggplot(cluster_sample_summary, 
                           aes(x = sample_id, y = seurat_clusters, fill = scATOMIC_annotation)) +
  geom_tile(color = "white", linewidth = 0.5) +
  scale_fill_manual(values = my_colors) +
  scale_y_discrete(drop = FALSE) +  # Keep all cluster levels
  scale_x_discrete(drop = FALSE) +  # Keep all sample levels
  labs(
    x = "Sample ID",
    y = "Seurat Clusters (0-18)",
    fill = "scATOMIC Annotation",
    title = "Cluster Distribution Across Samples (scATOMIC Annotation)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    legend.position = "right"
  )

print(p_cluster_sample)

# Save
ggsave("annotation_heatmap_clusters_samples_04-02-2026.png", 
       plot = p_cluster_sample, width = 16, height = 10, dpi = 300, bg = "white")
ggsave("annotation_heatmap_clusters_samples_04-02-2026.pdf", 
       plot = p_cluster_sample, width = 16, height = 10)

cat("\n✓ Clusters × Samples heatmap saved\n")

```

#  Alluvial Diagram - Annotation Flow Across Methods
```{r, fig.height= 14, fig.width= 26 }

library(dplyr)
library(tidyr)
library(ggplot2)
library(ggalluvial)

# Extract columns safely using base R (avoids select() conflicts)
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2", 
                 "singler.immune", "scATOMIC_annotation")

alluvial_df_raw <- All_samples_Merged@meta.data[, cols_needed]
colnames(alluvial_df_raw) <- c("Cluster", "scPred", "Azimuth", "SingleR", "scATOMIC")

# Summarize and filter
alluvial_data <- alluvial_df_raw %>%
  mutate(Cluster = factor(as.character(Cluster), levels = as.character(0:18))) %>%
  filter(!is.na(scPred) & !is.na(Azimuth) & !is.na(SingleR) & !is.na(scATOMIC)) %>%
  group_by(Cluster, scPred, Azimuth, SingleR, scATOMIC) %>%
  summarise(Freq = n(), .groups = "drop") %>%
  filter(Freq > 5)

# Check
print(paste("Rows in plot data:", nrow(alluvial_data)))

# Plot
p_alluvial <- ggplot(
  alluvial_data,
  aes(axis1 = scPred, axis2 = Azimuth, axis3 = SingleR, axis4 = scATOMIC, y = Freq)
) +
  geom_alluvium(aes(fill = scATOMIC), alpha = 0.7, curve_type = "sigmoid") +
  geom_stratum(width = 1/5, fill = "white", color = "grey50") +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), 
            size = 3, fontface = "bold") +
  scale_x_discrete(
    limits = c("scPred", "Azimuth", "SingleR", "scATOMIC"),
    expand = c(0.15, 0.05)
  ) +
  labs(
    title = "Annotation Flow Across Methods",
    subtitle = "Flow shows how cell annotations change across different methods",
    y = "Number of Cells",
    fill = "scATOMIC Label"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(face = "bold", size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    legend.text = element_text(size = 8)
  ) +
  guides(fill = guide_legend(nrow = 3))

print(p_alluvial)

# Save
ggsave("annotation_alluvial_flow_04-02-2026.png", plot = p_alluvial,
       width = 12, height = 8, dpi = 300, bg = "white")
ggsave("annotation_alluvial_flow_04-02-2026.pdf", plot = p_alluvial,
       width = 12, height = 8)

cat("\n✓ Alluvial diagram saved\n")


```




# Method Agreement Analysis

##  Calculate Pairwise Agreement
```{r, fig.height= 5, fig.width= 18 }
library(dplyr)
library(tidyr)

# Create matrix: clusters × methods
# Step 1: Extract columns safely
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2", 
                 "singler.immune", "scATOMIC_annotation")

cluster_annotation_df <- All_samples_Merged@meta.data[, cols_needed]

# Step 2: Pivot and find dominant label per cluster per method
cluster_annotation_matrix <- cluster_annotation_df %>%
  pivot_longer(cols = -seurat_clusters, names_to = "method", values_to = "label") %>%
  filter(!is.na(label)) %>%
  group_by(seurat_clusters, method, label) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(seurat_clusters, method) %>%
  slice_max(n, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  dplyr::select(seurat_clusters, method, label) %>%
  pivot_wider(names_from = method, values_from = label)

# Convert to matrix
mat <- as.matrix(cluster_annotation_matrix[, -1])
rownames(mat) <- cluster_annotation_matrix$seurat_clusters

# Calculate method similarity (Agreement proportion)
method_similarity <- function(m1, m2) {
  sum(m1 == m2, na.rm = TRUE) / sum(!is.na(m1) & !is.na(m2))
}

# Create similarity matrix
n_methods <- ncol(mat)
sim_matrix <- matrix(0, n_methods, n_methods)
colnames(sim_matrix) <- c("scPred", "Azimuth.l2", "SingleR", "scATOMIC")
rownames(sim_matrix) <- c("scPred", "Azimuth.l2", "SingleR", "scATOMIC")

for(i in 1:n_methods) {
  for(j in 1:n_methods) {
    sim_matrix[i, j] <- method_similarity(mat[, i], mat[, j])
  }
}

# Display agreement matrix
cat("\n=== Pairwise Agreement Between Methods ===\n\n")
print(round(sim_matrix, 3))

# Save as CSV
write.csv(sim_matrix, "method_agreement_matrix_04-02-2026.csv", row.names = TRUE)

cat("\n✓ Saved: method_agreement_matrix_04-02-2026.csv\n")
```

##  Agreement Heatmap
```{r, fig.height=6, fig.width=8}
library(pheatmap)

# Plot agreement heatmap
pheatmap(
  sim_matrix,
  display_numbers = TRUE,
  number_format = "%.2f",
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = colorRampPalette(c("white", "orange", "red"))(100),
  main = "Pairwise Agreement Between Annotation Methods",
  fontsize = 14,
  fontsize_number = 12,
  angle_col = 45,
  border_color = "grey60"
)

# Save PNG
png("annotation_method_agreement_heatmap_04-02-2026.png", 
    width = 10, height = 10, units = "in", res = 300)
pheatmap(
  sim_matrix,
  display_numbers = TRUE,
  number_format = "%.2f",
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = colorRampPalette(c("white", "orange", "red"))(100),
  main = "Pairwise Agreement Between Annotation Methods",
  fontsize = 14,
  fontsize_number = 12,
  angle_col = 45,
  border_color = "grey60"
)
dev.off()

# Save PDF
pdf("annotation_method_agreement_heatmap_04-02-2026.pdf", 
    width = 10, height = 10)
pheatmap(
  sim_matrix,
  display_numbers = TRUE,
  number_format = "%.2f",
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = colorRampPalette(c("white", "orange", "red"))(100),
  main = "Pairwise Agreement Between Annotation Methods",
  fontsize = 14,
  fontsize_number = 12,
  angle_col = 45,
  border_color = "grey60"
)
dev.off()

cat("\n✓ Agreement heatmap saved\n")
```


## Hierarchical Clustering Dendrogram
```{r, fig.height=6, fig.width=8}
# Convert similarity to distance
dist_matrix <- as.dist(1 - sim_matrix)

# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
par(cex = 1.2)
plot(hc, 
     main = "Hierarchical Clustering of Annotation Methods",
     xlab = "Annotation Method", 
     ylab = "Distance (1 - Agreement)",
     hang = -1,
     cex.main = 1.5,
     cex.lab = 1.3,
     cex.axis = 1.2)

# Save PNG
png("annotation_method_dendrogram_04-02-2026.png", 
    width = 12, height = 8, units = "in", res = 300)
par(cex = 1.2)
plot(hc, 
     main = "Hierarchical Clustering of Annotation Methods",
     xlab = "Annotation Method", 
     ylab = "Distance (1 - Agreement)",
     hang = -1,
     cex.main = 1.5,
     cex.lab = 1.3,
     cex.axis = 1.2)
dev.off()

# Save PDF
pdf("annotation_method_dendrogram_04-02-2026.pdf", 
    width = 12, height = 8)
par(cex = 1.2)
plot(hc, 
     main = "Hierarchical Clustering of Annotation Methods",
     xlab = "Annotation Method", 
     ylab = "Distance (1 - Agreement)",
     hang = -1,
     cex.main = 1.5,
     cex.lab = 1.3,
     cex.axis = 1.2)
dev.off()

cat("\n✓ Dendrogram saved\n")
```


#  Dominant Label Table (Simplest)
```{r, fig.height=8, fig.width=10}
# A clean table showing the dominant label from each method per cluster:

library(dplyr)
library(knitr)
library(kableExtra)

# Extract dominant label per cluster per method
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2", 
                 "singler.immune", "scATOMIC_annotation")

annotation_table <- All_samples_Merged@meta.data[, cols_needed] %>%
  group_by(seurat_clusters) %>%
  summarise(
    scPred = names(sort(table(predicted.id), decreasing = TRUE))[1],
    Azimuth = names(sort(table(predicted.celltype.l2), decreasing = TRUE))[1],
    SingleR = names(sort(table(singler.immune), decreasing = TRUE))[1],
    scATOMIC = names(sort(table(scATOMIC_annotation), decreasing = TRUE))[1],
    n_cells = n(),
    .groups = "drop"
  ) %>%
  arrange(seurat_clusters)

# Add consensus column (TRUE if all 4 agree)
annotation_table <- annotation_table %>%
  mutate(
    Consensus = (scPred == Azimuth & Azimuth == SingleR & SingleR == scATOMIC)
  )

# Display
kable(annotation_table, caption = "Dominant Cell Type Annotation per Cluster") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE, font_size = 10) %>%
  column_spec(7, background = ifelse(annotation_table$Consensus, "#d4edda", "#f8d7da"))

# Save as CSV
write.csv(annotation_table, "annotation_comparison_clean_table_04-02-2026.csv", 
          row.names = FALSE)

cat("\n✓ Clean annotation table saved\n")


```


##  Simplified Text-Based Heatmap
```{r, fig.height=4, fig.width=28}
library(ggplot2)
library(dplyr)
library(tidyr)

# Prepare data
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2", 
                 "singler.immune", "scATOMIC_annotation")

heatmap_data <- All_samples_Merged@meta.data[, cols_needed] %>%
  group_by(seurat_clusters) %>%
  summarise(
    scPred = names(sort(table(predicted.id), decreasing = TRUE))[1],
    Azimuth = names(sort(table(predicted.celltype.l2), decreasing = TRUE))[1],
    SingleR = names(sort(table(singler.immune), decreasing = TRUE))[1],
    scATOMIC = names(sort(table(scATOMIC_annotation), decreasing = TRUE))[1],
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(scPred, Azimuth, SingleR, scATOMIC),
               names_to = "Method", values_to = "CellType") %>%
  mutate(
    Method = factor(Method, levels = c("scPred", "Azimuth", "SingleR", "scATOMIC")),
    seurat_clusters = factor(seurat_clusters, levels = as.character(0:18))
  )

# Create text-based heatmap
p_text_heatmap <- ggplot(heatmap_data, aes(x = seurat_clusters, y = Method, label = CellType)) +
  geom_tile(color = "grey30", fill = "white", linewidth = 0.5) +
  geom_text(size = 2.5, hjust = 0.5, vjust = 0.5) +
  labs(
    title = "Cell Type Annotation Comparison Across Methods",
    x = "Seurat Cluster",
    y = "Annotation Method"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(face = "bold", size = 12),
    axis.text.y = element_text(face = "bold", size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
    panel.grid = element_blank()
  )

print(p_text_heatmap)

# Save
ggsave("annotation_text_heatmap_04-02-2026.png", 
       plot = p_text_heatmap, width = 28, height = 4, dpi = 300, bg = "white")
ggsave("annotation_text_heatmap_04-02-2026.pdf", 
       plot = p_text_heatmap, width = 28, height = 4)

cat("\n✓ Text-based heatmap saved\n")
```

##  Dot Plot with Consensus Highlighting
```{r, fig.height=4, fig.width=28}
library(ggplot2)

# Add consensus column
heatmap_data_consensus <- heatmap_data %>%
  group_by(seurat_clusters) %>%
  mutate(
    all_same = n_distinct(CellType) == 1,
    CellType_short = ifelse(nchar(CellType) > 15, 
                            paste0(substr(CellType, 1, 12), "..."), 
                            CellType)
  ) %>%
  ungroup()

# Create dot plot
p_dotplot <- ggplot(heatmap_data_consensus, 
                    aes(x = seurat_clusters, y = Method, fill = all_same)) +
  geom_tile(color = "grey40", linewidth = 0.3) +
  geom_text(aes(label = CellType_short), size = 2.2, fontface = "bold") +
  scale_fill_manual(
    values = c("TRUE" = "#d4edda", "FALSE" = "white"),
    labels = c("TRUE" = "Consensus", "FALSE" = "No consensus"),
    name = ""
  ) +
  labs(
    title = "Cell Type Annotation Comparison with Consensus Highlighting",
    x = "Seurat Cluster (0-18)",
    y = "Annotation Method"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(face = "bold", size = 11),
    axis.text.y = element_text(face = "bold", size = 12),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
    panel.grid = element_blank(),
    legend.position = "top"
  )

print(p_dotplot)

# Save
ggsave("annotation_dotplot_consensus_04-02-2026.png", 
       plot = p_dotplot, width = 16, height = 6, dpi = 300, bg = "white")
ggsave("annotation_dotplot_consensus_04-02-2026.pdf", 
       plot = p_dotplot, width = 16, height = 6)

cat("\n✓ Dot plot with consensus highlighting saved\n")
```


##  Per-Cluster Annotation Distribution
```{r, fig.height=8, fig.width=10}
library(ggplot2)
library(dplyr)
library(tidyr)
library(patchwork)

# Function to plot annotation distribution for one cluster
plot_cluster_agreement <- function(cluster_id) {
  cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2", 
                   "singler.immune", "scATOMIC_annotation")
  
  # Extract and filter using base R
  temp_df <- All_samples_Merged@meta.data[, cols_needed]
  temp_df <- temp_df[temp_df$seurat_clusters == cluster_id, ]
  
  # Rename columns
  colnames(temp_df) <- c("Cluster", "scPred", "Azimuth", "SingleR", "scATOMIC")
  
  # Now use dplyr safely with explicit namespace
  df <- temp_df %>%
    dplyr::select(-Cluster) %>%
    pivot_longer(everything(), names_to = "method", values_to = "label") %>%
    filter(!is.na(label)) %>%
    dplyr::count(method, label) %>%
    group_by(method) %>%
    mutate(prop = n / sum(n)) %>%
    ungroup()
  
  ggplot(df, aes(x = method, y = prop, fill = label)) +
    geom_bar(stat = "identity") +
    labs(title = paste("Cluster", cluster_id),
         y = "Proportion", x = "") +
    theme_minimal(base_size = 10) +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
      legend.position = "none",
      plot.title = element_text(hjust = 0.5, face = "bold")
    )
}

# Create plots for clusters 0-18
clusters <- as.character(0:18)
cluster_plots <- lapply(clusters, plot_cluster_agreement)

# Combine plots
p_combined <- wrap_plots(cluster_plots, ncol = 5) +
  plot_annotation(
    title = "Annotation Distribution Across Methods per Cluster (0-18)",
    theme = theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16))
  )

print(p_combined)

# Save
ggsave("annotation_per_cluster_comparison_04-02-2026.png", 
       plot = p_combined, width = 16, height = 12, dpi = 300, bg = "white")
ggsave("annotation_per_cluster_comparison_04-02-2026.pdf", 
       plot = p_combined, width = 16, height = 12)

cat("\n✓ Per-cluster comparison saved\n")
```



# Improved Version for Manuscript Comparison
```{r, fig.height=4, fig.width=28}

library(ggplot2)
library(dplyr)
library(tidyr)

# Prepare data
cols_needed <- c("seurat_clusters", "predicted.id", "predicted.celltype.l2", 
                 "singler.immune", "scATOMIC_annotation")

heatmap_data <- All_samples_Merged@meta.data[, cols_needed] %>%
  group_by(seurat_clusters) %>%
  summarise(
    scPred = names(sort(table(predicted.id), decreasing = TRUE))[1],
    Azimuth = names(sort(table(predicted.celltype.l2), decreasing = TRUE))[1],
    SingleR = names(sort(table(singler.immune), decreasing = TRUE))[1],
    scATOMIC = names(sort(table(scATOMIC_annotation), decreasing = TRUE))[1],
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(scPred, Azimuth, SingleR, scATOMIC),
               names_to = "Method", values_to = "CellType") %>%
  mutate(
    Method = factor(Method, levels = c("scATOMIC", "SingleR", "Azimuth", "scPred")),
    seurat_clusters = factor(seurat_clusters, levels = as.character(0:18)),
    # Shorten long labels for readability
    CellType_display = ifelse(nchar(CellType) > 20, 
                              paste0(substr(CellType, 1, 17), "..."), 
                              CellType)
  )

# Create publication-quality heatmap
p_manuscript <- ggplot(heatmap_data, 
                       aes(x = seurat_clusters, y = Method, label = CellType_display)) +
  geom_tile(color = "grey20", fill = "white", linewidth = 0.8) +
  geom_text(size = 3, hjust = 0.5, vjust = 0.5, fontface = "bold") +
  labs(
    title = "Discordance in Automated Cell Type Annotation - Herrera Dataset",
    x = "Seurat Cluster",
    y = "Annotation Method"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    axis.text.x = element_text(face = "bold", size = 14),
    axis.text.y = element_text(face = "bold", size = 14),
    axis.title = element_text(size = 16, face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    panel.grid = element_blank(),
    plot.margin = margin(10, 10, 10, 10)
  )

print(p_manuscript)

# Save high-quality versions for manuscript
ggsave("FigureX_Herrera_Annotation_Comparison.png", 
       plot = p_manuscript, width = 28, height = 5, dpi = 600, bg = "white")
ggsave("FigureX_Herrera_Annotation_Comparison.pdf", 
       plot = p_manuscript, width = 28, height = 5)
ggsave("FigureX_Herrera_Annotation_Comparison.tiff", 
       plot = p_manuscript, width = 28, height = 5, dpi = 600, bg = "white")

cat("\n✓ Herrera manuscript figure saved (PNG, PDF, TIFF at 600 DPI)\n")

# 
# **Figure X. Automated annotation methods show poor concordance across two independent Sézary syndrome datasets.**  
# **(A)** Cell line-derived Sézary samples. **(B)** Patient-derived Sézary samples (Herrera dataset). Each cell displays the dominant annotation assigned by four reference-based methods (scATOMIC, SingleR, Azimuth, and scPred) to cells within each cluster. Despite using identical methodology, methods assign divergent labels in both datasets, demonstrating that malignant T-cell populations from Sézary syndrome consistently fail to match healthy reference atlases. This cross-dataset validation highlights the necessity for manual curation or disease-specific annotation strategies.


```




# sessionInfo()
```{r}

sessionInfo()


```


