Load libraries
Load Annotated Healthy
Object
reference_integrated <- readRDS("../CD4_reference_RPCA_SCT_integrated.rds")
cat("Healthy Reference Integrated CD4 cells:", ncol(reference_integrated), "\n")
Healthy Reference Integrated CD4 cells: 12034
print(table(reference_integrated$predicted.celltype.l2, reference_integrated$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 CTL 1 0 11
CD4 Naive 1367 381 359
CD4 Proliferating 9 4 0
CD4 TCM 2010 2856 4654
CD4 TEM 29 51 73
Treg 87 137 5
print(table(reference_integrated$predicted.celltype.l3, reference_integrated$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 CTL 1 0 11
CD4 Naive 1370 382 363
CD4 Proliferating 9 4 0
CD4 TCM_1 1518 1859 4181
CD4 TCM_2 113 148 248
CD4 TCM_3 370 849 226
CD4 TEM_1 12 26 8
CD4 TEM_2 19 11 13
CD4 TEM_3 2 7 49
Treg Memory 58 134 3
Treg Naive 31 9 0
DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- JoinLayers(reference_integrated)
reference_integrated
An object of class Seurat
57742 features across 12034 samples within 7 assays
Active assay: RNA (36601 features, 2902 variable features)
3 layers present: scale.data, data, counts
6 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
2 dimensional reductions calculated: pca, umap
Doublet detection and
removal
DefaultAssay(reference_integrated) <- "RNA"
# Convert to SingleCellExperiment
sce <- as.SingleCellExperiment(reference_integrated, assay = "RNA")
# Run scDblFinder with cluster-aware mode (recommended for droplet data)[3][4]
sce <- scDblFinder(sce)
table(sce$scDblFinder.class)
singlet doublet
11539 495
Doublet detection and
removal
library(dplyr)
# Explore results and add to seurat object
meta_scdblfinder <- sce@colData@listData %>% as.data.frame() %>%
dplyr::select(starts_with('scDblFinder')) # 'scDblFinder.class')
head(meta_scdblfinder)
Bring calls back into
Seurat and inspect cluster 9
# Bring doublet calls back to Seurat
reference_integrated$scDblFinder.class <- sce$scDblFinder.class[
match(colnames(reference_integrated), colnames(sce))
]
reference_integrated$scDblFinder.score <- sce$scDblFinder.score[
match(colnames(reference_integrated), colnames(sce))
]
# Inspect enrichment per cluster
table(reference_integrated$seurat_clusters,
reference_integrated$scDblFinder.class)
singlet doublet
0 5325 154
1 2907 175
2 1314 37
3 530 44
4 419 29
5 314 17
6 307 9
7 275 10
8 93 2
9 55 18
DimPlot(reference_integrated, group.by = "scDblFinder.class") +
ggtitle("scDblFinder doublet calls on integrated UMAP")

# Doublet stats
# Check how doublets singlets differ in QC measures per sample.
VlnPlot(reference_integrated, group.by = 'dataset', split.by = "scDblFinder.class",
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 4, pt.size = 0) + theme(legend.position = 'right')

doublets_summary <- reference_integrated@meta.data %>%
group_by(dataset, scDblFinder.class) %>%
summarise(total_count = n(),.groups = 'drop') %>% as.data.frame() %>% ungroup() %>%
group_by(dataset) %>%
mutate(countT = sum(total_count)) %>%
group_by(scDblFinder.class, .add = TRUE) %>%
mutate(percent = paste0(round(100 * total_count/countT, 2),'%')) %>%
dplyr::select(-countT)
doublets_summary
write.table(doublets_summary, file = "doublet_folder/scDblFinder_doublets_summary.txt", quote = FALSE, row.names = FALSE, sep = '\t')
Keep singlets only and
recluster
# Keep singlets only and recluster
reference_singlets <- subset(reference_integrated,
subset = scDblFinder.class == "singlet")
reference_singlets <- FindClusters(reference_singlets,
graph.name = "integrated_snn",
resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 11539
Number of edges: 398563
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8335
Number of communities: 13
Elapsed time: 1 seconds
reference_singlets <- RunUMAP(reference_singlets, dims = 1:15)
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
DimPlot(reference_singlets, group.by = "seurat_clusters") +
ggtitle("Reclustered singlets using integrated_snn")

Validation plots after
doublet removal
DefaultAssay(reference_singlets) <- "RNA"
p1 <- DimPlot(reference_singlets, group.by = "predicted.celltype.l2",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()
p2 <- DimPlot(reference_singlets, group.by = "dataset")
p3 <- DimPlot(reference_singlets, group.by = "seurat_clusters", label = TRUE)
(p1 | p2) / p3

Validation plots
DimPlot(reference_singlets, group.by = "predicted.celltype.l1",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_singlets, group.by = "predicted.celltype.l2",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

DimPlot(reference_singlets, group.by = "predicted.celltype.l3",
label = TRUE, repel = TRUE, label.size = 3) + NoLegend()

print(table(reference_singlets$predicted.celltype.l1, reference_singlets$seurat_clusters))
0 1 2 3 4 5 6 7 8 9 10 11 12
CD4 T 2748 2178 1712 1497 860 528 524 426 316 307 278 104 55
CD8 T 0 0 0 0 0 5 0 0 0 0 1 0 0
print(table(reference_singlets$predicted.celltype.l2, reference_singlets$seurat_clusters))
0 1 2 3 4 5 6 7 8 9 10 11 12
CD4 CTL 0 0 0 0 0 11 0 0 0 0 0 0 1
CD4 Naive 0 1010 400 599 0 0 0 14 6 1 12 22 0
CD4 Proliferating 1 0 0 0 0 11 0 0 0 0 0 0 0
CD4 TCM 2731 1159 1305 881 858 377 523 411 308 128 264 82 52
CD4 TEM 6 1 0 0 2 134 0 0 1 0 0 0 0
Treg 10 8 7 17 0 0 1 1 1 178 3 0 2
print(table(reference_singlets$predicted.celltype.l3, reference_singlets$seurat_clusters))
0 1 2 3 4 5 6 7 8 9 10 11 12
CD4 CTL 0 0 0 0 0 11 0 0 0 0 0 0 1
CD4 Naive 1 1013 402 599 0 0 0 15 6 1 12 23 0
CD4 Proliferating 1 0 0 0 0 11 0 0 0 0 0 0 0
CD4 TCM_1 2283 1151 1303 878 200 77 198 392 300 53 245 73 39
CD4 TCM_2 258 1 1 0 13 50 52 16 0 70 10 5 6
CD4 TCM_3 186 3 1 4 645 259 270 2 8 3 6 3 7
CD4 TEM_1 1 0 0 0 0 41 0 0 0 0 0 0 0
CD4 TEM_2 6 1 0 0 0 32 1 0 0 0 0 0 0
CD4 TEM_3 0 0 0 0 2 52 1 0 1 0 0 0 0
Treg Memory 9 5 1 1 0 0 2 1 1 166 6 0 2
Treg Naive 3 4 4 15 0 0 0 0 0 14 0 0 0
Save Reference (before
MapQuery)
# Save
saveRDS(reference_singlets, "../CD4_reference_RPCA_SCT_integrated_doublets_removed.rds")
cat("✅ COMPLETE:", ncol(reference_singlets), "cells integrated\n")
✅ COMPLETE: 11539 cells integrated
RNA marker module
scores (differentiation states)
DefaultAssay(reference_singlets) <- "RNA"
marker_list <- list(
Tnaive = c("CCR7","SELL","LEF1","TCF7","IL7R","CD27","PTPRC"),
Tcm = c("CCR7","SELL","CD27","IL7R","BCL2","TCF7"),
Tem = c("CCR6","CXCR3","GZMK","PRF1","IFNG"),
Temra = c("GZMB","PRF1","KLRG1","CX3CR1"),
Treg = c("FOXP3","IL2RA","CTLA4","IKZF2"),
Tex = c("PDCD1","CTLA4","LAG3","TIGIT","TOX","ENTPD1"),
CD4CTL = c("GZMB","PRF1","NKG7","KLRG1","CX3CR1")
)
marker_list <- lapply(marker_list, function(x)
intersect(x, rownames(reference_singlets))
)
marker_list <- marker_list[vapply(marker_list, length, 1L) > 0]
for (state in names(marker_list)) {
reference_singlets <- AddModuleScore(
reference_singlets,
features = list(marker_list[[state]]),
name = state
)
}
score_features <- paste0(names(marker_list), "1")
FeaturePlot(reference_singlets,
features = score_features,
ncol = 4,
cols = c("lightblue","red"),
max.cutoff = "q95") +
plot_annotation(title = "Differentiation State Module Scores")

FeaturePlots Markers
based


tnaive_plot
tcm_plot
tem_plot

temra_plot

tex_plot

cd4ctl_plot

ADT feature plots and
gating (CD4T_10x_S1 ONLY)


plot_markers_grid_ADT(tnaive_adt, "Tnaive")
plot_markers_grid_ADT(tcm_adt, "Tcm")
plot_markers_grid_ADT(tem_adt, "Tem")

plot_markers_grid_ADT(temra_adt, "Temra/CD4CTL")

plot_markers_grid_ADT(treg_adt, "Treg")