print(table(healthy$predicted.celltype.l2))
ASDC B intermediate B memory B naive CD14 Mono CD16 Mono CD4 CTL CD4 Naive
9 441 169 764 4117 290 12 2107
CD4 Proliferating CD4 TCM CD4 TEM CD8 Naive CD8 Proliferating CD8 TCM CD8 TEM cDC1
13 9527 159 1488 1 370 706 18
cDC2 dnT gdT HSPC ILC MAIT NK NK Proliferating
181 13 119 27 3 325 540 3
NK_CD56bright pDC Plasmablast Platelet Treg
25 92 28 57 230
# Final validation tables
cat("=== ANNOTATION SUMMARY ===\n")
=== ANNOTATION SUMMARY ===
print(table(healthy$predicted.celltype.l1, healthy$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
B 924 479 0
CD4 T 3510 3445 5105
CD8 T 1370 1200 1
DC 204 96 0
Mono 3073 1332 0
NK 450 118 0
other 46 41 0
other T 290 150 0
print(table(healthy$predicted.celltype.l2, healthy$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
ASDC 5 4 0
B intermediate 141 300 0
B memory 88 81 0
B naive 682 82 0
CD14 Mono 2956 1161 0
CD16 Mono 119 171 0
CD4 CTL 1 0 11
CD4 Naive 1367 381 359
CD4 Proliferating 9 4 0
CD4 TCM 2011 2861 4655
CD4 TEM 29 55 75
CD8 Naive 987 501 0
CD8 Proliferating 0 1 0
CD8 TCM 184 185 1
CD8 TEM 203 503 0
cDC1 13 5 0
cDC2 129 52 0
dnT 7 6 0
gdT 74 45 0
HSPC 12 15 0
ILC 3 0 0
MAIT 210 115 0
NK 435 105 0
NK Proliferating 2 1 0
NK_CD56bright 13 12 0
pDC 57 35 0
Plasmablast 12 16 0
Platelet 31 26 0
Treg 87 138 5
print(table(healthy$predicted.celltype.l3, healthy$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
ASDC_mDC 2 3 0
ASDC_pDC 2 1 0
B intermediate kappa 68 46 0
B intermediate lambda 77 254 0
B memory kappa 44 44 0
B memory lambda 42 32 0
B naive kappa 628 84 0
B naive lambda 52 2 0
CD14 Mono 2956 1162 0
CD16 Mono 119 172 0
CD4 CTL 3 0 11
CD4 Naive 1371 382 363
CD4 Proliferating 9 4 0
CD4 TCM_1 1519 1863 4181
CD4 TCM_2 113 148 248
CD4 TCM_3 371 856 226
CD4 TEM_1 12 26 8
CD4 TEM_2 19 11 13
CD4 TEM_3 2 7 49
CD8 Naive 974 376 0
CD8 Naive_2 16 121 0
CD8 Proliferating 0 1 0
CD8 TCM_1 116 118 1
CD8 TCM_2 45 39 0
CD8 TCM_3 16 12 0
CD8 TEM_1 69 273 2
CD8 TEM_2 64 179 0
CD8 TEM_3 19 12 0
CD8 TEM_4 15 0 0
CD8 TEM_5 1 5 0
CD8 TEM_6 32 38 0
cDC1 13 5 0
cDC2_1 43 25 0
cDC2_2 86 26 0
dnT_1 0 1 0
dnT_2 7 7 0
gdT_1 59 16 0
gdT_3 17 41 1
HSPC 12 15 0
ILC 3 0 0
MAIT 211 115 0
NK Proliferating 2 1 0
NK_1 5 1 0
NK_2 375 102 0
NK_3 23 1 0
NK_4 32 2 0
NK_CD56bright 13 12 0
pDC 58 35 0
Plasma 12 16 0
Platelet 31 26 0
Treg Memory 58 134 3
Treg Naive 31 9 0
# Multi-annotation consensus plot
DimPlot(healthy, group.by = "singler.hpca", label = TRUE, repel = TRUE) + ggtitle("HPCA")
DimPlot(healthy, group.by = "singler.immune", label = TRUE, repel = TRUE) + ggtitle("Immune")
DimPlot(healthy, group.by = "predicted.celltype.l1", label = TRUE) + ggtitle("Azimuth level1")
DimPlot(healthy, group.by = "predicted.celltype.l2", label = TRUE) + ggtitle("Azimuth level1")
DimPlot(healthy, group.by = "predicted.celltype.l3", label = TRUE) + ggtitle("Azimuth level1")
# ============================================================
# Verify: no contamination remains
# ============================================================
cat("=== AFTER STRICT L2 + L3 CONSENSUS FILTER ===\n")
=== AFTER STRICT L2 + L3 CONSENSUS FILTER ===
cat("Total CD4 cells retained:", ncol(healthy_cd4), "\n\n")
Total CD4 cells retained: 12034
cat("--- Azimuth L2 distribution ---\n")
--- Azimuth L2 distribution ---
print(table(healthy_cd4$predicted.celltype.l2, healthy_cd4$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 CTL 1 0 11
CD4 Naive 1367 381 359
CD4 Proliferating 9 4 0
CD4 TCM 2010 2856 4654
CD4 TEM 29 51 73
Treg 87 137 5
cat("\n--- Azimuth L3 distribution ---\n")
--- Azimuth L3 distribution ---
print(table(healthy_cd4$predicted.celltype.l3, healthy_cd4$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 CTL 1 0 11
CD4 Naive 1370 382 363
CD4 Proliferating 9 4 0
CD4 TCM_1 1518 1859 4181
CD4 TCM_2 113 148 248
CD4 TCM_3 370 849 226
CD4 TEM_1 12 26 8
CD4 TEM_2 19 11 13
CD4 TEM_3 2 7 49
Treg Memory 58 134 3
Treg Naive 31 9 0
# Visual check
p1 <- DimPlot(healthy_cd4, group.by = "predicted.celltype.l2",
label = TRUE, repel = TRUE, label.size = 3) +
NoLegend() + ggtitle("CD4 T-cells — Azimuth L2")
p2 <- DimPlot(healthy_cd4, group.by = "predicted.celltype.l3",
label = TRUE, repel = TRUE, label.size = 3) +
NoLegend() + ggtitle("CD4 T-cells — Azimuth L3")
p3 <- DimPlot(healthy_cd4, group.by = "dataset") +
ggtitle("By Dataset")
(p1 | p2) / p3
# Final cell count per dataset
cat("\n--- Final cell counts ---\n")
--- Final cell counts ---
cat("CD4T_10x_S1 :", sum(healthy_cd4$dataset == "CD4T_10x_S1"), "\n")
CD4T_10x_S1 : 3503
cat("CD4T_10x_S2 :", sum(healthy_cd4$dataset == "CD4T_10x_S2"), "\n")
CD4T_10x_S2 : 3429
cat("CD4T_lab :", sum(healthy_cd4$dataset == "CD4T_lab"), "\n")
CD4T_lab : 5102
cat("=== ANNOTATION SUMMARY ===\n")
=== ANNOTATION SUMMARY ===
cat("CD4 after L1+L2+L3 consensus:", ncol(healthy_cd4), "\n")
CD4 after L1+L2+L3 consensus: 12034
print(table(healthy_cd4$predicted.celltype.l1, healthy_cd4$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 T 3503 3423 5102
CD8 T 0 6 0
print(table(healthy_cd4$predicted.celltype.l2, healthy_cd4$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 CTL 1 0 11
CD4 Naive 1367 381 359
CD4 Proliferating 9 4 0
CD4 TCM 2010 2856 4654
CD4 TEM 29 51 73
Treg 87 137 5
print(table(healthy_cd4$predicted.celltype.l3, healthy_cd4$dataset))
CD4T_10x_S1 CD4T_10x_S2 CD4T_lab
CD4 CTL 1 0 11
CD4 Naive 1370 382 363
CD4 Proliferating 9 4 0
CD4 TCM_1 1518 1859 4181
CD4 TCM_2 113 148 248
CD4 TCM_3 370 849 226
CD4 TEM_1 12 26 8
CD4 TEM_2 19 11 13
CD4 TEM_3 2 7 49
Treg Memory 58 134 3
Treg Naive 31 9 0
# ============================================================
# MARKER VALIDATION: Check NK / Mono / CD8 contamination
# ============================================================
DefaultAssay(healthy_cd4) <- "RNA"
# Key marker panel
contamination_markers <- list(
CD4_markers = c("CD4", "CD3D", "CD3E"),
CD8_markers = c("CD8A", "CD8B"),
NK_markers = c("NKG7", "GNLY", "NCAM1", "KLRB1"),
Mono_markers = c("CD14", "LYZ", "S100A8", "CST3")
)
# 1. VlnPlot: expression distribution per Azimuth L2 cluster
VlnPlot(healthy_cd4,
features = c("CD4", "CD8A", "NKG7", "CD14", "GNLY", "LYZ", "CD14"),
group.by = "predicted.celltype.l2",
pt.size = 0, ncol = 3) +
plot_annotation(title = "Contamination Check — Key Markers per CD4 Subtype")
# 2. FeaturePlot: spatial distribution on UMAP
FeaturePlot(healthy_cd4,
features = c("CD4", "CD8A", "NKG7", "CD14", "GNLY", "NCAM1"),reduction = "umap",
ncol = 3, max.cutoff = "q95") +
plot_annotation(title = "Marker UMAP — Contamination Check")
# 3. Quantitative: mean expression per L2 cluster
marker_means <- AverageExpression(
healthy_cd4,
features = c("CD4","CD8A","CD8B","NKG7","GNLY","NCAM1","CD14","LYZ","S100A8"),
group.by = "predicted.celltype.l2",
assays = "RNA"
)$RNA
print(round(marker_means, 3))
9 x 6 sparse Matrix of class "dgCMatrix"
CD4 CTL CD4 Naive CD4 Proliferating CD4 TCM CD4 TEM Treg
S100A8 19.708 0.142 0.102 0.232 0.051 0.323
GNLY 63.721 0.011 0.026 0.098 0.519 0.021
CD8A . 0.006 2.169 0.071 0.553 0.038
CD8B . 0.092 1.088 0.091 0.430 0.050
CD14 0.147 0.026 . 0.033 . 0.016
NCAM1 . 0.002 . 0.002 . 0.006
CD4 0.147 1.194 0.869 1.093 1.147 1.099
LYZ 1.471 0.071 0.024 0.120 0.038 0.214
NKG7 11.440 0.013 3.125 0.049 1.111 .
# SAVE for trajectory RMD
saveRDS(healthy, file = "healthy_CD4_Subsetted_full_annotated_harmony.rds")
```