# ── CRITICAL requirements for this object ─────────────────────────────────
# ✅ CD4 Proliferating cluster removed (MKI67+/TOP2A+/CDK1+)
# ✅ UMAP built with return.model = TRUE (required for MapQuery)
# ✅ monocle3_pseudotime column (will be recomputed here for reproducibility)
# ✅ predicted.celltype.l2 (Azimuth l2 labels)
# ✅ cell_type annotation (clusters 0-6)
reference_integrated <- readRDS(
"../1-Final_Custom_MST_Monocle3_Trajectory_and_mapping/1-Custom_MST/Objects/cd4_ref_dual_trajectory.rds"
)
cat("=== Reference object summary ===\n")=== Reference object summary ===
Cells : 11466
Assays : RNA, ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT, integrated
Reductions: pca, umap, integrated_dr, ref.umap
UMAP model: TRUE
Clusters : 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
# ── Confirm CD4 Proliferating cells absent ────────────────────────────────
if (any(grepl("Prolif|prolif|cycling|Cycling",
reference_integrated$cell_type, ignore.case = TRUE))) {
warning("⚠️ Proliferating cell_type labels detected — remove before proceeding.")
} else {
cat("✅ No proliferating cell_type labels\n")
}✅ No proliferating cell_type labels
prolif_markers <- c("MKI67", "TOP2A", "PCNA", "CDK1", "STMN1")
prolif_present <- intersect(prolif_markers, rownames(reference_integrated))
if (length(prolif_present) > 0) {
DefaultAssay(reference_integrated) <- "SCT"
prolif_expr <- Matrix::colMeans(
GetAssayData(reference_integrated, layer = "data")[prolif_present, , drop = FALSE]
)
cat("Prolif marker score — max:", round(max(prolif_expr), 3),
"| cells > 0.5:", sum(prolif_expr > 0.5), "\n")
if (sum(prolif_expr > 0.5) > 50) {
warning("⚠️ Substantial proliferation signal — verify removal was complete.")
} else {
cat("✅ Proliferating cells confirmed absent\n")
}
}Prolif marker score — max: 1.187 | cells > 0.5: 6
✅ Proliferating cells confirmed absent
Cell type distribution (reference):
CD4 Tnaive (CCR7+SELL+TCF7+) CD4 TCM (CD161+/IL7R+) CD4 TCM (CCR4+/Th2-like) CD4 CTL/Temra (GZMK+GZMA+CCL5+) CD4 TEM (NF-kB activated)
5479 3994 522 490 412
CD4 Treg (FOXP3+Helios+CD25+) CD4 Tnaive-RTE (IGF1R+)
336 233
Azimuth l2 distribution (reference):
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
# Extend palette to cover any labels not pre-seeded above
ref_l2_labels <- unique(as.character(reference_integrated$predicted.celltype.l2))
ref_l2_labels <- ref_l2_labels[!is.na(ref_l2_labels)]
azimuth_l2_colors <- extend_palette(azimuth_l2_colors, ref_l2_labels)
cat("\nColour palette covers", length(ref_l2_labels), "reference labels ✅\n")
Colour palette covers 5 reference labels ✅
# ── Hard stop: UMAP model must exist for MapQuery ─────────────────────────
if (is.null(reference_integrated@reductions$umap@misc$model)) {
stop(
"❌ UMAP model missing.\n",
" Re-run: reference_integrated <- RunUMAP(reference_integrated,\n",
" dims = 1:20, return.model = TRUE)\n",
" Then re-save and reload."
)
} else {
cat("✅ UMAP model present — MapQuery projection ready\n")
}✅ UMAP model present — MapQuery projection ready
All_samples_Merged <- readRDS(
"/home/nabbasi/apollo_home/1-Seurat_RDS_OBJECT_FINAL/All_samples_Merged_with_Renamed_Clusters_Cell_state-03-12-2025.rds.rds"
)
cat("All cell lines:\n")All cell lines:
L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab CD4Tcells_10x
5825 5935 6428 6006 6022 5148 5331 5106 3504
All_samples_Merged$Group <- ifelse(
All_samples_Merged$cell_line %in% paste0("L", 1:7), "MalignantCD4T",
ifelse(
All_samples_Merged$cell_line %in%
c("CD4Tcells_lab", "CD4Tcells_10x"),
"NormalCD4T", "Other"
)
)
cat("\nGroup distribution:\n")
Group distribution:
MalignantCD4T NormalCD4T
40695 8610
MalignantCD4T_raw <- subset(All_samples_Merged, subset = Group == "MalignantCD4T")
cat("\nMalignant CD4T cells:", ncol(MalignantCD4T_raw), "\n")
Malignant CD4T cells: 40695
L1 L2 L3 L4 L5 L6 L7 CD4Tcells_lab CD4Tcells_10x
5825 5935 6428 6006 6022 5148 5331 0 0
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 9068604 484.4 15929212 850.8 15929212 850.8
Vcells 1394672850 10640.6 4458683922 34017.1 3722027787 28396.9
# ══════════════════════════════════════════════════════════════════════════
# PER-CELL-LINE SCTransform
# ══════════════════════════════════════════════════════════════════════════
# WHY: Each cell line is a separate library with different sequencing depth
# and ambient RNA profile. A single merged SCT model conflates batch
# variation with biology. Per-line SCT corrects for this before projection.
#
# APPROACH:
# 1. Split by cell_line
# 2. SCTransform each line (regress percent.mt + cell cycle scores)
# 3. Select HVGs by cross-line frequency — most robust to batch effects
# 4. Merge, scale, PCA — ready for FindTransferAnchors
# ══════════════════════════════════════════════════════════════════════════
cat("=== Per-cell-line SCTransform ===\n")=== Per-cell-line SCTransform ===
# Save names BEFORE lapply to avoid double SplitObject call
# FIX: original script called SplitObject twice — wasteful and risky
cell_line_names <- names(SplitObject(MalignantCD4T_raw, split.by = "cell_line"))
cell_line_list <- SplitObject(MalignantCD4T_raw, split.by = "cell_line")
cat("Lines to process:", paste(cell_line_names, collapse = ", "), "\n\n")Lines to process: L1, L2, L3, L4, L5, L6, L7
cell_line_list <- lapply(cell_line_names, function(line_name) {
obj <- cell_line_list[[line_name]]
cat("Processing:", line_name, "| Cells:", ncol(obj), "\n")
if (!"percent.mt" %in% colnames(obj@meta.data))
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
# Cell cycle regression removes cycling artefacts without removing cells
vars_regress <- tryCatch({
obj <- CellCycleScoring(obj,
s.features = cc.genes$s.genes,
g2m.features = cc.genes$g2m.genes,
set.ident = FALSE)
cat(" → Cell cycle scores computed\n")
c("percent.mt", "S.Score", "G2M.Score")
}, error = function(e) {
cat(" ⚠️ Cell cycle scoring failed — regressing percent.mt only\n")
"percent.mt"
})
obj <- SCTransform(obj,
vars.to.regress = vars_regress,
variable.features.n = 3000,
vst.flavor = "v2",
verbose = FALSE)
cat(" ✅", line_name, "complete |", length(VariableFeatures(obj)), "HVGs\n")
return(obj)
})Processing: L1 | Cells: 5825
→ Cell cycle scores computed
✅ L1 complete | 3000 HVGs
Processing: L2 | Cells: 5935
→ Cell cycle scores computed
✅ L2 complete | 3000 HVGs
Processing: L3 | Cells: 6428
→ Cell cycle scores computed
✅ L3 complete | 3000 HVGs
Processing: L4 | Cells: 6006
→ Cell cycle scores computed
✅ L4 complete | 3000 HVGs
Processing: L5 | Cells: 6022
→ Cell cycle scores computed
✅ L5 complete | 3000 HVGs
Processing: L6 | Cells: 5148
→ Cell cycle scores computed
✅ L6 complete | 3000 HVGs
Processing: L7 | Cells: 5331
→ Cell cycle scores computed
✅ L7 complete | 3000 HVGs
names(cell_line_list) <- cell_line_names # reuse saved names — no second SplitObject
# ── Select HVGs by cross-line frequency ───────────────────────────────────
all_hvg_lists <- lapply(cell_line_list, VariableFeatures)
hvg_freq <- sort(table(unlist(all_hvg_lists)), decreasing = TRUE)
n_hvgs <- min(3000, length(hvg_freq))
shared_hvgs <- names(hvg_freq)[1:n_hvgs]
cat("\nHVG selection:\n")
HVG selection:
cat(" Variable in all", length(cell_line_list), "lines:",
sum(hvg_freq == length(cell_line_list)), "\n") Variable in all 7 lines: 689
Variable in ≥3 lines: 3271
Final HVG set: 3000
# ── Merge, scale, PCA ─────────────────────────────────────────────────────
MalignantCD4T <- merge(cell_line_list[[1]], y = cell_line_list[-1],
merge.data = TRUE)
VariableFeatures(MalignantCD4T) <- shared_hvgs
cat("\nRunning PCA on merged object...\n")
Running PCA on merged object...
MalignantCD4T <- ScaleData(MalignantCD4T, features = shared_hvgs,
assay = "SCT", verbose = FALSE)
MalignantCD4T <- RunPCA(MalignantCD4T, features = shared_hvgs,
assay = "SCT", npcs = 30, verbose = FALSE)
cat("✅ Merged object ready\n")✅ Merged object ready
Cells: 40695
HVGs: 3000
PCA dims: 30
L1 L2 L3 L4 L5 L6 L7
5825 5935 6428 6006 6022 5148 5331
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 9453241 504.9 15929212 850.8 15929212 850.8
Vcells 1387730132 10587.6 4280400565 32656.9 5350467741 40820.9
# Genes variable in ALL 7 lines
hvg_all7 <- names(hvg_freq[hvg_freq == 7])
cat("Genes variable in all 7 lines:", length(hvg_all7), "\n")Genes variable in all 7 lines: 689
[1] "AARS" "ABCC1" "ABHD3" "AC004687.1" "AC006064.4" "AC008105.3" "AC011603.2" "AC020916.1" "ACAT2" "ACLY" "ACTB"
[12] "ACTG1" "ACTN4" "ADA" "ADAM19" "ADGRE5" "AHNAK" "AHR" "AKAP13" "AL133415.1" "AL138963.4" "AL662797.1"
[23] "ALOX5AP" "ANKRD17" "ANKRD44" "ANLN" "ANXA1" "ANXA2" "ANXA6" "APP" "ARGLU1" "ARHGAP11A" "ARHGAP15"
[34] "ARHGDIB" "ARHGEF6" "ARL4C" "ARL6IP1" "ASF1B" "ASPM" "ATAD2" "ATP1A1" "ATP2A2" "ATP2B1" "ATP5MC1"
[45] "ATXN1" "AURKA" "AURKB" "B2M" "BACH2" "BCAT1" "BCL2" "BHLHE40" "BIRC3" "BIRC5" "BIRC6"
[56] "BLVRA" "BRCA1" "BRIP1" "BTG1" "BTG2" "BUB1" "BUB1B" "C12orf75" "C21orf58" "CALM1" "CALM2"
[67] "CALR" "CAMK4" "CANX" "CAPN2" "CARHSP1" "CASK" "CASP8" "CAST" "CBR3" "CCDC28B" "CCDC86"
[78] "CCL5" "CCNA2" "CCNB1" "CCNB2" "CCND2" "CCND3" "CCNE1" "CCNE2" "CCNF" "CCR7" "CCT5"
[89] "CD151" "CD3D" "CD48" "CD52" "CD55" "CD59" "CD69" "CD70" "CD74" "CD96" "CDC20"
[100] "CDC6" "CDCA2" "CDCA3" "CDCA5" "CDCA7" "CDCA8" "CDK1" "CDK2AP2" "CDK6" "CDKAL1" "CDKN1A"
[111] "CDKN2D" "CDKN3" "CDT1" "CEBPB" "CELF2" "CENPA" "CENPE" "CENPF" "CENPU" "CEP128" "CEP55"
[122] "CHAC1" "CHAF1A" "CHCHD10" "CHST11" "CISH" "CKAP2" "CKAP2L" "CKAP5" "CKS1B" "CKS2" "CLDND1"
[133] "CLSPN" "CLTC" "CNN2" "CORO1A" "CORO1B" "COTL1" "CRIP1" "CTSC" "CTSD" "CXCR4" "CYBA"
[144] "CYLD" "CYP51A1" "CYTH1" "CYTIP" "CYTOR" "DANCR" "DCTN1" "DDIT3" "DDIT4" "DDX21" "DDX3X"
[155] "DDX6" "DENND1B" "DENND4C" "DEPDC1" "DEPDC1B" "DIAPH3" "DLG1" "DLGAP5" "DNAJA1" "DNAJB1" "DOCK10"
[166] "DOCK2" "DOCK8" "DSCC1" "DTL" "DUSP2" "DYNLL1" "E2F1" "ECT2" "EEF1A1" "EEF2" "EFHD2"
[177] "EIF1" "EIF4G1" "ELMO1" "EMP3" "ENO1" "ERC1" "ERN1" "ESCO2" "ESYT1" "ESYT2" "ETS1"
[188] "EVL" "EXOC4" "EZH2" "FABP5" "FAM107B" "FAM111A" "FAM111B" "FAM83D" "FASN" "FBXL17" "FBXL20"
[199] "FBXO5" "FDFT1" "FEN1" "FKBP11" "FKBP4" "FKBP5" "FLI1" "FLNA" "FLOT1" "FNDC3A" "FOS"
[210] "FOXN3" "FOXP2" "FTL" "FTX" "FUS" "FXYD5" "FYN" "GABPB1-AS1" "GAPDH" "GARS" "GAS5"
[221] "GATA3" "GINS2" "GNG2" "GOLGB1" "GPHN" "GPR15" "GPR171" "GPRIN3" "GRN" "GSTP1" "GTSE1"
[232] "H1FX" "H2AFX" "H2AFZ" "H3F3B" "HCST" "HDAC9" "HELLS" "HERPUD1" "HIPK2" "HIST1H1A" "HIST1H1B"
[243] "HIST1H1C" "HIST1H1D" "HIST1H1E" "HIST1H2AC" "HIST1H2AE" "HIST1H2AG" "HIST1H2AL" "HIST1H2BC" "HIST1H2BJ" "HIST1H3B" "HIST1H3D"
[254] "HIST1H3H" "HIST1H3I" "HIST1H4C" "HIST2H2AB" "HIST2H2AC" "HIST2H2BF" "HJURP" "HLA-A" "HLA-B" "HLA-C" "HLA-E"
[265] "HMGB2" "HMGCR" "HMGCS1" "HMGN2" "HMMR" "HNRNPA3" "HNRNPAB" "HNRNPH1" "HNRNPU" "HNRNPUL2" "HP1BP3"
[276] "HPGD" "HSP90AA1" "HSP90AB1" "HSP90B1" "HSPA1A" "HSPA1B" "HSPA5" "HSPA8" "HSPA9" "HSPB1" "HSPD1"
[287] "HSPE1" "HSPH1" "HUWE1" "IARS" "ID2" "IDH2" "IDI1" "IER2" "IER3" "IFITM1" "IGF2R"
[298] "IKZF1" "IKZF2" "IL10RA" "IL2RB" "IL32" "IL4R" "IL9R" "ILF3-DT" "IMMP2L" "INCENP" "INPP4B"
[309] "INSIG1" "IPO5" "IQGAP1" "IRF1" "ISG20" "ITGA4" "ITGAL" "ITGB2" "ITGB7" "ITK" "ITM2B"
[320] "ITM2C" "JPT1" "JUN" "JUNB" "JUND" "KCNQ5" "KIF11" "KIF14" "KIF15" "KIF18B" "KIF20A"
[331] "KIF20B" "KIF21B" "KIF23" "KIF2C" "KIF4A" "KIFC1" "KLF6" "KMT2C" "KNL1" "KNSTRN" "KPNA2"
[342] "LARP1" "LASP1" "LAT" "LBR" "LCP1" "LCP2" "LDHA" "LDLR" "LDLRAD4" "LGALS1" "LIME1"
[353] "LINC00892" "LINC01572" "LMNA" "LMNB1" "LMO4" "LRBA" "LRPPRC" "LSP1" "LTA" "LTB" "LY6E"
[364] "LYST" "MACF1" "MALAT1" "MANF" "MAP3K8" "MARCKSL1" "MAT2A" "MBD5" "MBNL1" "MBP" "MCL1"
[375] "MCM10" "MCM2" "MCM3" "MCM4" "MCM5" "MCM6" "MCM7" "MDFIC" "MDN1" "MGST3" "MIB1"
[386] "MIF" "MIR4435-2HG" "MIS18BP1" "MKI67" "MKNK2" "MMP25" "MSH6" "MSI2" "MSMO1" "MSN" "MT-ATP6"
[397] "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" "MT-ND1" "MT-ND2" "MT-ND3" "MT-ND4" "MT-ND4L" "MT-ND5"
[408] "MT-ND6" "MT1X" "MT2A" "MTHFD2" "MTRNR2L12" "MXD3" "MYB" "MYC" "MYH9" "MYL6" "MYO1F"
[419] "MYO1G" "MZB1" "NAMPT" "NCAPD2" "NCAPG" "NCAPG2" "NCL" "NCOA3" "NDC80" "NEAT1" "NEIL3"
[430] "NEK2" "NEK7" "NFAT5" "NFATC2" "NFE2L3" "NFKB1" "NFKBIA" "NIBAN1" "NINJ1" "NKTR" "NME1"
[441] "NOLC1" "NOP16" "NORAD" "NPM1" "NQO1" "NR3C1" "NSD2" "NSMCE2" "NUCB2" "NUDT8" "NUF2"
[452] "NUFIP2" "NUMA1" "NUSAP1" "ODC1" "OPTN" "OSBPL3" "OSTF1" "P2RY8" "PALM2-AKAP2" "PARP14" "PCLAF"
[463] "PCNA" "PDE3B" "PDE4D" "PDE7A" "PGAM1" "PGK1" "PHACTR2" "PHF19" "PHLDA1" "PIK3CD" "PIK3R1"
[474] "PIM1" "PIM2" "PIM3" "PKM" "PKMYT1" "PLAAT4" "PLEC" "PLK1" "PLP2" "PLPP1" "PMAIP1"
[485] "PNN" "PNRC1" "POLR2A" "PPDPF" "PPP1R15A" "PPP3CA" "PRC1" "PRDX1" "PREX1" "PRKCA" "PRKDC"
[496] "PRNP" "PRR11" "PSAT1" "PTMA" "PTMS" "PTPN6" "PTPN7" "PTPRC" "PTTG1" "PUM3" "PUS7"
[507] "PVT1" "PYCARD" "RAB11FIP1" "RAB37" "RABGAP1L" "RACGAP1" "RAD21" "RAD51B" "RASGRP1" "RBL1" "RBM38"
[518] "RBPJ" "RCC2" "RCSD1" "REEP5" "RELB" "RERE" "RHBDD2" "RHOC" "RNF213" "RORA" "RPL10"
[529] "RPL11" "RPL12" "RPL13" "RPL19" "RPL22L1" "RPL32" "RPL41" "RPLP0" "RPLP1" "RPS12" "RPS14"
[540] "RPS18" "RPS2" "RPS23" "RPS3" "RPS3A" "RPS6KA5" "RPS8" "RRM2" "RSRP1" "RUNX3" "S100A10"
[551] "S100A11" "S100A4" "S100A6" "S100P" "SAC3D1" "SAMD9" "SASH3" "SCLT1" "SCPEP1" "SDCBP" "SEC14L1"
[562] "SELENOW" "SELPLG" "SEMA4D" "SEPTIN9" "SERPINB1" "SETX" "SFPQ" "SGO2" "SH2D2A" "SH3BGRL3" "SH3BP1"
[573] "SIK3" "SIT1" "SKAP1" "SLBP" "SLC16A1-AS1" "SLC1A5" "SLC20A1" "SLC25A32" "SLC2A3" "SLC38A2" "SLC3A2"
[584] "SLC43A3" "SLC4A7" "SLC7A5" "SLC9A3R1" "SLFN12L" "SMARCA2" "SMC1A" "SMC4" "SMG1" "SMYD3" "SNHG12"
[595] "SNHG15" "SNHG3" "SNHG7" "SNHG8" "SNRNP200" "SOCS1" "SORL1" "SOS1" "SP140" "SPAG5" "SPIDR"
[606] "SPOCK2" "SPTAN1" "SPTBN1" "SQLE" "SQSTM1" "SREBF2" "SRGN" "SRM" "SRRT" "SRSF7" "SSBP2"
[617] "ST8SIA4" "STAT1" "STAT3" "STAT4" "STK10" "STK17B" "STMN1" "SUN2" "SYNE2" "SYTL3" "TACC3"
[628] "TAF15" "TAGLN2" "TBC1D5" "TBL1X" "TCF12" "TCP1" "TFRC" "TIMP1" "TK1" "TMBIM1" "TMEM173"
[639] "TMPO" "TMSB10" "TMSB4X" "TMTC2" "TMX4" "TNFRSF1B" "TNFSF10" "TNIK" "TOP2A" "TPM4" "TPX2"
[650] "TRAF1" "TRAF3IP3" "TRBV20-1" "TRG-AS1" "TRIM44" "TRIM56" "TRIM59" "TRIO" "TROAP" "TSC22D3" "TTK"
[661] "TUBA1A" "TUBA1B" "TUBA1C" "TUBA4A" "TUBB" "TUBB4B" "TYMS" "UBALD2" "UBC" "UBE2C" "UBE2S"
[672] "UBR4" "UCP2" "UHRF1" "UNG" "VIM" "WARS" "WDR76" "WWOX" "XBP1" "YWHAG" "ZC3HAV1"
[683] "ZEB1" "ZFAND3" "ZFAS1" "ZFP36" "ZFP36L1" "ZFP36L2" "ZYX"
# Check which canonical T cell markers are in the all-7 set
t_cell_markers <- c(
# Naive/memory
"CCR7", "SELL", "TCF7", "IL7R", "LEF1", "KLF2",
# Activation/exhaustion
"TOX", "PDCD1", "LAG3", "TIGIT", "CTLA4", "HAVCR2",
# Effector
"GZMB", "GZMK", "GZMA", "PRF1", "IFNG", "TNF",
# Treg
"FOXP3", "IL2RA", "IKZF2", "CTLA4",
# Sézary specific
"KIR3DL2", "PLS3", "TWIST1", "EPHA4", "CD164",
# Proliferation
"MKI67", "TOP2A", "CDK1"
)
found_in_all7 <- intersect(t_cell_markers, hvg_all7)
cat("\nCanonical markers in all-7 HVG set:\n")
Canonical markers in all-7 HVG set:
[1] "CCR7" "IKZF2" "MKI67" "TOP2A" "CDK1"
Markers NOT in all-7 set:
[1] "SELL" "TCF7" "IL7R" "LEF1" "KLF2" "TOX" "PDCD1" "LAG3" "TIGIT" "CTLA4" "HAVCR2" "GZMB" "GZMK" "GZMA" "PRF1" "IFNG"
[17] "TNF" "FOXP3" "IL2RA" "KIR3DL2" "PLS3" "TWIST1" "EPHA4" "CD164"
# Check what junk genes are in your current HVG set
# before find-anchors runs
cat("=== Junk gene check on shared_hvgs ===\n")=== Junk gene check on shared_hvgs ===
mt_in_hvgs <- shared_hvgs[grepl("^MT-", shared_hvgs)]
cat("MT genes in shared_hvgs:", length(mt_in_hvgs), "\n")MT genes in shared_hvgs: 13
[1] "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" "MT-ND1" "MT-ND2" "MT-ND3" "MT-ND4" "MT-ND4L" "MT-ND5" "MT-ND6"
ribo_in_hvgs <- shared_hvgs[grepl("^RPL|^RPS", shared_hvgs)]
cat("\nRibosomal genes in shared_hvgs:", length(ribo_in_hvgs), "\n")
Ribosomal genes in shared_hvgs: 53
[1] "RPL10" "RPL11" "RPL12" "RPL13" "RPL19" "RPL22L1" "RPL32" "RPL41" "RPLP0" "RPLP1" "RPS12" "RPS14" "RPS18" "RPS2" "RPS23" "RPS3"
[17] "RPS3A" "RPS6KA5" "RPS8" "RPL13A" "RPL18A" "RPL28" "RPL29" "RPL35A" "RPL5" "RPL7A" "RPL8" "RPS13" "RPS6" "RPS6KA3" "RPS7" "RPL17"
[33] "RPL23" "RPL30" "RPL35" "RPL6" "RPL7" "RPS15A" "RPS27L" "RPS4X" "RPL14" "RPL23A" "RPL26" "RPL27A" "RPL3" "RPL37" "RPL4" "RPS17"
[49] "RPS20" "RPS24" "RPS27A" "RPS6KA1" "RPS6KC1"
hsp_in_hvgs <- shared_hvgs[grepl("^HSP|^HSPA|^HSPB", shared_hvgs)]
cat("\nHeat shock genes in shared_hvgs:", length(hsp_in_hvgs), "\n")
Heat shock genes in shared_hvgs: 13
[1] "HSP90AA1" "HSP90AB1" "HSP90B1" "HSPA1A" "HSPA1B" "HSPA5" "HSPA8" "HSPA9" "HSPB1" "HSPD1" "HSPE1" "HSPH1" "HSPA4"
snhg_in_hvgs <- shared_hvgs[grepl("^SNHG|MALAT1|NEAT1", shared_hvgs)]
cat("\nlncRNA genes in shared_hvgs:", length(snhg_in_hvgs), "\n")
lncRNA genes in shared_hvgs: 15
[1] "MALAT1" "NEAT1" "SNHG12" "SNHG15" "SNHG3" "SNHG7" "SNHG8" "SNHG1" "SNHG29" "SNHG16" "SNHG25" "SNHG17" "SNHG30" "SNHG32" "SNHG5"
cat("\nTotal junk genes to be filtered:",
length(mt_in_hvgs) + length(ribo_in_hvgs) +
length(hsp_in_hvgs) + length(snhg_in_hvgs), "\n")
Total junk genes to be filtered: 94
=== Building Monocle3 CDS ===
cds <- as.cell_data_set(reference_integrated)
# Transfer frozen UMAP coordinates — these define the trajectory space
reducedDim(cds, "UMAP") <- Embeddings(reference_integrated, "umap")
# Single partition: one connected graph across all healthy cells.
# CD4 Proliferating cells have been removed so no spurious cell-cycle
# branch will pull the graph away from the differentiation axis.
partition_vec <- setNames(factor(rep(1L, ncol(cds))), colnames(cds))
cds@clusters$UMAP$partitions <- partition_vec
cluster_vec <- setNames(
factor(reference_integrated$seurat_clusters[colnames(cds)]),
colnames(cds)
)
cds@clusters$UMAP$clusters <- cluster_vec
# Transfer metadata
colData(cds)$cell_line <- reference_integrated$orig.ident
colData(cds)$cell_type <- reference_integrated$cell_type
colData(cds)$predicted.celltype.l2 <- reference_integrated$predicted.celltype.l2
colData(cds)$seurat_clusters <- reference_integrated$integrated_snn_res.0.2
if ("orig.ident" %in% colnames(reference_integrated@meta.data))
colData(cds)$sample <- reference_integrated$orig.ident
# Validate all required slots are named correctly
stopifnot(
"clusters must be named" = !is.null(names(cds@clusters$UMAP$clusters)),
"partitions must be named" = !is.null(names(cds@clusters$UMAP$partitions)),
"cluster length matches" = length(cds@clusters$UMAP$clusters) == ncol(cds),
"partition length matches" = length(cds@clusters$UMAP$partitions) == ncol(cds)
)
cat("CDS built:", ncol(cds), "cells\n")CDS built: 11466 cells
Clusters: 8
Partitions: 1
Azimuth l2 breakdown:
CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
2037 9067 145 10 207
# ── Learn principal graph ──────────────────────────────────────────────────
# Goal: recover two biological lineages as a single connected graph:
# Lineage 1: Naive → TCM → TEM → Temra/CTL (effector axis)
# Lineage 2: Naive → TCM → Treg (regulatory branch)
#
# use_partition = FALSE: single connected graph (no partition forcing)
# close_loop = FALSE: open trajectory (no loop back)
# minimal_branch_len = 5: short enough to detect the Treg branch
# — if Treg branch not detected, reduce to 3
# — if too many spurious branches, increase to 8
#
# NOTE: nn.k is NOT a valid learn_graph_control parameter in monocle3.
# FIX: removed nn.k — monocle3 handles neighbourhood structure internally.
# The key sensitivity parameter is minimal_branch_len.
set.seed(42)
cds <- learn_graph(
cds,
use_partition = FALSE,
close_loop = FALSE,
learn_graph_control = list(
minimal_branch_len = 10,
# ncenter = 100, # default is ~250 — reduce to simplify graph
orthogonal_proj_tip = FALSE
),
verbose = TRUE
)
cat("\n✅ Principal graph learned\n")
✅ Principal graph learned
Nodes: 349
n_branch_points <- sum(igraph::degree(principal_graph(cds)$UMAP) > 2)
cat("Branch points:", n_branch_points, "\n")Branch points: 18
if (n_branch_points == 0) {
warning(
"❌ No branch point — Treg lineage not separated.\n",
" Re-run learn_graph with minimal_branch_len = 3"
)
} else if (n_branch_points > 3) {
warning(
"⚠️ Too many branch points (", n_branch_points, ").\n",
" Re-run learn_graph with minimal_branch_len = 8"
)
} else {
cat("✅ Branch structure looks correct (1-3 branch points)\n")
}
# ── Trajectory visualisation ──────────────────────────────────────────────
umap_coords <- as.data.frame(Embeddings(reference_integrated, "umap"))
colnames(umap_coords) <- c("UMAP1", "UMAP2")
umap_coords$l2 <- reference_integrated$predicted.celltype.l2
umap_coords$cell_type <- reference_integrated$cell_type
label_l2 <- umap_coords %>%
group_by(l2) %>%
summarise(UMAP1 = median(UMAP1), UMAP2 = median(UMAP2), .groups = "drop")
label_ct <- umap_coords %>%
group_by(cell_type) %>%
summarise(UMAP1 = median(UMAP1), UMAP2 = median(UMAP2), .groups = "drop")
p_graph_l2 <- plot_cells(
cds,
color_cells_by = "predicted.celltype.l2",
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = TRUE,
cell_size = 0.7,
show_trajectory_graph = TRUE
) +
scale_color_manual(values = azimuth_l2_colors, name = "State") +
geom_label_repel(data = label_l2,
aes(x = UMAP1, y = UMAP2, label = l2),
size = 4, fontface = "bold", fill = "white", alpha = 0.85,
box.padding = 0.5, segment.color = "grey40",
inherit.aes = FALSE) +
ggtitle("Monocle3 Graph: Azimuth l2\n(Two lineages: effector axis + Treg branch)") +
theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"),
legend.position = "none")
p_graph_ct <- plot_cells(
cds,
color_cells_by = "cell_type",
label_groups_by_cluster = FALSE,
label_leaves = FALSE,
label_branch_points = TRUE,
cell_size = 0.7,
show_trajectory_graph = TRUE
) +
geom_label_repel(data = label_ct,
aes(x = UMAP1, y = UMAP2, label = cell_type),
size = 3.5, fontface = "bold", fill = "white", alpha = 0.85,
box.padding = 0.5, segment.color = "grey40",
inherit.aes = FALSE) +
ggtitle("Monocle3 Graph: Cell Types") +
theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"),
legend.position = "none")
p_graph_l2 | p_graph_ctAzimuth l2 labels in CDS:
[1] CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
Levels: CD4 Naive CD4 TCM CD4 TEM CD4 Temra/CTL Treg
# ── Identify naive cells ───────────────────────────────────────────────────
naive_ids <- rownames(colData(cds))[
grepl("naive|Naive|TN$", colData(cds)$predicted.celltype.l2,
ignore.case = TRUE) |
grepl("Tnaive", as.character(colData(cds)$cell_type),
ignore.case = FALSE)
]
cat("\nNaive root cells:", length(naive_ids), "\n")
Naive root cells: 5739
if (length(naive_ids) == 0)
stop("❌ No naive cells found. Check label patterns above.")
# ── Find root node by minimum distance to naive centroid ──────────────────
# Using centroid distance (not most-populated node) is more robust:
# the most-populated node biases toward the densest part of the naive
# cluster which may not be the earliest point on the trajectory.
naive_umap <- Embeddings(reference_integrated, "umap")[naive_ids, ]
naive_centroid <- colMeans(naive_umap)
pr_nodes <- t(cds@principal_graph_aux[["UMAP"]]$dp_mst)
node_dists <- rowSums(
(pr_nodes - matrix(naive_centroid,
nrow = nrow(pr_nodes), ncol = 2,
byrow = TRUE))^2
)
root_node <- names(which.min(node_dists))
cat("Root node selected:", root_node, "\n")Root node selected: Y_256
Root coords: (-3.424, -0.886)
Naive centroid: (-2.865, -0.651)
# ── Order cells ────────────────────────────────────────────────────────────
cds <- order_cells(cds, root_pr_nodes = root_node)
# Transfer pseudotime back to Seurat object
reference_integrated$monocle3_pseudotime <- pseudotime(cds)
reference_integrated$monocle3_pseudotime[
!is.finite(reference_integrated$monocle3_pseudotime)] <- NA
cat("\nPseudotime summary (reference):\n")
Pseudotime summary (reference):
print(summary(reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)])) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.226 9.459 11.396 18.602 25.553
# ══════════════════════════════════════════════════════════════════════════
# TOPOLOGY VALIDATION
# ══════════════════════════════════════════════════════════════════════════
# Correct order for each lineage:
# Effector: Naive < TCM < TEM < Temra
# Treg: Naive < TCM — Treg (Treg branches from TCM, NOT after TEM)
#
# This is a hard biological requirement. If Treg > TEM in pseudotime,
# the graph has placed Treg downstream of TEM which is anatomically wrong.
# Treg differentiates from TCM-stage precursors via FOXP3 induction,
# not from terminally differentiated TEM/Temra cells.
pt_check <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n = n(),
med_pt = round(median(monocle3_pseudotime, na.rm = TRUE), 3),
mean_pt = round(mean(monocle3_pseudotime, na.rm = TRUE), 3),
.groups = "drop"
) %>%
arrange(med_pt)
cat("\n=== Pseudotime order per state (must be Naive < TCM < Treg < TEM < Temra) ===\n")
=== Pseudotime order per state (must be Naive < TCM < Treg < TEM < Temra) ===
# Extract state medians — used for bin boundaries downstream
naive_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 Naive"]
tcm_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 TCM"]
treg_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "Treg"]
tem_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 TEM"]
temra_med <- pt_check$med_pt[pt_check$predicted.celltype.l2 == "CD4 Temra/CTL"]
# FIX: single-string sprintf — original had two format strings which
# caused the second string to be silently ignored
cat(sprintf(
"\nState medians: Naive=%.2f | TCM=%.2f | Treg=%.2f | TEM=%.2f | Temra=%.2f\n",
naive_med, tcm_med, treg_med, tem_med, temra_med
))
State medians: Naive=3.99 | TCM=13.36 | Treg=18.75 | TEM=24.78 | Temra=25.25
# Hard stops — script will not continue if topology is biologically wrong
if (naive_med > tcm_med)
stop(sprintf(
"❌ TOPOLOGY ERROR: Naive (%.2f) > TCM (%.2f).\n Root node may be misplaced — check naive_centroid coordinates.",
naive_med, tcm_med))
if (tcm_med > tem_med)
stop(sprintf(
"❌ TOPOLOGY ERROR: TCM (%.2f) > TEM (%.2f).\n Re-check root node selection.",
tcm_med, tem_med))
if (treg_med > tem_med)
stop(sprintf(
"❌ TOPOLOGY ERROR: Treg (%.2f) > TEM (%.2f).\n Treg must branch from TCM, not be downstream of TEM.\n Fix: re-run learn_graph with minimal_branch_len = 3",
treg_med, tem_med))
# FIX: single sprintf format string (was split across two strings before)
cat(sprintf(
"✅ Topology confirmed: Naive(%.2f) < TCM(%.2f) < Treg(%.2f) < TEM(%.2f) < Temra(%.2f)\n",
naive_med, tcm_med, treg_med, tem_med, temra_med
))✅ Topology confirmed: Naive(3.99) < TCM(13.36) < Treg(18.75) < TEM(24.78) < Temra(25.25)
# ── Pseudotime UMAP plots ─────────────────────────────────────────────────
plot_cells(
cds,
color_cells_by = "pseudotime",
label_cell_groups = FALSE,
cell_size = 0.8,
show_trajectory_graph = TRUE
) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
ggtitle("Reference Pseudotime\n(Root = CD4 Naive | Two lineages: TEM/Temra + Treg)") +
theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"))# Visual validation that pseudotime order matches expected biology.
# Both violin and ridge plots should show:
# Naive (lowest) → TCM → Treg (branch, intermediate) → TEM → Temra (highest)
pt_data <- data.frame(
pseudotime = reference_integrated$monocle3_pseudotime,
l2 = reference_integrated$predicted.celltype.l2
) %>% filter(is.finite(pseudotime), !is.na(l2))
# Order x-axis by median pseudotime — should give biological order
l2_order <- pt_data %>%
group_by(l2) %>%
summarise(med_pt = median(pseudotime, na.rm = TRUE), .groups = "drop") %>%
arrange(med_pt) %>%
pull(l2)
pt_data$l2 <- factor(pt_data$l2, levels = l2_order)
p_violin <- ggplot(pt_data, aes(x = l2, y = pseudotime, fill = l2)) +
geom_violin(scale = "width", alpha = 0.85, trim = TRUE) +
geom_boxplot(width = 0.12, fill = "white", outlier.size = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Reference Pseudotime per Azimuth l2 Label",
subtitle = "Expected order: Naive < TCM < Treg < TEM < Temra",
x = "", y = "Monocle3 Pseudotime")
p_ridge <- ggplot(pt_data, aes(x = pseudotime, y = l2, fill = l2)) +
geom_density_ridges(alpha = 0.75, scale = 1.2, rel_min_height = 0.01) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 10) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Reference Pseudotime Density by Azimuth l2 Label",
x = "Pseudotime", y = "")
p_violin | p_ridge
# ════════════════════════════════════════════════════════════════════════════════
# CRITICAL FIX: Rebuild reference PCA in SCT space (preserves UMAP trajectory)
# PROBLEM: reference_integrated PCA = "integrated" assay (50 dims)
# MalignantCD4T PCA = "SCT" assay (30 dims)
# SOLUTION: SCT + PCA on reference → unified SCT space for FindTransferAnchors
# UMAP MODEL 100% SAFE — lives in reductions$umap@misc$model
# ════════════════════════════════════════════════════════════════════════════════
cat("=== DIAGNOSTIC: Current reference state ===\n")=== DIAGNOSTIC: Current reference state ===
DefaultAssay: SCT
SCT HVGs: 0
PCA dims: 50
UMAP model present: TRUE
# 1. Sanity check UMAP before
old_umap_coords <- Embeddings(reference_integrated, "umap")[1:3, ]
old_umap_model <- !is.null(reference_integrated@reductions$umap@misc$model)
cat("\nUMAP first 3 cells (backup):", paste(round(old_umap_coords[1,], 3), collapse=" "), "\n")
UMAP first 3 cells (backup): -2.072 -1.749
# 2. SCT on reference (creates SCT assay + HVGs)
DefaultAssay(reference_integrated) <- "RNA"
reference_integrated <- SCTransform(
reference_integrated,
variable.features.n = 3000,
vst.flavor = "v2",
vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"),
verbose = FALSE
)
# 3. PCA in SCT space (matches MalignantCD4T exactly)
reference_integrated <- RunPCA(
reference_integrated,
assay = "SCT",
npcs = 30, # ← Match query dims exactly
verbose = FALSE
)
# 4. Verify fix
cat("\n=== FIXED: New reference state ===\n")
=== FIXED: New reference state ===
DefaultAssay: SCT
SCT HVGs: 3000
PCA dims: 30
UMAP model preserved: TRUE
# 5. UMAP integrity check
new_umap_coords <- Embeddings(reference_integrated, "umap")[1:3, ]
cat("UMAP unchanged:", all(old_umap_coords == new_umap_coords), "\n")UMAP unchanged: TRUE
✅ Reference now in SCT space — FindTransferAnchors will work for ALL 7 lines
# QUICK PRE-FLIGHT CHECKS (run these 2 lines now)
cat("SCT HVGs:", length(VariableFeatures(reference_integrated, assay = "SCT")), "\n")SCT HVGs: 3000
PCA dims: 30
# ══════════════════════════════════════════════════════════════════════════
# FIND TRANSFER ANCHORS
# ══════════════════════════════════════════════════════════════════════════
# Anchors are mutual nearest neighbours between reference and query in
# shared PCA space. Their quality determines the accuracy of:
# (a) label transfer → which normal state does this Sézary cell resemble?
# (b) pseudotime transfer → where along the differentiation axis is it?
#
# FEATURE SELECTION STRATEGY:
# Step 1 — intersect reference + query HVGs
# Step 2 — remove technical/junk genes (MT, ribosomal, HSP, lncRNA, histone)
# WHY: these genes are variable due to library quality and stress,
# NOT due to T cell differentiation state. Keeping them biases
# anchors toward batch effects rather than biology.
# Step 3 — rank remaining genes by joint residual variance (ref + query)
# WHY: genes highly variable in BOTH datasets are the most
# reliable for finding true biological nearest neighbours
# ══════════════════════════════════════════════════════════════════════════
DefaultAssay(reference_integrated) <- "SCT"
DefaultAssay(MalignantCD4T) <- "SCT"
# ── Step 1: Intersect reference and query HVGs ────────────────────────────
ref_hvgs <- VariableFeatures(reference_integrated, assay = "SCT")
if (length(ref_hvgs) == 0) {
cat("⚠️ No HVGs stored in reference — extracting from scale.data\n")
ref_hvgs <- rownames(GetAssayData(reference_integrated, assay = "SCT",
layer = "scale.data"))
}
query_hvgs <- VariableFeatures(MalignantCD4T)
common_features <- intersect(ref_hvgs, query_hvgs)
cat("=== Feature set construction ===\n")=== Feature set construction ===
Reference HVGs : 3000
Query HVGs : 3000
Common HVGs : 1443
if (length(common_features) < 1500)
warning(paste0("⚠️ Only ", length(common_features),
" common features — below recommended 1500.\n",
" Increase nfeatures in SCTransform to 4000."))
# ── Step 2: Remove technical/junk genes ───────────────────────────────────
# Categories removed and WHY:
# ^MT- : mitochondrial OXPHOS genes — variable due to cell quality/stress
# ^RPL/S : ribosomal proteins — variable due to translation rate/library depth
# ^HSP* : heat shock proteins — variable due to stress, not differentiation
# ^SNHG/MALAT1/NEAT1 : lncRNAs — nuclear architecture, not T cell state
# ^HIST : replication-dependent histones — S-phase markers (cell cycle)
# NOTE: ^HIST added based on pre-run QC showing 20+ histone genes
# in shared HVG set, all driven by cycling subpopulation
junk_pattern <- paste0(
"^RPL|^RPS|", # ribosomal large + small subunits
"^MT-|", # mitochondrial genome (OXPHOS complexes I,III,IV,V)
"^HSP|^HSPA|^HSPB|", # heat shock proteins
"^HSPD|^HSPE|^HSPH|", # heat shock proteins (continued)
"^SNHG|", # small nucleolar RNA host genes
"^MALAT1|^NEAT1|", # abundant nuclear lncRNAs
"^XIST|", # X-inactivation (sex-specific batch)
"^HIST" # replication-dependent histones (S-phase signal)
)
# Report breakdown by category before removing
mt_removed <- sum(grepl("^MT-", common_features))
ribo_removed <- sum(grepl("^RPL|^RPS", common_features))
hsp_removed <- sum(grepl("^HSP", common_features))
lnc_removed <- sum(grepl("^SNHG|^MALAT1|^NEAT1|^XIST", common_features))
hist_removed <- sum(grepl("^HIST", common_features))
cat("\n=== Junk gene removal breakdown ===\n")
=== Junk gene removal breakdown ===
Mitochondrial (OXPHOS) : 13
Ribosomal (RPL/RPS) : 32
Heat shock (HSP*) : 12
lncRNA (SNHG/MALAT1/NEAT1): 12
Histones (HIST*) : 10
n_before <- length(common_features)
common_features_clean <- common_features[!grepl(junk_pattern, common_features)]
n_removed <- n_before - length(common_features_clean)
cat(sprintf(" ─────────────────────────────\n")) ─────────────────────────────
Total removed : 79 (5.5%)
Clean features remaining : 1364
if (length(common_features_clean) < 1200)
warning("⚠️ Fewer than 1200 clean features — anchor quality may be reduced.")
# ── Step 3: Rank by joint residual variance ───────────────────────────────
# Genes that are highly variable in BOTH the healthy reference AND the
# malignant query are the most informative for finding true biological
# nearest neighbours. Ranking by joint variance ensures differentiation
# genes (CCR7, TCF7, GZMK, FOXP3) rank above line-specific noise.
ref_var_meta <- reference_integrated[["SCT"]]@meta.features
query_var_meta <- MalignantCD4T[["SCT"]]@meta.features
if ("residual_variance" %in% colnames(ref_var_meta) &&
"residual_variance" %in% colnames(query_var_meta)) {
ref_rv <- ref_var_meta[common_features_clean, "residual_variance"]
query_rv <- query_var_meta[common_features_clean, "residual_variance"]
ref_rv[is.na(ref_rv)] <- 0
query_rv[is.na(query_rv)] <- 0
mean_rv <- (ref_rv + query_rv) / 2
common_features <- common_features_clean[order(mean_rv, decreasing = TRUE)]
cat("Top 20 features by joint residual variance:\n")
print(head(common_features, 20))
cat("\n")
} else {
cat("⚠️ residual_variance not in SCT meta.features — using unranked clean list\n")
cat(" This is OK but ranking would improve anchor quality.\n")
common_features <- common_features_clean
}⚠️ residual_variance not in SCT meta.features — using unranked clean list
This is OK but ranking would improve anchor quality.
# ── Step 4: Canonical T cell marker check ─────────────────────────────────
# These markers span the full differentiation axis and Treg branch.
# If fewer than 5 are present, the feature set cannot distinguish
# differentiation states and anchor quality will be poor.
markers_check <- c(
"CCR7", "SELL", "TCF7", "IL7R", # Naive/TCM
"GZMB", "GZMK", "PRF1", "EOMES", # Effector/TEM
"FOXP3", "IKZF2", # Treg
"TOX", "PDCD1", "LAG3", # Exhaustion
"CD44", "TBX21" # Activation
)
found <- markers_check[markers_check %in% common_features]
not_found <- markers_check[!markers_check %in% common_features]
cat("=== Canonical T cell marker check ===\n")=== Canonical T cell marker check ===
Present ( 8 ): CCR7, SELL, TCF7, IL7R, PRF1, IKZF2, TOX, CD44
Absent ( 7 ): GZMB, GZMK, EOMES, FOXP3, PDCD1, LAG3, TBX21
if (length(found) < 5) {
warning("⚠️ Fewer than 5 canonical markers present — anchor quality at risk.")
} else if (length(found) < 8) {
cat("⚠️ Some markers absent — likely heterogeneous across lines (expected)\n")
cat(" Projection will still work but interpret per-line results carefully\n")
} else {
cat("✅ Good marker coverage — feature set captures differentiation axis\n")
}✅ Good marker coverage — feature set captures differentiation axis
# ── Step 5: FindTransferAnchors ───────────────────────────────────────────
# k.anchor = 10 : neighbours used to find anchors — balanced for ~11k ref
# k.filter = 500 : neighbours for anchor filtering — removes weak anchors
# k.score = 30 : neighbours for anchor scoring — weights anchor quality
# dims = 1:30: use all 30 PCA dims — captures full differentiation space
dims_to_use <- min(30,
ncol(Embeddings(reference_integrated, "pca")),
ncol(Embeddings(MalignantCD4T, "pca")))
cat("\nFinding anchors with dims 1:", dims_to_use, "\n")
Finding anchors with dims 1: 30
Features used: 1364
anchors <- FindTransferAnchors(
reference = reference_integrated,
query = MalignantCD4T,
features = common_features,
normalization.method = "SCT",
reference.reduction = "pca",
dims = 1:dims_to_use,
k.anchor = 10,
k.filter = 500,
k.score = 30,
verbose = TRUE
)
# ── Step 6: Anchor quality check ──────────────────────────────────────────
# NOTE: In Seurat FindTransferAnchors internal convention:
# cell1 = reference indices
# cell2 = query indices (OPPOSITE to what variable names suggest)
# Always use cell1→reference and cell2→query for barcode mapping.
anchor_df <- as.data.frame(slot(anchors, "anchors"))
# CORRECT barcode mapping — cell1=reference, cell2=query
ref_barcodes <- colnames(reference_integrated)
query_barcodes <- colnames(MalignantCD4T)
anchor_df$ref_barcode <- ref_barcodes[anchor_df$cell1]
anchor_df$query_barcode <- query_barcodes[anchor_df$cell2]
anchor_df$query_cellline <- MalignantCD4T$cell_line[anchor_df$query_barcode]
anchor_df$ref_label <- reference_integrated$predicted.celltype.l2[anchor_df$ref_barcode]
# Verify no NAs
cat("\n=== Barcode mapping validation ===\n")
=== Barcode mapping validation ===
NA ref barcodes : 0
NA query barcodes: 0
NA cell lines : 0
n_anchors <- nrow(anchor_df)
anchor_ratio <- ncol(MalignantCD4T) / n_anchors
anchor_scores <- anchor_df$score
cat("\n=== Anchor quality summary ===\n")
=== Anchor quality summary ===
Anchors found : 8021
Malignant cells : 40695
Reference cells : 11466
Cells per anchor : 5.1 : 1 (ideal ≤ 8:1)
cat(sprintf("Anchor scores : mean=%.3f | median=%.3f | min=%.3f | max=%.3f\n",
mean(anchor_scores), median(anchor_scores),
min(anchor_scores), max(anchor_scores)))Anchor scores : mean=0.535 | median=0.500 | min=0.000 | max=1.000
=== Anchors per cell line ===
L1 L2 L3 L4 L5 L6 L7
2062 1699 869 1025 706 926 734
=== Anchors by reference label ===
anchor_df %>%
group_by(ref_label) %>%
summarise(n_anchors = n(),
mean_score = round(mean(score), 3),
.groups = "drop") %>%
arrange(desc(n_anchors)) %>%
print()
if (anchor_ratio > 8) {
warning(paste0(
"⚠️ Low anchor density (", round(anchor_ratio, 1), ":1).\n",
" 1. Junk genes still dominating PCA — check top 20 features above\n",
" 2. Sézary cells too transcriptionally distant from reference\n",
" 3. Increase k.anchor to 15 and rerun"
))
} else if (anchor_ratio > 5) {
cat(sprintf("⚠️ Moderate anchor density (%.1f:1) — acceptable\n", anchor_ratio))
} else {
cat(sprintf("✅ Good anchor density (%.1f:1) — projection quality should be high\n",
anchor_ratio))
}⚠️ Moderate anchor density (5.1:1) — acceptable
# ── ANCHOR DIAGNOSTIC: Biology vs Artefact ────────────────────────────────
# If L3-L7 are truly dedifferentiated, their PCA space should be
# DISTANT from the reference. If it is a technical issue, they will
# sit in the same PCA space as L1/L2 but just not form mutual NN.
cat("=== PCA space comparison: L1/L2 vs L3-L7 ===\n")=== PCA space comparison: L1/L2 vs L3-L7 ===
# Get reference PCA centroid (TCM region)
ref_pca <- Embeddings(reference_integrated, "pca")[, 1:10]
ref_centroid <- colMeans(ref_pca)
# Get query PCA embeddings per line
query_pca <- Embeddings(MalignantCD4T, "pca")[, 1:10]
# Distance from each query cell to reference centroid
dist_to_ref <- rowSums(
sweep(query_pca, 2, ref_centroid, "-")^2
)
# Compare distances by cell line
dist_df <- data.frame(
cell = colnames(MalignantCD4T),
cell_line = MalignantCD4T$cell_line,
dist_to_ref = dist_to_ref
)
dist_summary <- dist_df %>%
group_by(cell_line) %>%
summarise(
mean_dist = round(mean(dist_to_ref), 2),
median_dist = round(median(dist_to_ref), 2),
.groups = "drop"
) %>%
arrange(mean_dist)
cat("\nDistance from reference PCA centroid per cell line:\n")
Distance from reference PCA centroid per cell line:
(Lower = more similar to healthy reference)
=== HVG overlap with reference per line ===
ref_hvgs_check <- VariableFeatures(reference_integrated, assay = "SCT")
for (ln in paste0("L", 1:7)) {
line_cells <- colnames(MalignantCD4T)[MalignantCD4T$cell_line == ln]
# Check expression of key T cell markers in each line
markers <- c("CCR7", "TCF7", "SELL", "IL7R", "GZMB", "TOX", "KIR3DL2")
markers_present <- intersect(markers, rownames(MalignantCD4T))
expr <- rowMeans(
GetAssayData(MalignantCD4T, assay = "SCT", layer = "data")[
markers_present, line_cells, drop = FALSE]
)
cat(sprintf("\n%s — key marker expression:\n", ln))
print(round(expr, 3))
}
L1 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
0.627 0.465 0.634 0.217 0.026 0.359 1.068
L2 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
0.148 0.437 0.000 0.062 0.088 1.222 1.871
L3 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
1.015 0.060 0.018 0.042 0.002 0.020 0.000
L4 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
1.090 0.001 0.221 0.034 0.007 0.422 0.000
L5 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
1.218 0.011 0.135 0.311 0.000 0.147 0.624
L6 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
0.713 0.020 0.092 0.401 0.425 0.171 0.880
L7 — key marker expression:
CCR7 TCF7 SELL IL7R GZMB TOX KIR3DL2
0.849 0.000 0.247 0.145 0.028 0.191 0.855
# Show TOP reference states each cell line connects to
cat("\n=== WHAT EACH CELL LINE ANCHORS TO ===\n")
=== WHAT EACH CELL LINE ANCHORS TO ===
anchor_df %>%
group_by(query_cellline, ref_label) %>%
summarise(
n_anchors = n(),
mean_score = round(mean(score), 3),
.groups = "drop"
) %>%
arrange(query_cellline, desc(n_anchors)) %>%
print(n = 30) # Show all combinations=== MapQuery projection ===
# MapQuery simultaneously:
# 1. Transfers labels + pseudotime from reference to query (TransferData)
# 2. Projects query PCA into reference PCA space (IntegrateEmbeddings)
# 3. Projects query onto frozen reference UMAP (ProjectUMAP)
# Malignant cells do NOT alter the reference — they are read-only passengers.
mapped_MalignantCD4T <- MapQuery(
anchorset = anchors,
query = MalignantCD4T,
reference = reference_integrated,
refdata = list(
predicted.celltype.l2 = "predicted.celltype.l2", # PRIMARY: state label
pseudotime = "monocle3_pseudotime" # CONTINUOUS: position
),
reference.reduction = "pca",
reduction.model = "umap"
)0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
| | 0 % ~calculating
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=07s
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
✅ MapQuery complete
Mapped cells: 40695
# Coerce pseudotime to numeric — Seurat TransferData returns character
mapped_MalignantCD4T$predicted.pseudotime <- as.numeric(
mapped_MalignantCD4T$predicted.pseudotime
)
cat("\nTransferred pseudotime summary:\n")
Transferred pseudotime summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.156 16.431 17.703 17.655 19.487 25.553
Transferred Azimuth l2 distribution:
CD4 Naive CD4 TCM CD4 TEM Treg
21 40291 304 79
=== Diversity check ===
l2_table <- table(mapped_MalignantCD4T$predicted.predicted.celltype.l2, useNA = "always")
l2_dist <- prop.table(l2_table)
cat("Label distribution:\n")Label distribution:
CD4 Naive CD4 TCM CD4 TEM Treg <NA>
21 40291 304 79 0
na_count <- l2_table["<NA>"]
na_pct <- ifelse(is.na(na_count), 0, round(100 * na_pct, 1))
# Non-NA predictions only
non_na_table <- l2_table[!names(l2_table) %in% "<NA>"]
if (length(non_na_table) > 0) {
non_na_dist <- prop.table(non_na_table)
max_state <- names(which.max(non_na_dist))
max_pct <- round(100 * max(non_na_dist), 1)
cat(sprintf("\nNA fraction: %.1f%%\n", na_pct))
cat(sprintf("Dominant state: %s (%.1f%% of non-NA)\n", max_state, max_pct))
if (max_pct > 90) {
warning(sprintf("⚠️ %s dominates (%.1f%%) — check SCT/feature filtering", max_state, max_pct))
} else {
cat("✅ Diverse mapping across states\n")
}
} else {
cat("⚠️ No confident predictions made\n")
}
NA fraction: 0.0%
Dominant state: CD4 TCM (99.0% of non-NA)
# Sézary-specific diversity check
dominant_pct <- round(100 * max(prop.table(non_na_table)), 1)
if (dominant_pct > 95) {
cat(sprintf("✅ TCM-dominant (%.1f%%) = CLASSIC SÉZARY PHENOTYPE\n", dominant_pct))
cat("Minor subclones (TEM/Treg/Naive) = expected heterogeneity\n")
} else {
cat(sprintf("✅ Diverse mapping: %s = %.1f%%\n", max_state, dominant_pct))
}✅ TCM-dominant (99.0%) = CLASSIC SÉZARY PHENOTYPE
Minor subclones (TEM/Treg/Naive) = expected heterogeneity
# 1. Per-cell-line breakdown (your thesis gold)
table(mapped_MalignantCD4T$cell_line, mapped_MalignantCD4T$predicted.predicted.celltype.l2)
CD4 Naive CD4 TCM CD4 TEM Treg
L1 21 5800 2 2
L2 0 5893 42 0
L3 0 6168 253 7
L4 0 6000 1 5
L5 0 6008 6 8
L6 0 5102 0 46
L7 0 5320 0 11
# 2. Plot on trajectory UMAP
DimPlot(mapped_MalignantCD4T, group.by = "predicted.predicted.celltype.l2", reduction = "ref.umap")
# 3. Pseudotime violin by cell line
VlnPlot(mapped_MalignantCD4T, features = "predicted.pseudotime", group.by = "cell_line", pt.size = 0)p_l2 <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.6) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = l2_q),
size = 1.2, alpha = 0.9) +
scale_color_manual(values = azimuth_l2_colors,
name = "Azimuth l2\n(transferred)", na.value = "grey40") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Sézary Cells — Transferred Azimuth l2 Labels")
p_lines <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = reference_color, size = 0.4, alpha = 0.5) +
geom_point(data = query_umap,
aes(x = UMAP_1, y = UMAP_2, color = cell_line),
size = 1.0, alpha = 0.9) +
scale_color_manual(values = cell_line_palette, name = "Cell line") +
theme_classic(base_size = 11) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Sézary Cells per Cell Line")
p_l2 | p_linesggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = "grey60", size = 0.3, alpha = 0.5) +
geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 0.9, alpha = 0.85) +
facet_wrap(~ cell_line, ncol = 4) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime",
na.value = "grey60") +
theme_classic(base_size = 10) +
theme(strip.text = element_text(size = 9, face = "bold"),
strip.background = element_rect(fill = "#E8F4FD"),
plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Per Cell Line — Projection onto Reference UMAP (Pseudotime)")# ══════════════════════════════════════════════════════════════════════════
# PSEUDOTIME BIN ASSIGNMENT — STATE-ANCHORED BOUNDARIES
# ══════════════════════════════════════════════════════════════════════════
#
# Bin boundaries = midpoint between adjacent reference state medians.
# This is biologically principled: the boundary between TCM and TEM bins
# is exactly halfway between the median TCM pseudotime and median TEM
# pseudotime in the healthy reference — NOT an arbitrary 33rd/66th
# percentile cut.
#
# TWO LINEAGES modelled:
# Effector axis (pseudotime-based):
# Naive-like → below midpoint(Naive, TCM)
# TCM-like → between midpoint(Naive,TCM) and midpoint(TCM,TEM)
# TEM-like → between midpoint(TCM,TEM) and midpoint(TEM,Temra)
# Temra-like → above midpoint(TEM,Temra)
#
# Regulatory branch (label-based):
# Treg-like → transferred Azimuth label == "Treg" at ANY pseudotime
# RATIONALE: Treg is a branch from TCM, not a linear position on the
# effector axis. Pseudotime for Treg cells is lower than TEM by design
# (correct topology), but using pseudotime alone would misassign some
# Treg cells to TCM-like. Label + branch logic is more accurate.
# IMPORTANT: This means Treg-like bin % will equal Treg label % exactly.
# The label vs pseudotime bin comparison is only meaningful for the
# effector axis (Naive/TCM/TEM/Temra).
# ── Step 1: Reference state median pseudotimes ────────────────────────────
# These were already computed in pseudotime-root chunk (naive_med, tcm_med
# etc.) but we recompute here for clarity and chunk independence.
pt_state_medians <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n = n(),
med_pt = round(median(monocle3_pseudotime, na.rm = TRUE), 3),
.groups = "drop"
) %>%
arrange(med_pt)
cat("=== Reference state median pseudotimes ===\n")=== Reference state median pseudotimes ===
# Extract medians (overwrite with fresh values from this chunk)
naive_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 Naive"]
tcm_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 TCM"]
treg_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "Treg"]
tem_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 TEM"]
temra_med <- pt_state_medians$med_pt[
pt_state_medians$predicted.celltype.l2 == "CD4 Temra/CTL"]
# ── Step 2: Bin boundaries as midpoints between state medians ─────────────
naive_tcm_cut <- (naive_med + tcm_med) / 2
tcm_tem_cut <- (tcm_med + tem_med) / 2
tem_temra_cut <- (tem_med + temra_med) / 2
tcm_treg_cut <- (tcm_med + treg_med) / 2 # stored for reference only
cat("\n=== Biology-anchored bin boundaries ===\n")
=== Biology-anchored bin boundaries ===
Naive | TCM : 8.677
TCM | TEM : 19.070 (main effector axis)
TEM | Temra: 25.017
cat(sprintf(" TCM | Treg : %.3f (regulatory branch — label-based, not used as cut)\n",
tcm_treg_cut)) TCM | Treg : 16.057 (regulatory branch — label-based, not used as cut)
# ── Step 3: Assign working columns ────────────────────────────────────────
mapped_MalignantCD4T$state_azimuth_l2 <-
mapped_MalignantCD4T$predicted.predicted.celltype.l2
mapped_MalignantCD4T$pseudotime_value <-
as.numeric(mapped_MalignantCD4T$predicted.pseudotime)
# ── Step 4: Assign pseudotime bins ────────────────────────────────────────
mapped_MalignantCD4T$pseudotime_bin <- case_when(
# Treg lineage — label takes priority over pseudotime position
# because Treg is a branch, not a point on the linear effector axis
mapped_MalignantCD4T$state_azimuth_l2 == "Treg" ~ "Treg-like",
# Effector axis bins — pseudotime-based
mapped_MalignantCD4T$pseudotime_value < naive_tcm_cut ~ "Naive-like",
mapped_MalignantCD4T$pseudotime_value >= naive_tcm_cut &
mapped_MalignantCD4T$pseudotime_value < tcm_tem_cut ~ "TCM-like",
mapped_MalignantCD4T$pseudotime_value >= tcm_tem_cut &
mapped_MalignantCD4T$pseudotime_value < tem_temra_cut ~ "TEM-like",
mapped_MalignantCD4T$pseudotime_value >= tem_temra_cut ~ "Temra-like",
TRUE ~ NA_character_
)
# Set factor levels in biological trajectory order
mapped_MalignantCD4T$pseudotime_bin <- factor(
mapped_MalignantCD4T$pseudotime_bin,
levels = c("Naive-like", "TCM-like", "Treg-like", "TEM-like", "Temra-like")
)
# ── Step 5: Treg count check ──────────────────────────────────────────────
# FIX: added as identified in expert review. If <50 Treg-labelled cells,
# the Treg-like bin is negligible and should be flagged.
n_treg <- sum(mapped_MalignantCD4T$state_azimuth_l2 == "Treg", na.rm = TRUE)
cat(sprintf("\nTreg-labelled malignant cells: %d (%.2f%% of total)\n",
n_treg,
100 * n_treg / ncol(mapped_MalignantCD4T)))
Treg-labelled malignant cells: 79 (0.19% of total)
if (n_treg == 0) {
message("ℹ️ No Treg-labelled cells — Treg-like bin will be empty.\n",
" This is biologically plausible: Sézary cells rarely map to Treg.\n",
" The Treg bin column is retained for completeness.")
} else if (n_treg < 50) {
message(sprintf(
"⚠️ Only %d Treg-labelled cells (%.2f%%).\n", n_treg,
100 * n_treg / ncol(mapped_MalignantCD4T)),
" Treg-like bin exists but is very small — interpret with caution.\n",
" Consider noting this as 'rare/absent Treg identity' in results.")
} else {
cat(sprintf("✅ Treg-like bin has %d cells — sufficient for reporting.\n", n_treg))
}✅ Treg-like bin has 79 cells — sufficient for reporting.
Note: Treg-like bin % equals Treg label % by design (label-based assignment).
Label vs pseudotime bin discordance is only interpretable for the
effector axis: Naive-like / TCM-like / TEM-like / Temra-like.
# ── Step 6: Results ───────────────────────────────────────────────────────
cat("\n=== State assignment complete ===\n")
=== State assignment complete ===
Azimuth l2 label (cell of origin — WHAT the cell resembles):
CD4 Naive CD4 TCM CD4 TEM Treg
21 40291 304 79
Pseudotime bin (differentiation position — WHERE the cell sits):
Naive-like TCM-like Treg-like TEM-like Temra-like
2734 26389 79 10415 1078
# ── KEY RESULT: cross-table ───────────────────────────────────────────────
cat("\n═══════════════════════════════════════════════════════════\n")
═══════════════════════════════════════════════════════════
KEY RESULT: Label (cell of origin) vs Pseudotime Bin
═══════════════════════════════════════════════════════════
TCM-labelled cells in TEM/Temra bins = effector skewing
beyond the TCM arrest point
# FIX: explicit column naming from table() output — avoids rename() failure
# when Var1/Var2 column names differ from expected Label/PT_Bin
cross_tab <- table(
Label = mapped_MalignantCD4T$state_azimuth_l2,
PT_Bin = mapped_MalignantCD4T$pseudotime_bin
)
print(cross_tab) PT_Bin
Label Naive-like TCM-like Treg-like TEM-like Temra-like
CD4 Naive 21 0 0 0 0
CD4 TCM 2713 26357 0 10365 856
CD4 TEM 0 32 0 50 222
Treg 0 0 79 0 0
Note: Treg row will show 100% Treg-like — this is by design.
# ── Overall state distribution ─────────────────────────────────────────────
state_summary <- mapped_MalignantCD4T@meta.data %>%
count(state_azimuth_l2, name = "n_cells") %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
arrange(desc(n_cells))
cat("=== Overall Azimuth l2 State Distribution ===\n")=== Overall Azimuth l2 State Distribution ===
# ── Per cell line ─────────────────────────────────────────────────────────
state_per_line <- mapped_MalignantCD4T@meta.data %>%
count(cell_line, state_azimuth_l2, name = "n_cells") %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
ungroup() %>%
arrange(cell_line, desc(n_cells))
cat("\n=== State Distribution per Cell Line ===\n")
=== State Distribution per Cell Line ===
# ── Pseudotime bin distribution ───────────────────────────────────────────
pt_bin_summary <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(pseudotime_bin, name = "n_cells") %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1))
cat("\n=== Pseudotime Bin Summary ===\n")
=== Pseudotime Bin Summary ===
pt_bin_per_line <- mapped_MalignantCD4T@meta.data %>%
filter(!is.na(pseudotime_bin)) %>%
count(cell_line, pseudotime_bin, name = "n_cells") %>%
group_by(cell_line) %>%
mutate(pct = round(100 * n_cells / sum(n_cells), 1)) %>%
ungroup()
cat("\n=== Pseudotime Bin per Cell Line ===\n")
=== Pseudotime Bin per Cell Line ===
# Top panel: label transfer (cell of origin)
p_bar_labels <- ggplot(state_per_line,
aes(x = cell_line, y = pct,
fill = state_azimuth_l2)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70",
name = "Azimuth l2\nstate") +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Sézary Cell State Composition per Cell Line",
subtitle = "Label transfer: cell of origin (which normal state does this cell resemble?)",
x = "Cell Line", y = "% Cells")
# Bottom panel: pseudotime bins (differentiation position)
p_bar_bins <- ggplot(pt_bin_per_line,
aes(x = cell_line, y = pct,
fill = pseudotime_bin)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 5, paste0(pct, "%"), "")),
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = bin_colors, name = "Pseudotime\nBin",
drop = FALSE) +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Sézary Cell Pseudotime Bin Distribution per Cell Line",
subtitle = paste0(
"Differentiation position — state-anchored boundaries: ",
"Naive|TCM=", round(naive_tcm_cut, 2),
" | TCM|TEM=", round(tcm_tem_cut, 2),
" | TEM|Temra=", round(tem_temra_cut, 2),
" | Treg=label-based"),
x = "Cell Line", y = "% Cells")
p_bar_labels / p_bar_bins +
plot_annotation(
title = "Cell of Origin (Label Transfer) vs Differentiation Position (Pseudotime Bins)\nSézary Syndrome — CD4+ T Cell Trajectory",
theme = theme(plot.title = element_text(hjust = 0.5, size = 13,
face = "bold"))
)# Heatmap 1: Azimuth l2 label proportions per cell line
heatmap_df <- state_per_line %>%
select(cell_line, state_azimuth_l2, pct) %>%
pivot_wider(names_from = state_azimuth_l2,
values_from = pct, values_fill = 0)
heatmap_mat <- as.matrix(heatmap_df[, -1])
rownames(heatmap_mat) <- heatmap_df$cell_line
pheatmap(
heatmap_mat,
color = colorRampPalette(c("white", "#FEE090", "#D73027"))(50),
display_numbers = TRUE, number_format = "%.1f%%",
fontsize_number = 8, fontsize_row = 10, fontsize_col = 9,
angle_col = 45, cluster_rows = TRUE, cluster_cols = TRUE,
main = "% Sézary Cells per Azimuth l2 State\n(Label transfer — cell of origin)",
border_color = "white"
)
# Heatmap 2: pseudotime bin proportions — biological order preserved
bin_heatmap_df <- pt_bin_per_line %>%
select(cell_line, pseudotime_bin, pct) %>%
pivot_wider(names_from = pseudotime_bin,
values_from = pct, values_fill = 0)
# FIX: use any_of() to safely select only columns that exist
# Original used bare select() which errors if a bin column is missing
all_bin_levels <- c("Naive-like", "TCM-like", "Treg-like",
"TEM-like", "Temra-like")
for (b in all_bin_levels) {
if (!b %in% colnames(bin_heatmap_df)) bin_heatmap_df[[b]] <- 0
}
bin_heatmap_df <- bin_heatmap_df %>%
select(cell_line, any_of(all_bin_levels))
bin_mat <- as.matrix(bin_heatmap_df[, -1])
rownames(bin_mat) <- bin_heatmap_df$cell_line
pheatmap(
bin_mat,
color = colorRampPalette(c("white", "#74ADD1", "#D73027"))(50),
display_numbers = TRUE, number_format = "%.1f%%",
fontsize_number = 8, fontsize_row = 10, fontsize_col = 9,
angle_col = 45,
cluster_rows = TRUE,
cluster_cols = FALSE, # preserve biological order of bins
main = "% Sézary Cells per Pseudotime Bin\n(State-anchored boundaries — differentiation position)",
border_color = "white"
)ref_pt <- data.frame(
pseudotime = reference_integrated$monocle3_pseudotime[
is.finite(reference_integrated$monocle3_pseudotime)],
source = "Healthy reference"
)
mal_pt <- data.frame(
pseudotime = mapped_MalignantCD4T$pseudotime_value[
is.finite(mapped_MalignantCD4T$pseudotime_value)],
source = "Sézary cells"
)
pt_combined <- bind_rows(ref_pt, mal_pt)
# Reference state median lines — visual anchors showing where each
# healthy state sits on the pseudotime axis
state_vlines <- data.frame(
state = c("Naive", "TCM", "Treg", "TEM", "Temra"),
med = c(naive_med, tcm_med, treg_med, tem_med, temra_med),
col = c("#2166AC", "#74ADD1","#762A83", "#FEE090","#D73027")
)
p_density <- ggplot(pt_combined, aes(x = pseudotime, fill = source)) +
geom_density(alpha = 0.65, adjust = 1.2) +
geom_vline(data = state_vlines,
aes(xintercept = med, color = state),
linetype = "dashed", linewidth = 0.8) +
scale_color_manual(
values = setNames(state_vlines$col, state_vlines$state),
name = "Reference state\nmedian") +
scale_fill_manual(
values = c("Healthy reference" = "#4575B4",
"Sézary cells" = malignant_color),
name = "") +
theme_classic(base_size = 12) +
theme(legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Pseudotime Distribution: Healthy Reference vs Sézary Cells",
subtitle = "Peak shift indicates differentiation arrest point | dashed = reference state medians",
x = "Monocle3 Pseudotime", y = "Density")
p_ridge <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value)) %>%
ggplot(aes(x = pseudotime_value, y = cell_line, fill = cell_line)) +
geom_density_ridges(alpha = 0.80, scale = 1.3, rel_min_height = 0.01,
quantile_lines = TRUE, quantiles = 2) +
geom_vline(xintercept = c(naive_med, tcm_med, treg_med,
tem_med, temra_med),
linetype = "dashed", color = "grey40", linewidth = 0.5) +
scale_fill_manual(values = cell_line_palette) +
theme_classic(base_size = 11) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Pseudotime per Cell Line",
subtitle = "Vertical line = median | Dashed = reference state medians",
x = "Pseudotime", y = "")
p_density | p_ridge# FIX: pct columns computed only for non-Treg cells on the effector axis.
# Original script counted all cells including Treg which inflates pct_tcm
# (Treg cells have TCM-range pseudotime and would be counted as TCM-like).
# Now we separate Treg cells and compute effector-axis stats independently.
pt_stats <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value)) %>%
group_by(cell_line) %>%
summarise(
n_cells = n(),
n_treg = sum(state_azimuth_l2 == "Treg", na.rm = TRUE),
n_effector = sum(state_azimuth_l2 != "Treg", na.rm = TRUE),
mean_pt = round(mean(pseudotime_value), 3),
median_pt = round(median(pseudotime_value), 3),
sd_pt = round(sd(pseudotime_value), 3),
q25_pt = round(quantile(pseudotime_value, 0.25), 3),
q75_pt = round(quantile(pseudotime_value, 0.75), 3),
# Effector axis pct: computed only on non-Treg cells
pct_naive = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value < naive_tcm_cut,
na.rm = TRUE) / n_effector, 1),
pct_tcm = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value >= naive_tcm_cut &
pseudotime_value < tcm_tem_cut,
na.rm = TRUE) / n_effector, 1),
pct_tem = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value >= tcm_tem_cut &
pseudotime_value < tem_temra_cut,
na.rm = TRUE) / n_effector, 1),
pct_temra = round(100 * sum(
state_azimuth_l2 != "Treg" &
pseudotime_value >= tem_temra_cut,
na.rm = TRUE) / n_effector, 1),
pct_treg = round(100 * n_treg / n_cells, 1),
.groups = "drop"
) %>%
arrange(median_pt)
cat("=== Pseudotime Statistics per Cell Line ===\n")=== Pseudotime Statistics per Cell Line ===
(pct_naive/tcm/tem/temra computed on effector-axis cells only,
pct_treg = Treg-labelled cells as % of all cells)
=== Reference Pseudotime per Azimuth l2 State ===
ref_stats <- reference_integrated@meta.data %>%
filter(is.finite(monocle3_pseudotime), !is.na(predicted.celltype.l2)) %>%
group_by(predicted.celltype.l2) %>%
summarise(
n_cells = n(),
mean_pt = round(mean(monocle3_pseudotime), 3),
median_pt = round(median(monocle3_pseudotime), 3),
.groups = "drop"
) %>%
arrange(median_pt)
print(ref_stats)p_ref <- DimPlot(
reference_integrated, group.by = "predicted.celltype.l2",
reduction = "umap", label = TRUE, repel = TRUE,
label.size = 3, pt.size = 0.4
) +
scale_color_manual(values = azimuth_l2_colors, na.value = "grey40") +
ggtitle("A. Healthy Reference CD4+ T Cells\n(Azimuth l2)") +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
NoLegend()
p_pt_ref <- FeaturePlot(
reference_integrated, features = "monocle3_pseudotime",
reduction = "umap", cols = c("lightblue", "yellow", "red"),
pt.size = 0.4
) +
ggtitle("B. Reference Pseudotime\n(Naive → Temra + Treg branch)") +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold"))
p_proj <- ggplot() +
geom_point(data = ref_umap,
aes(x = UMAP_1, y = UMAP_2),
color = "grey88", size = 0.3, alpha = 0.6) +
geom_point(data = query_umap %>% filter(is.finite(pseudotime)),
aes(x = UMAP_1, y = UMAP_2, color = pseudotime),
size = 1.0, alpha = 0.85) +
scale_color_viridis_c(option = "plasma", name = "Pseudotime") +
theme_classic(base_size = 10) +
theme(plot.title = element_text(hjust = 0.5, size = 11, face = "bold")) +
ggtitle("C. Sézary Cells Projected\n(Transferred pseudotime)")
top_states <- state_summary %>% head(8) %>% pull(state_azimuth_l2)
state_plot_df <- state_per_line %>% filter(state_azimuth_l2 %in% top_states)
p_quant <- ggplot(state_plot_df,
aes(x = reorder(cell_line, -pct), y = pct,
fill = state_azimuth_l2)) +
geom_col(position = "stack", color = "white", linewidth = 0.3) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey40",
name = "Azimuth l2") +
theme_classic(base_size = 10) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
plot.title = element_text(hjust = 0.5, size = 11, face = "bold"),
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 8)) +
labs(title = "D. State Proportions per Cell Line\n(Azimuth l2 label transfer)",
x = "", y = "% Cells")
(p_ref | p_pt_ref) / (p_proj | p_quant) +
plot_annotation(
title = "Sézary Cell Differentiation State Mapping onto Healthy CD4+ T Cell Trajectory",
theme = theme(plot.title = element_text(hjust = 0.5, size = 13,
face = "bold"))
)mal_state_pt <- mapped_MalignantCD4T@meta.data %>%
filter(is.finite(pseudotime_value), !is.na(state_azimuth_l2)) %>%
mutate(state = state_azimuth_l2)
state_order_mal <- mal_state_pt %>%
group_by(state) %>%
summarise(med = median(pseudotime_value), .groups = "drop") %>%
arrange(med) %>%
pull(state)
mal_state_pt$state <- factor(mal_state_pt$state, levels = state_order_mal)
ggplot(mal_state_pt, aes(x = state, y = pseudotime_value, fill = state)) +
geom_violin(scale = "width", alpha = 0.8, trim = TRUE) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.3) +
geom_hline(yintercept = c(naive_med, tcm_med, treg_med,
tem_med, temra_med),
linetype = "dashed", color = "grey50", linewidth = 0.5) +
scale_fill_manual(values = azimuth_l2_colors, na.value = "grey70") +
theme_classic(base_size = 11) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 9),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Transferred Pseudotime per Azimuth l2 State (Sézary Cells)",
subtitle = "Dashed lines = healthy reference state medians",
x = "Transferred State (cell of origin)",
y = "Transferred Pseudotime (differentiation position)")# This is the core result plot.
# For each transferred Azimuth l2 label, shows what fraction of cells
# fall into each pseudotime bin.
# Key message: TCM-labelled cells span TCM, TEM, and Temra bins
# → identity = TCM (cell of origin)
# → position = further along effector axis (partial skewing)
# FIX: build cross_df with explicit column selection from table output
# rather than rename() which can fail if column names differ
cross_df <- as.data.frame(cross_tab)
# table() always returns: Var1 → Label, Var2 → PT_Bin (from our named dims)
# but if table was constructed with named dims "Label" and "PT_Bin"
# then colnames are already Label, PT_Bin, Freq
colnames(cross_df) <- c("Label", "PT_Bin", "n")
cross_df <- cross_df %>%
group_by(Label) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
filter(n > 0) %>%
mutate(PT_Bin = factor(PT_Bin,
levels = c("Naive-like", "TCM-like", "Treg-like",
"TEM-like", "Temra-like")))
ggplot(cross_df, aes(x = PT_Bin, y = pct, fill = PT_Bin)) +
geom_col(color = "white", linewidth = 0.3) +
geom_text(aes(label = ifelse(pct >= 3, paste0(pct, "%"), "")),
vjust = -0.3, size = 3, fontface = "bold") +
facet_wrap(~ Label, scales = "free_y", ncol = 3) +
scale_fill_manual(values = bin_colors, drop = FALSE,
name = "Pseudotime Bin") +
theme_classic(base_size = 10) +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 8),
strip.text = element_text(face = "bold", size = 9),
strip.background = element_rect(fill = "#E8F4FD"),
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(
title = "Pseudotime Bin Distribution within Each Transferred Azimuth l2 Label",
subtitle = paste0(
"Core finding: TCM-labelled Sézary cells span multiple pseudotime bins\n",
"→ TCM identity (label) but partial effector skewing (pseudotime position)\n",
"Note: Treg panel shows 100% Treg-like by design (label-based bin assignment)"),
x = "Pseudotime Bin", y = "% Cells within label"
)[1] "C"
options(expressions = 10000)
out_dir <- "results/MalignantCD4T_Monocle3_projection"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
library(SeuratObject)
SaveSeuratRds(mapped_MalignantCD4T,
file = file.path(out_dir,
"MalignantCD4T_mapped_monocle3_pseudotime_noProlif_ref.Rds"))
# Summary tables
write.csv(state_summary,
file.path(out_dir, "state_distribution_overall.csv"),
row.names = FALSE)
write.csv(state_per_line,
file.path(out_dir, "state_distribution_per_cellline.csv"),
row.names = FALSE)
write.csv(pt_bin_per_line,
file.path(out_dir, "pseudotime_bins_per_cellline.csv"),
row.names = FALSE)
write.csv(pt_stats,
file.path(out_dir, "pseudotime_stats_per_cellline.csv"),
row.names = FALSE)
write.csv(as.data.frame(cross_tab),
file.path(out_dir, "label_vs_pseudotime_bin_crosstable.csv"),
row.names = FALSE)
write.csv(mapped_MalignantCD4T@meta.data,
file.path(out_dir, "MalignantCD4T_full_metadata_with_projection.csv"),
row.names = TRUE)
# Bin boundaries — saved for methods section and reproducibility
bin_boundaries <- data.frame(
boundary = c("Naive_TCM", "TCM_TEM", "TEM_Temra", "TCM_Treg_branch"),
pseudotime_cut = c(naive_tcm_cut, tcm_tem_cut, tem_temra_cut, tcm_treg_cut),
naive_med = naive_med,
tcm_med = tcm_med,
treg_med = treg_med,
tem_med = tem_med,
temra_med = temra_med
)
write.csv(bin_boundaries,
file.path(out_dir, "pseudotime_bin_boundaries.csv"),
row.names = FALSE)
cat("✅ All outputs saved to:", out_dir, "\n\n")✅ All outputs saved to: results/MalignantCD4T_Monocle3_projection
Files:
[1] "MalignantCD4T_full_metadata_with_projection.csv" "MalignantCD4T_mapped_monocle3_pseudotime_noProlif_ref.Rds"
[3] "label_vs_pseudotime_bin_crosstable.csv" "pseudotime_bin_boundaries.csv"
[5] "pseudotime_bins_per_cellline.csv" "pseudotime_stats_per_cellline.csv"
[7] "state_distribution_overall.csv" "state_distribution_per_cellline.csv"