load libraries————————————

Read Seurat object with load as you save it with save() function


All_samples_Merged <- readRDS("../STCAT_Annotation/All_samples_Merged_with_STCAT.rds")

1. Subset Normal CD4⁺ T Cells for Reference

# Load required library
library(Seurat)

# Subset normal CD4+ T cells from merged object
reference_cd4 <- subset(
  All_samples_Merged,
  subset = cell_line %in% c("CD4Tcells_lab", "CD4Tcells_10x")
)

# Ensure the SCT assay is available and set it as default
if ("SCT" %in% names(reference_cd4@assays)) {
  DefaultAssay(reference_cd4) <- "SCT"
} else {
  stop("SCT assay not found in the object. Please run SCTransform first.")
}

# Confirm object class and assays
print(class(reference_cd4))            # Should return "Seurat"
[1] "Seurat"
attr(,"package")
[1] "SeuratObject"
print(names(reference_cd4@assays))     # List available assays
[1] "RNA"                          "ADT"                          "prediction.score.celltype.l1"
[4] "prediction.score.celltype.l2" "prediction.score.celltype.l3" "SCT"                         

2. Normalize & Integrate Reference (Accommodate Multiple Donors)


ref_list <- SplitObject(reference_cd4, split.by = "cell_line")

# Run SCTransform on each subset
ref_list <- lapply(ref_list, SCTransform, verbose = FALSE)
Warning: The `slot` argument of `SetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: Different cells and/or features from existing assay SCTWarning: Different cells and/or features from existing assay SCT
# Run PCA on each SCT-normalized subset (required for RPCA)
ref_list <- lapply(ref_list, function(x) {
  x <- RunPCA(x, assay = "SCT", verbose = FALSE)
  return(x)
})

# Select integration features
ref_features <- SelectIntegrationFeatures(object.list = ref_list)

# Prepare SCT integration
ref_list <- PrepSCTIntegration(object.list = ref_list, anchor.features = ref_features)

  |                                                  | 0 % ~calculating  
  |+++++++++++++++++++++++++                         | 50% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
# Find integration anchors using RPCA reduction
ref_anchors <- FindIntegrationAnchors(
  object.list = ref_list,
  anchor.features = ref_features,
  normalization.method = "SCT",
  reduction = "rpca"
)
Computing within dataset neighborhoods

  |                                                  | 0 % ~calculating  
  |+++++++++++++++++++++++++                         | 50% ~01s          
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=02s  
Finding all pairwise anchors

  |                                                  | 0 % ~calculating  
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
    Found 630 anchors

  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=04s  
# Integrate data
reference_integrated <- IntegrateData(anchorset = ref_anchors, normalization.method = "SCT")
[1] 1
Warning: Different cells and/or features from existing assay SCTWarning: Layer counts isn't present in the assay object; returning NULL
[1] 2
Warning: Different cells and/or features from existing assay SCTWarning: Layer counts isn't present in the assay object; returning NULLMerging dataset 2 into 1
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Warning: Layer counts isn't present in the assay object; returning NULLWarning: Assay integrated changing from Assay to SCTAssayWarning: Layer counts isn't present in the assay object; returning NULLWarning: Different cells and/or features from existing assay SCT
# Set default assay to integrated
DefaultAssay(reference_integrated) <- "integrated"

3. Clustering & Dimensionality Reduction


reference_integrated <- RunPCA(reference_integrated, verbose = FALSE)
reference_integrated <- RunUMAP(reference_integrated, dims = 1:18)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session12:29:11 UMAP embedding parameters a = 0.9922 b = 1.112
12:29:11 Read 8610 rows and found 18 numeric columns
12:29:11 Using Annoy for neighbor search, n_neighbors = 30
12:29:11 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
12:29:12 Writing NN index file to temp file /tmp/RtmpgsQ1lC/file1a753457555a35
12:29:12 Searching Annoy index using 1 thread, search_k = 3000
12:29:14 Annoy recall = 100%
12:29:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
12:29:15 Initializing from normalized Laplacian + noise (using RSpectra)
12:29:15 Commencing optimization for 500 epochs, with 361792 positive edges
12:29:15 Using rng type: pcg
Using method 'umap'
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
12:29:22 Optimization finished
reference_integrated <- FindNeighbors(reference_integrated, dims = 1:18)
Computing nearest neighbor graph
Computing SNN
reference_integrated <- FindClusters(reference_integrated, resolution = 0.3)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 8610
Number of edges: 308059

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9201
Number of communities: 11
Elapsed time: 0 seconds
ElbowPlot(reference_integrated, ndims = 50)


# Visualize UMAP colored by original donor (cell_line)
DimPlot(reference_integrated, group.by = "cell_line", reduction = "umap") +
  ggtitle("UMAP of Integrated CD4⁺ T Cells")


# Visualize UMAP colored by original donor (cell_line)
DimPlot(reference_integrated, group.by = "integrated_snn_res.0.3", reduction = "umap") +
  ggtitle("UMAP of Integrated CD4⁺ T Cells")


# Visualize UMAP colored by original donor (cell_line)
DimPlot(reference_integrated, group.by = "Prediction", reduction = "umap") +
  ggtitle("UMAP of Integrated CD4⁺ T Cells")


# Visualize UMAP colored by original donor (cell_line)
DimPlot(reference_integrated, group.by = "predicted.celltype.l2", reduction = "umap") +
  ggtitle("UMAP of Integrated CD4⁺ T Cells")

Save the mapped query object (Sézary cell lines projected onto reference trajectory):



saveRDS(reference_integrated, file = "sezary_cell_lines_mapped_to_cd4_reference_integrated_before_Monocle3.rds")

4. Trajectory and Pseudotime with Monocle3

library(monocle3)
library(SeuratWrappers)
library(Matrix)

Attaching package: ‘Matrix’

The following object is masked from ‘package:S4Vectors’:

    expand
cds <- as.cell_data_set(reference_integrated)
Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.Warning: Monocle 3 trajectories require cluster partitions, which Seurat does not calculate. Please run 'cluster_cells' on your cell_data_set object
cds <- cluster_cells(cds, reduction_method = "UMAP")
cds <- learn_graph(cds, use_partition = TRUE)

  |                                                                                                                     
  |                                                                                                               |   0%
  |                                                                                                                     
  |===============================================================================================================| 100%

  |                                                                                                                     
  |                                                                                                               |   0%
  |                                                                                                                     
  |===============================================================================================================| 100%

  |                                                                                                                     
  |                                                                                                               |   0%
  |                                                                                                                     
  |===============================================================================================================| 100%
naive_markers <- c("CCR7", "SELL", "LEF1")
naive_markers <- naive_markers[naive_markers %in% rownames(cds)]

# Extract log-normalized expression or fallback to counts log-transformed
if("logcounts" %in% assayNames(cds)) {
  expr_mat <- assay(cds, "logcounts")
} else {
  expr_mat <- log1p(assay(cds, "counts"))
}

naive_score <- Matrix::colMeans(expr_mat[naive_markers, , drop = FALSE])
threshold <- quantile(naive_score, 0.95)
root_cells <- names(naive_score[naive_score > threshold])

cds <- order_cells(cds, root_cells = root_cells)
reference_integrated$pseudotime <- pseudotime(cds)

plot_cells(cds, color_cells_by = "pseudotime", show_trajectory_graph = TRUE)
Cells aren't colored in a way that allows them to be grouped.

# Visualize UMAP colored by original donor (cell_line)
DimPlot(reference_integrated, group.by = "Prediction", reduction = "umap") +
  ggtitle("UMAP of Integrated CD4⁺ T Cells")


# Visualize UMAP colored by original donor (cell_line)
DimPlot(reference_integrated, group.by = "predicted.celltype.l2", reduction = "umap") +
  ggtitle("UMAP of Integrated CD4⁺ T Cells")

Save the mapped query object (Sézary cell lines projected onto reference trajectory):



saveRDS(reference_integrated, file = "sezary_cell_lines_mapped_to_cd4_reference_integrated_before_Query_Projection.rds")

5. Prepare Sézary Syndrome Cell Lines as Query

# Load required packages
library(Seurat)
library(glmGamPoi)   # Recommended for memory-efficient SCTransform

Attaching package: ‘glmGamPoi’

The following object is masked from ‘package:ggplot2’:

    vars

The following object is masked from ‘package:dplyr’:

    vars
library(future)

# Optional: Parallel setup to handle memory better
plan("multisession", workers = 4)
options(future.globals.maxSize = 50 * 1024^3)  # 50 GB memory ceiling

# Step 1: Subset Sézary syndrome cell lines
query_subset <- subset(All_samples_Merged, subset = cell_line %in% paste0("L", 1:7))
gc()
             used    (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells    9283855   495.9   14812904   791.1   14812904   791.1
Vcells 2971711686 22672.4 4735597000 36129.8 4085964052 31173.5
# Step 2: Get raw counts matrix from RNA assay
query_raw <- GetAssayData(query_subset, assay = "RNA", slot = "counts")

# Step 3: Filter out genes expressed in <3 cells (saves memory)
keep_genes <- rowSums(query_raw > 0) >= 3
query_raw_filtered <- query_raw[keep_genes, ]

# Step 4: Create a new Seurat object with metadata preserved
query_cells <- CreateSeuratObject(counts = query_raw_filtered, meta.data = query_subset@meta.data)

# Step 5: Run SCTransform with glmGamPoi backend for better performance
query_cells <- SCTransform(
  query_cells,
  method = "glmGamPoi",               # Faster and uses less RAM
  variable.features.n = 3000,         # Optional: focus on top 3000 variable genes
  verbose = FALSE
)

6. Inject Cell Lines into Reference with MapQuery


# Find anchors between reference and query
anchors_query <- FindTransferAnchors(
  reference = reference_integrated,
  query = query_cells,
  reference.reduction = "pca",
  normalization.method = "SCT",
  dims = 1:18
)

# Map query cells to reference
query_mapped <- MapQuery(
  anchorset = anchors_query,
  query = query_cells,
  reference = reference_integrated,
  refdata = list(
    pseudotime = "pseudotime",        # Transfer pseudotime
    seurat_clusters = "seurat_clusters", # Transfer clusters
    trajectory_position = "monocle3_embedding" # If storing MST coords
  ),
  reference.reduction = "pca", 
  reduction.model = "umap"
)

7. Visualization (Plot Reference and Injected Cells by Pseudotime and Cell Line)


# Plot the reference trajectory UMAP, colored by pseudotime
DimPlot(reference_integrated, group.by = "pseudotime", reduction = "umap") +
  ggtitle("Reference CD4⁺ T Cell Trajectory (Pseudotime)")

# Overlay injected cell lines
DimPlot(query_mapped, reduction = "ref.umap", group.by = "cell_line", label = TRUE) +
  ggtitle("Injected Sézary Cell Lines on Reference Trajectory")

Visualization & Analysis


# Combined UMAP with pseudotime
p1 <- DimPlot(reference_integrated, 
              group.by = "pseudotime", 
              reduction = "umap") + 
  scale_color_viridis_c(option = "magma")

p2 <- DimPlot(query_mapped, 
              reduction = "ref.umap", 
              group.by = "cell_line", 
              cols = "darkred") + 
  ggtitle("Sézary Cell Lines")

p1 + p2

# Pseudotime distribution in cell lines
VlnPlot(query_mapped, 
        features = "pseudotime", 
        group.by = "cell_line", 
        pt.size = 0.1) +
  geom_boxplot(width = 0.2)

# Project cell lines on reference trajectory
FeaturePlot(query_mapped, 
           features = "pseudotime", 
           reduction = "ref.umap") +
  geom_point(data = query_mapped[[]], 
             aes(x = refUMAP_1, y = refUMAP_2, color = pseudotime),
             size = 1.5)

Analyze Pseudotime Distributions Across Cell Lines



FeaturePlot(query_mapped, features = "pseudotime", reduction = "ref.umap") +
  ggtitle("Pseudotime Values of Injected Cell Lines")

VlnPlot(query_mapped, features = "pseudotime", group.by = "cell_line") +
  ggtitle("Pseudotime Distribution by Cell Line")

Save the mapped query object (Sézary cell lines projected onto reference trajectory):



saveRDS(query_mapped, file = "sezary_cell_lines_mapped_to_cd4_reference.rds")
