library(scran)
source("/exports/eddie/scratch/aduguid3/colours.R")
library(SingleCellExperiment)
library(scater) # Plot dimred
library(clustree) # show relationship clustering
library(Seurat) # clustering 
library(qs2) # readRDS faster
library(dplyr)
library(readr)
library(ggrepel)

Load the data

srt <- qs_read("/exports/eddie/scratch/aduguid3/initial_clustering/03_seurat_obj_harmony_clustered.qs")

# I am assuming all doublet have already been removed

Clustering the entire dataset, start with Nadine’s harmony Seurat object

# Check for variable features and scale data
srt <- FindVariableFeatures(srt, selection.method = "vst", nfeatures = 2000)
srt <- ScaleData(srt, verbose = FALSE)

# Run PCA and UMAP (skip if already done)
srt <- RunPCA(srt, verbose = FALSE)

## Warning: Key 'PC_' taken, using 'pca_' instead

srt<- RunUMAP(srt, dims = 1:25)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 20:37:53 UMAP embedding parameters a = 0.9922 b = 1.112

## 20:37:53 Read 112213 rows and found 25 numeric columns

## 20:37:53 Using Annoy for neighbor search, n_neighbors = 30

## 20:37:53 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 20:38:07 Writing NN index file to temp file /tmp/Rtmpz3F4mk/filee2f087c1a6f37
## 20:38:08 Searching Annoy index using 1 thread, search_k = 3000
## 20:38:47 Annoy recall = 100%
## 20:38:49 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 20:38:55 Initializing from normalized Laplacian + noise (using RSpectra)
## 20:39:30 Commencing optimization for 200 epochs, with 5039480 positive edges
## 20:39:30 Using rng type: pcg
## 20:40:25 Optimization finished

# Find neighbors and cluster at multiple resolutions
srt <- FindNeighbors(srt, reduction = "pca", dims = 1:25)

## Computing nearest neighbor graph
## Computing SNN

srt <- FindClusters(srt, resolution = c(0.01, 0.04, 0.05, seq(0.1, 1, by = 0.1)))

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9936
## Number of communities: 7
## Elapsed time: 60 seconds

## 1 singletons identified. 6 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9834
## Number of communities: 11
## Elapsed time: 55 seconds

## 1 singletons identified. 10 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9809
## Number of communities: 12
## Elapsed time: 50 seconds

## 1 singletons identified. 11 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9683
## Number of communities: 14
## Elapsed time: 47 seconds

## 1 singletons identified. 13 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9499
## Number of communities: 18
## Elapsed time: 54 seconds

## 1 singletons identified. 17 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9387
## Number of communities: 20
## Elapsed time: 58 seconds

## 1 singletons identified. 19 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9299
## Number of communities: 22
## Elapsed time: 48 seconds

## 1 singletons identified. 21 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9213
## Number of communities: 25
## Elapsed time: 53 seconds

## 1 singletons identified. 24 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9139
## Number of communities: 26
## Elapsed time: 57 seconds

## 1 singletons identified. 25 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9064
## Number of communities: 26
## Elapsed time: 53 seconds

## 1 singletons identified. 25 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9001
## Number of communities: 27
## Elapsed time: 49 seconds

## 1 singletons identified. 26 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8932
## Number of communities: 31
## Elapsed time: 44 seconds

## 1 singletons identified. 30 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 112213
## Number of edges: 3679570
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8887
## Number of communities: 33
## Elapsed time: 43 seconds

## 1 singletons identified. 32 final clusters.

# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(srt@meta.data), value = TRUE)
srt_clusters_metadata <- srt[[cluster_columns]]

# Save clustering metadata
qs_save(
  srt_clusters_metadata,
  file = "/exports/eddie/scratch/aduguid3/harmony_clustering/01_harmony_recluster_march26_metadata.qs")

# Plot UMAP with a selected resolution (e.g., 1.0) --> reduction for this is lower case umap
DimPlot(srt, reduction = "umap", group.by = "originalexp_snn_res.0.1", label = TRUE, label.size = 2) +
  ggtitle("umap by res.0.1")

## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

qs_save(srt, "/exports/eddie/scratch/aduguid3/harmony_clustering/01_harmony_recluster_march26.qs")

Cluster tree check

clustree(srt_clusters_metadata, prefix = "originalexp_snn_res.", edge_arrow = FALSE)

Identifying leukaemia clusters by eGFP expression

Set the identity to res 0.1 first, clustering the whole dataset (no subsetting) 0.1 resolution looks about right and not overclustered (checked with clustree). Leukaemia clusters are likely 0, 1, 2.

# Set the identity to res 0.1 
Idents(srt) <- "originalexp_snn_res.0.1"

# VlnPlot - group by the res 0.1 column
VlnPlot(srt, features = "GFP", 
        group.by = "originalexp_snn_res.0.1", 
        pt.size = 0) +
  ggtitle("GFP expression by cluster (res 0.1)")

## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## GFP is relatively spracly expressed across clusters
# How many cells are actually GFP+?
sum(GetAssayData(srt, layer = "counts")["GFP",] > 0)

## [1] 7928

# Percentage of total cells
sum(GetAssayData(srt, layer = "counts")["GFP",] > 0) / ncol(srt) * 100

## [1] 7.065135

# GFP+ cells per cluster
table(srt$originalexp_snn_res.0.1[GetAssayData(srt, layer = "counts")["GFP",] > 0])

## 
##    0    1   10   11   12    2    3    4    5    6    7    8    9 
## 4320 2156    3    0    8 1318   28   31   25   21    3   13    2

# This shows that 98% of GFP+ cell fall in to cluster 0, 1, 2 there some in clusters 3-6 that need investigation...

# Investigate cluster 3
c3_cells <- WhichCells(srt, idents = "3")
cat("Total cells in cluster 3:", length(c3_cells), "\n")

## Total cells in cluster 3: 5119

cat("GFP+ cells in cluster 3:", 28, "\n")

## GFP+ cells in cluster 3: 28

cat("GFP+ proportion:", round(28/length(c3_cells)*100, 2), "%\n")

## GFP+ proportion: 0.55 %

# Check B-ALL markers in cluster 3-6 vs confirmed leukaemia, genes confrimed to be upregualted in miR128+ blasts in Camille's paper - Ccnt1, Cdk9, Dot1l, Brd4, Meis1, Runx1, Cdk6, Bcl2
DotPlot(srt, 
        features = c("GFP", "Ccnt1", "Cdk9", "Dot1l","Brd4", "Meis1", "Runx1", "Cdk6", "Bcl2", "Pax5", "Ikzf1", "Il7r", "Ptprc", "Cd19"),
        idents = c("0", "1", "2", "3", "4", "5", "6"),
        group.by = "originalexp_snn_res.0.1") +
  RotatedAxis() +
  ggtitle("Leukaemia markers - main mass clusters")

# Confident to proceed with just 0, 1, 2 for the leukaemia object
leuk_obj <- subset(srt, idents = c("0", "1", "2"))

ncol(leuk_obj) # check cell numbers

## [1] 91826

table(leuk_obj$Tissue) # check BM vs CNS distribution

## 
##    BM   CNS 
## 44606 47220

DimPlot(leuk_obj, reduction = "umap", 
        group.by = "Tissue",
        cols = c("BM" = "steelblue", "CNS" = "firebrick")) +
  ggtitle("Leukaemia cells by tissue")

Cell Population Identification from Marker Gene Expression

Clusters 0, 1, 2 — Confirmed Leukaemia

Expression of the leukaemia-associated markers Meis1, Runx1, Cdk6, and Bcl2 was consistent across clusters 0, 1, and 2, in line with the signatures reported by Dang et al. (2021). B cell lineage was confirmed by Pax5 and Ikzf1 expression. GFP expression was sparse but present, consistent with the expected transcript capture rate in scRNAseq. These three clusters were designated as the leukaemia population with confidence.

Cluster 3 — Normal B Cells

Cluster 3 showed strong expression of Pax5, Ikzf1, Cd19, and Ptprc, consistent with a mature B cell profile. Leukaemia-associated markers (Meis1, Runx1, Cdk6, Bcl2) were notably low or absent. Only 28 GFP+ cells were identified in this cluster, most likely representing doublets. Cluster 3 was excluded from the leukaemia object.

Cluster 5 — T Cells / ILCs

Cluster 5 was characterised by dominant Il7r expression, a hallmark marker of T cells and innate lymphoid cells (ILCs), with minimal B cell or leukaemia-ass

Reclustering the leukaemia subset

# Check for variable features and scale data
leuk_obj <- FindVariableFeatures(leuk_obj, selection.method = "vst", nfeatures = 2000)
leuk_obj <- ScaleData(leuk_obj, verbose = FALSE)

# Run PCA and UMAP (skip if already done)
leuk_obj <- RunPCA(leuk_obj, verbose = FALSE)

## Warning: Key 'PC_' taken, using 'pca_' instead

leuk_obj <- RunUMAP(leuk_obj, dims = 1:25)

## 21:00:23 UMAP embedding parameters a = 0.9922 b = 1.112

## 21:00:23 Read 91826 rows and found 25 numeric columns

## 21:00:23 Using Annoy for neighbor search, n_neighbors = 30

## 21:00:23 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 21:00:36 Writing NN index file to temp file /tmp/Rtmpz3F4mk/filee2f08fe5e91a
## 21:00:36 Searching Annoy index using 1 thread, search_k = 3000
## 21:01:10 Annoy recall = 100%
## 21:01:11 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 21:01:17 Initializing from normalized Laplacian + noise (using RSpectra)
## 21:01:22 Commencing optimization for 200 epochs, with 4034758 positive edges
## 21:01:22 Using rng type: pcg
## 21:02:05 Optimization finished

# Find neighbors and cluster at multiple resolutions
leuk_obj <- FindNeighbors(leuk_obj, reduction = "pca", dims = 1:25)

## Computing nearest neighbor graph
## Computing SNN

leuk_obj <- FindClusters(leuk_obj, resolution = c(0.01, 0.04, 0.05, seq(0.1, 1, by = 0.1)))

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9900
## Number of communities: 2
## Elapsed time: 33 seconds

## 1 singletons identified. 1 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9755
## Number of communities: 6
## Elapsed time: 28 seconds

## 1 singletons identified. 5 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9725
## Number of communities: 6
## Elapsed time: 31 seconds

## 1 singletons identified. 5 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9584
## Number of communities: 7
## Elapsed time: 26 seconds

## 1 singletons identified. 6 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9392
## Number of communities: 12
## Elapsed time: 31 seconds

## 1 singletons identified. 11 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9257
## Number of communities: 13
## Elapsed time: 34 seconds

## 1 singletons identified. 12 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9149
## Number of communities: 16
## Elapsed time: 31 seconds

## 1 singletons identified. 15 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9044
## Number of communities: 16
## Elapsed time: 30 seconds

## 1 singletons identified. 15 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8949
## Number of communities: 17
## Elapsed time: 27 seconds

## 1 singletons identified. 16 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8873
## Number of communities: 20
## Elapsed time: 30 seconds

## 1 singletons identified. 19 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8789
## Number of communities: 22
## Elapsed time: 28 seconds

## 1 singletons identified. 21 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8721
## Number of communities: 23
## Elapsed time: 28 seconds

## 1 singletons identified. 22 final clusters.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 91826
## Number of edges: 2651402
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8657
## Number of communities: 25
## Elapsed time: 27 seconds

## 1 singletons identified. 24 final clusters.

# Extract clustering metadata
cluster_columns <- grep("snn_res\\.", names(leuk_obj@meta.data), value = TRUE)
leuk_obj_clusters_metadata <- leuk_obj[[cluster_columns]]

# Save clustering metadata
qs_save(
  srt_clusters_metadata,
  file = "/exports/eddie/scratch/aduguid3/harmony_clustering/02_harmony_recluster_leuk_only_march26_metadata.qs")

# Plot UMAP with a selected resolution (e.g., 1.0) --> reduction for this is lower case umap
DimPlot(leuk_obj, reduction = "umap", group.by = "originalexp_snn_res.1", label = TRUE, label.size = 2) +
  ggtitle("leuk only - umap by res.0.3")

# Plot UMAP and check how cells are distributed by tissue
DimPlot(leuk_obj, reduction = "umap", 
        group.by = "Tissue",
        cols = c("BM" = "steelblue", "CNS" = "firebrick")) +
  ggtitle("Leukaemia cells by tissue")

## Calculate proportion of BM vs CNS per cluster at each resolution
# At res 0.5 for example
leuk_obj$cluster_res05 <- leuk_obj$originalexp_snn_res.0.5

tissue_props <- leuk_obj@meta.data %>%
  group_by(cluster_res05, Tissue) %>%
  summarise(n = n()) %>%
  group_by(cluster_res05) %>%
  mutate(prop = n / sum(n))

## `summarise()` has grouped output by 'cluster_res05'. You can override using the
## `.groups` argument.

ggplot(tissue_props, aes(x = cluster_res05, y = prop, fill = Tissue)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("BM" = "steelblue", "CNS" = "firebrick")) +
  theme_classic() +
  labs(x = "Cluster", y = "Proportion", 
       title = "Tissue composition per cluster (res 0.5)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

qs_save(leuk_obj, "/exports/eddie/scratch/aduguid3/harmony_clustering/02_harmony_recluster_leuk_only_march26.qs")

Cluster tree check

clustree(leuk_obj_clusters_metadata, prefix = "originalexp_snn_res.", edge_arrow = FALSE)

Resolution Selection for Leukaemia Subset Clustering, based on downstream tissue dependent analysis

# Tissue composition at res 0.5 and res 1.0

# Plot UMAP and check how cells are distributed by tissue
DimPlot(leuk_obj, reduction = "umap", 
        group.by = "Tissue",
        cols = c("BM" = "steelblue", "CNS" = "firebrick")) +
  ggtitle("Leukaemia cells by tissue")

## Calculate proportion of BM vs CNS per cluster at each resolution
# At res 0.5 
leuk_obj$cluster_res05 <- leuk_obj$originalexp_snn_res.0.5

tissue_props <- leuk_obj@meta.data %>%
  group_by(cluster_res05, Tissue) %>%
  summarise(n = n()) %>%
  group_by(cluster_res05) %>%
  mutate(prop = n / sum(n))

## `summarise()` has grouped output by 'cluster_res05'. You can override using the
## `.groups` argument.

ggplot(tissue_props, aes(x = cluster_res05, y = prop, fill = Tissue)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("BM" = "steelblue", "CNS" = "firebrick")) +
  theme_classic() +
  labs(x = "Cluster", y = "Proportion", 
       title = "Tissue composition per cluster (res 0.5)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# At res 1.0 
leuk_obj$cluster_res1 <- leuk_obj$originalexp_snn_res.1

tissue_props <- leuk_obj@meta.data %>%
  group_by(cluster_res1, Tissue) %>%
  summarise(n = n()) %>%
  group_by(cluster_res1) %>%
  mutate(prop = n / sum(n))

## `summarise()` has grouped output by 'cluster_res1'. You can override using the
## `.groups` argument.

ggplot(tissue_props, aes(x = cluster_res1, y = prop, fill = Tissue)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("BM" = "steelblue", "CNS" = "firebrick")) +
  theme_classic() +
  labs(x = "Cluster", y = "Proportion", 
       title = "Tissue composition per cluster (res 0.5)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Clustering resolution was selected by examining tissue composition per cluster at multiple resolutions alongside clustree stability analysis.

At res 0.5 (15 clusters), two clusters showed near-exclusive CNS enrichment (clusters 8 and 9, >95% CNS) and two showed near-exclusive BM enrichment (clusters 11 and 12, >98% BM), with the remaining clusters representing transcriptional states shared across both niches.

At res 1.0 (24 clusters), an additional CNS-enriched cluster emerged (cluster 19, ~90% CNS), however this resolution also fragmented biologically similar mixed-tissue states into numerous small clusters. This fragmentation reduces cell numbers per cluster per sample, which would underpower the pseudobulk differential expression analysis required for robust CNS vs BM comparisons downstream.

Cross-tabulation of res 0.5 and res 1.0 clusterings confirmed that cluster 19 (res 1.0) represents a sub-fragment of cluster 8 (res 0.5), indicating that the CNS-specific biology is already captured at the lower resolution.

Resolution 0.5 was therefore selected as the optimal balance between biological resolution and statistical power for downstream analysis. Tissue-specific clusters of interest are:

CNS-enriched: clusters 8 and 9
BM-enriched: clusters 11 and 12
Mixed/shared states: clusters 0–7, 10, 13, 14

Annotate the cells with CellCycle and ribosomal genes

  # A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
  # segregate this list into markers of G2/M phase and markers of S phase
  s.genes <- stringr::str_to_title(cc.genes$s.genes)
  g2m.genes <- stringr::str_to_title(cc.genes$g2m.genes)
  
  leuk_obj <- CellCycleScoring(leuk_obj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

## Warning: The following features are not present in the object: Mlf1ip, not
## searching for symbol synonyms

## Warning: The following features are not present in the object: Fam64a, Hn1, not
## searching for symbol synonyms

  # view cell cycle scores and phase assignments
  head(leuk_obj[[]])

  srt$Phase <- leuk_obj$Phase
  
  # RIBOSOMAL
  leuk_obj[["subsets_ribo_pct"]] <- PercentageFeatureSet(leuk_obj, pattern = "^Rp[sl]")

qs_save(leuk_obj, "/exports/eddie/scratch/aduguid3/harmony_clustering/03_harmony_recluster_leuk_only_cellcycleanno_march26.qs")

Clustering, leukaemia identification

Alasdair Duguid

2026-02-18

Load the data

Clustering the entire dataset, start with Nadine’s harmony Seurat object

Cluster tree check

Identifying leukaemia clusters by eGFP expression

Cell Population Identification from Marker Gene Expression

Clusters 0, 1, 2 — Confirmed Leukaemia

Cluster 3 — Normal B Cells

Cluster 5 — T Cells / ILCs

Reclustering the leukaemia subset

Cluster tree check

Resolution Selection for Leukaemia Subset Clustering, based on downstream tissue dependent analysis

Annotate the cells with CellCycle and ribosomal genes