1. Setup & Data Loading

library(tidyverse)
library(patchwork)
library(viridis)
library(Seurat)
library(qs2)
library(ggrepel)
library(pheatmap)
library(RColorBrewer)
library(clusterProfiler)
library(org.Mm.eg.db)
library(fgsea)
library(msigdbr)

# Resolve namespace conflicts
conflicted::conflict_prefer("intersect", "base")
conflicted::conflict_prefer("setdiff", "base")
conflicted::conflict_prefer("union", "base")
conflicted::conflict_prefer("select", "dplyr")
conflicted::conflict_prefer("filter", "dplyr")

1.1 Load & Subset — Atlas (Dural Meninges Only)

gdt_atlas_full <- readRDS("~/Documents/Projects/CNS_border_atlas/results/gdT_analysis/gdT_subset_clean.rds")

cat("Full atlas γδ T:", ncol(gdt_atlas_full), "cells\n")

## Full atlas γδ T: 992 cells

print(table(gdt_atlas_full$site))

## 
##        bone marrow     brain meninges     coronal suture         dura mater 
##                  9                 24                  1                504 
##     dural meninges        whole brain whole brain + dura 
##                338                 37                 79

gdt_atlas <- subset(gdt_atlas_full, site %in% c("dura mater", "dural meninges"))
rm(gdt_atlas_full); invisible(gc())

cat("\nDural meningeal γδ T:", ncol(gdt_atlas), "cells\n")

## 
## Dural meningeal γδ T: 842 cells

print(table(gdt_atlas$gd_subtype))

## 
##       gd17      gdIFN      Mixed Unassigned 
##        291        124         63        364

1.2 Load & Subset — miR-128a (CNS Only)

gdt_mir128 <- qs_read("/Users/alasdairduguid/Documents/Projects/CNS_border_atlas/data/11_gdt_mir128a_export.qs")

cat("Full miR-128a γδ T:", ncol(gdt_mir128), "cells\n")

## Full miR-128a γδ T: 700 cells

gdt_mir128 <- subset(gdt_mir128, Tissue == "CNS")

cat("CNS γδ T:", ncol(gdt_mir128), "cells\n")

## CNS γδ T: 405 cells

print(table(gdt_mir128$cell_annotation))

## 
## Other γδ      Vγ4   Vγ6Vδ4 
##      252       32      121

print(sort(table(gdt_mir128$vgene_label), decreasing = TRUE))

## 
## Vγ6Vδ4 Vγ?Vδ4 Vγ2Vδ? Vγ1Vδ? Vγ4Vδ? Vγ2Vδ4 Vγ5Vδ? Vγ5Vδ4 Vγ1Vδ4 Vγ4Vδ4 Vγ3Vδ? 
##    119     92     52     37     27     21     19     15     11      4      2 
## Vγ6Vδ? Vγ1Vδ1 Vγ4Vδ5 Vγ7Vδ? Vγ7Vδ4 
##      2      1      1      1      1

print(table(gdt_mir128$gdt_functional_class))

## 
## Anti-tumour biased       Intermediate  Pro-tumour biased 
##                 44                 27                334

1.3 Tag Datasets

gdt_atlas$dataset  <- "Dural_Atlas"
gdt_atlas$model    <- "Healthy"
gdt_mir128$dataset <- "miR128a_CNS"
gdt_mir128$model   <- "B-ALL"

1.4 Common Gene Space

common_genes <- intersect(rownames(gdt_atlas), rownames(gdt_mir128))
cat("Common genes:", length(common_genes), "\n")

## Common genes: 19988

key_markers <- c("Cd3e", "Cd3d", "Trdc",
                 "Rorc", "Sox13", "Maf", "Blk", "Rora", "Zbtb16",
                 "Il17a", "Il17f", "Il23r", "Il1r1", "Ccr6",
                 "Tbx21", "Ifng", "Gzmb", "Gzma", "Nkg7", "Cd27",
                 "Cd44", "Cd69", "Cxcr6", "Id2", "Ncr1",
                 "Prf1", "Ccl5", "Eomes")
cat("Key markers in both:", sum(key_markers %in% common_genes), "/",
    length(key_markers), "\n")

## Key markers in both: 27 / 28

2. Integration — RPCA

DefaultAssay(gdt_atlas)  <- "originalexp"
DefaultAssay(gdt_mir128) <- "originalexp"

obj_list <- list(
  Atlas   = subset(gdt_atlas,  features = common_genes),
  miR128a = subset(gdt_mir128, features = common_genes)
)

obj_list <- lapply(obj_list, function(obj) {
  obj <- NormalizeData(obj, verbose = FALSE)
  obj <- FindVariableFeatures(obj, nfeatures = 3000, verbose = FALSE)
  return(obj)
})

features <- SelectIntegrationFeatures(object.list = obj_list, nfeatures = 3000)

obj_list <- lapply(obj_list, function(obj) {
  obj <- ScaleData(obj, features = features, verbose = FALSE)
  obj <- RunPCA(obj, features = features, npcs = 30, verbose = FALSE)
  return(obj)
})

anchors <- FindIntegrationAnchors(
  object.list     = obj_list,
  anchor.features = features,
  reduction       = "rpca",
  dims            = 1:30,
  verbose         = TRUE
)
cat("Anchors:", nrow(anchors@anchors), "\n")

## Anchors: 546

integrated <- IntegrateData(anchorset = anchors, dims = 1:30, verbose = TRUE)
cat("Integrated:", ncol(integrated), "cells\n")

## Integrated: 1247 cells

rm(obj_list, anchors); invisible(gc())

3. Joint Clustering

DefaultAssay(integrated) <- "integrated"
integrated <- ScaleData(integrated, verbose = FALSE)
integrated <- RunPCA(integrated, npcs = 30, verbose = FALSE)

ElbowPlot(integrated, ndims = 30) + ggtitle("Elbow Plot — Integrated")

n_pcs <- 20
integrated <- FindNeighbors(integrated, dims = 1:n_pcs)

for (res in c(0.3, 0.5, 0.8)) {
  integrated <- FindClusters(integrated, resolution = res, verbose = FALSE)
}
integrated <- RunUMAP(integrated, dims = 1:n_pcs)

# Use resolution 0.5 for focused analysis
integrated$res05 <- integrated$integrated_snn_res.0.5
cat("Clusters at res 0.5:", length(unique(integrated$res05)), "\n")

## Clusters at res 0.5: 9

res_cols <- grep("integrated_snn", colnames(integrated@meta.data), value = TRUE)
plots <- lapply(res_cols, function(rc) {
  DimPlot(integrated, group.by = rc, label = TRUE, label.size = 3) + ggtitle(rc)
})
wrap_plots(plots, ncol = 3)

4. Integration Quality

4.1 Overview

ds_cols <- c("Dural_Atlas" = "#2166AC", "miR128a_CNS" = "#B2182B")

p1 <- DimPlot(integrated, group.by = "dataset", cols = ds_cols,
              shuffle = TRUE, raster = TRUE) +
  ggtitle("By Dataset")
p2 <- DimPlot(integrated, group.by = "res05",
              label = TRUE, label.size = 5) +
  ggtitle("Joint Clusters (res 0.5)")
p3 <- DimPlot(integrated, group.by = "model",
              cols = c("Healthy" = "#2166AC", "B-ALL" = "#B2182B")) +
  ggtitle("Healthy vs B-ALL")
p4 <- DimPlot(integrated, group.by = "dataset", cols = ds_cols,
              split.by = "dataset", raster = TRUE) +
  ggtitle("Split by Dataset") + NoLegend()

(p1 + p2) / (p3 + p4)

4.2 Dataset-Specific Annotations

p1 <- DimPlot(integrated, group.by = "cell_annotation",
              cells = WhichCells(integrated, expression = dataset == "miR128a_CNS"),
              na.value = "grey90") +
  ggtitle("miR-128a: Vγ Subset")

p2 <- DimPlot(integrated, group.by = "gdt_functional_class",
              cells = WhichCells(integrated, expression = dataset == "miR128a_CNS"),
              cols = c("Pro-tumour biased"  = "#B2182B",
                       "Anti-tumour biased" = "#2166AC",
                       "Intermediate"       = "#FDB863"),
              na.value = "grey90") +
  ggtitle("miR-128a: Functional Class")

p3 <- DimPlot(integrated, group.by = "gd_subtype",
              cells = WhichCells(integrated, expression = dataset == "Dural_Atlas"),
              cols = c("gd17" = "#CC0000", "gdIFN" = "#0066CC",
                       "Mixed" = "#9933CC", "Unassigned" = "grey80"),
              na.value = "grey90") +
  ggtitle("Atlas: γδ17 / γδIFN")

p4 <- DimPlot(integrated, group.by = "vgene_label",
              cells = WhichCells(integrated, expression = dataset == "miR128a_CNS"),
              na.value = "grey90") +
  ggtitle("miR-128a: V Gene Usage")

(p1 + p2) / (p3 + p4)

4.3 Cluster Composition

comp <- integrated@meta.data %>%
  count(res05, dataset) %>%
  group_by(res05) %>%
  mutate(total = sum(n), pct = round(n / total * 100, 1)) %>%
  ungroup()

print(comp %>% pivot_wider(id_cols = c(res05, total),
                            names_from = dataset,
                            values_from = c(n, pct),
                            values_fill = 0))

## # A tibble: 9 × 6
##   res05 total n_Dural_Atlas n_miR128a_CNS pct_Dural_Atlas pct_miR128a_CNS
##   <fct> <int>         <int>         <int>           <dbl>           <dbl>
## 1 0       319           314             5            98.4             1.6
## 2 1       249           239            10            96               4  
## 3 2       165             8           157             4.8            95.2
## 4 3       128             3           125             2.3            97.7
## 5 4       106           105             1            99.1             0.9
## 6 5        96            31            65            32.3            67.7
## 7 6        87            61            26            70.1            29.9
## 8 7        67            64             3            95.5             4.5
## 9 8        30            17            13            56.7            43.3

p1 <- integrated@meta.data %>%
  ggplot(aes(x = res05, fill = dataset)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = ds_cols) +
  theme_minimal(base_size = 13) +
  labs(title = "Dataset Mixing by Cluster", x = "Cluster", y = "Proportion")

p2 <- integrated@meta.data %>%
  filter(dataset == "miR128a_CNS") %>%
  ggplot(aes(x = res05, fill = cell_annotation)) +
  geom_bar(position = "fill") +
  theme_minimal(base_size = 13) +
  labs(title = "miR-128a Vγ Subset by Cluster", x = "Cluster", y = "Proportion")

p1 + p2

cat("=== All clusters: Dataset × Cluster ===\n")

## === All clusters: Dataset × Cluster ===

print(table(integrated$res05, integrated$dataset))

##    
##     Dural_Atlas miR128a_CNS
##   0         314           5
##   1         239          10
##   2           8         157
##   3           3         125
##   4         105           1
##   5          31          65
##   6          61          26
##   7          64           3
##   8          17          13

cat("\n=== miR-128a cell annotation × Cluster ===\n")

## 
## === miR-128a cell annotation × Cluster ===

print(table(integrated$res05[integrated$dataset == "miR128a_CNS"],
            integrated$cell_annotation[integrated$dataset == "miR128a_CNS"]))

##    
##     Other γδ Vγ4 Vγ6Vδ4
##   0        5   0      0
##   1        2   0      8
##   2       39  14    104
##   3      121   3      1
##   4        1   0      0
##   5       62   2      1
##   6       16  10      0
##   7        2   0      1
##   8        4   3      6

cat("\n=== Atlas subtype × Cluster ===\n")

## 
## === Atlas subtype × Cluster ===

print(table(integrated$res05[integrated$dataset == "Dural_Atlas"],
            integrated$gd_subtype[integrated$dataset == "Dural_Atlas"]))

##    
##     gd17 gdIFN Mixed Unassigned
##   0   94    39    30        151
##   1  102    25    13         99
##   2    6     0     1          1
##   3    1     1     0          1
##   4   58     5     8         34
##   5    6     7     2         16
##   6    3    33     3         22
##   7   14    12     5         33
##   8    7     2     1          7

4.4 Il17a Expression by Cluster

DefaultAssay(integrated) <- "originalexp"

# Quantify
expr_il17a <- GetAssayData(integrated, layer = "data")["Il17a", ]
il17a_summary <- integrated@meta.data %>%
  mutate(Il17a_pos = expr_il17a[rownames(.)] > 0) %>%
  group_by(res05) %>%
  summarise(
    n_total  = n(),
    n_pos    = sum(Il17a_pos),
    pct_pos  = round(n_pos / n_total * 100, 1),
    .groups  = "drop"
  )

cat("Il17a expression by cluster:\n")

## Il17a expression by cluster:

print(as.data.frame(il17a_summary))

##   res05 n_total n_pos pct_pos
## 1     0     319   176    55.2
## 2     1     249     9     3.6
## 3     2     165     5     3.0
## 4     3     128     2     1.6
## 5     4     106     7     6.6
## 6     5      96     3     3.1
## 7     6      87    13    14.9
## 8     7      67    17    25.4
## 9     8      30     2     6.7

# FeaturePlot
p1 <- FeaturePlot(integrated, features = "Il17a", order = TRUE,
                  cols = c("lightgrey", "#FF6600")) +
  ggtitle("Il17a Expression")
p2 <- DimPlot(integrated, group.by = "res05", label = TRUE, label.size = 4) +
  ggtitle("Clusters (res 0.5)")

p1 + p2

5. Focal Clusters: C1, C2, and C6

Three clusters tell the γδ17 story:

C0 — The main healthy Il17a factory (55% positive), almost entirely atlas
C1 — Healthy dural γδ17, poised but low Il17a (3.6%)
C2 — B-ALL Vγ6Vδ4-enriched, Il17a silenced (3%)
C6 — Mixed cluster with Il17a activity (14.9%), atlas γδIFN-enriched + miR-128a Vγ4/Other γδ, zero Vγ6Vδ4

5.1 Cluster Identity

for (cl in c("0", "1", "2", "6")) {
  meta_cl <- integrated@meta.data %>% filter(res05 == cl)
  
  cat("\n=== Cluster", cl, "(n =", nrow(meta_cl), ") ===\n\n")
  
  cat("Dataset breakdown:\n")
  ds <- table(meta_cl$dataset)
  for (d in names(ds)) cat("  ", d, ":", ds[d], "\n")
  
  mir <- meta_cl %>% filter(dataset == "miR128a_CNS")
  if (nrow(mir) > 0) {
    cat("\nmiR-128a cell annotations:\n")
    ann <- table(mir$cell_annotation)
    for (a in names(ann)) cat("  ", a, ":", ann[a], "\n")
    cat("  ── Vγ6Vδ4:", sum(mir$cell_annotation == "Vγ6Vδ4"), "\n")
  }
  
  atl <- meta_cl %>% filter(dataset == "Dural_Atlas")
  if (nrow(atl) > 0) {
    cat("\nAtlas subtypes:\n")
    sub <- table(atl$gd_subtype)
    for (s in names(sub)) cat("  ", s, ":", sub[s], "\n")
  }
  
  # Il17a
  cells <- rownames(meta_cl)
  il17a <- expr_il17a[cells]
  cat("\nIl17a: ", sum(il17a > 0), "/", length(il17a), " positive (",
      round(sum(il17a > 0) / length(il17a) * 100, 1), "%)\n", sep = "")
}

## 
## === Cluster 0 (n = 319 ) ===
## 
## Dataset breakdown:
##    Dural_Atlas : 314 
##    miR128a_CNS : 5 
## 
## miR-128a cell annotations:
##    Other γδ : 5 
##   ── Vγ6Vδ4: 0 
## 
## Atlas subtypes:
##    gd17 : 94 
##    gdIFN : 39 
##    Mixed : 30 
##    Unassigned : 151 
## 
## Il17a: 176/319 positive (55.2%)
## 
## === Cluster 1 (n = 249 ) ===
## 
## Dataset breakdown:
##    Dural_Atlas : 239 
##    miR128a_CNS : 10 
## 
## miR-128a cell annotations:
##    Other γδ : 2 
##    Vγ6Vδ4 : 8 
##   ── Vγ6Vδ4: 8 
## 
## Atlas subtypes:
##    gd17 : 102 
##    gdIFN : 25 
##    Mixed : 13 
##    Unassigned : 99 
## 
## Il17a: 9/249 positive (3.6%)
## 
## === Cluster 2 (n = 165 ) ===
## 
## Dataset breakdown:
##    Dural_Atlas : 8 
##    miR128a_CNS : 157 
## 
## miR-128a cell annotations:
##    Other γδ : 39 
##    Vγ4 : 14 
##    Vγ6Vδ4 : 104 
##   ── Vγ6Vδ4: 104 
## 
## Atlas subtypes:
##    gd17 : 6 
##    Mixed : 1 
##    Unassigned : 1 
## 
## Il17a: 5/165 positive (3%)
## 
## === Cluster 6 (n = 87 ) ===
## 
## Dataset breakdown:
##    Dural_Atlas : 61 
##    miR128a_CNS : 26 
## 
## miR-128a cell annotations:
##    Other γδ : 16 
##    Vγ4 : 10 
##   ── Vγ6Vδ4: 0 
## 
## Atlas subtypes:
##    gd17 : 3 
##    gdIFN : 33 
##    Mixed : 3 
##    Unassigned : 22 
## 
## Il17a: 13/87 positive (14.9%)

5.2 Highlight UMAP

integrated$focal_highlight <- case_when(
  integrated$res05 == "0" ~ "C0: Atlas Il17a+",
  integrated$res05 == "1" & integrated$dataset == "Dural_Atlas"  ~ "C1: Healthy γδ17",
  integrated$res05 == "1" & integrated$dataset == "miR128a_CNS"  ~ "C1: B-ALL (mixed in)",
  integrated$res05 == "2" & integrated$dataset == "miR128a_CNS"  ~ "C2: B-ALL Vγ6Vδ4",
  integrated$res05 == "2" & integrated$dataset == "Dural_Atlas"  ~ "C2: Atlas (mixed in)",
  integrated$res05 == "6" ~ "C6: Il17a-active mixed",
  TRUE ~ "Other"
)

focal_cols <- c("C0: Atlas Il17a+"       = "#FF6600",
                "C1: Healthy γδ17"       = "#2166AC",
                "C1: B-ALL (mixed in)"   = "#9ECAE1",
                "C2: B-ALL Vγ6Vδ4"      = "#B2182B",
                "C2: Atlas (mixed in)"   = "#FCAE91",
                "C6: Il17a-active mixed" = "#228B22",
                "Other"                  = "grey85")

p1 <- DimPlot(integrated, group.by = "focal_highlight", cols = focal_cols,
              order = c("C0: Atlas Il17a+", "C6: Il17a-active mixed",
                        "C2: B-ALL Vγ6Vδ4", "C1: Healthy γδ17",
                        "C1: B-ALL (mixed in)", "C2: Atlas (mixed in)", "Other")) +
  ggtitle("Focal Clusters — C0, C1, C2, C6")

p2 <- FeaturePlot(integrated, features = "Il17a", order = TRUE,
                  cols = c("lightgrey", "#FF6600")) +
  ggtitle("Il17a")

p1 + p2

5.3 Canonical Markers — Focal Clusters

DefaultAssay(integrated) <- "originalexp"

focal_obj <- subset(integrated, res05 %in% c("0", "1", "2", "6"))
focal_obj$focal_label <- paste0("C", focal_obj$res05)

canonical <- c("Cd3e", "Cd3d", "Trdc",
               "Rorc", "Sox13", "Maf", "Blk", "Rora", "Zbtb16",
               "Il17a", "Il17f", "Il23r", "Il1r1", "Ccr6",
               "Tbx21", "Ifng", "Gzmb", "Nkg7", "Cd27",
               "Ctla4", "Tgfbr2", "Lck",
               "S100a4", "Tmem176a", "Itgb7",
               "H2-Eb1", "Cd74",
               "Cd44", "Cd69", "Cxcr6")
canonical_present <- canonical[canonical %in% rownames(focal_obj)]

DotPlot(focal_obj, features = canonical_present,
        group.by = "focal_label",
        cols = c("lightgrey", "darkred"), dot.scale = 6) +
  RotatedAxis() +
  ggtitle("Canonical γδ Markers — C0, C1, C2, C6") +
  theme(axis.text.x = element_text(face = "italic", size = 9))

5.4 Split by Dataset Within Focal Clusters

focal_obj$focal_ds <- paste0("C", focal_obj$res05, "_",
                              ifelse(focal_obj$dataset == "Dural_Atlas",
                                     "Atlas", "miR128"))

DotPlot(focal_obj, features = canonical_present,
        group.by = "focal_ds",
        cols = c("lightgrey", "darkred"), dot.scale = 5) +
  RotatedAxis() +
  ggtitle("Markers — Focal Clusters × Dataset") +
  theme(axis.text.x = element_text(face = "italic", size = 8),
        axis.text.y = element_text(size = 8))

5.5 Key FeaturePlots

focal_cells <- WhichCells(integrated, expression = res05 %in% c("0", "1", "2", "6"))

key <- c("Maf", "Rorc", "Sox13", "Blk",
         "Il17a", "Il23r", "Ccr6", "Il1r1",
         "Ctla4", "Tgfbr2", "Lck", "Nkg7",
         "S100a4", "Tmem176a", "H2-Eb1", "Dock2")
key_present <- key[key %in% rownames(integrated)]

FeaturePlot(integrated, features = key_present,
            cells = focal_cells,
            order = TRUE, ncol = 4,
            cols = c("lightgrey", "darkred")) &
  theme(plot.title = element_text(face = "bold.italic", size = 10))

6. DE: Cluster 2 (B-ALL Vγ6Vδ4) vs Cluster 1 (Healthy γδ17)

6.1 Full DE

c1c2 <- subset(integrated, res05 %in% c("1", "2"))
c1c2$cluster_id <- paste0("C", c1c2$res05)
Idents(c1c2) <- "cluster_id"
DefaultAssay(c1c2) <- "originalexp"

de_c2c1 <- FindMarkers(c1c2,
                        ident.1 = "C2",
                        ident.2 = "C1",
                        test.use = "wilcox",
                        min.pct = 0.1,
                        logfc.threshold = 0)
de_c2c1$gene <- rownames(de_c2c1)

cat("Total DE genes (p_adj < 0.05):",
    sum(de_c2c1$p_val_adj < 0.05), "\n")

## Total DE genes (p_adj < 0.05): 473

cat("  C2 (B-ALL) enriched:",
    sum(de_c2c1$p_val_adj < 0.05 & de_c2c1$avg_log2FC > 0.25), "\n")

##   C2 (B-ALL) enriched: 256

cat("  C1 (Healthy) enriched:",
    sum(de_c2c1$p_val_adj < 0.05 & de_c2c1$avg_log2FC < -0.25), "\n")

##   C1 (Healthy) enriched: 215

6.2 Volcano Plot

hl_genes <- c(
  "Rorc", "Sox13", "Maf", "Blk", "Rora", "Il17a", "Il23r", "Ccr6", "Il1r1",
  "Ctla4", "Tgfbr2", "Cd28",
  "Lck", "Itk", "Prkcq", "Skap1", "Fyb",
  "S100a4", "S100a6", "Tmem176a", "Lgals1", "Itgb7", "Fgl2",
  "Cd4", "Cd5", "Cd6", "Cd2", "Nkg7",
  "H2-Eb1", "H2-Ab1", "Cd74",
  "mt-Nd4l", "mt-Co3", "mt-Atp6",
  "Dock2", "Myh9",
  "Il31ra", "Lef1", "Tcf7", "Socs1", "Apoe", "Ccr2",
  "Themis", "Ubash3a"
)

# Flag Il17a separately
de_c2c1 <- de_c2c1 %>%
  mutate(
    sig = case_when(
      gene == "Il17a"                          ~ "Il17a",
      p_val_adj < 0.05 & avg_log2FC > 0.5     ~ "C2: B-ALL ↑",
      p_val_adj < 0.05 & avg_log2FC < -0.5    ~ "C1: Healthy ↑",
      p_val_adj < 0.05                         ~ "Sig (small FC)",
      TRUE ~ "NS"),
    label = ifelse(gene %in% hl_genes |
                     (p_val_adj < 1e-15 & abs(avg_log2FC) > 1.5),
                   gene, NA))

p_volcano <- ggplot(de_c2c1, aes(x = avg_log2FC, y = -log10(p_val_adj), colour = sig)) +
  geom_point(alpha = 0.5, size = 1.5) +
  geom_point(data = de_c2c1 %>% filter(gene == "Il17a"),
             colour = "#FF8C00", size = 5, shape = 18) +
  geom_text_repel(aes(label = label), size = 3, max.overlaps = 40,
                  fontface = "italic", segment.alpha = 0.4) +
  geom_label_repel(data = de_c2c1 %>% filter(gene == "Il17a"),
                   aes(label = gene), size = 4.5,
                   fontface = "bold.italic", colour = "#FF8C00",
                   fill = "white", box.padding = 1,
                   segment.colour = "#FF8C00") +
  scale_colour_manual(values = c("Il17a"          = "#FF8C00",
                                 "C2: B-ALL ↑"    = "#B2182B",
                                 "C1: Healthy ↑"  = "#2166AC",
                                 "Sig (small FC)"  = "grey50",
                                 "NS"              = "grey85")) +
  theme_minimal(base_size = 13) +
  labs(title = "C2 (B-ALL Vγ6Vδ4) vs C1 (Healthy Dural γδ17)",
       x = "log2 FC (+ = B-ALL, − = Healthy)", y = "-log10(adj p)") +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", colour = "grey60") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", colour = "grey60")

# If Il17a was excluded by FindMarkers, add annotation
if (!"Il17a" %in% de_c2c1$gene) {
  p_volcano <- p_volcano +
    annotate("label", x = 0, y = max(-log10(de_c2c1$p_val_adj), na.rm = TRUE) * 0.95,
             label = "Il17a: 0% in both clusters\n(excluded from DE)",
             fontface = "bold.italic", colour = "#FF8C00",
             fill = "white", size = 4, label.padding = unit(0.4, "lines"))
}
p_volcano

6.3 Top DE Tables

cat("=== C2 (B-ALL) enriched — Top 30 ===\n")

## === C2 (B-ALL) enriched — Top 30 ===

print(de_c2c1 %>%
        filter(avg_log2FC > 0, p_val_adj < 0.05) %>%
        arrange(p_val_adj) %>%
        select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
        head(30))

##                        gene avg_log2FC pct.1 pct.2    p_val_adj
## Il31ra               Il31ra  4.2252694 0.830 0.040 9.967330e-53
## Cmss1                 Cmss1  3.7900377 0.885 0.149 1.682781e-50
## mt-Nd4l             mt-Nd4l  3.1555432 0.939 0.434 2.200476e-48
## Dock2                 Dock2  2.6407197 0.982 0.739 6.294392e-47
## Xist                   Xist  3.9815863 0.770 0.040 3.003433e-46
## Cdk8                   Cdk8  3.3027898 0.818 0.189 1.973006e-39
## Gphn                   Gphn  3.5192714 0.727 0.076 1.590281e-38
## 5730419F03Rik 5730419F03Rik  4.2238862 0.667 0.032 3.763966e-38
## Il23r                 Il23r  2.6103342 0.867 0.406 1.540150e-34
## Mir142hg           Mir142hg  2.9395175 0.788 0.257 9.138299e-33
## Lars2                 Lars2  2.9794757 0.848 0.446 5.562631e-30
## Arhgap15           Arhgap15  2.1825427 0.909 0.562 7.994757e-30
## Hexb                   Hexb  2.6728994 0.745 0.213 1.006640e-29
## Rpl23a               Rpl23a  0.9943612 0.994 0.988 1.048114e-28
## Camk1d               Camk1d  2.6166027 0.758 0.277 1.666237e-27
## Myh9                   Myh9  1.7966743 0.909 0.775 1.608692e-25
## Il7r                   Il7r  1.5113395 0.976 0.976 5.662621e-24
## St6galnac3       St6galnac3  1.8332846 0.855 0.574 1.065464e-22
## Gclm                   Gclm  2.8428683 0.618 0.145 2.007625e-22
## Hbb-bt               Hbb-bt  1.0763190 0.024 0.538 9.707948e-22
## Hba-a1               Hba-a1  0.4697820 0.036 0.562 2.354802e-21
## mt-Atp8             mt-Atp8  2.2781585 0.715 0.273 2.202245e-20
## Coro1b               Coro1b  3.7199107 0.479 0.064 4.690754e-20
## Evi2b                 Evi2b  4.3399479 0.418 0.028 1.479383e-19
## Airn                   Airn  3.1260996 0.533 0.108 2.696380e-19
## Elmo1                 Elmo1  2.0199716 0.739 0.365 4.390692e-19
## mt-Nd5               mt-Nd5  1.3497075 0.927 0.783 8.379613e-19
## Camk4                 Camk4  2.3543322 0.618 0.205 1.852452e-18
## Ctse                   Ctse  3.1215910 0.448 0.052 4.880938e-18
## Sqstm1               Sqstm1  2.5884546 0.576 0.189 5.254024e-17

cat("\n=== C1 (Healthy) enriched — Top 30 ===\n")

## 
## === C1 (Healthy) enriched — Top 30 ===

print(de_c2c1 %>%
        filter(avg_log2FC < 0, p_val_adj < 0.05) %>%
        arrange(p_val_adj) %>%
        select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
        head(30))

##              gene   avg_log2FC pct.1 pct.2    p_val_adj
## Hbb-bs     Hbb-bs -0.133013951 0.042 0.936 2.666058e-54
## mt-Atp6   mt-Atp6 -2.006577835 1.000 1.000 1.635392e-53
## mt-Nd3     mt-Nd3 -4.009062761 0.200 0.952 4.511251e-53
## Uba52       Uba52 -2.987354121 0.461 0.976 2.081473e-52
## mt-Co3     mt-Co3 -1.917386816 1.000 1.000 4.611624e-52
## Rps29       Rps29 -2.747396803 0.909 0.996 6.602969e-52
## Rps27       Rps27 -1.608228392 0.994 1.000 1.217417e-51
## Rpl38       Rpl38 -1.936719489 0.958 1.000 1.389371e-51
## Rpl37a     Rpl37a -1.493071620 0.988 1.000 4.338952e-51
## Rps28       Rps28 -1.912679482 0.897 0.996 6.509011e-51
## Rpl39       Rpl39 -1.755326339 0.976 1.000 6.705536e-51
## Malat1     Malat1 -1.816026580 1.000 1.000 2.884447e-50
## mt-Co2     mt-Co2 -1.250937420 0.994 1.000 1.359475e-46
## Rps21       Rps21 -1.344765970 1.000 1.000 1.475885e-46
## Chchd2     Chchd2 -2.495601183 0.455 0.960 1.594588e-46
## Tmsb4x     Tmsb4x -1.159844298 1.000 1.000 2.444912e-45
## mt-Nd4     mt-Nd4 -1.374092339 0.927 1.000 8.098724e-41
## mt-Nd2     mt-Nd2 -1.337974302 0.970 1.000 4.280863e-40
## Rnaset2a Rnaset2a -3.778155326 0.073 0.819 7.772854e-40
## Rpl37       Rpl37 -1.139489136 0.964 1.000 6.814850e-39
## Rpl41       Rpl41 -1.012661754 0.994 1.000 7.182826e-39
## S100a9     S100a9 -4.508994899 0.042 0.747 5.167022e-36
## Rpl36       Rpl36 -0.994472334 0.970 1.000 3.280749e-35
## S100a4     S100a4 -1.176823236 0.927 0.996 2.872959e-33
## Hba-a2     Hba-a2 -0.001965198 0.018 0.675 1.473771e-32
## mt-Cytb   mt-Cytb -0.978964725 0.988 1.000 8.065926e-30
## S100a6     S100a6 -1.142857210 0.945 1.000 2.156443e-29
## Apoe         Apoe -4.207494818 0.042 0.667 1.649745e-28
## Cox7c       Cox7c -1.561414994 0.624 0.956 1.709743e-28
## Rnaset2b Rnaset2b -2.698838246 0.133 0.775 4.794400e-28

7. Understanding C6: The Il17a-Active Cluster

Cluster 6 has the highest Il17a positivity among mixed clusters (14.9%). It contains 61 atlas cells (33 classified as γδIFN) and 26 miR-128a cells (10 Vγ4, 16 Other γδ, zero Vγ6Vδ4). What defines this cluster?

7.1 C6 vs C1 (Healthy γδ17)

c1c6 <- subset(integrated, res05 %in% c("1", "6"))
c1c6$cluster_id <- paste0("C", c1c6$res05)
Idents(c1c6) <- "cluster_id"
DefaultAssay(c1c6) <- "originalexp"

de_c6c1 <- FindMarkers(c1c6,
                        ident.1 = "C6",
                        ident.2 = "C1",
                        test.use = "wilcox",
                        min.pct = 0.1,
                        logfc.threshold = 0.25)
de_c6c1$gene <- rownames(de_c6c1)

cat("=== C6 enriched vs C1 (top 25) ===\n")

## === C6 enriched vs C1 (top 25) ===

print(de_c6c1 %>%
        filter(avg_log2FC > 0) %>% arrange(p_val_adj) %>%
        select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
        head(25))

##              gene avg_log2FC pct.1 pct.2    p_val_adj
## Lck           Lck   3.805627 0.839 0.108 7.397144e-37
## Fyb           Fyb   3.499028 0.828 0.129 8.687786e-33
## Ms4a4b     Ms4a4b   5.325674 0.667 0.060 2.938818e-30
## Cd28         Cd28   4.926970 0.575 0.028 1.414241e-27
## Cd6           Cd6   5.412829 0.563 0.032 1.380709e-26
## Cd2           Cd2   3.164314 0.736 0.157 4.137607e-23
## Ctla2a     Ctla2a   3.669892 0.644 0.092 1.553687e-22
## Slfn1       Slfn1   4.513577 0.437 0.020 3.823129e-19
## Ms4a6b     Ms4a6b   2.328953 0.839 0.317 3.828266e-19
## Cd4           Cd4   6.848959 0.356 0.004 1.118201e-17
## Nkg7         Nkg7   6.186589 0.402 0.020 2.050248e-17
## Prkch       Prkch   4.315814 0.437 0.032 8.286403e-17
## Cd5           Cd5   3.345765 0.609 0.141 9.278173e-17
## Ctla4       Ctla4   7.946893 0.276 0.000 1.917888e-13
## Gimap4     Gimap4   3.158851 0.621 0.189 6.383826e-13
## Themis     Themis   4.012360 0.368 0.032 8.816432e-13
## Ubash3a   Ubash3a   5.619560 0.287 0.008 2.662243e-12
## Gm8369     Gm8369   7.531245 0.241 0.000 2.743308e-11
## Sh2d1a     Sh2d1a   4.248805 0.310 0.020 3.166790e-11
## Sdcbp2     Sdcbp2   3.844844 0.333 0.028 4.607026e-11
## Tnfsf8     Tnfsf8   3.614154 0.379 0.052 8.264985e-11
## Itga4       Itga4   3.005339 0.506 0.124 3.192302e-10
## S1pr1       S1pr1   4.708223 0.276 0.016 6.676409e-10
## Gimap7     Gimap7   4.066003 0.322 0.036 2.953436e-09
## Arhgef18 Arhgef18   2.922724 0.425 0.084 4.022006e-09

cat("\n=== C1 enriched vs C6 (top 25) ===\n")

## 
## === C1 enriched vs C6 (top 25) ===

print(de_c6c1 %>%
        filter(avg_log2FC < 0) %>% arrange(p_val_adj) %>%
        select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
        head(25))

##                gene avg_log2FC pct.1 pct.2    p_val_adj
## Blk             Blk -2.2901770 0.287 0.871 1.142737e-17
## S100a6       S100a6 -1.1908354 0.931 1.000 2.585677e-17
## Ckb             Ckb -1.5888129 0.621 0.960 4.409747e-17
## Kcnk1         Kcnk1 -2.9133463 0.161 0.743 5.541100e-16
## S100a4       S100a4 -0.8671809 0.920 0.996 5.917436e-11
## Cd3g           Cd3g -0.8495019 0.977 1.000 4.750882e-10
## Ccr2           Ccr2 -1.0426928 0.793 0.960 2.069459e-09
## Rps29         Rps29 -0.6845094 0.966 0.996 3.788894e-09
## Serpinb1a Serpinb1a -1.1642978 0.621 0.920 4.897184e-09
## Actn2         Actn2 -1.2176766 0.529 0.900 6.558825e-09
## Ndufa4       Ndufa4 -0.8007199 0.862 0.964 5.105627e-07
## Igf1r         Igf1r -1.2416855 0.437 0.803 1.467305e-06
## Ly6g5b       Ly6g5b -1.3708531 0.356 0.735 1.878953e-06
## Chchd2       Chchd2 -0.7297560 0.759 0.960 3.731960e-06
## Tmem176b   Tmem176b -0.7578820 0.816 1.000 4.398266e-06
## Ifngr1       Ifngr1 -0.8123158 0.862 0.976 4.524057e-06
## Auts2         Auts2 -4.8445351 0.023 0.373 9.037673e-06
## S100a11     S100a11 -0.5396069 0.966 1.000 1.095739e-05
## Rpl35         Rpl35 -0.5992748 0.954 0.988 3.156211e-05
## Tmem176a   Tmem176a -0.6893334 0.851 1.000 3.242190e-05
## Rnaset2a   Rnaset2a -1.0822997 0.529 0.819 3.744436e-05
## Krt83         Krt83 -1.7174245 0.276 0.635 5.787020e-05
## Eif1           Eif1 -0.4619675 0.989 1.000 5.835986e-05
## Sox13         Sox13 -2.8087894 0.092 0.426 9.781737e-05
## Maf             Maf -0.7245098 0.897 0.980 1.220542e-04

7.2 C6 vs C2 (B-ALL Vγ6Vδ4)

c2c6 <- subset(integrated, res05 %in% c("2", "6"))
c2c6$cluster_id <- paste0("C", c2c6$res05)
Idents(c2c6) <- "cluster_id"
DefaultAssay(c2c6) <- "originalexp"

de_c6c2 <- FindMarkers(c2c6,
                        ident.1 = "C6",
                        ident.2 = "C2",
                        test.use = "wilcox",
                        min.pct = 0.1,
                        logfc.threshold = 0.25)
de_c6c2$gene <- rownames(de_c6c2)

cat("=== C6 enriched vs C2 (top 25) ===\n")

## === C6 enriched vs C2 (top 25) ===

print(de_c6c2 %>%
        filter(avg_log2FC > 0) %>% arrange(p_val_adj) %>%
        select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
        head(25))

##              gene avg_log2FC pct.1 pct.2    p_val_adj
## Ms4a4b     Ms4a4b  5.2527614 0.667 0.036 1.560852e-23
## Cd28         Cd28  5.6151524 0.575 0.012 4.897312e-21
## S100a9     S100a9  6.6901555 0.632 0.042 7.774314e-21
## Ctla2a     Ctla2a  4.2792281 0.644 0.048 1.926765e-20
## Fyb           Fyb  2.8147537 0.828 0.164 1.951775e-20
## S100a8     S100a8  6.8388068 0.621 0.036 1.966693e-20
## Lck           Lck  2.4891098 0.839 0.194 7.677000e-20
## Cd6           Cd6  5.1529552 0.563 0.024 5.410181e-19
## mt-Nd3     mt-Nd3  3.6873599 0.736 0.200 5.164725e-18
## Malat1     Malat1  1.9826024 1.000 1.000 1.174518e-16
## Cd2           Cd2  2.7093485 0.736 0.176 6.583270e-16
## Ms4a6b     Ms4a6b  2.1978591 0.839 0.242 1.202890e-15
## AW112010 AW112010  2.0314376 0.943 0.527 6.476605e-15
## Btg1         Btg1  1.9257432 0.908 0.606 1.260936e-14
## mt-Co3     mt-Co3  1.4748593 1.000 1.000 2.449640e-14
## Hba-a2     Hba-a2  0.4681838 0.460 0.018 2.658153e-14
## Rps27rt   Rps27rt  3.0405749 0.598 0.109 6.388324e-14
## Rpl38       Rpl38  1.4590558 0.989 0.958 5.107186e-13
## Rps29       Rps29  2.0628874 0.966 0.909 8.494497e-13
## mt-Co2     mt-Co2  0.9596887 1.000 0.994 1.682837e-12
## mt-Atp6   mt-Atp6  1.5232509 1.000 1.000 2.183368e-12
## Cox7c       Cox7c  1.4332139 0.920 0.624 3.515396e-12
## Rpl37a     Rpl37a  1.1776059 0.989 0.988 4.813872e-12
## Uba52       Uba52  2.3833103 0.816 0.461 5.313406e-12
## Lyz2         Lyz2  3.1809431 0.483 0.048 1.745905e-11

cat("\n=== C2 enriched vs C6 (top 25) ===\n")

## 
## === C2 enriched vs C6 (top 25) ===

print(de_c6c2 %>%
        filter(avg_log2FC < 0) %>% arrange(p_val_adj) %>%
        select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
        head(25))

##                        gene avg_log2FC pct.1 pct.2    p_val_adj
## Il23r                 Il23r -2.7983167 0.299 0.867 2.022089e-18
## Hba-a1               Hba-a1 -0.7448318 0.483 0.036 1.870307e-12
## Il7r                   Il7r -1.5311672 0.862 0.976 2.371896e-12
## Hbb-bt               Hbb-bt -0.6770794 0.425 0.024 1.475018e-11
## mt-Nd4l             mt-Nd4l -1.4474599 0.609 0.939 1.738369e-11
## Tmem176b           Tmem176b -1.2636258 0.816 0.994 3.121738e-11
## Ckb                     Ckb -1.5417968 0.621 0.897 4.317987e-11
## Cmss1                 Cmss1 -0.2903581 0.345 0.885 5.378058e-11
## Blk                     Blk -2.4534944 0.287 0.758 8.691235e-11
## Igf1r                 Igf1r -1.8275537 0.437 0.830 9.997474e-11
## Kcnk1                 Kcnk1 -2.8399486 0.161 0.648 3.795247e-10
## Gstp1                 Gstp1 -2.0191990 0.322 0.782 1.154236e-09
## Cdk8                   Cdk8 -2.9663189 0.333 0.818 1.582328e-09
## Il31ra               Il31ra -1.7419012 0.287 0.830 2.475198e-09
## Cd247                 Cd247 -2.0980682 0.448 0.752 1.249868e-08
## Rpl23a               Rpl23a -0.6863286 0.977 0.994 1.768788e-08
## St6galnac3       St6galnac3 -1.4652802 0.563 0.855 4.440787e-08
## Xist                   Xist -1.3146175 0.253 0.770 1.235669e-07
## Ramp1                 Ramp1 -0.9918688 0.736 0.952 1.999508e-07
## 5730419F03Rik 5730419F03Rik -1.9310500 0.195 0.667 2.126169e-07
## Gpx1                   Gpx1 -1.3889846 0.736 0.897 3.068429e-07
## Hexb                   Hexb -1.4893979 0.299 0.745 4.105283e-07
## Cd82                   Cd82 -1.0914290 0.782 0.939 5.347737e-07
## Clnk                   Clnk -1.8558711 0.207 0.630 9.130585e-07
## Cnbp                   Cnbp -0.8602864 0.897 0.964 6.162425e-06

7.3 Three-Way Comparison: C1 vs C2 vs C6

programme_genes <- list(
  "γδ17 TFs"           = c("Rorc", "Sox13", "Maf", "Blk", "Rora", "Zbtb16"),
  "IL-17 Effectors"    = c("Il17a", "Il17f", "Il22", "Il23r", "Il1r1", "Ccr6"),
  "γδIFN"              = c("Tbx21", "Eomes", "Ifng", "Gzmb", "Nkg7", "Cd27"),
  "Checkpoints"        = c("Ctla4", "Tgfbr2", "Cd28", "Cd5", "Cd6"),
  "TCR Signalling"     = c("Lck", "Itk", "Prkcq", "Skap1", "Fyb", "Cd2"),
  "Tissue Residency"   = c("S100a4", "S100a6", "S100a10", "S100a11",
                            "Tmem176a", "Lgals1", "Crip1", "Itgb7", "Fgl2"),
  "MHC II"             = c("H2-Eb1", "H2-Ab1", "Cd74"),
  "Migration"          = c("Dock2", "Myh9", "Coro1b", "Myo1f", "Ccr2"),
  "Survival"           = c("Socs1", "Mcl1", "Bcl2a1b", "Bcl2a1d"),
  "Wnt / Stemness"     = c("Lef1", "Tcf7"),
  "Notable"            = c("Il31ra", "Apoe", "Cd4", "Themis", "Ubash3a")
)

all_prog <- unlist(programme_genes) %>% unique()
all_prog_present <- all_prog[all_prog %in% rownames(integrated)]

gene_to_cat <- data.frame(gene = character(), Category = character(),
                           stringsAsFactors = FALSE)
for (cat_name in names(programme_genes)) {
  genes <- programme_genes[[cat_name]]
  genes <- genes[genes %in% all_prog_present]
  if (length(genes) > 0) {
    gene_to_cat <- rbind(gene_to_cat,
                         data.frame(gene = genes, Category = cat_name,
                                    stringsAsFactors = FALSE))
  }
}
gene_to_cat <- gene_to_cat[!duplicated(gene_to_cat$gene), ]

# Average expression: cluster × dataset for C0, C1, C2, C6
focal_sub <- subset(integrated, res05 %in% c("0", "1", "2", "6"))
focal_sub$hm_group <- paste0("C", focal_sub$res05, "_",
                              ifelse(focal_sub$dataset == "Dural_Atlas",
                                     "Atlas", "miR128"))
DefaultAssay(focal_sub) <- "originalexp"

# Filter groups with very few cells
grp_counts <- table(focal_sub$hm_group)
valid_groups <- names(grp_counts[grp_counts >= 5])
focal_filt <- subset(focal_sub, hm_group %in% valid_groups)

avg <- AverageExpression(focal_filt, features = gene_to_cat$gene,
                         group.by = "hm_group", return.seurat = FALSE)
avg_mat <- as.matrix(avg[[1]])
avg_sc  <- t(scale(t(avg_mat)))

row_annot <- data.frame(Category = gene_to_cat$Category,
                        row.names = gene_to_cat$gene)

col_annot <- data.frame(
  Dataset = ifelse(grepl("Atlas", colnames(avg_sc)), "Atlas", "miR-128a"),
  Cluster = gsub("-.*", "", colnames(avg_sc)),
  row.names = colnames(avg_sc)
)

cluster_levels <- unique(col_annot$Cluster)
all_cl_cols <- c("C0" = "#FF6600", "C1" = "#2166AC", "C2" = "#B2182B",
                 "C6" = "#228B22")
cl_cols_use <- all_cl_cols[names(all_cl_cols) %in% cluster_levels]

cat_cols <- c("γδ17 TFs"         = "#CC0000",
              "IL-17 Effectors"  = "#FF6600",
              "γδIFN"            = "#0066CC",
              "Checkpoints"      = "#800080",
              "TCR Signalling"   = "#FF69B4",
              "Tissue Residency" = "#228B22",
              "MHC II"           = "#DAA520",
              "Migration"        = "#00CED1",
              "Survival"         = "#8B4513",
              "Wnt / Stemness"   = "#FF4500",
              "Notable"          = "grey50")

annot_colors <- list(
  Dataset  = c("Atlas" = "#2166AC", "miR-128a" = "#B2182B"),
  Cluster  = cl_cols_use,
  Category = cat_cols[names(cat_cols) %in% unique(gene_to_cat$Category)]
)

pheatmap(avg_sc,
         annotation_col = col_annot,
         annotation_row = row_annot,
         annotation_colors = annot_colors,
         cluster_cols = TRUE, cluster_rows = FALSE,
         color = colorRampPalette(c("#2166AC", "white", "#B2182B"))(100),
         main = "Biological Programmes — C0, C1, C2, C6\n(split by dataset)",
         fontsize_row = 8, fontsize_col = 9)

7.4 Per-Gene Violin Plots — Three-Way

focal_sub$group <- paste0("C", focal_sub$res05)

highlight_genes <- c("Maf", "Rorc", "Il17a", "Il23r", "Ccr6",
                      "Ctla4", "Tgfbr2", "Lck",
                      "S100a4", "Tmem176a", "H2-Eb1",
                      "Tbx21", "Nkg7", "Gzmb",
                      "Socs1", "Dock2")
highlight_present <- highlight_genes[highlight_genes %in% rownames(focal_sub)]

cl_cols <- c("C0" = "#FF6600", "C1" = "#2166AC",
             "C2" = "#B2182B", "C6" = "#228B22")

VlnPlot(focal_sub, features = highlight_present,
        group.by = "group", pt.size = 0, ncol = 4,
        cols = cl_cols) &
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
        plot.title = element_text(face = "bold.italic"))

7.5 C6 Vγ4 vs C2 Vγ6Vδ4 — Within miR-128a

Why can Vγ4 in C6 produce Il17a but Vγ6Vδ4 in C2 cannot?

vg4_c6 <- WhichCells(integrated,
                      expression = dataset == "miR128a_CNS" &
                        cell_annotation == "Vγ4" &
                        res05 == "6")
vg6_c2 <- WhichCells(integrated,
                      expression = dataset == "miR128a_CNS" &
                        cell_annotation == "Vγ6Vδ4" &
                        res05 == "2")

cat("Vγ4 in C6:", length(vg4_c6), "\n")

## Vγ4 in C6: 10

cat("Vγ6Vδ4 in C2:", length(vg6_c2), "\n")

## Vγ6Vδ4 in C2: 104

if (length(vg4_c6) >= 5 & length(vg6_c2) >= 10) {
  comp_vg <- subset(integrated, cells = c(vg4_c6, vg6_c2))
  comp_vg$comparison <- ifelse(colnames(comp_vg) %in% vg4_c6,
                                "Vg4_C6", "Vg6_C2")
  Idents(comp_vg) <- "comparison"
  DefaultAssay(comp_vg) <- "originalexp"
  
  de_vg4c6_vg6c2 <- FindMarkers(comp_vg,
                                  ident.1 = "Vg4_C6",
                                  ident.2 = "Vg6_C2",
                                  test.use = "wilcox",
                                  min.pct = 0.05,
                                  logfc.threshold = 0)
  de_vg4c6_vg6c2$gene <- rownames(de_vg4c6_vg6c2)
  
  cat("\n--- Vγ4 C6 enriched (top 20) ---\n")
  print(de_vg4c6_vg6c2 %>%
          filter(avg_log2FC > 0) %>% arrange(p_val_adj) %>%
          select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
          head(20))
  
  cat("\n--- Vγ6Vδ4 C2 enriched (top 20) ---\n")
  print(de_vg4c6_vg6c2 %>%
          filter(avg_log2FC < 0) %>% arrange(p_val_adj) %>%
          select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
          head(20))
} else {
  cat("Not enough cells for formal DE (Vg4_C6 n=", length(vg4_c6), ")\n")
}

## 
## --- Vγ4 C6 enriched (top 20) ---
##                        gene avg_log2FC pct.1 pct.2    p_val_adj
## Crtam                 Crtam   7.609520   0.6 0.000 1.483309e-11
## Padi2                 Padi2   3.920143   0.8 0.038 2.726767e-09
## Lck                     Lck   4.811873   0.8 0.058 3.060547e-08
## Nkg7                   Nkg7   8.347140   0.5 0.010 5.755093e-07
## Themis               Themis   6.021974   0.5 0.010 6.859926e-07
## Cd28                   Cd28   7.348822   0.4 0.000 1.403877e-06
## Cd40lg               Cd40lg   7.158713   0.4 0.000 1.403877e-06
## S1pr1                 S1pr1   7.654367   0.4 0.000 1.403877e-06
## Klf2                   Klf2   6.583317   0.4 0.010 1.555741e-04
## Lgals3               Lgals3   7.086169   0.4 0.010 1.555741e-04
## Ms4a6b               Ms4a6b   2.612150   1.0 0.192 1.669897e-04
## Ctla2a               Ctla2a   4.550009   0.4 0.010 2.563270e-04
## Prkch                 Prkch   4.843005   0.4 0.010 2.563270e-04
## Plac8                 Plac8   6.852715   0.3 0.000 3.880605e-04
## Il12rb2             Il12rb2   7.826460   0.3 0.000 3.880605e-04
## Klrb1c               Klrb1c   7.230640   0.3 0.000 3.880605e-04
## Mmaa                   Mmaa   6.231869   0.3 0.000 3.880605e-04
## 9330111N05Rik 9330111N05Rik   6.151617   0.3 0.000 3.880605e-04
## Galnt10             Galnt10   4.510244   0.5 0.029 4.619156e-04
## Atp8b4               Atp8b4   2.901499   0.9 0.183 8.692216e-04
## 
## --- Vγ6Vδ4 C2 enriched (top 20) ---
##              gene avg_log2FC pct.1 pct.2 p_val_adj
## Ckb           Ckb -2.1473781   0.7 0.962 0.1541904
## Cd247       Cd247 -2.9528728   0.3 0.875 0.9488665
## Kcnk1       Kcnk1 -5.0768837   0.0 0.702 1.0000000
## Il23r       Il23r -1.6414271   0.8 0.942 1.0000000
## Gstp1       Gstp1 -2.2955890   0.3 0.865 1.0000000
## Ptprcap   Ptprcap -0.9771990   1.0 0.971 1.0000000
## Ptma         Ptma -0.9740465   1.0 1.000 1.0000000
## Tmem176b Tmem176b -1.3109573   0.7 1.000 1.0000000
## Ppia         Ppia -0.8445177   0.9 0.990 1.0000000
## Rabac1     Rabac1 -1.1801628   0.8 0.952 1.0000000
## Cd3g         Cd3g -0.6511260   1.0 1.000 1.0000000
## Ifngr1     Ifngr1 -1.3457286   0.9 0.990 1.0000000
## Usp3         Usp3 -2.4020745   0.2 0.702 1.0000000
## Rpl36a     Rpl36a -1.2928735   0.9 0.923 1.0000000
## Eif1         Eif1 -0.6869980   1.0 1.000 1.0000000
## Hsp90ab1 Hsp90ab1 -0.9273860   0.9 0.990 1.0000000
## Dnaja1     Dnaja1 -1.7360232   0.3 0.769 1.0000000
## Sox13       Sox13 -4.0684603   0.0 0.471 1.0000000
## Dek           Dek -4.3009624   0.0 0.471 1.0000000
## Blk           Blk -1.6618592   0.5 0.808 1.0000000

8. DE: Tumour Bias in Focal Clusters

8.1 Functional Class Distribution

fc_cols <- c("Pro-tumour biased"  = "#B2182B",
             "Anti-tumour biased" = "#2166AC",
             "Intermediate"       = "#FDB863")

mir128_focal <- integrated@meta.data %>%
  filter(dataset == "miR128a_CNS", res05 %in% c("1", "2", "6"))

cat("Functional class by cluster (miR-128a only):\n")

## Functional class by cluster (miR-128a only):

print(table(mir128_focal$res05, mir128_focal$gdt_functional_class))

##    
##     Anti-tumour biased Intermediate Pro-tumour biased
##   0                  0            0                 0
##   1                  0            0                10
##   2                  1            0               156
##   3                  0            0                 0
##   4                  0            0                 0
##   5                  0            0                 0
##   6                  6            1                19
##   7                  0            0                 0
##   8                  0            0                 0

p1 <- mir128_focal %>%
  ggplot(aes(x = paste0("C", res05), fill = gdt_functional_class)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = fc_cols) +
  theme_minimal(base_size = 13) +
  labs(title = "Tumour Bias: C1 vs C2 vs C6",
       x = "Cluster", y = "Proportion")

p2 <- mir128_focal %>%
  ggplot(aes(x = paste0("C", res05), fill = cell_annotation)) +
  geom_bar(position = "fill") +
  theme_minimal(base_size = 13) +
  labs(title = "Vγ Annotation: C1 vs C2 vs C6",
       x = "Cluster", y = "Proportion")

p1 + p2

9. Pathway Analysis — C2 (B-ALL) vs C1 (Healthy)

9.1 GO Enrichment — C2 Up

sig_up <- de_c2c1 %>%
  filter(p_val_adj < 0.05, avg_log2FC > 0.25) %>%
  pull(gene)
sig_down <- de_c2c1 %>%
  filter(p_val_adj < 0.05, avg_log2FC < -0.25) %>%
  pull(gene)
bg_genes <- de_c2c1$gene

cat("C2 (B-ALL) up:", length(sig_up), "| C1 (Healthy) up:", length(sig_down), "\n")

## C2 (B-ALL) up: 256 | C1 (Healthy) up: 215

up_entrez <- bitr(sig_up, fromType = "SYMBOL", toType = "ENTREZID",
                  OrgDb = org.Mm.eg.db)
bg_entrez <- bitr(bg_genes, fromType = "SYMBOL", toType = "ENTREZID",
                  OrgDb = org.Mm.eg.db)

if (nrow(up_entrez) >= 5) {
  go_up <- enrichGO(gene          = up_entrez$ENTREZID,
                    universe      = bg_entrez$ENTREZID,
                    OrgDb         = org.Mm.eg.db,
                    ont           = "BP",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 0.05,
                    qvalueCutoff  = 0.1,
                    readable      = TRUE)
  
  if (nrow(as.data.frame(go_up)) > 0) {
    cat("\nTop GO BP — C2 up:\n")
    print(head(as.data.frame(go_up) %>%
                 select(Description, GeneRatio, p.adjust, Count), 20))
  }
}

## 
## Top GO BP — C2 up:
##                                                 Description GeneRatio
## GO:0046629                    gamma-delta T cell activation     6/241
## GO:0042492               gamma-delta T cell differentiation     5/241
## GO:0045586 regulation of gamma-delta T cell differentiation     5/241
## GO:0046643      regulation of gamma-delta T cell activation     5/241
## GO:0030097                                      hemopoiesis    39/241
## GO:0045321                             leukocyte activation    34/241
##              p.adjust Count
## GO:0046629 0.00708147     6
## GO:0042492 0.01078133     5
## GO:0045586 0.01078133     5
## GO:0046643 0.01078133     5
## GO:0030097 0.01078133    39
## GO:0045321 0.04743301    34

if (exists("go_up") && nrow(as.data.frame(go_up)) > 0) {
  dotplot(go_up, showCategory = 20) +
    ggtitle("GO BP — C2 (B-ALL) Upregulated")
}

9.2 GO Enrichment — C1 Up

down_entrez <- bitr(sig_down, fromType = "SYMBOL", toType = "ENTREZID",
                    OrgDb = org.Mm.eg.db)

if (nrow(down_entrez) >= 5) {
  go_down <- enrichGO(gene          = down_entrez$ENTREZID,
                      universe      = bg_entrez$ENTREZID,
                      OrgDb         = org.Mm.eg.db,
                      ont           = "BP",
                      pAdjustMethod = "BH",
                      pvalueCutoff  = 0.05,
                      qvalueCutoff  = 0.1,
                      readable      = TRUE)
  
  if (nrow(as.data.frame(go_down)) > 0) {
    cat("Top GO BP — C1 up:\n")
    print(head(as.data.frame(go_down) %>%
                 select(Description, GeneRatio, p.adjust, Count), 20))
  }
}

## Top GO BP — C1 up:
##                                                                                  Description
## GO:0002181                                                           cytoplasmic translation
## GO:0140236                                                         translation at presynapse
## GO:0140241                                                            translation at synapse
## GO:0140242                                                        translation at postsynapse
## GO:0006412                                                                       translation
## GO:0006119                                                         oxidative phosphorylation
## GO:0061844           antimicrobial humoral immune response mediated by antimicrobial peptide
## GO:0009060                                                               aerobic respiration
## GO:0019730                                                    antimicrobial humoral response
## GO:0048002                            antigen processing and presentation of peptide antigen
## GO:0097250                                                mitochondrial respirasome assembly
## GO:0045333                                                              cellular respiration
## GO:0006091                                    generation of precursor metabolites and energy
## GO:0009145                               purine nucleoside triphosphate biosynthetic process
## GO:0009206                           purine ribonucleoside triphosphate biosynthetic process
## GO:0006959                                                           humoral immune response
## GO:0042274                                                ribosomal small subunit biogenesis
## GO:0033108                                  mitochondrial respiratory chain complex assembly
## GO:0019886 antigen processing and presentation of exogenous peptide antigen via MHC class II
## GO:0031640                                              killing of cells of another organism
##            GeneRatio     p.adjust Count
## GO:0002181    38/198 7.157658e-22    38
## GO:0140236    22/198 3.756762e-18    22
## GO:0140241    22/198 3.756762e-18    22
## GO:0140242    22/198 3.756762e-18    22
## GO:0006412    49/198 3.387262e-12    49
## GO:0006119    14/198 2.879681e-04    14
## GO:0061844     6/198 2.879681e-04     6
## GO:0009060    15/198 1.633736e-03    15
## GO:0019730     6/198 1.742048e-03     6
## GO:0048002     7/198 4.109406e-03     7
## GO:0097250    10/198 1.075678e-02    10
## GO:0045333    15/198 1.640694e-02    15
## GO:0006091    20/198 2.259944e-02    20
## GO:0009145     9/198 2.265209e-02     9
## GO:0009206     9/198 2.265209e-02     9
## GO:0006959     7/198 2.265209e-02     7
## GO:0042274    11/198 2.333490e-02    11
## GO:0033108     9/198 2.341364e-02     9
## GO:0019886     4/198 2.341364e-02     4
## GO:0031640     5/198 2.341364e-02     5

if (exists("go_down") && nrow(as.data.frame(go_down)) > 0) {
  dotplot(go_down, showCategory = 20) +
    ggtitle("GO BP — C1 (Healthy) Upregulated")
}

9.3 GSEA Hallmark

ranked <- de_c2c1 %>%
  filter(!is.na(p_val_adj), p_val_adj > 0) %>%
  mutate(rank_score = sign(avg_log2FC) * -log10(p_val_adj)) %>%
  arrange(desc(rank_score))

ranked_entrez <- bitr(ranked$gene, fromType = "SYMBOL", toType = "ENTREZID",
                      OrgDb = org.Mm.eg.db)
ranked <- ranked %>% inner_join(ranked_entrez, by = c("gene" = "SYMBOL"))
gene_ranks <- setNames(ranked$rank_score, ranked$ENTREZID)
gene_ranks <- sort(gene_ranks, decreasing = TRUE)

h_sets <- msigdbr(species = "Mus musculus", category = "H") %>%
  select(gs_name, entrez_gene) %>%
  group_by(gs_name) %>%
  summarise(entrez_gene = list(as.character(entrez_gene)), .groups = "drop") %>%
  deframe()

gsea_h <- fgsea(pathways = h_sets,
                stats    = gene_ranks,
                minSize  = 10,
                maxSize  = 500)

gsea_h_sig <- gsea_h %>%
  dplyr::filter(padj < 0.1) %>%
  arrange(NES) %>%
  mutate(pathway = str_replace_all(pathway, "HALLMARK_", ""))

cat("Significant Hallmark pathways (padj < 0.1):\n")

## Significant Hallmark pathways (padj < 0.1):

print(gsea_h_sig %>% select(pathway, NES, padj, size))

## Empty data.table (0 rows and 4 cols): pathway,NES,padj,size

if (nrow(gsea_h_sig) > 0) {
  gsea_h_sig %>%
    mutate(direction = ifelse(NES > 0, "C2: B-ALL ↑", "C1: Healthy ↑")) %>%
    ggplot(aes(x = NES, y = reorder(pathway, NES), fill = direction)) +
    geom_col() +
    scale_fill_manual(values = c("C2: B-ALL ↑" = "#B2182B",
                                 "C1: Healthy ↑" = "#2166AC")) +
    theme_minimal(base_size = 12) +
    labs(title = "GSEA Hallmark — C2 (B-ALL) vs C1 (Healthy)",
         x = "Normalised Enrichment Score", y = NULL, fill = NULL)
}

9.4 GSEA KEGG

kegg_sets <- msigdbr(species = "Mus musculus", category = "C2",
                     subcategory = "CP:KEGG_LEGACY") %>%
  select(gs_name, entrez_gene) %>%
  group_by(gs_name) %>%
  summarise(entrez_gene = list(as.character(entrez_gene)), .groups = "drop") %>%
  deframe()

gsea_kegg <- fgsea(pathways = kegg_sets,
                   stats    = gene_ranks,
                   minSize  = 10,
                   maxSize  = 500)

gsea_kegg_sig <- gsea_kegg %>%
  dplyr::filter(padj < 0.1) %>%
  arrange(NES) %>%
  mutate(pathway = str_replace_all(pathway, "KEGG_", ""))

cat("Significant KEGG pathways (padj < 0.1):\n")

## Significant KEGG pathways (padj < 0.1):

print(gsea_kegg_sig %>% select(pathway, NES, padj, size))

## Empty data.table (0 rows and 4 cols): pathway,NES,padj,size

if (nrow(gsea_kegg_sig) > 0) {
  gsea_kegg_sig %>%
    mutate(direction = ifelse(NES > 0, "C2: B-ALL ↑", "C1: Healthy ↑")) %>%
    ggplot(aes(x = NES, y = reorder(pathway, NES), fill = direction)) +
    geom_col() +
    scale_fill_manual(values = c("C2: B-ALL ↑" = "#B2182B",
                                 "C1: Healthy ↑" = "#2166AC")) +
    theme_minimal(base_size = 12) +
    labs(title = "GSEA KEGG — C2 (B-ALL) vs C1 (Healthy)",
         x = "Normalised Enrichment Score", y = NULL, fill = NULL)
}

9.5 GSEA Immunological Signatures

c7_sets <- msigdbr(species = "Mus musculus", category = "C7",
                   subcategory = "IMMUNESIGDB") %>%
  select(gs_name, entrez_gene) %>%
  group_by(gs_name) %>%
  summarise(entrez_gene = list(as.character(entrez_gene)), .groups = "drop") %>%
  deframe()

if (length(c7_sets) > 0) {
  gsea_c7 <- fgsea(pathways = c7_sets,
                   stats    = gene_ranks,
                   minSize  = 10,
                   maxSize  = 500)
  
  gsea_c7_top <- gsea_c7 %>%
    dplyr::filter(padj < 0.05) %>%
    arrange(NES)
  
  cat("Significant immunological signatures (padj < 0.05):",
      nrow(gsea_c7_top), "\n\n")
  
  if (nrow(gsea_c7_top) > 0) {
    cat("Top 10 enriched in C2 (B-ALL):\n")
    print(gsea_c7_top %>% dplyr::filter(NES > 0) %>% tail(10) %>%
            select(pathway, NES, padj, size))
    cat("\nTop 10 enriched in C1 (Healthy):\n")
    print(gsea_c7_top %>% dplyr::filter(NES < 0) %>% head(10) %>%
            select(pathway, NES, padj, size))
  }
}

## Significant immunological signatures (padj < 0.05): 0

10. Other Populations

10.1 Within miR-128a — Vγ6Vδ4 vs Vγ4 vs Other γδ

mir128_sub <- subset(integrated, dataset == "miR128a_CNS")
Idents(mir128_sub) <- "cell_annotation"
DefaultAssay(mir128_sub) <- "originalexp"

mir128_markers <- FindAllMarkers(mir128_sub, only.pos = TRUE,
                                 min.pct = 0.2, logfc.threshold = 0.25,
                                 test.use = "wilcox")

cat("Top markers per miR-128a population:\n")

## Top markers per miR-128a population:

print(mir128_markers %>%
        group_by(cluster) %>%
        slice_max(avg_log2FC, n = 15) %>%
        select(cluster, gene, avg_log2FC, pct.1, pct.2, p_val_adj),
      n = 45)

## # A tibble: 45 × 6
## # Groups:   cluster [3]
##    cluster  gene     avg_log2FC pct.1 pct.2 p_val_adj
##    <fct>    <chr>         <dbl> <dbl> <dbl>     <dbl>
##  1 Vγ4      Crtam          6.42 0.281 0.003  4.39e-18
##  2 Vγ4      Plekhg1        3.25 0.406 0.07   4.08e- 6
##  3 Vγ4      Cd40lg         3.24 0.219 0.035  5.86e- 2
##  4 Vγ4      S1pr1          2.58 0.25  0.04   1.80e- 2
##  5 Vγ4      Slfn1          2.55 0.344 0.056  1.29e- 4
##  6 Vγ4      Sco1           2.29 0.219 0.064  1   e+ 0
##  7 Vγ4      Syt11          2.25 0.25  0.086  1   e+ 0
##  8 Vγ4      Csnk1e         2.22 0.25  0.078  1   e+ 0
##  9 Vγ4      She            2.16 0.281 0.051  2.33e- 2
## 10 Vγ4      P2rx7          2.13 0.281 0.083  1   e+ 0
## 11 Vγ4      Qpct           2.11 0.406 0.086  1.02e- 3
## 12 Vγ4      Itgae          1.97 0.219 0.056  1   e+ 0
## 13 Vγ4      Plagl2         1.97 0.281 0.118  1   e+ 0
## 14 Vγ4      Zfp410         1.94 0.344 0.139  1   e+ 0
## 15 Vγ4      Ifi27l2a       1.92 0.281 0.08   1   e+ 0
## 16 Other γδ Ptgir          6.65 0.214 0      1.85e- 5
## 17 Other γδ Kit            6.36 0.361 0.007  5.83e-12
## 18 Other γδ Plcl1          5.80 0.369 0.013  6.10e-12
## 19 Other γδ Dach2          5.31 0.286 0.013  1.24e- 7
## 20 Other γδ Trf            4.90 0.23  0.013  4.31e- 5
## 21 Other γδ Prnp           4.89 0.282 0.013  2.42e- 7
## 22 Other γδ Gzma           4.69 0.214 0.013  2.68e- 4
## 23 Other γδ Ctla2a         4.20 0.456 0.072  3.43e-12
## 24 Other γδ Dapl1          4.19 0.27  0.072  4.11e- 3
## 25 Other γδ Fcer1g         4.15 0.246 0.02   3.58e- 5
## 26 Other γδ Ccr8           4.15 0.242 0.02   6.14e- 5
## 27 Other γδ Il1rl1         4.07 0.46  0.033  6.94e-15
## 28 Other γδ Litaf          4.01 0.202 0.013  9.73e- 4
## 29 Other γδ Cd81           3.98 0.552 0.033  2.95e-20
## 30 Other γδ Nrgn           3.89 0.417 0.026  7.28e-13
## 31 Vγ6Vδ4   Lrrc17         3.61 0.289 0.021  9.25e-12
## 32 Vγ6Vδ4   Tdrd9          3.21 0.273 0.042  2.29e- 7
## 33 Vγ6Vδ4   Sox13          3.12 0.479 0.06   9.45e-19
## 34 Vγ6Vδ4   Fstl4          3.07 0.264 0.049  3.18e- 6
## 35 Vγ6Vδ4   Ablim3         2.85 0.405 0.074  4.57e-12
## 36 Vγ6Vδ4   Blk            2.80 0.81  0.19   5.97e-31
## 37 Vγ6Vδ4   Prss12         2.72 0.207 0.049  1.03e- 2
## 38 Vγ6Vδ4   Zfp462         2.67 0.446 0.085  2.87e-13
## 39 Vγ6Vδ4   Kcnk1          2.66 0.702 0.127  5.68e-27
## 40 Vγ6Vδ4   Tns4           2.59 0.455 0.085  9.65e-14
## 41 Vγ6Vδ4   Rapgef4        2.57 0.207 0.081  1   e+ 0
## 42 Vγ6Vδ4   Il1r1          2.53 0.752 0.183  1.40e-25
## 43 Vγ6Vδ4   Sdc1           2.53 0.339 0.074  7.09e- 8
## 44 Vγ6Vδ4   Irak3          2.43 0.364 0.092  8.29e- 8
## 45 Vγ6Vδ4   Rhobtb1        2.36 0.231 0.063  8.94e- 3

10.2 Vγ6Vδ4 in C1 vs C2 — Healthy-Like vs TME-Shifted

mir128_vg6 <- integrated@meta.data %>%
  filter(dataset == "miR128a_CNS", cell_annotation == "Vγ6Vδ4")

cat("Vγ6Vδ4 across clusters:\n")

## Vγ6Vδ4 across clusters:

print(table(mir128_vg6$res05))

## 
##   0   1   2   3   4   5   6   7   8 
##   0   8 104   1   0   1   0   1   6

n_c1 <- sum(mir128_vg6$res05 == "1")
n_c2 <- sum(mir128_vg6$res05 == "2")
cat("\nVγ6Vδ4 in C1:", n_c1, "| C2:", n_c2, "\n")

## 
## Vγ6Vδ4 in C1: 8 | C2: 104

if (n_c1 >= 5 & n_c2 >= 10) {
  vg6_cells <- rownames(mir128_vg6)
  vg6_c1c2  <- subset(integrated, cells = vg6_cells[mir128_vg6$res05 %in% c("1", "2")])
  vg6_c1c2$vg6_cluster <- paste0("Vg6_C", vg6_c1c2$res05)
  Idents(vg6_c1c2) <- "vg6_cluster"
  DefaultAssay(vg6_c1c2) <- "originalexp"
  
  de_vg6_split <- FindMarkers(vg6_c1c2,
                               ident.1 = "Vg6_C1",
                               ident.2 = "Vg6_C2",
                               test.use = "wilcox",
                               min.pct = 0.05,
                               logfc.threshold = 0)
  de_vg6_split$gene <- rownames(de_vg6_split)
  
  cat("\n--- Vγ6 in C1 (healthy-like) enriched ---\n")
  print(de_vg6_split %>%
          filter(avg_log2FC > 0) %>% arrange(p_val_adj) %>%
          select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
          head(20))
  
  cat("\n--- Vγ6 in C2 (TME-shifted) enriched ---\n")
  print(de_vg6_split %>%
          filter(avg_log2FC < 0) %>% arrange(p_val_adj) %>%
          select(gene, avg_log2FC, pct.1, pct.2, p_val_adj) %>%
          head(20))
} else {
  cat("Too few Vγ6Vδ4 in C1 (n =", n_c1, ") for formal DE.\n")
}

## 
## --- Vγ6 in C1 (healthy-like) enriched ---
##                        gene avg_log2FC pct.1 pct.2    p_val_adj
## Nefh                   Nefh   6.111371 0.375 0.000 6.717679e-06
## Gm15208             Gm15208   3.223486 0.625 0.048 2.053573e-03
## Tsacc                 Tsacc   4.427477 0.375 0.010 2.712658e-03
## Cyp2s1               Cyp2s1   5.906438 0.250 0.000 6.886076e-03
## Ybey                   Ybey   5.658010 0.250 0.000 6.886076e-03
## 2010320M18Rik 2010320M18Rik   5.858292 0.250 0.000 6.886076e-03
## Mmaa                   Mmaa   6.274597 0.250 0.000 6.886076e-03
## Rps27rt             Rps27rt   2.968251 0.750 0.115 3.511559e-02
## Cic                     Cic   3.394744 0.500 0.058 3.331503e-01
## Apbb2                 Apbb2   4.304799 0.375 0.029 4.731954e-01
## Dhx33                 Dhx33   3.534999 0.375 0.029 6.935122e-01
## Ddx56                 Ddx56   2.581080 0.625 0.096 7.185535e-01
## Tfdp1                 Tfdp1   3.643213 0.375 0.029 7.864949e-01
## Pea15a               Pea15a   2.170751 0.875 0.212 9.353099e-01
## Carf                   Carf   5.325514 0.250 0.010 9.828880e-01
## Zfat                   Zfat   4.746747 0.250 0.010 9.828880e-01
## Ears2                 Ears2   4.729501 0.250 0.010 1.000000e+00
## Armc5                 Armc5   4.126680 0.250 0.010 1.000000e+00
## Mef2b                 Mef2b   4.488093 0.250 0.010 1.000000e+00
## Galns                 Galns   4.557947 0.250 0.010 1.000000e+00
## 
## --- Vγ6 in C2 (TME-shifted) enriched ---
##          gene avg_log2FC pct.1 pct.2 p_val_adj
## Ywhaz   Ywhaz -1.0824174 0.875 0.923         1
## Rpl9     Rpl9 -0.6210202 1.000 1.000         1
## Rer1     Rer1 -3.6527053 0.000 0.462         1
## Mdh1     Mdh1 -2.6016478 0.125 0.548         1
## Rps11   Rps11 -0.6068689 1.000 1.000         1
## Pik3r1 Pik3r1 -2.1721436 0.375 0.625         1
## Ndufa6 Ndufa6 -1.6753736 0.250 0.683         1
## Sbds     Sbds -3.7261028 0.000 0.433         1
## Zranb2 Zranb2 -2.8936144 0.125 0.500         1
## Rps24   Rps24 -0.4561261 1.000 1.000         1
## Nap1l4 Nap1l4 -1.4993845 0.250 0.673         1
## Txn1     Txn1 -2.2143618 0.250 0.558         1
## Cd47     Cd47 -1.5555260 0.375 0.673         1
## Gimap1 Gimap1 -1.4437220 0.500 0.721         1
## Eif1     Eif1 -0.5506942 1.000 1.000         1
## Exoc3   Exoc3 -2.2798379 0.125 0.490         1
## Etfa     Etfa -3.2474239 0.000 0.375         1
## Otud5   Otud5 -3.1656822 0.000 0.365         1
## Crebrf Crebrf -3.4146531 0.000 0.365         1
## Sqstm1 Sqstm1 -1.4838772 0.125 0.577         1

11. Summary

11.1 Emerging Model

cat("=============================================================\n")

## =============================================================

cat("  INTEGRATION SUMMARY\n")

##   INTEGRATION SUMMARY

cat("=============================================================\n\n")

## =============================================================

cat("Datasets:\n")

## Datasets:

cat("  Dural Atlas (healthy):", sum(integrated$dataset == "Dural_Atlas"), "cells\n")

##   Dural Atlas (healthy): 842 cells

cat("  miR-128a CNS (B-ALL):", sum(integrated$dataset == "miR128a_CNS"), "cells\n\n")

##   miR-128a CNS (B-ALL): 405 cells

cat("Key clusters (res 0.5):\n\n")

## Key clusters (res 0.5):

cat("  C0: Healthy Il17a factory (55% Il17a+), almost entirely atlas\n")

##   C0: Healthy Il17a factory (55% Il17a+), almost entirely atlas

cat("  C1: Healthy dural γδ17, poised (3.6% Il17a), 8 Vγ6Vδ4 mix in\n")

##   C1: Healthy dural γδ17, poised (3.6% Il17a), 8 Vγ6Vδ4 mix in

cat("  C2: B-ALL Vγ6Vδ4-enriched, Il17a SILENCED (3%), 104 Vγ6Vδ4\n")

##   C2: B-ALL Vγ6Vδ4-enriched, Il17a SILENCED (3%), 104 Vγ6Vδ4

cat("  C6: Il17a-active mixed (14.9%), 10 Vγ4 + atlas gdIFN, 0 Vγ6Vδ4\n\n")

##   C6: Il17a-active mixed (14.9%), 10 Vγ4 + atlas gdIFN, 0 Vγ6Vδ4

cat("B-ALL Vγ6Vδ4 (C2) vs Healthy dural γδ17 (C1):\n")

## B-ALL Vγ6Vδ4 (C2) vs Healthy dural γδ17 (C1):

cat("  RETAINED:  Il23r (enhanced), Rorc, Sox13, Blk\n")

##   RETAINED:  Il23r (enhanced), Rorc, Sox13, Blk

cat("  LOST:      Maf, tissue residency (S100a4/6, Tmem176a, Itgb7)\n")

##   LOST:      Maf, tissue residency (S100a4/6, Tmem176a, Itgb7)

cat("             MHC II (H2-Eb1, Cd74), Socs1, Bcl2a1b/d\n")

##              MHC II (H2-Eb1, Cd74), Socs1, Bcl2a1b/d

cat("             Il17a/f — ZERO transcription\n")

##              Il17a/f — ZERO transcription

cat("  GAINED:    Ctla4, Tgfbr2, Cd4, Cd28, Themis\n")

##   GAINED:    Ctla4, Tgfbr2, Cd4, Cd28, Themis

cat("             TCR hyperactivation (Lck, Itk, Prkcq)\n")

##              TCR hyperactivation (Lck, Itk, Prkcq)

cat("             Migration (Dock2, Myh9, Coro1b)\n\n")

##              Migration (Dock2, Myh9, Coro1b)

cat("Critical finding: Vγ6Vδ4 cells are EXCLUDED from the\n")

## Critical finding: Vγ6Vδ4 cells are EXCLUDED from the

cat("Il17a-active cluster (C6). The TME appears to specifically\n")

## Il17a-active cluster (C6). The TME appears to specifically

cat("silence IL-17 effector output in this population while\n")

## silence IL-17 effector output in this population while

cat("Vγ4 cells in the same tumour retain some Il17a capacity.\n")

## Vγ4 cells in the same tumour retain some Il17a capacity.

11.2 Summary Heatmap — All Populations

integrated$population <- with(integrated@meta.data, case_when(
  dataset == "Dural_Atlas" & gd_subtype == "gd17"            ~ "Atlas_gd17",
  dataset == "Dural_Atlas" & gd_subtype == "gdIFN"           ~ "Atlas_gdIFN",
  dataset == "miR128a_CNS" & cell_annotation == "Vγ6Vδ4"    ~ "miR128_Vg6Vd4",
  dataset == "miR128a_CNS" & cell_annotation == "Vγ4"       ~ "miR128_Vg4",
  dataset == "miR128a_CNS" & cell_annotation == "Other γδ"  ~ "miR128_Other_gd",
  TRUE ~ "Other"
))

pop_counts <- table(integrated$population)
valid_pops <- names(pop_counts[pop_counts >= 5 & names(pop_counts) != "Other"])
int_valid  <- subset(integrated, population %in% valid_pops)
DefaultAssay(int_valid) <- "originalexp"

summary_genes <- c("Cd3e", "Cd3d",
                    "Rorc", "Sox13", "Maf", "Blk", "Rora",
                    "Il17a", "Il17f", "Il23r", "Il1r1", "Ccr6",
                    "Tbx21", "Ifng", "Gzmb", "Nkg7", "Cd27",
                    "Ctla4", "Tgfbr2",
                    "Lck", "Itk", "Prkcq",
                    "S100a4", "S100a6", "Tmem176a", "Lgals1", "Itgb7",
                    "H2-Eb1", "Cd74",
                    "Il31ra", "Socs1", "Apoe", "Lef1")
summary_present <- summary_genes[summary_genes %in% rownames(int_valid)]

avg_pop     <- AverageExpression(int_valid, features = summary_present,
                                 group.by = "population", return.seurat = FALSE)
avg_pop_mat <- as.matrix(avg_pop[[1]])
avg_pop_sc  <- t(scale(t(avg_pop_mat)))

col_annot_pop <- data.frame(
  Dataset = ifelse(grepl("Atlas", colnames(avg_pop_sc)), "Atlas", "miR-128a"),
  row.names = colnames(avg_pop_sc)
)

gene_cat_pop <- data.frame(
  Category = case_when(
    summary_present %in% c("Cd3e", "Cd3d")                                    ~ "Identity",
    summary_present %in% c("Rorc", "Sox13", "Maf", "Blk", "Rora")            ~ "γδ17 TFs",
    summary_present %in% c("Il17a", "Il17f", "Il23r", "Il1r1", "Ccr6")       ~ "IL-17",
    summary_present %in% c("Tbx21", "Ifng", "Gzmb", "Nkg7", "Cd27")         ~ "γδIFN",
    summary_present %in% c("Ctla4", "Tgfbr2")                                 ~ "Checkpoints",
    summary_present %in% c("Lck", "Itk", "Prkcq")                             ~ "TCR Signal",
    summary_present %in% c("S100a4", "S100a6", "Tmem176a", "Lgals1", "Itgb7") ~ "Residency",
    summary_present %in% c("H2-Eb1", "Cd74")                                  ~ "MHC II",
    TRUE                                                                        ~ "Other"),
  row.names = summary_present)

annot_colors_pop <- list(
  Dataset  = c("Atlas" = "#2166AC", "miR-128a" = "#B2182B"),
  Category = c("Identity" = "grey50", "γδ17 TFs" = "#CC0000",
               "IL-17" = "#FF6600", "γδIFN" = "#0066CC",
               "Checkpoints" = "#800080", "TCR Signal" = "#FF69B4",
               "Residency" = "#228B22", "MHC II" = "#DAA520",
               "Other" = "grey70")
)

pheatmap(avg_pop_sc,
         annotation_col = col_annot_pop,
         annotation_row = gene_cat_pop,
         annotation_colors = annot_colors_pop,
         cluster_cols = TRUE, cluster_rows = FALSE,
         color = colorRampPalette(c("#2166AC", "white", "#B2182B"))(100),
         main = "γδ T Programme — Atlas vs B-ALL Populations",
         fontsize_row = 9, fontsize_col = 10)

12. Save

out_dir <- "~/Documents/Projects/CNS_border_atlas/results/integration"
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

saveRDS(integrated, file.path(out_dir, "integrated_dural_mir128a_CNS.rds"))

if (exists("de_c2c1"))
  write_csv(de_c2c1, file.path(out_dir, "DE_C2_BALL_vs_C1_healthy.csv"))
if (exists("de_c6c1"))
  write_csv(de_c6c1, file.path(out_dir, "DE_C6_vs_C1_healthy.csv"))
if (exists("de_c6c2"))
  write_csv(de_c6c2, file.path(out_dir, "DE_C6_vs_C2_BALL.csv"))
if (exists("de_vg4c6_vg6c2"))
  write_csv(de_vg4c6_vg6c2, file.path(out_dir, "DE_Vg4_C6_vs_Vg6_C2.csv"))
if (exists("de_vg6_split"))
  write_csv(de_vg6_split, file.path(out_dir, "DE_Vg6_C1_vs_C2_within.csv"))
if (exists("mir128_markers"))
  write_csv(mir128_markers, file.path(out_dir, "markers_within_mir128a_CNS.csv"))

if (exists("gsea_h"))
  write_csv(as.data.frame(gsea_h), file.path(out_dir, "GSEA_Hallmark_C2_vs_C1.csv"))
if (exists("gsea_kegg"))
  write_csv(as.data.frame(gsea_kegg), file.path(out_dir, "GSEA_KEGG_C2_vs_C1.csv"))
if (exists("go_up"))
  write_csv(as.data.frame(go_up), file.path(out_dir, "GO_BP_C2_BALL_up.csv"))
if (exists("go_down"))
  write_csv(as.data.frame(go_down), file.path(out_dir, "GO_BP_C1_healthy_up.csv"))

write_csv(integrated@meta.data %>% rownames_to_column("cell_barcode"),
          file.path(out_dir, "integrated_metadata.csv"))

cat("\nSaved to:", out_dir, "\n")

## 
## Saved to: ~/Documents/Projects/CNS_border_atlas/results/integration

sessionInfo()

## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] future_1.69.0          msigdbr_25.1.1         fgsea_1.32.4          
##  [4] org.Mm.eg.db_3.20.0    AnnotationDbi_1.68.0   IRanges_2.40.1        
##  [7] S4Vectors_0.44.0       Biobase_2.66.0         BiocGenerics_0.52.0   
## [10] clusterProfiler_4.14.6 RColorBrewer_1.1-3     pheatmap_1.0.13       
## [13] ggrepel_0.9.6          qs2_0.1.7              Seurat_5.4.0          
## [16] SeuratObject_5.3.0     sp_2.2-1               viridis_0.6.5         
## [19] viridisLite_0.4.3      patchwork_1.3.2        lubridate_1.9.5       
## [22] forcats_1.0.1          stringr_1.6.0          dplyr_1.2.0           
## [25] purrr_1.2.1            readr_2.1.6            tidyr_1.3.2           
## [28] tibble_3.3.1           ggplot2_4.0.2          tidyverse_2.0.0       
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.23        splines_4.4.2           later_1.4.6            
##   [4] ggplotify_0.1.3         R.oo_1.27.1             polyclip_1.10-7        
##   [7] fastDummies_1.7.5       lifecycle_1.0.5         vroom_1.7.0            
##  [10] globals_0.19.0          lattice_0.22-9          MASS_7.3-65            
##  [13] magrittr_2.0.4          limma_3.62.2            plotly_4.12.0          
##  [16] sass_0.4.10             rmarkdown_2.30          jquerylib_0.1.4        
##  [19] yaml_2.3.12             ggtangle_0.1.1          httpuv_1.6.16          
##  [22] otel_0.2.0              sctransform_0.4.3       spam_2.11-3            
##  [25] spatstat.sparse_3.1-0   reticulate_1.45.0       cowplot_1.2.0          
##  [28] pbapply_1.7-4           DBI_1.2.3               abind_1.4-8            
##  [31] zlibbioc_1.52.0         Rtsne_0.17              presto_1.0.0           
##  [34] R.utils_2.13.0          yulab.utils_0.2.4       rappdirs_0.3.4         
##  [37] GenomeInfoDbData_1.2.13 enrichplot_1.26.6       irlba_2.3.7            
##  [40] listenv_0.10.0          spatstat.utils_3.2-1    tidytree_0.4.7         
##  [43] goftest_1.2-3           RSpectra_0.16-2         spatstat.random_3.4-4  
##  [46] fitdistrplus_1.2-6      parallelly_1.46.1       codetools_0.2-20       
##  [49] DOSE_4.0.1              tidyselect_1.2.1        aplot_0.2.9            
##  [52] UCSC.utils_1.2.0        farver_2.1.2            matrixStats_1.5.0      
##  [55] spatstat.explore_3.7-0  jsonlite_2.0.0          progressr_0.18.0       
##  [58] ggridges_0.5.7          survival_3.8-6          tools_4.4.2            
##  [61] treeio_1.30.0           ica_1.0-3               Rcpp_1.1.1             
##  [64] glue_1.8.0              gridExtra_2.3           xfun_0.56              
##  [67] qvalue_2.38.0           GenomeInfoDb_1.42.3     withr_3.0.2            
##  [70] fastmap_1.2.0           digest_0.6.39           gridGraphics_0.5-1     
##  [73] timechange_0.4.0        R6_2.6.1                mime_0.13              
##  [76] colorspace_2.1-2        scattermore_1.2         GO.db_3.20.0           
##  [79] tensor_1.5.1            dichromat_2.0-0.1       spatstat.data_3.1-9    
##  [82] RSQLite_2.4.6           R.methodsS3_1.8.2       utf8_1.2.6             
##  [85] generics_0.1.4          data.table_1.18.2.1     httr_1.4.8             
##  [88] htmlwidgets_1.6.4       uwot_0.2.4              pkgconfig_2.0.3        
##  [91] gtable_0.3.6            blob_1.3.0              lmtest_0.9-40          
##  [94] S7_0.2.1                XVector_0.46.0          htmltools_0.5.9        
##  [97] dotCall64_1.2           scales_1.4.0            png_0.1-8              
## [100] spatstat.univar_3.1-6   ggfun_0.2.0             knitr_1.51             
## [103] rstudioapi_0.18.0       tzdb_0.5.0              reshape2_1.4.5         
## [106] curl_7.0.0              nlme_3.1-168            cachem_1.1.0           
## [109] zoo_1.8-15              KernSmooth_2.23-26      vipor_0.4.7            
## [112] parallel_4.4.2          miniUI_0.1.2            ggrastr_1.0.2          
## [115] pillar_1.11.1           grid_4.4.2              vctrs_0.7.1            
## [118] RANN_2.6.2              promises_1.5.0          stringfish_0.18.0      
## [121] xtable_1.8-4            cluster_2.1.8.2         beeswarm_0.4.0         
## [124] evaluate_1.0.5          cli_3.6.5               compiler_4.4.2         
## [127] rlang_1.1.7             crayon_1.5.3            future.apply_1.20.1    
## [130] labeling_0.4.3          ggbeeswarm_0.7.3        fs_1.6.6               
## [133] plyr_1.8.9              stringi_1.8.7           deldir_2.0-4           
## [136] BiocParallel_1.40.2     babelgene_22.9          assertthat_0.2.1       
## [139] Biostrings_2.74.1       lazyeval_0.2.2          spatstat.geom_3.7-0    
## [142] GOSemSim_2.32.0         Matrix_1.7-4            RcppHNSW_0.6.0         
## [145] hms_1.1.4               bit64_4.6.0-1           conflicted_1.2.0       
## [148] statmod_1.5.1           KEGGREST_1.46.0         shiny_1.12.1           
## [151] ROCR_1.0-12             igraph_2.2.2            memoise_2.0.1          
## [154] RcppParallel_5.1.11-1   bslib_0.10.0            ggtree_3.14.0          
## [157] fastmatch_1.1-8         bit_4.6.0               gson_0.1.0             
## [160] ape_5.8-1

Integration — miR-128a CNS γδ T Cells × Dural Meningeal Atlas

Cross-dataset validation: healthy γδ17, silenced Vγ6Vδ4, and the Il17a-active cluster

2026-02-24