1. Setup & Data Loading

library(tidyverse)
library(patchwork)
library(viridis)
library(scran)
library(SingleCellExperiment)
library(zellkonverter)
library(scater)
library(Seurat)
library(qs2)
library(dplyr)
library(readr)
library(ggrepel)
library(pheatmap)
library(RColorBrewer)
h5ad_path <- "~/Documents/Projects/CNS_border_atlas/data/CNSbordercellatlas_Dec22_RAWCOUNTS.h5ad"
sce <- readH5AD(h5ad_path)

seurat_obj <- as.Seurat(sce, counts = "X", data = NULL)
seurat_obj[["percent.mt"]]   <- PercentageFeatureSet(seurat_obj, pattern = "^mt-|^MT-")
seurat_obj[["percent.ribo"]] <- PercentageFeatureSet(seurat_obj, pattern = "^Rps|^Rpl|^RPS|^RPL")

cat("Full atlas:", format(ncol(seurat_obj), big.mark = ","), "cells,",
    format(nrow(seurat_obj), big.mark = ","), "genes\n")
## Full atlas: 138,211 cells, 25,836 genes

1.1 Fix Metadata — Handle NAs and Factors

# Convert factors → character
meta <- seurat_obj@meta.data
for (col in colnames(meta)) {
  if (is.factor(meta[[col]])) {
    seurat_obj@meta.data[[col]] <- as.character(seurat_obj@meta.data[[col]])
  }
}

# Diagnose original_annotation
n_na        <- sum(is.na(seurat_obj$original_annotation))
n_annotated <- sum(!is.na(seurat_obj$original_annotation))
cat("original_annotation:", n_annotated, "annotated,", n_na, "NA\n")
## original_annotation: 5397 annotated, 132814 NA
# Replace NAs
seurat_obj$original_annotation[is.na(seurat_obj$original_annotation)] <- "unannotated"

cat("\nAnnotation distribution after fix:\n")
## 
## Annotation distribution after fix:
print(sort(table(seurat_obj$original_annotation), decreasing = TRUE))
## 
##  unannotated    Microglia      unknown          ILC         gdTc        CD4Tc 
##       132814         1108          602          493          376          346 
##      T cells      B cells        CD8Tc         mast     myeloid1         mDC2 
##          346          253          237          222          217          176 
##  Neutrophils          BAM     NK cells           NK       BcProg         mDC1 
##          145          125          115          105           85           80 
##           Bc     granulo2     granulo1          pDC     myeloid2         Treg 
##           71           58           56           55           48           29 
##     myeloid3          cDC          DAM         cDC1 granuloProli  T/NKT cells 
##           19           16            6            4            2            2
# Fill other NA columns
for (col in c("site", "study", "age_group", "age_info", "library", "gender")) {
  n_na_col <- sum(is.na(seurat_obj@meta.data[[col]]))
  if (n_na_col > 0) {
    cat(col, ":", n_na_col, "NAs — filling with 'unknown'\n")
    seurat_obj@meta.data[[col]][is.na(seurat_obj@meta.data[[col]])] <- "unknown"
  }
}

n_gdtc <- sum(seurat_obj$original_annotation == "gdTc", na.rm = TRUE)
cat("\ngdTc cells:", n_gdtc, "\n")
## 
## gdTc cells: 376
stopifnot(n_gdtc > 100 & n_gdtc < 1000)

1.2 Check Available Genes

key_markers <- c("Trdc", "Cd3e", "Cd3d", "Cd3g", "Rorc", "Sox13",
                 "Maf", "Blk", "Il17a", "Il17f", "Tbx21", "Ifng",
                 "Eomes", "Gzmb", "Ccr6", "Il23r", "Il1r1",
                 "Cd27", "Cd44", "Id2", "Ncr1", "Scart2",
                 "Rora", "Zbtb16", "Gata3", "Nkg7", "Prf1")

found   <- key_markers[key_markers %in% rownames(seurat_obj)]
missing <- setdiff(key_markers, found)
cat("Found:", length(found), "—", paste(found, collapse = ", "), "\n")
## Found: 25 — Cd3e, Cd3d, Cd3g, Rorc, Sox13, Maf, Blk, Il17a, Il17f, Tbx21, Ifng, Eomes, Gzmb, Ccr6, Il23r, Il1r1, Cd27, Cd44, Id2, Ncr1, Rora, Zbtb16, Gata3, Nkg7, Prf1
cat("Missing:", length(missing), "—", paste(missing, collapse = ", "), "\n")
## Missing: 2 — Trdc, Scart2
tcr_hits <- grep("Tr[abgd]|Tcr|TCR", rownames(seurat_obj), value = TRUE)
cat("\nTCR-related genes:", length(tcr_hits), "\n")
## 
## TCR-related genes: 43
if (length(tcr_hits) > 0) print(tcr_hits)
##  [1] "Tram1"    "Tram2"    "Trak2"    "Traf3ip1" "Traf5"    "Traf3ip3"
##  [7] "Trdmt1"   "Traf2"    "Traf1"    "Traf6"    "Trap1a"   "Trappc2" 
## [13] "Tram1l1"  "Trabd2b"  "Trappc3"  "Trafd1"   "Tra2a"    "Trappc6a"
## [19] "Trdn"     "Trappc3l" "Traf3ip2" "Trappc10" "Trappc5"  "Trappc11"
## [25] "Tradd"    "Trappc2l" "Trappc4"  "Traip"    "Trank1"   "Trak1"   
## [31] "Trappc1"  "Traf4"    "Trappc13" "Trappc12" "Trappc6b" "Traf3"   
## [37] "Trappc9"  "Trabd"    "Trap1"    "Tra2b"    "Trat1"    "Traf7"   
## [43] "Trappc8"

1.3 Preprocessing

seurat_obj <- NormalizeData(seurat_obj)
seurat_obj <- FindVariableFeatures(seurat_obj, nfeatures = 3000)
seurat_obj <- ScaleData(seurat_obj)
seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = FALSE)
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:30)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 138211
## Number of edges: 5214586
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9642
## Number of communities: 41
## Elapsed time: 26 seconds
seurat_obj <- RunUMAP(seurat_obj, dims = 1:30)

2. Survey γδ Markers Across the Atlas

survey_genes <- c("Trdc", "Cd3e", "Cd3d", "Rorc", "Sox13", "Maf", "Blk",
                  "Il17a", "Il17f", "Il23r", "Ccr6",
                  "Tbx21", "Ifng", "Eomes", "Cd27",
                  "Id2", "Gata3", "Rora", "Ncr1")
survey_present <- survey_genes[survey_genes %in% rownames(seurat_obj)]

DotPlot(seurat_obj, features = survey_present,
        group.by = "seurat_clusters",
        cols = c("lightgrey", "darkred"), dot.scale = 5) +
  RotatedAxis() +
  ggtitle("γδ T Markers Across All Clusters (full atlas)") +
  theme(axis.text.x = element_text(face = "italic", size = 9))

seurat_annotated <- subset(seurat_obj, original_annotation != "unannotated")
cat("Annotated cells for DotPlot:", ncol(seurat_annotated), "\n")
## Annotated cells for DotPlot: 5397
DotPlot(seurat_annotated, features = survey_present,
        group.by = "original_annotation",
        cols = c("lightgrey", "darkred"), dot.scale = 5) +
  RotatedAxis() +
  ggtitle("γδ T Markers Across Annotated Cell Types") +
  theme(axis.text.x = element_text(face = "italic", size = 9),
        axis.text.y = element_text(size = 8))

has_trdc <- "Trdc" %in% rownames(seurat_obj)
if (has_trdc) {
  p1 <- FeaturePlot(seurat_obj, features = "Trdc",
                    order = TRUE, raster = TRUE,
                    cols = c("lightgrey", "darkred")) +
    ggtitle("Trdc — Full Atlas")
} else {
  p1 <- ggplot() + annotate("text", x = 0.5, y = 0.5, label = "Trdc not in dataset") +
    theme_void()
}

seurat_obj$is_gdTc <- seurat_obj$original_annotation == "gdTc"
p2 <- DimPlot(seurat_obj, group.by = "is_gdTc",
              cols = c("TRUE" = "red", "FALSE" = "grey90"),
              order = TRUE, raster = TRUE) +
  ggtitle(paste0("Annotated gdTc (n=", sum(seurat_obj$is_gdTc), ")")) +
  NoLegend()

p1 + p2

gd17_tf <- c("Rorc", "Sox13")
gd17_tf_present <- gd17_tf[gd17_tf %in% rownames(seurat_obj)]
if (length(gd17_tf_present) == 2) {
  p1 <- FeaturePlot(seurat_obj, features = "Rorc",
                    order = TRUE, raster = TRUE,
                    cols = c("lightgrey", "#CC0000")) +
    ggtitle("Rorc (RORγt)")
  p2 <- FeaturePlot(seurat_obj, features = "Sox13",
                    order = TRUE, raster = TRUE,
                    cols = c("lightgrey", "#CC0000")) +
    ggtitle("Sox13")
  p1 + p2
}

3. Isolating γδ T Cells

No TCR V genes in this dataset (filtered during atlas construction). We rely on Trdc, CD3, and γδ17 TF expression for isolation. ~132k cells are unannotated — our expression-based strategies will search these too.

# --- Strategy 1: Annotation ---
cells_annot <- colnames(seurat_obj)[seurat_obj$original_annotation == "gdTc"]
cat("Strategy 1 — Annotated gdTc:", length(cells_annot), "\n")
## Strategy 1 — Annotated gdTc: 376
# --- Strategy 2: Trdc+ / CD3+ (stringent) ---
if ("Trdc" %in% rownames(seurat_obj)) {
  trdc <- GetAssayData(seurat_obj, layer = "data")["Trdc", ]
} else {
  trdc <- setNames(rep(0, ncol(seurat_obj)), colnames(seurat_obj))
}
cd3_vars <- intersect(c("Cd3d", "Cd3e", "Cd3g"), rownames(seurat_obj))
cd3_mat  <- GetAssayData(seurat_obj, layer = "data")[cd3_vars, , drop = FALSE]
cd3_pos  <- colSums(cd3_mat > 0) > 0
cells_stringent <- names(trdc)[trdc > 0 & cd3_pos]
cat("Strategy 2 — Trdc+/CD3+:", length(cells_stringent), "\n")
## Strategy 2 — Trdc+/CD3+: 0
# --- Strategy 3: γδ17 programme (sensitive) ---
prog_genes   <- c("Rorc", "Sox13", "Maf", "Blk")
prog_present <- prog_genes[prog_genes %in% rownames(seurat_obj)]

if (length(prog_present) >= 2 & "Cd3e" %in% rownames(seurat_obj)) {
  prog_mat <- GetAssayData(seurat_obj, layer = "data")[prog_present, , drop = FALSE]
  cd3e     <- GetAssayData(seurat_obj, layer = "data")["Cd3e", ]
  
  prog_count <- colSums(prog_mat > 0)
  gd17_like  <- prog_count >= 2 & cd3e > 0
  
  # Exclude ILC-like (high Id2, no CD3d)
  if ("Id2" %in% rownames(seurat_obj)) {
    id2  <- GetAssayData(seurat_obj, layer = "data")["Id2", ]
    cd3d <- if ("Cd3d" %in% rownames(seurat_obj)) {
      GetAssayData(seurat_obj, layer = "data")["Cd3d", ]
    } else { rep(0, ncol(seurat_obj)) }
    ilc_like  <- id2 > quantile(id2[id2 > 0], 0.75, na.rm = TRUE) & cd3d == 0
    gd17_like <- gd17_like & !ilc_like
  }
  
  cells_sensitive <- colnames(seurat_obj)[gd17_like]
  cells_sensitive <- unique(c(cells_sensitive, names(trdc)[trdc > 0]))
} else {
  cells_sensitive <- names(trdc)[trdc > 0]
}
cat("Strategy 3 — Sensitive:", length(cells_sensitive), "\n")
## Strategy 3 — Sensitive: 852
cells_union <- unique(c(cells_annot, cells_stringent, cells_sensitive))
cat("\n=== UNION:", length(cells_union), "cells ===\n")
## 
## === UNION: 1019 cells ===
cat("\nBreakdown:\n")
## 
## Breakdown:
cat("  Annotated gdTc:", length(cells_annot), "\n")
##   Annotated gdTc: 376
cat("  Newly recovered (not in annotation):", sum(!(cells_union %in% cells_annot)), "\n")
##   Newly recovered (not in annotation): 643
cat("  From Trdc+/CD3+:", length(cells_stringent), "\n")
##   From Trdc+/CD3+: 0
cat("  From γδ17 programme:", sum(cells_sensitive %in% cells_union), "\n")
##   From γδ17 programme: 852
recovered_annots <- seurat_obj$original_annotation[cells_union]
cat("\nOriginal labels of recovered γδ T cells:\n")
## 
## Original labels of recovered γδ T cells:
print(sort(table(recovered_annots), decreasing = TRUE))
## recovered_annots
## unannotated        gdTc     T cells     unknown       CD4Tc         BAM 
##         593         376          35           6           3           1 
##      BcProg       CD8Tc    myeloid1    myeloid3    NK cells 
##           1           1           1           1           1
seurat_obj$gdt_source <- "Other"
seurat_obj$gdt_source[colnames(seurat_obj) %in% cells_annot] <- "Annotated gdTc"
seurat_obj$gdt_source[colnames(seurat_obj) %in%
  setdiff(cells_stringent, cells_annot)] <- "Trdc+CD3+ (new)"
seurat_obj$gdt_source[colnames(seurat_obj) %in%
  setdiff(cells_sensitive, c(cells_annot, cells_stringent))] <- "gd17 prog (new)"

p1 <- DimPlot(seurat_obj,
              cells.highlight = cells_union,
              cols.highlight = "red", cols = "grey90",
              order = TRUE, raster = TRUE, sizes.highlight = 0.5) +
  ggtitle(paste0("Recovered γδ T Cells (n=", length(cells_union), ")")) +
  NoLegend()

source_cols <- c("Annotated gdTc" = "#E41A1C",
                 "Trdc+CD3+ (new)" = "#377EB8",
                 "gd17 prog (new)" = "#FF7F00",
                 "Other"           = "grey90")
p2 <- DimPlot(seurat_obj, group.by = "gdt_source",
              cols = source_cols,
              order = TRUE, raster = TRUE) +
  ggtitle("By Recovery Strategy")

p1 + p2

4. Create γδ T Subset

gdt <- subset(seurat_obj, cells = cells_union)
gdt$recovery_source <- "Sensitive"
gdt$recovery_source[colnames(gdt) %in% cells_annot] <- "Annotated"
gdt$recovery_source[colnames(gdt) %in% cells_stringent &
                      !(colnames(gdt) %in% cells_annot)] <- "Stringent"

cat("γδ T cell subset:", ncol(gdt), "cells\n")
## γδ T cell subset: 1019 cells
cat("Recovery source:\n")
## Recovery source:
print(table(gdt$recovery_source))
## 
## Annotated Sensitive 
##       376       643
p1 <- VlnPlot(gdt, features = "nFeature_originalexp",
              group.by = "recovery_source", pt.size = 0) +
  ggtitle("Genes Detected") + NoLegend()
p2 <- VlnPlot(gdt, features = "nCount_originalexp",
              group.by = "recovery_source", pt.size = 0) +
  ggtitle("UMI Counts") + NoLegend()
p3 <- VlnPlot(gdt, features = "percent.mt",
              group.by = "recovery_source", pt.size = 0) +
  ggtitle("% Mitochondrial") + NoLegend()
p1 + p2 + p3

p1 <- ggplot(gdt@meta.data, aes(x = reorder(site, site, length), fill = site)) +
  geom_bar() + coord_flip() + theme_minimal() +
  labs(title = "By Site", x = NULL, y = "Cells") + theme(legend.position = "none")
p2 <- ggplot(gdt@meta.data, aes(x = study, fill = study)) +
  geom_bar() + coord_flip() + theme_minimal() +
  labs(title = "By Study", x = NULL, y = "Cells") + theme(legend.position = "none")
p3 <- ggplot(gdt@meta.data, aes(x = age_group, fill = age_group)) +
  geom_bar() + theme_minimal() +
  labs(title = "By Age", x = NULL, y = "Cells") + theme(legend.position = "none")
p1 + p2 + p3

5. Re-cluster γδ T Cells

gdt <- NormalizeData(gdt)
gdt <- FindVariableFeatures(gdt, nfeatures = 2000)
gdt <- ScaleData(gdt, features = rownames(gdt))
gdt <- RunPCA(gdt, npcs = 30, verbose = FALSE)
ElbowPlot(gdt, ndims = 30) + ggtitle("Elbow Plot — γδ T Cells")

n_pcs <- 15
gdt <- FindNeighbors(gdt, dims = 1:n_pcs)

for (res in c(0.3, 0.5, 0.8, 1.0)) {
  gdt <- FindClusters(gdt, resolution = res, verbose = FALSE)
}
gdt <- RunUMAP(gdt, dims = 1:n_pcs)
res_cols <- grep("_snn_res\\.", colnames(gdt@meta.data), value = TRUE)
res05    <- grep("0\\.5$", res_cols, value = TRUE)[1]
if (!is.na(res05)) {
  Idents(gdt) <- res05
  gdt$seurat_clusters <- gdt@meta.data[[res05]]
} else {
  Idents(gdt) <- res_cols[1]
  gdt$seurat_clusters <- gdt@meta.data[[res_cols[1]]]
}
cat("Clusters at chosen resolution:", length(unique(Idents(gdt))), "\n")
## Clusters at chosen resolution: 8
plots <- lapply(res_cols, function(rc) {
  DimPlot(gdt, group.by = rc, label = TRUE, label.size = 4) + ggtitle(rc)
})
if (length(plots) >= 4) {
  (plots[[1]] + plots[[2]]) / (plots[[3]] + plots[[4]])
} else {
  wrap_plots(plots, ncol = 2)
}

p1 <- DimPlot(gdt, group.by = "seurat_clusters",
              label = TRUE, label.size = 5) + ggtitle("Clusters")
p2 <- DimPlot(gdt, group.by = "site") + ggtitle("Site")
p3 <- DimPlot(gdt, group.by = "age_group") + ggtitle("Age Group")
p4 <- DimPlot(gdt, group.by = "original_annotation") + ggtitle("Original Annotation")
(p1 + p2) / (p3 + p4)

p1 <- DimPlot(gdt, group.by = "recovery_source") + ggtitle("Recovery Source")
p2 <- DimPlot(gdt, group.by = "library") + ggtitle("Library Type")
p1 + p2

6. Cluster Identity

6.1 Find Markers

Idents(gdt) <- "seurat_clusters"
gdt_markers <- FindAllMarkers(gdt, only.pos = TRUE,
                              min.pct = 0.2, logfc.threshold = 0.25,
                              test.use = "wilcox")

top10 <- gdt_markers %>% group_by(cluster) %>%
  slice_max(avg_log2FC, n = 10) %>% ungroup()
print(top10 %>% select(cluster, gene, avg_log2FC, pct.1, pct.2, p_val_adj), n = 80)
## # A tibble: 80 × 6
##    cluster gene      avg_log2FC pct.1 pct.2 p_val_adj
##    <fct>   <chr>          <dbl> <dbl> <dbl>     <dbl>
##  1 0       Muc1            7.81 0.411 0.003 2.19e- 67
##  2 0       Calca           7.61 0.328 0.011 5.46e- 47
##  3 0       Maff            5.75 0.65  0.033 3.06e-104
##  4 0       Cystm1          5.70 0.22  0.01  5.95e- 28
##  5 0       Hilpda          5.55 0.825 0.166 9.99e-113
##  6 0       Ifrd1           5.41 0.994 0.224 2.11e-157
##  7 0       Ell2            4.98 0.567 0.041 5.49e- 82
##  8 0       Mvk             4.89 0.462 0.038 6.01e- 61
##  9 0       Ptgs2           4.87 0.318 0.016 1.94e- 42
## 10 0       Gem             4.78 0.338 0.05  7.12e- 33
## 11 1       Hba-a1          3.94 0.572 0.098 5.44e- 55
## 12 1       Hbb-bs          3.71 0.959 0.178 4.59e-117
## 13 1       Hbb-bt          3.46 0.557 0.095 1.81e- 55
## 14 1       Hba-a2          3.10 0.683 0.094 2.68e- 78
## 15 1       Mrpl53          2.86 0.321 0.052 3.11e- 27
## 16 1       Cpne1           2.60 0.207 0.035 1.09e- 14
## 17 1       Rnasek          2.56 0.812 0.15  8.94e- 82
## 18 1       Fam89b          2.45 0.358 0.079 7.69e- 25
## 19 1       Eif2s3y         2.17 0.351 0.104 3.39e- 17
## 20 1       Zdhhc8          2.10 0.207 0.045 3.41e- 11
## 21 2       mt-Atp8         5.18 0.991 0.227 8.94e- 92
## 22 2       Rpl9-ps6        5.16 1     0.208 2.95e- 97
## 23 2       Lars2           4.72 0.914 0.305 8.03e- 65
## 24 2       Gm10076         4.38 1     0.489 2.34e- 69
## 25 2       Rpl10-ps3       4.06 0.948 0.383 2.22e- 64
## 26 2       AY036118        4.05 0.922 0.499 4.27e- 48
## 27 2       Rln3            4.01 0.25  0.082 1.18e-  5
## 28 2       mt-Nd4l         3.91 0.966 0.372 7.76e- 68
## 29 2       Gm42418         3.42 1     0.952 2.68e- 54
## 30 2       mt-Nd6          3.09 0.388 0.112 1.01e- 14
## 31 3       Srp54b          3.99 0.266 0.019 2.16e- 25
## 32 3       Tsix            3.44 0.245 0.022 3.69e- 20
## 33 3       Tonsl           3.06 0.383 0.048 2.65e- 26
## 34 3       Sept2           2.95 0.34  0.046 4.30e- 21
## 35 3       Xist            2.22 0.755 0.378 1.19e- 19
## 36 3       Sfxn1           1.58 0.426 0.173 1.42e-  5
## 37 3       Mtif2           1.51 0.277 0.122 5.78e-  1
## 38 3       Clec2i          1.48 0.309 0.138 8.36e-  2
## 39 3       Pdk1            1.43 0.223 0.068 9.11e-  3
## 40 3       Usp50           1.42 0.319 0.139 7.19e-  2
## 41 4       Hspa1b          2.98 0.53  0.11  2.02e- 19
## 42 4       Ppp1r15a        2.54 0.758 0.315 1.03e- 14
## 43 4       Tnf             2.50 0.667 0.234 2.22e- 13
## 44 4       Hspa1a          2.39 0.379 0.056 9.51e- 18
## 45 4       Hbegf           2.01 0.212 0.097 1   e+  0
## 46 4       Zfp36           1.92 0.621 0.284 2.33e-  6
## 47 4       Fos             1.87 0.833 0.4   2.06e- 12
## 48 4       Dusp1           1.82 0.682 0.441 9.94e-  4
## 49 4       Nfkbid          1.82 0.621 0.269 3.13e-  8
## 50 4       Cd69            1.81 0.697 0.325 3.79e-  9
## 51 5       Cd8b1           5.59 0.273 0.012 5.95e- 31
## 52 5       Ubash3a         4.78 0.409 0.019 2.38e- 46
## 53 5       Cd6             4.67 0.682 0.049 2.14e- 65
## 54 5       Cd4             4.57 0.561 0.029 6.47e- 62
## 55 5       Cd8a            4.57 0.288 0.015 1.62e- 29
## 56 5       Trat1           4.28 0.288 0.016 4.13e- 28
## 57 5       Cd28            4.15 0.758 0.058 3.79e- 71
## 58 5       Gpr174          4.11 0.258 0.019 2.60e- 20
## 59 5       Ccl5            3.90 0.439 0.097 7.24e- 14
## 60 5       Themis          3.76 0.439 0.035 2.36e- 36
## 61 6       Tmem119         6.95 0.231 0.003 4.53e- 37
## 62 6       Clec4a3         6.86 0.231 0.003 4.08e- 37
## 63 6       Hpgd            6.85 0.323 0.004 7.06e- 54
## 64 6       F13a1           6.52 0.277 0.005 7.01e- 42
## 65 6       C1qc            6.20 0.569 0.047 4.62e- 50
## 66 6       Fcgr1           6.20 0.292 0.004 2.99e- 47
## 67 6       Igsf6           6.05 0.231 0.003 6.02e- 37
## 68 6       C1qa            6.01 0.723 0.07  5.94e- 60
## 69 6       P2ry6           5.97 0.215 0.002 2.56e- 36
## 70 6       Ccl9            5.94 0.231 0.007 7.35e- 29
## 71 7       Tmem108         9.53 0.296 0     6.72e- 62
## 72 7       Klhl14          9.45 0.222 0     1.12e- 45
## 73 7       Cecr2           9.12 0.444 0.002 2.64e- 80
## 74 7       Gm37065         8.59 0.222 0     1.12e- 45
## 75 7       Spib            8.55 0.852 0.006 1.65e-146
## 76 7       Fcrla           8.55 0.926 0.007 1.45e-157
## 77 7       Vpreb3          8.44 0.741 0.013 2.40e- 96
## 78 7       Pax5            8.42 0.667 0.003 1.09e-122
## 79 7       Chst3           8.36 0.481 0.002 2.63e- 88
## 80 7       Ms4a1           8.25 0.704 0.009 4.88e-102
top5_genes <- gdt_markers %>% group_by(cluster) %>%
  slice_max(avg_log2FC, n = 5) %>% pull(gene) %>% unique()
DoHeatmap(gdt, features = top5_genes, group.by = "seurat_clusters",
          size = 3, angle = 90) +
  scale_fill_viridis(option = "inferno") +
  ggtitle("Top DE Genes per Cluster")

6.2 Canonical γδ Marker Panel

canonical <- c(
  "Cd3e", "Cd3d", "Trdc",
  "Rorc", "Sox13", "Maf", "Blk", "Scart2", "Rora", "Zbtb16",
  "Il17a", "Il17f", "Il22", "Il23r", "Il1r1", "Ccr6",
  "Tbx21", "Eomes", "Ifng", "Gzmb", "Gzma", "Prf1", "Nkg7", "Cd27",
  "Cd44", "Cd69", "Itgae", "Cxcr6",
  "Mki67", "Top2a",
  "Ncr1", "Id2", "Gata3", "Klrb1c"
)
canonical_present <- canonical[canonical %in% rownames(gdt)]

DotPlot(gdt, features = canonical_present, group.by = "seurat_clusters",
        cols = c("lightgrey", "darkred"), dot.scale = 6) +
  RotatedAxis() +
  ggtitle("Canonical Markers — γδ T Clusters") +
  theme(axis.text.x = element_text(face = "italic", size = 9))

key <- c("Trdc", "Cd3e", "Rorc", "Sox13", "Maf", "Blk",
         "Il17a", "Il23r", "Ccr6", "Tbx21", "Ifng", "Gzmb",
         "Cd27", "Cd44", "Id2", "Ncr1")
key_present <- key[key %in% rownames(gdt)]

FeaturePlot(gdt, features = key_present, order = TRUE, ncol = 4,
            cols = c("lightgrey", "darkred")) &
  theme(plot.title = element_text(face = "bold.italic", size = 10))

6.3 Contamination Check & Removal

ilc_g <- c("Id2", "Gata3", "Rora", "Il7r", "Kit", "Ncr1")
nk_g  <- c("Ncr1", "Klrb1c", "Klrk1", "Klrd1", "Nkg7", "Prf1")

if (length(ilc_g[ilc_g %in% rownames(gdt)]) >= 3)
  gdt <- AddModuleScore(gdt, features = list(ilc_g[ilc_g %in% rownames(gdt)]),
                        name = "ILC_score")
if (length(nk_g[nk_g %in% rownames(gdt)]) >= 3)
  gdt <- AddModuleScore(gdt, features = list(nk_g[nk_g %in% rownames(gdt)]),
                        name = "NK_score")

sc <- intersect(c("ILC_score1", "NK_score1"), colnames(gdt@meta.data))
if (length(sc) > 0)
  VlnPlot(gdt, features = sc, group.by = "seurat_clusters",
          pt.size = 0, ncol = 2)

αβ T cell contamination check

Cluster 5 from v3 showed Cd4, Cd8a, Cd8b1, Cd28, Ubash3a, Themis — classic αβ T markers. We now systematically score every cluster for αβ contamination and remove offending clusters.

# αβ T cell markers (should be absent/low in real γδ T cells)
ab_markers <- c("Cd4", "Cd8a", "Cd8b1", "Cd28", "Themis", "Ubash3a",
                "Lef1", "Tcf7", "Rag1", "Rag2")
ab_present <- ab_markers[ab_markers %in% rownames(gdt)]

cat("αβ T markers available:", paste(ab_present, collapse = ", "), "\n\n")
## αβ T markers available: Cd4, Cd8a, Cd8b1, Cd28, Themis, Ubash3a, Lef1, Tcf7, Rag1, Rag2
# Score each cluster
if (length(ab_present) >= 3) {
  gdt <- AddModuleScore(gdt, features = list(ab_present), name = "abT_score")
  
  # Per-cluster summary
  ab_summary <- FetchData(gdt, vars = c("abT_score1", "seurat_clusters")) %>%
    group_by(seurat_clusters) %>%
    summarise(
      mean_abT_score = mean(abT_score1),
      median_abT_score = median(abT_score1),
      n = n(),
      .groups = "drop"
    ) %>%
    arrange(desc(mean_abT_score))
  
  cat("αβ T cell score by cluster:\n")
  print(as.data.frame(ab_summary))
}
## αβ T cell score by cluster:
##   seurat_clusters mean_abT_score median_abT_score   n
## 1               5    0.320899130      0.270580194  66
## 2               7    0.113679055      0.095031826  27
## 3               2    0.004799292     -0.067972847 116
## 4               3   -0.001336393      0.007285311  94
## 5               4   -0.018776445     -0.059036756  66
## 6               0   -0.023801753     -0.062634805 314
## 7               6   -0.043555381     -0.076530676  65
## 8               1   -0.052100862     -0.100008235 271
# Also check key αβ markers individually per cluster
if (length(ab_present) >= 2) {
  DotPlot(gdt, features = ab_present, group.by = "seurat_clusters",
          cols = c("lightgrey", "navy"), dot.scale = 6) +
    RotatedAxis() +
    ggtitle("αβ T Cell Markers by Cluster — Contamination Check") +
    theme(axis.text.x = element_text(face = "italic", size = 10))
}

if ("abT_score1" %in% colnames(gdt@meta.data)) {
  VlnPlot(gdt, features = "abT_score1", group.by = "seurat_clusters",
          pt.size = 0.3) +
    geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
    ggtitle("αβ T Cell Score by Cluster") +
    NoLegend()
}

Identify & remove contaminant clusters

# Identify clusters with high αβ T score AND low γδ identity markers
# γδ identity: Cd3e + at least one of Rorc, Sox13, Maf, Blk
gd_id_genes <- c("Rorc", "Sox13", "Maf", "Blk")
gd_id_present <- gd_id_genes[gd_id_genes %in% rownames(gdt)]

cluster_identity <- FetchData(gdt, vars = c("seurat_clusters",
                                             ab_present[ab_present %in% colnames(FetchData(gdt, vars = ab_present))],
                                             gd_id_present)) %>%
  group_by(seurat_clusters) %>%
  summarise(
    across(everything(), ~ mean(. > 0) * 100),
    n = n(),
    .groups = "drop"
  )

cat("Per-cluster marker detection (% cells expressing):\n")
## Per-cluster marker detection (% cells expressing):
print(as.data.frame(cluster_identity))
##   seurat_clusters        Cd4       Cd8a      Cd8b1      Cd28     Themis
## 1               0  0.6369427  0.6369427  0.3184713  2.229299  0.3184713
## 2               1  0.3690037  1.8450185  0.7380074  4.797048  4.0590406
## 3               2  5.1724138  0.8620690  0.0000000  6.896552  5.1724138
## 4               3  1.0638298  1.0638298  4.2553191  6.382979  5.3191489
## 5               4  0.0000000  1.5151515  0.0000000  0.000000  0.0000000
## 6               5 56.0606061 28.7878788 27.2727273 75.757576 43.9393939
## 7               6 10.7692308  1.5384615  3.0769231 15.384615  7.6923077
## 8               7 40.7407407 11.1111111  7.4074074 40.740741 18.5185185
##      Ubash3a      Lef1     Tcf7      Rag1      Rag2     Rorc     Sox13
## 1  0.3184713 10.191083 18.47134  0.000000 0.3184713 18.47134 10.828025
## 2  0.7380074  4.428044 33.21033  0.000000 0.0000000 73.43173 43.173432
## 3  1.7241379  6.034483 31.89655  0.000000 0.0000000 71.55172 34.482759
## 4  3.1914894 17.021277 41.48936  0.000000 0.0000000 69.14894 37.234043
## 5  1.5151515 10.606061 37.87879  0.000000 0.0000000 53.03030 31.818182
## 6 40.9090909 18.181818 40.90909  0.000000 0.0000000 81.81818  9.090909
## 7  6.1538462 15.384615 21.53846  1.538462 0.0000000 63.07692 32.307692
## 8 18.5185185 37.037037 29.62963 18.518519 3.7037037 14.81481 11.111111
##         Maf       Blk   n
## 1  69.10828  57.96178 314
## 2  98.89299  85.23985 271
## 3  96.55172  59.48276 116
## 4  97.87234  63.82979  94
## 5  87.87879  65.15152  66
## 6 100.00000  19.69697  66
## 7  92.30769  80.00000  65
## 8 100.00000 100.00000  27
# Flag clusters where ≥2 αβ markers each expressed in >20% of cells
# AND γδ17 TFs each expressed in <15% of cells
contaminant_clusters <- c()

for (cl in unique(gdt$seurat_clusters)) {
  cl_cells <- WhichCells(gdt, idents = cl)
  cl_data  <- GetAssayData(gdt, layer = "data")[, cl_cells, drop = FALSE]
  
  # αβ marker positivity
  ab_pct <- sapply(ab_present, function(g) mean(cl_data[g, ] > 0) * 100)
  ab_high <- sum(ab_pct > 20)
  
  # γδ TF positivity
  gd_pct <- sapply(gd_id_present, function(g) mean(cl_data[g, ] > 0) * 100)
  gd_any_high <- any(gd_pct > 15)
  
  # Cd3e positivity
  cd3e_pct <- mean(cl_data["Cd3e", ] > 0) * 100
  
  if (ab_high >= 2 & !gd_any_high) {
    contaminant_clusters <- c(contaminant_clusters, cl)
    cat("  FLAGGED cluster", cl, "— αβ markers high in", ab_high,
        "genes, no γδ TFs >15%\n")
  }
}

if (length(contaminant_clusters) == 0) {
  cat("\nNo clusters flagged by automated criteria.\n")
  cat("Checking manually for cluster 5 (αβ-like in v3)...\n")
  
  # Manual check for the cluster enriched in Cd4/Cd8/Themis from v3
  for (cl in unique(gdt$seurat_clusters)) {
    cl_cells <- WhichCells(gdt, idents = cl)
    cl_data  <- GetAssayData(gdt, layer = "data")[, cl_cells, drop = FALSE]
    
    cd4_pct  <- if ("Cd4" %in% rownames(cl_data)) mean(cl_data["Cd4", ] > 0) * 100 else 0
    cd8a_pct <- if ("Cd8a" %in% rownames(cl_data)) mean(cl_data["Cd8a", ] > 0) * 100 else 0
    rorc_pct <- if ("Rorc" %in% rownames(cl_data)) mean(cl_data["Rorc", ] > 0) * 100 else 0
    
    if ((cd4_pct > 30 | cd8a_pct > 20) & rorc_pct < 15) {
      contaminant_clusters <- c(contaminant_clusters, cl)
      cat("  Manual flag: cluster", cl, "— Cd4:", round(cd4_pct, 1),
          "%, Cd8a:", round(cd8a_pct, 1), "%, Rorc:", round(rorc_pct, 1), "%\n")
    }
  }
}
## 
## No clusters flagged by automated criteria.
## Checking manually for cluster 5 (αβ-like in v3)...
##   Manual flag: cluster 7 — Cd4: 40.7 %, Cd8a: 11.1 %, Rorc: 14.8 %
cat("\n=== Contaminant clusters to remove:", 
    ifelse(length(contaminant_clusters) > 0,
           paste(contaminant_clusters, collapse = ", "),
           "None"), "===\n")
## 
## === Contaminant clusters to remove: 7 ===
n_before <- ncol(gdt)

if (length(contaminant_clusters) > 0) {
  # Mark before removal
  gdt$is_contaminant <- gdt$seurat_clusters %in% contaminant_clusters
  
  p1 <- DimPlot(gdt, group.by = "is_contaminant",
                cols = c("TRUE" = "red", "FALSE" = "grey80")) +
    ggtitle(paste0("Contaminant Cells (clusters ",
                   paste(contaminant_clusters, collapse = ", "), ")"))
  print(p1)
  
  cat("\nCells before filtering:", n_before, "\n")
  cat("Cells in contaminant clusters:", sum(gdt$is_contaminant), "\n")
  
  # Remove
  gdt <- subset(gdt, seurat_clusters %in% contaminant_clusters, invert = TRUE)
  
  cat("Cells after filtering:", ncol(gdt), "\n")
  cat("Removed:", n_before - ncol(gdt), "likely αβ T cells\n")
} else {
  cat("No contaminant clusters identified — retaining all", n_before, "cells\n")
}

## 
## Cells before filtering: 1019 
## Cells in contaminant clusters: 27 
## Cells after filtering: 992 
## Removed: 27 likely αβ T cells

Re-cluster after contaminant removal

if (length(contaminant_clusters) > 0) {
  cat("Re-clustering after contaminant removal...\n")
  gdt <- NormalizeData(gdt)
  gdt <- FindVariableFeatures(gdt, nfeatures = 2000)
  gdt <- ScaleData(gdt, features = rownames(gdt))
  gdt <- RunPCA(gdt, npcs = 30, verbose = FALSE)
  gdt <- FindNeighbors(gdt, dims = 1:n_pcs)
  gdt <- FindClusters(gdt, resolution = 0.5, verbose = FALSE)
  gdt <- RunUMAP(gdt, dims = 1:n_pcs)
  
  Idents(gdt) <- "seurat_clusters"
  
  p1 <- DimPlot(gdt, label = TRUE, label.size = 5) +
    ggtitle(paste0("Clean γδ T Clusters (n=", ncol(gdt), ")"))
  p2 <- DimPlot(gdt, group.by = "site") + ggtitle("Site")
  p1 + p2
} else {
  cat("Skipping re-cluster — no cells removed.\n")
}
## Re-clustering after contaminant removal...

7. γδ17 vs γδIFN Classification

gd17_sig  <- c("Rorc", "Sox13", "Maf", "Blk", "Rora", "Scart2",
               "Bhlhe40", "Zbtb16")
gdifn_sig <- c("Tbx21", "Eomes", "Ifng", "Gzmb", "Gzma",
               "Prf1", "Nkg7", "Ccl5", "Cd27")
il17_sig  <- c("Il17a", "Il17f", "Il22", "Il23r", "Il1r1",
               "Ccr6", "Cd44", "Rorc", "Rora", "Sox13", "Maf")

gdt <- AddModuleScore(gdt, features = list(gd17_sig[gd17_sig %in% rownames(gdt)]),
                      name = "gd17_score")
gdt <- AddModuleScore(gdt, features = list(gdifn_sig[gdifn_sig %in% rownames(gdt)]),
                      name = "gdIFN_score")
gdt <- AddModuleScore(gdt, features = list(il17_sig[il17_sig %in% rownames(gdt)]),
                      name = "IL17_score")
p1 <- FeaturePlot(gdt, features = "gd17_score1", order = TRUE,
                  cols = c("lightgrey", "#CC0000")) + ggtitle("γδ17 Score")
p2 <- FeaturePlot(gdt, features = "gdIFN_score1", order = TRUE,
                  cols = c("lightgrey", "#0066CC")) + ggtitle("γδIFN Score")
p3 <- FeaturePlot(gdt, features = "IL17_score1", order = TRUE,
                  cols = c("lightgrey", "#FF6600")) + ggtitle("IL-17 Score")
p1 + p2 + p3

gdt$gd_subtype <- case_when(
  gdt$gd17_score1 > 0 & gdt$gdIFN_score1 <= 0 ~ "gd17",
  gdt$gdIFN_score1 > 0 & gdt$gd17_score1 <= 0 ~ "gdIFN",
  gdt$gd17_score1 > 0 & gdt$gdIFN_score1 > 0  ~ "Mixed",
  TRUE ~ "Unassigned"
)
print(table(gdt$gd_subtype))
## 
##       gd17      gdIFN      Mixed Unassigned 
##        333        172         80        407
sc <- c("gd17" = "#CC0000", "gdIFN" = "#0066CC",
        "Mixed" = "#9933CC", "Unassigned" = "grey80")

p1 <- DimPlot(gdt, group.by = "gd_subtype", cols = sc) + ggtitle("Subtypes")
p2 <- gdt@meta.data %>%
  ggplot(aes(x = seurat_clusters, fill = gd_subtype)) +
  geom_bar(position = "fill") + scale_fill_manual(values = sc) +
  theme_minimal() + labs(title = "By Cluster", x = "Cluster", y = "Prop.")
p3 <- gdt@meta.data %>%
  ggplot(aes(x = reorder(site, site, length), fill = gd_subtype)) +
  geom_bar(position = "fill") + coord_flip() +
  scale_fill_manual(values = sc) + theme_minimal() +
  labs(title = "By Site", x = NULL, y = "Prop.")
p4 <- gdt@meta.data %>%
  ggplot(aes(x = age_group, fill = gd_subtype)) +
  geom_bar(position = "fill") + scale_fill_manual(values = sc) +
  theme_minimal() + labs(title = "By Age", x = NULL, y = "Prop.")
(p1 + p2) / (p3 + p4)

FetchData(gdt, vars = c("gd17_score1", "gdIFN_score1", "gd_subtype")) %>%
  ggplot(aes(x = gd17_score1, y = gdIFN_score1, colour = gd_subtype)) +
  geom_point(alpha = 0.5, size = 1.2) +
  scale_colour_manual(values = sc) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  theme_minimal() +
  labs(title = "γδ17 vs γδIFN Score", x = "γδ17 Score", y = "γδIFN Score")

8. IL-17 Programme

il17_panel <- c("Il17a", "Il17f", "Il22", "Il21",
                "Rorc", "Rora", "Sox13", "Maf", "Blk",
                "Il23r", "Il1r1", "Il6ra", "Il6st",
                "Ccr6", "Cd44", "Cd69", "Cd27",
                "Cebpb", "Nfkb1", "Ccl20", "Cxcl2", "Csf2")
il17_present <- il17_panel[il17_panel %in% rownames(gdt)]

DotPlot(gdt, features = il17_present, group.by = "seurat_clusters",
        cols = c("lightgrey", "#CC0000"), dot.scale = 6) +
  RotatedAxis() +
  ggtitle("IL-17 Programme by Cluster") +
  theme(axis.text.x = element_text(face = "italic"))

meningeal_sites <- c("dura mater", "dural meninges", "brain meninges",
                     "enriched subdural meninges")
gdt$tissue_compartment <- case_when(
  gdt$site %in% meningeal_sites ~ "Meningeal",
  gdt$site == "bone marrow"     ~ "Bone Marrow",
  gdt$site == "coronal suture"  ~ "Coronal Suture",
  gdt$site == "whole brain"     ~ "Brain",
  gdt$site == "choroid plexus"  ~ "Choroid Plexus",
  TRUE                          ~ "Other"
)
print(table(gdt$tissue_compartment))
## 
##    Bone Marrow          Brain Coronal Suture      Meningeal          Other 
##              9             37              1            866             79
comp_counts <- table(gdt$tissue_compartment)
valid_comps <- names(comp_counts[comp_counts >= 5])
gdt_valid   <- subset(gdt, tissue_compartment %in% valid_comps)

p1 <- DotPlot(gdt_valid, features = il17_present,
              group.by = "tissue_compartment",
              cols = c("lightgrey", "#CC0000"), dot.scale = 7) +
  RotatedAxis() +
  ggtitle("IL-17 Programme by Compartment") +
  theme(axis.text.x = element_text(face = "italic"))

p2 <- gdt_valid@meta.data %>%
  ggplot(aes(x = reorder(tissue_compartment, IL17_score1, median),
             y = IL17_score1, fill = tissue_compartment)) +
  geom_boxplot(outlier.size = 0.5) + coord_flip() +
  theme_minimal() +
  labs(title = "IL-17 Score by Compartment", x = NULL, y = "Score") +
  theme(legend.position = "none")

p1 / p2

8.1 Meningeal vs Bone Marrow DE

n_men <- sum(gdt$tissue_compartment == "Meningeal")
n_bm  <- sum(gdt$tissue_compartment == "Bone Marrow")
cat("Meningeal:", n_men, "| BM:", n_bm, "\n")
## Meningeal: 866 | BM: 9
if (n_men >= 10 & n_bm >= 10) {
  gdt_mb <- subset(gdt, tissue_compartment %in% c("Meningeal", "Bone Marrow"))
  Idents(gdt_mb) <- "tissue_compartment"
  de_mb <- FindMarkers(gdt_mb, ident.1 = "Meningeal", ident.2 = "Bone Marrow",
                       test.use = "wilcox", min.pct = 0.1)
  de_mb$gene <- rownames(de_mb)
  
  cat("\nMeningeal-enriched:\n")
  print(head(de_mb %>% filter(avg_log2FC > 0) %>% arrange(p_val_adj), 20))
  cat("\nBM-enriched:\n")
  print(head(de_mb %>% filter(avg_log2FC < 0) %>% arrange(p_val_adj), 20))
} else {
  cat("Not enough cells for comparison.\n")
}
## Not enough cells for comparison.
if (exists("de_mb") && nrow(de_mb) > 0) {
  hl <- c("Rorc", "Sox13", "Maf", "Blk", "Il17a", "Il17f", "Ifng",
          "Tbx21", "Ccr6", "Cd27", "Cxcr6", "Ccl5", "S1pr1", "Il23r", "Gzmb")
  de_mb <- de_mb %>%
    mutate(sig = case_when(p_val_adj < 0.05 & avg_log2FC > 0.5  ~ "Meningeal",
                           p_val_adj < 0.05 & avg_log2FC < -0.5 ~ "BM",
                           TRUE ~ "NS"),
           label = ifelse(gene %in% hl | (p_val_adj < 1e-10 & abs(avg_log2FC) > 1),
                          gene, NA))
  
  ggplot(de_mb, aes(x = avg_log2FC, y = -log10(p_val_adj), colour = sig)) +
    geom_point(alpha = 0.6, size = 1.5) +
    geom_text_repel(aes(label = label), size = 3, max.overlaps = 25,
                    fontface = "italic") +
    scale_colour_manual(values = c("Meningeal" = "#E41A1C",
                                   "BM" = "#377EB8", "NS" = "grey70")) +
    theme_minimal() +
    labs(title = "Meningeal vs BM γδ T Cells",
         x = "log2 FC (+ = meningeal)", y = "-log10(adj p)")
}

8.2 γδ17 vs γδIFN DE

n17  <- sum(gdt$gd_subtype == "gd17")
nifn <- sum(gdt$gd_subtype == "gdIFN")
cat("gd17:", n17, "| gdIFN:", nifn, "\n")
## gd17: 333 | gdIFN: 172
if (n17 >= 10 & nifn >= 10) {
  gdt_c <- subset(gdt, gd_subtype %in% c("gd17", "gdIFN"))
  Idents(gdt_c) <- "gd_subtype"
  de_sub <- FindMarkers(gdt_c, ident.1 = "gd17", ident.2 = "gdIFN",
                        test.use = "wilcox", min.pct = 0.1)
  de_sub$gene <- rownames(de_sub)
  
  cat("\nγδ17-enriched:\n")
  print(head(de_sub %>% filter(avg_log2FC > 0) %>% arrange(p_val_adj), 15))
  cat("\nγδIFN-enriched:\n")
  print(head(de_sub %>% filter(avg_log2FC < 0) %>% arrange(p_val_adj), 15))
}
## 
## γδ17-enriched:
##                p_val avg_log2FC pct.1 pct.2    p_val_adj    gene
## Blk     9.494150e-29  1.9441285 0.844 0.459 2.452908e-24     Blk
## Cd163l1 1.282855e-22  1.4835616 0.862 0.453 3.314385e-18 Cd163l1
## Maf     1.273286e-15  0.9257445 0.952 0.872 3.289662e-11     Maf
## Sox13   6.512354e-13  2.0922815 0.459 0.157 1.682532e-08   Sox13
## Rorc    3.619342e-11  1.1955170 0.682 0.494 9.350932e-07    Rorc
## Ckb     7.290784e-10  0.7004056 0.910 0.826 1.883647e-05     Ckb
## Itm2b   7.385718e-09  0.3646948 0.997 1.000 1.908174e-04   Itm2b
## Rora    1.073310e-08  0.7584389 0.859 0.733 2.773005e-04    Rora
## Cd3e    1.099119e-08  0.5545632 0.979 0.994 2.839683e-04    Cd3e
## Kcnk1   2.433354e-08  1.1511850 0.556 0.343 6.286814e-04   Kcnk1
## S100a11 1.872645e-07  0.5329455 0.961 0.983 4.838165e-03 S100a11
## Cd3g    3.364725e-07  0.4758110 0.973 0.994 8.693105e-03    Cd3g
## Fxyd5   1.205322e-06  0.4706570 0.961 0.977 3.114071e-02   Fxyd5
## Cxcr6   1.690806e-06  0.4733319 0.967 0.948 4.368366e-02   Cxcr6
## Zbtb16  2.342841e-06  1.9675745 0.297 0.122 6.052964e-02  Zbtb16
## 
## γδIFN-enriched:
##               p_val avg_log2FC pct.1 pct.2    p_val_adj   gene
## Ccl5   6.360731e-39 -10.257477 0.006 0.453 1.643359e-34   Ccl5
## Nkg7   2.237279e-31 -10.152163 0.000 0.360 5.780235e-27   Nkg7
## Lck    4.613691e-29  -3.136096 0.045 0.465 1.191993e-24    Lck
## Cd27   2.018768e-26  -5.772819 0.009 0.331 5.215689e-22   Cd27
## Ms4a4b 1.990522e-21  -3.603915 0.051 0.378 5.142712e-17 Ms4a4b
## Cd28   1.374854e-19  -3.980542 0.021 0.291 3.552073e-15   Cd28
## Cd2    1.366790e-18  -2.349671 0.117 0.465 3.531239e-14    Cd2
## Fyb    4.295831e-18  -2.119705 0.102 0.442 1.109871e-13    Fyb
## Ctla2a 5.321546e-15  -2.216405 0.078 0.360 1.374875e-10 Ctla2a
## Tbx21  3.504392e-13  -7.091212 0.000 0.151 9.053948e-09  Tbx21
## Slfn1  5.760703e-13  -2.874069 0.009 0.180 1.488335e-08  Slfn1
## Prkch  2.287851e-12  -2.402402 0.024 0.215 5.910892e-08  Prkch
## Gm8369 2.451908e-12  -5.288303 0.003 0.151 6.334750e-08 Gm8369
## Ms4a6b 2.538169e-12  -1.602127 0.204 0.512 6.557614e-08 Ms4a6b
## Prf1   3.081005e-12  -6.696618 0.000 0.140 7.960083e-08   Prf1
if (exists("de_sub") && nrow(de_sub) > 0) {
  hl2 <- c(gd17_sig, gdifn_sig, "Il17a", "Il17f", "Ifng", "Ccr6",
           "Cd27", "Il23r")
  de_sub <- de_sub %>%
    mutate(sig = case_when(p_val_adj < 0.05 & avg_log2FC > 0.5  ~ "gd17",
                           p_val_adj < 0.05 & avg_log2FC < -0.5 ~ "gdIFN",
                           TRUE ~ "NS"),
           label = ifelse(gene %in% hl2 | (p_val_adj < 1e-10 & abs(avg_log2FC) > 1),
                          gene, NA))
  
  ggplot(de_sub, aes(x = avg_log2FC, y = -log10(p_val_adj), colour = sig)) +
    geom_point(alpha = 0.6, size = 1.5) +
    geom_text_repel(aes(label = label), size = 3, max.overlaps = 25,
                    fontface = "italic") +
    scale_colour_manual(values = c("gd17" = "#CC0000",
                                   "gdIFN" = "#0066CC", "NS" = "grey70")) +
    theme_minimal() +
    labs(title = "γδ17 vs γδIFN", x = "log2 FC (+ = γδ17)", y = "-log10(adj p)")
}

9. Age-Dependent Changes — Enhanced Analysis

9.1 Score Distributions by Age

age_order <- c("E15.5", "E17.5", "2 months", "3 months", "8-12 weeks",
               "8-14 weeks", "9 weeks", "16 months",
               "20-25 months", "22 months", "24 months")
gdt$age_ordered <- factor(gdt$age_info,
                          levels = intersect(age_order, unique(gdt$age_info)))

# Create a broad age category for statistical testing
gdt$age_broad <- case_when(
  gdt$age_info %in% c("E15.5", "E17.5")                            ~ "Embryonic",
  gdt$age_info %in% c("2 months", "3 months", "8-12 weeks",
                       "8-14 weeks", "9 weeks")                     ~ "Young Adult",
  gdt$age_info %in% c("16 months", "20-25 months",
                       "22 months", "24 months")                    ~ "Aged",
  TRUE                                                              ~ NA_character_
)
gdt$age_broad <- factor(gdt$age_broad, levels = c("Embryonic", "Young Adult", "Aged"))

cat("Broad age categories:\n")
## Broad age categories:
print(table(gdt$age_broad, useNA = "ifany"))
## 
##   Embryonic Young Adult        Aged 
##           1         605         386
p1 <- gdt@meta.data %>% filter(!is.na(age_ordered)) %>%
  ggplot(aes(x = age_ordered, y = gd17_score1, fill = age_ordered)) +
  geom_boxplot(outlier.size = 0.5) + theme_minimal() +
  labs(title = "γδ17 Score by Age", x = NULL, y = "Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

p2 <- gdt@meta.data %>% filter(!is.na(age_ordered)) %>%
  ggplot(aes(x = age_ordered, y = IL17_score1, fill = age_ordered)) +
  geom_boxplot(outlier.size = 0.5) + theme_minimal() +
  labs(title = "IL-17 Score by Age", x = NULL, y = "Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

p3 <- gdt@meta.data %>% filter(!is.na(age_ordered)) %>%
  ggplot(aes(x = age_ordered, y = gdIFN_score1, fill = age_ordered)) +
  geom_boxplot(outlier.size = 0.5) + theme_minimal() +
  labs(title = "γδIFN Score by Age", x = NULL, y = "Score") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

p1 / p2 / p3

9.2 Proportion Shift — γδ17 vs γδIFN with Age

gdt@meta.data %>%
  filter(!is.na(age_ordered), gd_subtype %in% c("gd17", "gdIFN")) %>%
  ggplot(aes(x = age_ordered, fill = gd_subtype)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("gd17" = "#CC0000", "gdIFN" = "#0066CC")) +
  theme_minimal() +
  labs(title = "γδ17 vs γδIFN Across Development & Aging",
       x = NULL, y = "Proportion") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

9.3 Statistical Tests — Age-Dependent IL-17 Decline

# ── Focus on meningeal cells only for clean age comparison ──
gdt_men <- subset(gdt, tissue_compartment == "Meningeal")
cat("Meningeal γδ T cells for age analysis:", ncol(gdt_men), "\n\n")
## Meningeal γδ T cells for age analysis: 866
# Contingency table: broad age × subtype (gd17 vs gdIFN)
age_sub <- gdt_men@meta.data %>%
  filter(!is.na(age_broad), gd_subtype %in% c("gd17", "gdIFN"))

cat("Cells per age × subtype:\n")
## Cells per age × subtype:
age_table <- table(age_sub$age_broad, age_sub$gd_subtype)
print(age_table)
##              
##               gd17 gdIFN
##   Embryonic      0     0
##   Young Adult  166    72
##   Aged         126    73
cat("\n")
# Proportions
age_props <- prop.table(age_table, margin = 1)
cat("Proportions (row-normalised):\n")
## Proportions (row-normalised):
print(round(age_props, 3))
##              
##                gd17 gdIFN
##   Embryonic              
##   Young Adult 0.697 0.303
##   Aged        0.633 0.367
cat("\n")
# ── Pairwise Fisher's exact tests ──
if (nrow(age_table) >= 2 & ncol(age_table) >= 2) {
  # Overall chi-squared / Fisher test
  cat("=== Overall test: age_broad × subtype ===\n")
  if (min(age_table) >= 5) {
    overall_test <- chisq.test(age_table)
    cat("Chi-squared test: X² =", round(overall_test$statistic, 2),
        ", df =", overall_test$parameter,
        ", p =", format.pval(overall_test$p.value, digits = 3), "\n\n")
  } else {
    overall_test <- fisher.test(age_table, simulate.p.value = TRUE, B = 10000)
    cat("Fisher's exact test (simulated): p =",
        format.pval(overall_test$p.value, digits = 3), "\n\n")
  }
  
  # Pairwise comparisons
  age_levels <- levels(droplevels(factor(age_sub$age_broad)))
  if (length(age_levels) >= 2) {
    cat("=== Pairwise Fisher's exact tests ===\n")
    pairwise_results <- list()
    
    for (i in 1:(length(age_levels) - 1)) {
      for (j in (i + 1):length(age_levels)) {
        a1 <- age_levels[i]
        a2 <- age_levels[j]
        sub_tab <- age_table[c(a1, a2), ]
        
        # Only test if both groups have cells
        if (all(rowSums(sub_tab) > 0)) {
          ft <- fisher.test(sub_tab)
          pairwise_results[[paste(a1, "vs", a2)]] <- data.frame(
            Comparison  = paste(a1, "vs", a2),
            gd17_pct_1  = round(age_props[a1, "gd17"] * 100, 1),
            gd17_pct_2  = round(age_props[a2, "gd17"] * 100, 1),
            OR          = round(ft$estimate, 3),
            p_value     = ft$p.value,
            stringsAsFactors = FALSE
          )
        }
      }
    }
    
    pairwise_df <- bind_rows(pairwise_results)
    pairwise_df$p_adj <- p.adjust(pairwise_df$p_value, method = "BH")
    pairwise_df$sig   <- ifelse(pairwise_df$p_adj < 0.05, "*", "ns")
    
    print(pairwise_df)
  }
}
## === Overall test: age_broad × subtype ===
## Fisher's exact test (simulated): p = 0.193 
## 
## === Pairwise Fisher's exact tests ===
##                     Comparison gd17_pct_1 gd17_pct_2    OR   p_value     p_adj
## odds ratio Young Adult vs Aged       69.7       63.3 1.335 0.1847454 0.1847454
##            sig
## odds ratio  ns
# ── Logistic regression: P(gd17) ~ age_broad, meningeal cells only ──
age_sub_men <- gdt_men@meta.data %>%
  filter(!is.na(age_broad), gd_subtype %in% c("gd17", "gdIFN")) %>%
  mutate(is_gd17 = as.integer(gd_subtype == "gd17"))

if (nrow(age_sub_men) >= 20 & length(unique(age_sub_men$age_broad)) >= 2) {
  glm_fit <- glm(is_gd17 ~ age_broad, data = age_sub_men, family = binomial)
  
  cat("\n=== Logistic regression: P(γδ17) ~ age (meningeal only) ===\n")
  print(summary(glm_fit))
  
  # Odds ratios with 95% CI
  or_table <- data.frame(
    Term = names(coef(glm_fit)),
    OR   = exp(coef(glm_fit)),
    CI_lower = exp(confint(glm_fit))[, 1],
    CI_upper = exp(confint(glm_fit))[, 2],
    p_value  = summary(glm_fit)$coefficients[, 4]
  )
  cat("\nOdds ratios:\n")
  print(or_table)
}
## 
## === Logistic regression: P(γδ17) ~ age (meningeal only) ===
## 
## Call:
## glm(formula = is_gd17 ~ age_broad, family = binomial, data = age_sub_men)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.8353     0.1411    5.92 3.23e-09 ***
## age_broadAged  -0.2895     0.2038   -1.42    0.156    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 555.38  on 436  degrees of freedom
## Residual deviance: 553.37  on 435  degrees of freedom
## AIC: 557.37
## 
## Number of Fisher Scoring iterations: 4
## 
## Odds ratios:
##                        Term        OR  CI_lower CI_upper      p_value
## (Intercept)     (Intercept) 2.3055556 1.7565143 3.056946 3.229022e-09
## age_broadAged age_broadAged 0.7486384 0.5016056 1.116233 1.555283e-01
# ── Kruskal-Wallis + pairwise Wilcoxon for continuous scores ──
men_age <- gdt_men@meta.data %>% filter(!is.na(age_broad))

if (length(unique(men_age$age_broad)) >= 2) {
  cat("\n=== Kruskal-Wallis: γδ17 score ~ age_broad (meningeal) ===\n")
  kw_17 <- kruskal.test(gd17_score1 ~ age_broad, data = men_age)
  cat("H =", round(kw_17$statistic, 2), ", p =",
      format.pval(kw_17$p.value, digits = 3), "\n")
  
  cat("\n=== Kruskal-Wallis: IL-17 score ~ age_broad (meningeal) ===\n")
  kw_il17 <- kruskal.test(IL17_score1 ~ age_broad, data = men_age)
  cat("H =", round(kw_il17$statistic, 2), ", p =",
      format.pval(kw_il17$p.value, digits = 3), "\n")
  
  cat("\n=== Kruskal-Wallis: γδIFN score ~ age_broad (meningeal) ===\n")
  kw_ifn <- kruskal.test(gdIFN_score1 ~ age_broad, data = men_age)
  cat("H =", round(kw_ifn$statistic, 2), ", p =",
      format.pval(kw_ifn$p.value, digits = 3), "\n")
  
  # Pairwise Wilcoxon for gd17 score
  cat("\n=== Pairwise Wilcoxon: γδ17 score ===\n")
  pw_17 <- pairwise.wilcox.test(men_age$gd17_score1, men_age$age_broad,
                                 p.adjust.method = "BH")
  print(pw_17)
  
  cat("\n=== Pairwise Wilcoxon: IL-17 score ===\n")
  pw_il17 <- pairwise.wilcox.test(men_age$IL17_score1, men_age$age_broad,
                                   p.adjust.method = "BH")
  print(pw_il17)
  
  cat("\n=== Pairwise Wilcoxon: γδIFN score ===\n")
  pw_ifn <- pairwise.wilcox.test(men_age$gdIFN_score1, men_age$age_broad,
                                  p.adjust.method = "BH")
  print(pw_ifn)
}
## 
## === Kruskal-Wallis: γδ17 score ~ age_broad (meningeal) ===
## H = 2.07 , p = 0.15 
## 
## === Kruskal-Wallis: IL-17 score ~ age_broad (meningeal) ===
## H = 0.45 , p = 0.504 
## 
## === Kruskal-Wallis: γδIFN score ~ age_broad (meningeal) ===
## H = 44.94 , p = 2.03e-11 
## 
## === Pairwise Wilcoxon: γδ17 score ===
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  men_age$gd17_score1 and men_age$age_broad 
## 
##      Young Adult
## Aged 0.15       
## 
## P value adjustment method: BH 
## 
## === Pairwise Wilcoxon: IL-17 score ===
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  men_age$IL17_score1 and men_age$age_broad 
## 
##      Young Adult
## Aged 0.5        
## 
## P value adjustment method: BH 
## 
## === Pairwise Wilcoxon: γδIFN score ===
## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  men_age$gdIFN_score1 and men_age$age_broad 
## 
##      Young Adult
## Aged 2e-11      
## 
## P value adjustment method: BH

9.4 Summary Visualisation — Age Effects (Meningeal Only)

men_age_sub <- gdt_men@meta.data %>%
  filter(!is.na(age_broad), gd_subtype %in% c("gd17", "gdIFN"))

# Panel A: proportion bar
p_prop <- men_age_sub %>%
  ggplot(aes(x = age_broad, fill = gd_subtype)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("gd17" = "#CC0000", "gdIFN" = "#0066CC")) +
  theme_minimal(base_size = 13) +
  labs(title = "A. γδ17 vs γδIFN Proportion by Age (Meningeal)",
       x = NULL, y = "Proportion", fill = "Subtype") +
  theme(legend.position = "bottom")

# Panel B: score boxplots by broad age
score_long <- gdt_men@meta.data %>%
  filter(!is.na(age_broad)) %>%
  select(age_broad, gd17_score1, gdIFN_score1, IL17_score1) %>%
  pivot_longer(cols = c(gd17_score1, gdIFN_score1, IL17_score1),
               names_to = "score_type", values_to = "score") %>%
  mutate(score_type = recode(score_type,
                             "gd17_score1"  = "γδ17 Score",
                             "gdIFN_score1" = "γδIFN Score",
                             "IL17_score1"  = "IL-17 Score"))

p_scores <- score_long %>%
  ggplot(aes(x = age_broad, y = score, fill = age_broad)) +
  geom_boxplot(outlier.size = 0.5) +
  facet_wrap(~ score_type, scales = "free_y", ncol = 3) +
  theme_minimal(base_size = 13) +
  labs(title = "B. Module Scores by Age (Meningeal)",
       x = NULL, y = "Score") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 30, hjust = 1))

# Panel C: detailed age (fine-grained)
p_detail <- gdt_men@meta.data %>%
  filter(!is.na(age_ordered), gd_subtype %in% c("gd17", "gdIFN")) %>%
  ggplot(aes(x = age_ordered, fill = gd_subtype)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("gd17" = "#CC0000", "gdIFN" = "#0066CC")) +
  theme_minimal(base_size = 13) +
  labs(title = "C. Fine-Grained Age Breakdown (Meningeal)",
       x = NULL, y = "Proportion", fill = "Subtype") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

# Panel D: cell counts per age
p_counts <- gdt_men@meta.data %>%
  filter(!is.na(age_broad)) %>%
  ggplot(aes(x = age_broad, fill = gd_subtype)) +
  geom_bar(position = "stack") +
  scale_fill_manual(values = c("gd17" = "#CC0000", "gdIFN" = "#0066CC",
                               "Mixed" = "#9933CC", "Unassigned" = "grey80")) +
  theme_minimal(base_size = 13) +
  labs(title = "D. Cell Counts by Age (Meningeal)",
       x = NULL, y = "Cells", fill = "Subtype") +
  theme(legend.position = "bottom")

(p_prop + p_counts) / (p_scores) / (p_detail) +
  plot_layout(heights = c(1, 1, 1))

10. IL-17 Niche — Response in All Cell Types

il17_niche <- c("Il17ra", "Il17rb", "Il17rc", "Il17rd", "Il17re",
                "Cxcl1", "Cxcl2", "Cxcl5", "Ccl20",
                "Lcn2", "S100a8", "S100a9",
                "Mmp9", "Mmp13", "Cebpb", "Cebpd",
                "Csf3", "Csf2", "Nfkbia")
il17_niche_present <- il17_niche[il17_niche %in% rownames(seurat_annotated)]

if (length(il17_niche_present) >= 2) {
  DotPlot(seurat_annotated, features = il17_niche_present,
          group.by = "original_annotation",
          cols = c("lightgrey", "#FF6600"), dot.scale = 5) +
    RotatedAxis() +
    ggtitle("IL-17 Response in All Annotated Cell Types") +
    theme(axis.text.x = element_text(face = "italic", size = 9),
          axis.text.y = element_text(size = 8))
}

11. Summary Heatmap

hm_genes <- c("Cd3e", "Cd3d", "Trdc",
              "Rorc", "Sox13", "Maf", "Blk", "Rora", "Zbtb16", "Scart2",
              "Il17a", "Il17f", "Il22", "Il23r", "Il1r1", "Ccr6",
              "Tbx21", "Eomes", "Ifng", "Gzmb", "Nkg7", "Cd27",
              "Cd44", "Cd69", "Cxcr6", "Mki67", "Top2a")
hm_present <- unique(hm_genes[hm_genes %in% rownames(gdt)])

if (length(hm_present) >= 5) {
  avg     <- AverageExpression(gdt, features = hm_present,
                               group.by = "seurat_clusters", return.seurat = FALSE)
  avg_mat <- as.matrix(avg[[1]])
  avg_sc  <- t(scale(t(avg_mat)))
  
  gene_cat <- data.frame(
    Category = case_when(
      hm_present %in% c("Cd3e", "Cd3d", "Trdc")                        ~ "1_Identity",
      hm_present %in% c("Rorc", "Sox13", "Maf", "Blk", "Rora",
                         "Zbtb16", "Scart2")                            ~ "2_gd17_TFs",
      hm_present %in% c("Il17a", "Il17f", "Il22", "Il23r",
                         "Il1r1", "Ccr6")                               ~ "3_IL17",
      hm_present %in% c("Tbx21", "Eomes", "Ifng", "Gzmb",
                         "Nkg7", "Cd27")                                ~ "4_gdIFN",
      hm_present %in% c("Cd44", "Cd69", "Cxcr6")                      ~ "5_Activation",
      TRUE                                                              ~ "6_Proliferation"),
    row.names = hm_present)
  
  pheatmap(avg_sc[hm_present, ],
           annotation_row = gene_cat,
           cluster_cols = TRUE, cluster_rows = FALSE,
           color = colorRampPalette(c("#0066CC", "white", "#CC0000"))(100),
           main = "γδ T Cell Programme by Cluster",
           fontsize_row = 9, fontsize_col = 10)
}

12. Prepare for Integration with miR-128a Data

12.1 Create Clean Meningeal γδ T Object

# ── Subset to meningeal γδ T cells (the population of interest) ──
gdt_meningeal <- subset(gdt, tissue_compartment == "Meningeal")
cat("Meningeal γδ T cells for integration:", ncol(gdt_meningeal), "\n\n")
## Meningeal γδ T cells for integration: 866
# Summary of what we're handing off
cat("Subtype composition:\n")
## Subtype composition:
print(table(gdt_meningeal$gd_subtype))
## 
##       gd17      gdIFN      Mixed Unassigned 
##        292        145         63        366
cat("\nAge distribution:\n")
## 
## Age distribution:
print(table(gdt_meningeal$age_broad, useNA = "ifany"))
## 
##   Embryonic Young Adult        Aged 
##           0         525         341
cat("\nSite breakdown:\n")
## 
## Site breakdown:
print(table(gdt_meningeal$site))
## 
## brain meninges     dura mater dural meninges 
##             24            504            338

12.2 Normalise & Prepare for SCTransform Integration

# Re-normalise the meningeal subset fresh for integration
# Using SCTransform for better integration with the miR-128a dataset
gdt_meningeal <- NormalizeData(gdt_meningeal)
gdt_meningeal <- FindVariableFeatures(gdt_meningeal, nfeatures = 3000)
gdt_meningeal <- ScaleData(gdt_meningeal, features = rownames(gdt_meningeal))
gdt_meningeal <- RunPCA(gdt_meningeal, npcs = 30, verbose = FALSE)
gdt_meningeal <- RunUMAP(gdt_meningeal, dims = 1:15)

p1 <- DimPlot(gdt_meningeal, group.by = "gd_subtype",
              cols = c("gd17" = "#CC0000", "gdIFN" = "#0066CC",
                       "Mixed" = "#9933CC", "Unassigned" = "grey80")) +
  ggtitle(paste0("Meningeal γδ T (n=", ncol(gdt_meningeal), ")"))

p2 <- DimPlot(gdt_meningeal, group.by = "age_broad") +
  ggtitle("By Age Group")

p1 + p2

12.3 Add Integration Metadata

# Tag this dataset for integration
gdt_meningeal$dataset       <- "CNS_border_atlas"
gdt_meningeal$species       <- "mouse"
gdt_meningeal$cell_type     <- "gdT"
gdt_meningeal$compartment   <- "meninges"
gdt_meningeal$mir128a_status <- "wildtype"   # these are WT reference cells

# Ensure key metadata is clean
gdt_meningeal$subtype_gd    <- gdt_meningeal$gd_subtype
gdt_meningeal$il17_score    <- gdt_meningeal$IL17_score1
gdt_meningeal$gd17_score    <- gdt_meningeal$gd17_score1
gdt_meningeal$gdifn_score   <- gdt_meningeal$gdIFN_score1

# Print final metadata summary
cat("Final integration object summary:\n")
## Final integration object summary:
cat("  Cells:", ncol(gdt_meningeal), "\n")
##   Cells: 866
cat("  Genes:", nrow(gdt_meningeal), "\n")
##   Genes: 25836
cat("  Assay:", Assays(gdt_meningeal), "\n")
##   Assay: originalexp
cat("\n  Key metadata columns:\n")
## 
##   Key metadata columns:
cat("    - gd_subtype / subtype_gd (gd17, gdIFN, Mixed, Unassigned)\n")
##     - gd_subtype / subtype_gd (gd17, gdIFN, Mixed, Unassigned)
cat("    - age_broad (Embryonic, Young Adult, Aged)\n")
##     - age_broad (Embryonic, Young Adult, Aged)
cat("    - age_info (detailed age)\n")
##     - age_info (detailed age)
cat("    - tissue_compartment (Meningeal)\n")
##     - tissue_compartment (Meningeal)
cat("    - site (specific meningeal site)\n")
##     - site (specific meningeal site)
cat("    - il17_score, gd17_score, gdifn_score\n")
##     - il17_score, gd17_score, gdifn_score
cat("    - dataset, species, mir128a_status\n")
##     - dataset, species, mir128a_status

12.4 DotPlot — Reference Markers for Integration QC

# Markers to check after integration with miR-128a data
integration_markers <- c(
  # γδ identity
  "Cd3e", "Cd3d",
  # γδ17 programme — expect changes with miR-128a
  "Rorc", "Sox13", "Maf", "Blk", "Rora",
  "Il17a", "Il17f", "Il23r", "Il1r1", "Ccr6",
  # γδIFN programme
  "Tbx21", "Ifng", "Gzmb", "Nkg7", "Cd27",
  # miR-128a relevant targets (curated from literature)
  "Bmi1", "Ezh2", "Kdm5a", "Suz12",
  "Rictor", "Pik3r1",
  # Activation / homing
  "Cd44", "Cd69", "Cxcr6", "S1pr1"
)
integration_markers_present <- integration_markers[integration_markers %in% rownames(gdt_meningeal)]

DotPlot(gdt_meningeal, features = integration_markers_present,
        group.by = "gd_subtype",
        cols = c("lightgrey", "darkred"), dot.scale = 6) +
  RotatedAxis() +
  ggtitle("Reference Expression — Meningeal γδ T Subtypes") +
  theme(axis.text.x = element_text(face = "italic", size = 9))

13. Save

out_dir <- "~/Documents/Projects/CNS_border_atlas/results/gdT_analysis"
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

# Full γδ T object (all compartments, post-contaminant removal)
saveRDS(gdt, file.path(out_dir, "gdT_subset_clean.rds"))

# Meningeal γδ T object — ready for integration
saveRDS(gdt_meningeal, file.path(out_dir, "gdT_meningeal_for_integration.rds"))

# Markers and DE tables
if (exists("gdt_markers"))
  write_csv(gdt_markers, file.path(out_dir, "gdT_cluster_markers.csv"))
if (exists("de_sub"))
  write_csv(de_sub, file.path(out_dir, "DE_gd17_vs_gdIFN.csv"))
if (exists("de_mb"))
  write_csv(de_mb, file.path(out_dir, "DE_meningeal_vs_BM_gdT.csv"))

# Metadata
write_csv(gdt@meta.data %>% rownames_to_column("cell_barcode"),
          file.path(out_dir, "gdT_metadata_clean.csv"))
write_csv(gdt_meningeal@meta.data %>% rownames_to_column("cell_barcode"),
          file.path(out_dir, "gdT_meningeal_metadata.csv"))

# Age stats summary
if (exists("pairwise_df")) {
  write_csv(pairwise_df, file.path(out_dir, "age_pairwise_fisher_tests.csv"))
}

cat("Saved to:", out_dir, "\n")
## Saved to: ~/Documents/Projects/CNS_border_atlas/results/gdT_analysis
cat("\nKey outputs for integration:\n")
## 
## Key outputs for integration:
cat("  1. gdT_meningeal_for_integration.rds — Seurat object, meningeal γδ T cells\n")
##   1. gdT_meningeal_for_integration.rds — Seurat object, meningeal γδ T cells
cat("  2. gdT_subset_clean.rds — full clean γδ T object (all sites)\n")
##   2. gdT_subset_clean.rds — full clean γδ T object (all sites)
cat("  3. gdT_meningeal_metadata.csv — metadata table\n")
##   3. gdT_meningeal_metadata.csv — metadata table
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] future_1.69.0               RColorBrewer_1.1-3         
##  [3] pheatmap_1.0.13             ggrepel_0.9.6              
##  [5] qs2_0.1.7                   Seurat_5.4.0               
##  [7] SeuratObject_5.3.0          sp_2.2-1                   
##  [9] scater_1.34.1               zellkonverter_1.16.0       
## [11] scran_1.34.0                scuttle_1.16.0             
## [13] SingleCellExperiment_1.28.1 SummarizedExperiment_1.36.0
## [15] Biobase_2.66.0              GenomicRanges_1.58.0       
## [17] GenomeInfoDb_1.42.3         IRanges_2.40.1             
## [19] S4Vectors_0.44.0            BiocGenerics_0.52.0        
## [21] MatrixGenerics_1.18.1       matrixStats_1.5.0          
## [23] viridis_0.6.5               viridisLite_0.4.3          
## [25] patchwork_1.3.2             lubridate_1.9.5            
## [27] forcats_1.0.1               stringr_1.6.0              
## [29] dplyr_1.2.0                 purrr_1.2.1                
## [31] readr_2.1.6                 tidyr_1.3.2                
## [33] tibble_3.3.1                ggplot2_4.0.2              
## [35] tidyverse_2.0.0            
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.23        splines_4.4.2           later_1.4.6            
##   [4] filelock_1.0.3          polyclip_1.10-7         basilisk.utils_1.18.0  
##   [7] fastDummies_1.7.5       lifecycle_1.0.5         edgeR_4.4.2            
##  [10] vroom_1.7.0             globals_0.19.0          lattice_0.22-9         
##  [13] MASS_7.3-65             magrittr_2.0.4          limma_3.62.2           
##  [16] plotly_4.12.0           sass_0.4.10             rmarkdown_2.30         
##  [19] jquerylib_0.1.4         yaml_2.3.12             metapod_1.14.0         
##  [22] httpuv_1.6.16           otel_0.2.0              sctransform_0.4.3      
##  [25] spam_2.11-3             spatstat.sparse_3.1-0   reticulate_1.45.0      
##  [28] cowplot_1.2.0           pbapply_1.7-4           abind_1.4-8            
##  [31] zlibbioc_1.52.0         Rtsne_0.17              presto_1.0.0           
##  [34] GenomeInfoDbData_1.2.13 irlba_2.3.7             listenv_0.10.0         
##  [37] spatstat.utils_3.2-1    goftest_1.2-3           RSpectra_0.16-2        
##  [40] spatstat.random_3.4-4   dqrng_0.4.1             fitdistrplus_1.2-6     
##  [43] parallelly_1.46.1       codetools_0.2-20        DelayedArray_0.32.0    
##  [46] tidyselect_1.2.1        UCSC.utils_1.2.0        farver_2.1.2           
##  [49] ScaledMatrix_1.14.0     spatstat.explore_3.7-0  jsonlite_2.0.0         
##  [52] BiocNeighbors_2.0.1     progressr_0.18.0        ggridges_0.5.7         
##  [55] survival_3.8-6          tools_4.4.2             ica_1.0-3              
##  [58] Rcpp_1.1.1              glue_1.8.0              gridExtra_2.3          
##  [61] SparseArray_1.6.2       xfun_0.56               withr_3.0.2            
##  [64] fastmap_1.2.0           basilisk_1.18.0         bluster_1.16.0         
##  [67] digest_0.6.39           rsvd_1.0.5              timechange_0.4.0       
##  [70] R6_2.6.1                mime_0.13               scattermore_1.2        
##  [73] tensor_1.5.1            dichromat_2.0-0.1       spatstat.data_3.1-9    
##  [76] utf8_1.2.6              generics_0.1.4          data.table_1.18.2.1    
##  [79] httr_1.4.8              htmlwidgets_1.6.4       S4Arrays_1.6.0         
##  [82] uwot_0.2.4              pkgconfig_2.0.3         gtable_0.3.6           
##  [85] lmtest_0.9-40           S7_0.2.1                XVector_0.46.0         
##  [88] htmltools_0.5.9         dotCall64_1.2           scales_1.4.0           
##  [91] png_0.1-8               spatstat.univar_3.1-6   knitr_1.51             
##  [94] rstudioapi_0.18.0       tzdb_0.5.0              reshape2_1.4.5         
##  [97] nlme_3.1-168            cachem_1.1.0            zoo_1.8-15             
## [100] KernSmooth_2.23-26      parallel_4.4.2          miniUI_0.1.2           
## [103] vipor_0.4.7             ggrastr_1.0.2           pillar_1.11.1          
## [106] grid_4.4.2              vctrs_0.7.1             RANN_2.6.2             
## [109] promises_1.5.0          stringfish_0.18.0       BiocSingular_1.22.0    
## [112] beachmat_2.22.0         xtable_1.8-4            cluster_2.1.8.2        
## [115] beeswarm_0.4.0          evaluate_1.0.5          cli_3.6.5              
## [118] locfit_1.5-9.12         compiler_4.4.2          rlang_1.1.7            
## [121] crayon_1.5.3            future.apply_1.20.1     labeling_0.4.3         
## [124] plyr_1.8.9              ggbeeswarm_0.7.3        stringi_1.8.7          
## [127] deldir_2.0-4            BiocParallel_1.40.2     lazyeval_0.2.2         
## [130] spatstat.geom_3.7-0     Matrix_1.7-4            dir.expiry_1.14.0      
## [133] RcppHNSW_0.6.0          hms_1.1.4               bit64_4.6.0-1          
## [136] statmod_1.5.1           shiny_1.12.1            ROCR_1.0-12            
## [139] igraph_2.2.2            RcppParallel_5.1.11-1   bslib_0.10.0           
## [142] bit_4.6.0