Introduction

This script performs GSEA using gmt files of genes upregulated by each cluster of a human thymus single-cell atlas (source: https://www.science.org/doi/10.1126/science.aay3224) and various stages of thymocyte development from the Molecular Signatures Database (https://www.gsea-msigdb.org/gsea/msigdb).

Note about some of the cell types in the Park et al. human thymus single-cell atlas:

T agonist = a population that shared expression modules with differentiating Treg cells, but not with terminally differentiated Treg cells. This population was defined by expression of a noncoding RNA, MIR155HG.
MEMP = megakaryocyte/erythrocyte/mast cell progenitor
NMP = neutrophil-myeloid progenitor * Some cell types were annotated on multiple UMAPs (i.e., a UMAP of all thymus cells together vs a UMAP of just T-cell populations). These are distinguished by a Roman numeral (e.g., Early Thymic Progenitor and Early Thymic Progenitor II). These numerals do not imply any biologic difference, such as later stages of differentiation.

Software

library(Seurat)
library(tidyverse)
library(patchwork)
library(pheatmap)
library(RColorBrewer)
library(SingleR)
library(celldex) # To install: BiocManager::install("celldex")
library(data.table)
library(knitr)
library(clusterProfiler)

Data

Input: Pre-processed Seurat object of integrated thymocyte and T-cell subsets from canine thymus and lymph node.

setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/T_Cells")
all_Tcells <- readRDS(file = "integrated_Tcells.Rdata")
all_Tcells <- FindNeighbors(all_Tcells, dims = 1:10)
all_Tcells <- FindClusters(all_Tcells, resolution = 0.2)
all_Tcells <- RunUMAP(all_Tcells, dims = 1:10)

p1 <- DimPlot(all_Tcells, 
        reduction = "umap",
        label = TRUE,
        label.size = 3,
        label.box = TRUE) + plot_annotation(title = "Merged Canine T-cells from Lymph Node and Thymus")

p2 <- DimPlot(all_Tcells, 
        reduction = "umap",
        group.by = "OriginalClusters",
        label = TRUE,
        label.size = 3,
        label.box = TRUE) + plot_annotation(title = "Merged Canine T-cells from Lymph Node and Thymus,\nOriginal Cluster IDs")

p1+p2

Gene Set Enrichment Analysis

# significant results only
t.cell.markers <- FindAllMarkers(all_Tcells, only.pos=TRUE)
sig.markers <- subset(t.cell.markers, p_val_adj < 0.05)

# loop through each cluster and create a ranked gene list for each
for (clust in unique(sig.markers$cluster)){
  name <- paste("cluster", clust, "rankedGenes", sep="")
  rankedGenes <- sig.markers %>%
    dplyr::filter(cluster == clust) %>%
    dplyr::select(gene, avg_log2FC) %>%
    arrange(desc(avg_log2FC))
  rankedGeneList <- rankedGenes$avg_log2FC
  names(rankedGeneList) <- rankedGenes$gene
  assign(name, rankedGeneList)
}

# import gmt file(s)
## multiple gmt files
setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/Cluster_Annotation/T_cell_GMTs")
gmtFiles <- list.files(pattern = "\\.gmt", full.names = TRUE) # Get list of all .gmt files in this directory
gmtTables <- lapply(gmtFiles, read.gmt) # Apply the read.gmt function to the list of .gmt files and save as a variable
gmt <- do.call(rbind, gmtTables) # Rbind files
setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/T_Cells")

# loop through each cluster's ranked gene list and run GSEA for gene lists in the gmt file(s) on each
for (geneList in ls(pattern = "cluster")){
  name <- paste(geneList, "GSEA", sep="_")
  input <- get(geneList)
  gse <- GSEA(input,
              exponent = 1,
              pvalueCutoff = 1,
              pAdjustMethod = "BH",
              TERM2GENE = gmt,
              verbose = TRUE,
              by = "fgsea")
  assign(name, gse)
}

# Remove any objects with no significantly enriched gene sets from the global environment
for (GSEAresult in ls(pattern = "_GSEA")){
  obj <- get(GSEAresult)
  res <- obj@result
  if (nrow(res) == 0) {
    rm(list = ls(pattern = GSEAresult))
  }
}

# Loop through each GSEA object, export results as csv, and plot results as an enrichment dot plot
plot_list <- list() # initiate empty list for individual plots

for (GSEAresult in ls(pattern = "_GSEA")){
  name <- paste(GSEAresult)
  obj <- get(GSEAresult)
  write.csv(obj, file=paste(name, "TCellGeneSets", "csv", sep="."))
  
  eplot <- obj %>%
    dotplot(showCategory = 10, x = "NES") +
    scale_color_viridis_c(name = "Adjusted\nP-value",
                          option = "H") +
    scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) +
    geom_vline(xintercept = 0, linetype = 2) +
    labs(title = name, y = "Gene Set") +
    theme(plot.title = element_text(hjust = 0.5))
  
  plot_list[[name]] <- eplot # add plot to list
}

# combine plots into single layout
combined_plot <- wrap_plots(plot_list, ncol = 3)
print(combined_plot)

Assigning cell type identity to clusters

To be done once confident cluster assignment has been achieved.

new.cluster.ids <- c("Naive_T_2", "DP_Thymocytes_1", "Activated_T", "DP_Thymocytes_2", "SP_Thymocytes", "Naive_T_1", "Proliferating_DP_Thymocytes", "CD8_NK", "ETP_DN_ISP_Thymocytes", "Late_SP_Thymocytes_1", "Late_SP_Thymocytes_2")
names(new.cluster.ids) <- levels(all_Tcells)
all_Tcells <- RenameIdents(all_Tcells, new.cluster.ids)

DimPlot(all_Tcells, reduction = "umap", label = TRUE, label.size = 3, label.box = TRUE, pt.size = 0.5) + ggtitle("Canine Merged Lymph Node and Thymus T Cells, Resolution: 0.2")

# save as new column in metadata in addition to reassigning idents
all_Tcells$IntegratedClusterIDs <- Idents(all_Tcells)

# Export
saveRDS(all_Tcells, file = "IntegThymAndLN_Annotated.Rdata")

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] clusterProfiler_4.12.6      knitr_1.49                 
##  [3] data.table_1.16.4           celldex_1.14.0             
##  [5] SingleR_2.6.0               SummarizedExperiment_1.34.0
##  [7] Biobase_2.64.0              GenomicRanges_1.56.2       
##  [9] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [11] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## [13] MatrixGenerics_1.16.0       matrixStats_1.5.0          
## [15] RColorBrewer_1.1-3          pheatmap_1.0.12            
## [17] patchwork_1.3.0             lubridate_1.9.4            
## [19] forcats_1.0.0               stringr_1.5.1              
## [21] dplyr_1.1.4                 purrr_1.0.2                
## [23] readr_2.1.5                 tidyr_1.3.1                
## [25] tibble_3.2.1                ggplot2_3.5.1              
## [27] tidyverse_2.0.0             Seurat_5.2.1               
## [29] SeuratObject_5.0.2          sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.5                  spatstat.sparse_3.1-0    
##   [3] enrichplot_1.24.4         httr_1.4.7               
##   [5] tools_4.4.0               sctransform_0.4.1        
##   [7] alabaster.base_1.4.2      R6_2.5.1                 
##   [9] HDF5Array_1.32.1          lazyeval_0.2.2           
##  [11] uwot_0.2.2                rhdf5filters_1.16.0      
##  [13] withr_3.0.2               gridExtra_2.3            
##  [15] progressr_0.15.1          cli_3.6.2                
##  [17] spatstat.explore_3.3-4    fastDummies_1.7.5        
##  [19] scatterpie_0.2.4          labeling_0.4.3           
##  [21] alabaster.se_1.4.1        sass_0.4.9               
##  [23] spatstat.data_3.1-4       ggridges_0.5.6           
##  [25] pbapply_1.7-2             yulab.utils_0.2.0        
##  [27] gson_0.1.0                DOSE_3.30.5              
##  [29] R.utils_2.12.3            parallelly_1.42.0        
##  [31] limma_3.60.6              rstudioapi_0.17.1        
##  [33] RSQLite_2.3.9             gridGraphics_0.5-1       
##  [35] generics_0.1.3            ica_1.0-3                
##  [37] spatstat.random_3.3-2     GO.db_3.19.1             
##  [39] Matrix_1.7-0              abind_1.4-8              
##  [41] R.methodsS3_1.8.2         lifecycle_1.0.4          
##  [43] yaml_2.3.10               qvalue_2.36.0            
##  [45] rhdf5_2.48.0              SparseArray_1.4.8        
##  [47] BiocFileCache_2.12.0      Rtsne_0.17               
##  [49] grid_4.4.0                blob_1.2.4               
##  [51] promises_1.3.2            ExperimentHub_2.12.0     
##  [53] crayon_1.5.3              miniUI_0.1.1.1           
##  [55] lattice_0.22-6            beachmat_2.20.0          
##  [57] cowplot_1.1.3             KEGGREST_1.44.1          
##  [59] pillar_1.10.1             fgsea_1.30.0             
##  [61] future.apply_1.11.3       codetools_0.2-20         
##  [63] fastmatch_1.1-6           glue_1.8.0               
##  [65] ggfun_0.1.8               spatstat.univar_3.1-1    
##  [67] treeio_1.28.0             vctrs_0.6.5              
##  [69] png_0.1-8                 gypsum_1.0.1             
##  [71] spam_2.11-1               gtable_0.3.6             
##  [73] cachem_1.1.0              xfun_0.49                
##  [75] S4Arrays_1.4.1            mime_0.12                
##  [77] tidygraph_1.3.1           survival_3.5-8           
##  [79] statmod_1.5.0             fitdistrplus_1.2-2       
##  [81] ROCR_1.0-11               nlme_3.1-164             
##  [83] ggtree_3.12.0             bit64_4.6.0-1            
##  [85] alabaster.ranges_1.4.2    filelock_1.0.3           
##  [87] RcppAnnoy_0.0.22          bslib_0.8.0              
##  [89] irlba_2.3.5.1             KernSmooth_2.23-22       
##  [91] colorspace_2.1-1          DBI_1.2.3                
##  [93] tidyselect_1.2.1          bit_4.5.0.1              
##  [95] compiler_4.4.0            curl_6.2.0               
##  [97] httr2_1.1.0               DelayedArray_0.30.1      
##  [99] plotly_4.10.4             shadowtext_0.1.4         
## [101] scales_1.3.0              lmtest_0.9-40            
## [103] rappdirs_0.3.3            digest_0.6.35            
## [105] goftest_1.2-3             presto_1.0.0             
## [107] spatstat.utils_3.1-2      alabaster.matrix_1.4.2   
## [109] rmarkdown_2.29            XVector_0.44.0           
## [111] htmltools_0.5.8.1         pkgconfig_2.0.3          
## [113] sparseMatrixStats_1.16.0  dbplyr_2.5.0             
## [115] fastmap_1.2.0             rlang_1.1.3              
## [117] htmlwidgets_1.6.4         UCSC.utils_1.0.0         
## [119] shiny_1.10.0              DelayedMatrixStats_1.26.0
## [121] farver_2.1.2              jquerylib_0.1.4          
## [123] zoo_1.8-12                jsonlite_1.8.9           
## [125] BiocParallel_1.38.0       GOSemSim_2.30.2          
## [127] R.oo_1.27.0               BiocSingular_1.20.0      
## [129] magrittr_2.0.3            ggplotify_0.1.2          
## [131] GenomeInfoDbData_1.2.12   dotCall64_1.2            
## [133] Rhdf5lib_1.26.0           munsell_0.5.1            
## [135] Rcpp_1.0.14               ape_5.8-1                
## [137] viridis_0.6.5             reticulate_1.40.0        
## [139] stringi_1.8.4             alabaster.schemas_1.4.0  
## [141] ggraph_2.2.1              zlibbioc_1.50.0          
## [143] MASS_7.3-60.2             AnnotationHub_3.12.0     
## [145] plyr_1.8.9                parallel_4.4.0           
## [147] listenv_0.9.1             ggrepel_0.9.6            
## [149] deldir_2.0-4              graphlayouts_1.2.2       
## [151] Biostrings_2.72.1         splines_4.4.0            
## [153] tensor_1.5                hms_1.1.3                
## [155] igraph_2.1.4              spatstat.geom_3.3-5      
## [157] RcppHNSW_0.6.0            reshape2_1.4.4           
## [159] ScaledMatrix_1.12.0       BiocVersion_3.19.1       
## [161] evaluate_1.0.3            BiocManager_1.30.25      
## [163] tweenr_2.0.3              tzdb_0.4.0               
## [165] httpuv_1.6.15             RANN_2.6.2               
## [167] polyclip_1.10-7           future_1.34.0            
## [169] scattermore_1.2           ggforce_0.4.2            
## [171] rsvd_1.0.5                xtable_1.8-4             
## [173] tidytree_0.4.6            RSpectra_0.16-2          
## [175] later_1.4.1               viridisLite_0.4.2        
## [177] snow_0.4-4                aplot_0.2.4              
## [179] memoise_2.0.1             AnnotationDbi_1.66.0     
## [181] cluster_2.1.6             timechange_0.3.0         
## [183] globals_0.16.3

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Merging T Cell Clusters Part 5: GSEA for Human & Murine Naive and Activated T cell Gene Sets from the Molecular Signatures Database and Gene Signatures Derived from the Conde et al. Human Immune Single-cell Atlas and the Ammons et al. Canine Circulating Leukocyte Single-Cell Atlas

Eileen Owens

2025-02-01