Introduction

The purpose of this script is to annotate single-cell RNA-seq clusters following filtering, normalization, and clustering of the data with Seurat. This script has been adapted from the Seurat documentation (https://satijalab.org/seurat/articles/pbmc3k_tutorial#assigning-cell-type-identity-to-clusters).

Software

library(Seurat)
library(tidyverse)
library(patchwork)
library(pheatmap)
library(RColorBrewer)
library(SingleR)
library(celldex) # To install: BiocManager::install("celldex")
library(data.table)
library(knitr)
library(clusterProfiler)

Data

Input: Pre-processed Seurat object.

setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation")
thym_seurat <- readRDS(file = "../Normalization_and_Clustering/THYM_NormalizedAndClustered.RData")

# Differentially expressed features
## identifies all positive and negative markers of a single cluster compared to all other cells.
thym_seurat <- FindClusters(thym_seurat, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22654
## Number of edges: 677273
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9488
## Number of communities: 11
## Elapsed time: 4 seconds
thym_seurat <- RunUMAP(thym_seurat, dims = 1:10)
thym.markers <- FindAllMarkers(thym_seurat, only.pos=TRUE)

# umap
thym_seurat <- RunUMAP(thym_seurat, 
                           dims = 1:10,
                           n.neighbors = 50, # default is 30
                           min.dist = 0.5) # default is 0.3

DimPlot(thym_seurat,
        reduction = "umap",
                   label = TRUE,
                   label.size = 6) + 
  plot_annotation(title = "Canine Thymus, Resolution: 0.2, \nn.neighbors = 50, min.dist = 0.5", theme = theme(plot.title = element_text(hjust = 0.5, size = 20)))

Gene Set Enrichment Analysis

Comparison with gene signatures from a human single-cell thymus atlas (https://cells.ucsc.edu/?ds=fetal-thymus+all; https://www.science.org/doi/10.1126/science.aay3224) and genes upregulated at various stages of thymocyte development from the Molecular Signatures Database (https://www.gsea-msigdb.org/gsea/msigdb).

Note about some of the cell types in the Park et al. human thymus single-cell atlas:

# significant results only
sig.thym.markers <- subset(thym.markers, p_val_adj < 0.05)

# loop through each cluster and create a ranked gene list for each
for (clust in unique(sig.thym.markers$cluster)){
  name <- paste("cluster", clust, "rankedGenes", sep="")
  rankedGenes <- sig.thym.markers %>%
    dplyr::filter(cluster == clust) %>%
    dplyr::select(gene, avg_log2FC) %>%
    arrange(desc(avg_log2FC))
  rankedGeneList <- rankedGenes$avg_log2FC
  names(rankedGeneList) <- rankedGenes$gene
  assign(name, rankedGeneList)
}

# import gmt file(s)
## multiple gmt files
setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation/Thymus_GMTs")
gmtFiles <- list.files(pattern = "\\.gmt", full.names = TRUE) # Get list of all .gmt files in this directory
gmtTables <- lapply(gmtFiles, read.gmt) # Apply the read.gmt function to the list of .gmt files and save as a variable
gmt <- do.call(rbind, gmtTables) # Rbind files
setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation")

## multiple gmt files
# gmtFiles <- list.files(pattern = "\\.gmt", full.names = TRUE) # Get list of all .gmt files in this directory
# gmtTables <- lapply(gmtFiles, read.gmt) # Apply the read.gmt function to the list of .gmt files and save as a variable
# gmt <- do.call(rbind, gmtTables) # Rbind files

# loop through each cluster's ranked gene list and run GSEA for gene lists in the gmt file(s) on each
for (geneList in ls(pattern = "cluster")){
  name <- paste(geneList, "GSEA", sep="_")
  input <- get(geneList)
  gse <- GSEA(input,
              exponent = 1,
              pvalueCutoff = 1,
              pAdjustMethod = "BH",
              TERM2GENE = gmt,
              verbose = TRUE,
              by = "fgsea")
  assign(name, gse)
}

# Remove any objects with no significantly enriched gene sets from the global environment
for (GSEAresult in ls(pattern = "_GSEA")){
  obj <- get(GSEAresult)
  res <- obj@result
  if (nrow(res) == 0) {
    rm(list = ls(pattern = GSEAresult))
  }
}
# Loop through each GSEA object, export results as csv, and plot results as an enrichment dot plot
plot_list <- list() # initiate empty list for individual plots

for (GSEAresult in ls(pattern = "_GSEA")){
  name <- paste("thym", GSEAresult, sep="_")
  obj <- get(GSEAresult)
  write.csv(obj, file=paste(name, "csv", sep="."))
  
  eplot <- obj %>%
    dotplot(showCategory = 10, x = "NES") +
    scale_color_viridis_c(name = "Adjusted\nP-value",
                          option = "H") +
    scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) +
    geom_vline(xintercept = 0, linetype = 2) +
    labs(title = name, y = "Gene Set") +
    theme(plot.title = element_text(hjust = 0.5))
  
  plot_list[[name]] <- eplot # add plot to list
}

# combine plots into single layout
combined_plot <- wrap_plots(plot_list, ncol = 3)
print(combined_plot)

Differential Expression Analysis

# find markers distinguishing clusters
## cluster 0 vs 1
Cluster0_vs_Cluster1 <- FindMarkers(thym_seurat, ident.1 = 0, ident.2 = 1)
write.csv(Cluster0_vs_Cluster1, file="thym_cluster0vscluster1_DEG.csv")
Cluster0_vs_Cluster1 %>%
  dplyr::arrange(avg_log2FC)

## cluster 1 vs 0
Cluster1_vs_Cluster0 <- FindMarkers(thym_seurat, ident.1 = 1, ident.2 = 0)
write.csv(Cluster1_vs_Cluster0, file="thym_cluster1vscluster0_DEG.csv")
Cluster1_vs_Cluster0 %>%
  dplyr::arrange(avg_log2FC)

## cluster 4 vs 8
Cluster4_vs_Cluster8 <- FindMarkers(thym_seurat, ident.1 = 4, ident.2 = 8)
write.csv(Cluster4_vs_Cluster8, file="thym_cluster4vscluster8_DEG.csv")
Cluster4_vs_Cluster8 %>%
  dplyr::arrange(avg_log2FC)

## cluster 5 vs 6
Cluster5_vs_Cluster6 <- FindMarkers(thym_seurat, ident.1 = 4, ident.2 = 8)
write.csv(Cluster5_vs_Cluster6, file="thym_cluster5vscluster6_DEG.csv")
Cluster5_vs_Cluster6 %>%
  dplyr::arrange(avg_log2FC)
kable(head(Cluster0_vs_Cluster1, n=20), caption = "Cluster 0 vs Cluster 1, Canine Thymus, Resolution: 0.2")
Cluster 0 vs Cluster 1, Canine Thymus, Resolution: 0.2
p_val avg_log2FC pct.1 pct.2 p_val_adj
RPL13A 0 2.845589 0.927 0.239 0
RPL14 0 2.466876 0.916 0.235 0
CFL1 0 2.517685 0.822 0.151 0
RPL27 0 2.369931 0.910 0.251 0
MYO7A 0 2.863546 0.946 0.292 0
RPL36AL 0 2.409323 0.794 0.141 0
SERF2 0 2.232048 0.900 0.247 0
EEF1A1 0 2.527629 0.914 0.262 0
RPLP0 0 2.340771 0.857 0.206 0
RPL34 0 2.358186 0.870 0.222 0
RPL7 0 2.375933 0.908 0.266 0
ENSCAFG00000037735 0 2.469168 0.909 0.270 0
RPS3 0 2.327889 0.875 0.236 0
ENSCAFG00000043730 0 2.569764 0.761 0.123 0
RPS5 0 2.294508 0.925 0.288 0
ENSCAFG00000006102 0 2.288204 0.900 0.264 0
ENSCAFG00000016065 0 2.531191 0.823 0.187 0
RPL5 0 2.209198 0.919 0.284 0
EIF3K 0 2.194524 0.807 0.173 0
FTL 0 2.521577 0.750 0.117 0
kable(head(Cluster1_vs_Cluster0, n=20), caption = "Cluster 1 vs Cluster 0, Canine Thymus, Resolution: 0.2")
Cluster 1 vs Cluster 0, Canine Thymus, Resolution: 0.2
p_val avg_log2FC pct.1 pct.2 p_val_adj
RPL13A 0 -2.845589 0.239 0.927 0
RPL14 0 -2.466876 0.235 0.916 0
CFL1 0 -2.517685 0.151 0.822 0
RPL27 0 -2.369931 0.251 0.910 0
MYO7A 0 -2.863546 0.292 0.946 0
RPL36AL 0 -2.409323 0.141 0.794 0
SERF2 0 -2.232048 0.247 0.900 0
EEF1A1 0 -2.527629 0.262 0.914 0
RPLP0 0 -2.340771 0.206 0.857 0
RPL34 0 -2.358186 0.222 0.870 0
RPL7 0 -2.375933 0.266 0.908 0
ENSCAFG00000037735 0 -2.469168 0.270 0.909 0
RPS3 0 -2.327889 0.236 0.875 0
ENSCAFG00000043730 0 -2.569764 0.123 0.761 0
RPS5 0 -2.294508 0.288 0.925 0
ENSCAFG00000006102 0 -2.288204 0.264 0.900 0
ENSCAFG00000016065 0 -2.531191 0.187 0.823 0
RPL5 0 -2.209198 0.284 0.919 0
EIF3K 0 -2.194524 0.173 0.807 0
FTL 0 -2.521577 0.117 0.750 0
kable(head(Cluster4_vs_Cluster8, n=20), caption = "Cluster 4 vs Cluster 8, Canine Thymus, Resolution: 0.2")
Cluster 4 vs Cluster 8, Canine Thymus, Resolution: 0.2
p_val avg_log2FC pct.1 pct.2 p_val_adj
PTMA 0 2.542254 0.999 0.955 0
ENSCAFG00000032392 0 3.463476 0.997 0.559 0
STMN1 0 2.530064 0.999 0.795 0
H2AZ1 0 3.015989 0.998 0.706 0
CFL1 0 3.148784 0.995 0.444 0
TPT1 0 2.611699 1.000 0.840 0
RPS8 0 2.895787 0.999 0.722 0
RPL8 0 2.789826 0.999 0.685 0
ENSCAFG00000009523 0 2.625405 0.998 0.772 0
DHDDS 0 2.910188 0.998 0.785 0
RPS4X 0 2.781179 1.000 0.759 0
ENSCAFG00000045333 0 3.186249 0.995 0.525 0
RPS6 0 3.015724 0.998 0.677 0
H3-3A 0 2.432530 0.997 0.625 0
RPS7 0 2.570644 1.000 0.730 0
ENSCAFG00000019044 0 2.980221 0.999 0.622 0
RPS25 0 2.951160 0.996 0.609 0
RPL11 0 2.643341 0.999 0.711 0
ENSCAFG00000049043 0 2.754056 0.998 0.682 0
RPL13 0 2.820418 0.998 0.577 0
kable(head(Cluster5_vs_Cluster6, n=20), caption = "Cluster 5 vs Cluster 6, Canine Thymus, Resolution: 0.2")
Cluster 5 vs Cluster 6, Canine Thymus, Resolution: 0.2
p_val avg_log2FC pct.1 pct.2 p_val_adj
PTMA 0 2.542254 0.999 0.955 0
ENSCAFG00000032392 0 3.463476 0.997 0.559 0
STMN1 0 2.530064 0.999 0.795 0
H2AZ1 0 3.015989 0.998 0.706 0
CFL1 0 3.148784 0.995 0.444 0
TPT1 0 2.611699 1.000 0.840 0
RPS8 0 2.895787 0.999 0.722 0
RPL8 0 2.789826 0.999 0.685 0
ENSCAFG00000009523 0 2.625405 0.998 0.772 0
DHDDS 0 2.910188 0.998 0.785 0
RPS4X 0 2.781179 1.000 0.759 0
ENSCAFG00000045333 0 3.186249 0.995 0.525 0
RPS6 0 3.015724 0.998 0.677 0
H3-3A 0 2.432530 0.997 0.625 0
RPS7 0 2.570644 1.000 0.730 0
ENSCAFG00000019044 0 2.980221 0.999 0.622 0
RPS25 0 2.951160 0.996 0.609 0
RPL11 0 2.643341 0.999 0.711 0
ENSCAFG00000049043 0 2.754056 0.998 0.682 0
RPL13 0 2.820418 0.998 0.577 0

Citations

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] clusterProfiler_4.14.4      knitr_1.49                 
##  [3] data.table_1.16.4           celldex_1.16.0             
##  [5] SingleR_2.8.0               SummarizedExperiment_1.36.0
##  [7] Biobase_2.66.0              GenomicRanges_1.58.0       
##  [9] GenomeInfoDb_1.42.1         IRanges_2.40.1             
## [11] S4Vectors_0.44.0            BiocGenerics_0.52.0        
## [13] MatrixGenerics_1.18.1       matrixStats_1.5.0          
## [15] RColorBrewer_1.1-3          pheatmap_1.0.12            
## [17] patchwork_1.3.0             lubridate_1.9.4            
## [19] forcats_1.0.0               stringr_1.5.1              
## [21] dplyr_1.1.4                 purrr_1.0.2                
## [23] readr_2.1.5                 tidyr_1.3.1                
## [25] tibble_3.2.1                ggplot2_3.5.1              
## [27] tidyverse_2.0.0             Seurat_5.2.1               
## [29] SeuratObject_5.0.2          sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.5                  spatstat.sparse_3.1-0    
##   [3] enrichplot_1.26.6         httr_1.4.7               
##   [5] tools_4.4.1               sctransform_0.4.1        
##   [7] alabaster.base_1.6.1      R6_2.5.1                 
##   [9] HDF5Array_1.34.0          lazyeval_0.2.2           
##  [11] uwot_0.2.2                rhdf5filters_1.18.0      
##  [13] withr_3.0.2               gridExtra_2.3            
##  [15] progressr_0.15.1          cli_3.6.3                
##  [17] spatstat.explore_3.3-3    fastDummies_1.7.5        
##  [19] labeling_0.4.3            alabaster.se_1.6.0       
##  [21] sass_0.4.9                spatstat.data_3.1-4      
##  [23] ggridges_0.5.6            pbapply_1.7-2            
##  [25] yulab.utils_0.1.9         gson_0.1.0               
##  [27] DOSE_4.0.0                R.utils_2.12.3           
##  [29] parallelly_1.40.1         limma_3.62.2             
##  [31] rstudioapi_0.17.1         RSQLite_2.3.9            
##  [33] gridGraphics_0.5-1        generics_0.1.3           
##  [35] ica_1.0-3                 spatstat.random_3.3-2    
##  [37] GO.db_3.20.0              Matrix_1.7-0             
##  [39] abind_1.4-8               R.methodsS3_1.8.2        
##  [41] lifecycle_1.0.4           yaml_2.3.10              
##  [43] qvalue_2.38.0             rhdf5_2.50.2             
##  [45] SparseArray_1.6.1         BiocFileCache_2.14.0     
##  [47] Rtsne_0.17                grid_4.4.1               
##  [49] blob_1.2.4                promises_1.3.2           
##  [51] ExperimentHub_2.14.0      crayon_1.5.3             
##  [53] ggtangle_0.0.6            miniUI_0.1.1.1           
##  [55] lattice_0.22-6            beachmat_2.22.0          
##  [57] cowplot_1.1.3             KEGGREST_1.46.0          
##  [59] pillar_1.10.1             fgsea_1.32.2             
##  [61] future.apply_1.11.3       codetools_0.2-20         
##  [63] fastmatch_1.1-6           glue_1.8.0               
##  [65] ggfun_0.1.8               spatstat.univar_3.1-1    
##  [67] treeio_1.30.0             vctrs_0.6.5              
##  [69] png_0.1-8                 gypsum_1.2.0             
##  [71] spam_2.11-1               gtable_0.3.6             
##  [73] cachem_1.1.0              xfun_0.49                
##  [75] S4Arrays_1.6.0            mime_0.12                
##  [77] survival_3.6-4            statmod_1.5.0            
##  [79] fitdistrplus_1.2-2        ROCR_1.0-11              
##  [81] nlme_3.1-164              ggtree_3.14.0            
##  [83] bit64_4.6.0-1             alabaster.ranges_1.6.0   
##  [85] filelock_1.0.3            RcppAnnoy_0.0.22         
##  [87] bslib_0.8.0               irlba_2.3.5.1            
##  [89] KernSmooth_2.23-24        colorspace_2.1-1         
##  [91] DBI_1.2.3                 tidyselect_1.2.1         
##  [93] bit_4.5.0.1               compiler_4.4.1           
##  [95] curl_6.1.0                httr2_1.1.0              
##  [97] BiocNeighbors_2.0.1       DelayedArray_0.32.0      
##  [99] plotly_4.10.4             scales_1.3.0             
## [101] lmtest_0.9-40             rappdirs_0.3.3           
## [103] digest_0.6.37             goftest_1.2-3            
## [105] presto_1.0.0              spatstat.utils_3.1-1     
## [107] alabaster.matrix_1.6.1    rmarkdown_2.29           
## [109] XVector_0.46.0            htmltools_0.5.8.1        
## [111] pkgconfig_2.0.3           sparseMatrixStats_1.18.0 
## [113] dbplyr_2.5.0              fastmap_1.2.0            
## [115] rlang_1.1.5               htmlwidgets_1.6.4        
## [117] UCSC.utils_1.2.0          shiny_1.10.0             
## [119] DelayedMatrixStats_1.28.1 farver_2.1.2             
## [121] jquerylib_0.1.4           zoo_1.8-12               
## [123] jsonlite_1.8.9            BiocParallel_1.40.0      
## [125] GOSemSim_2.32.0           R.oo_1.27.0              
## [127] BiocSingular_1.22.0       magrittr_2.0.3           
## [129] ggplotify_0.1.2           GenomeInfoDbData_1.2.13  
## [131] dotCall64_1.2             Rhdf5lib_1.28.0          
## [133] munsell_0.5.1             Rcpp_1.0.13-1            
## [135] ape_5.8-1                 reticulate_1.40.0        
## [137] stringi_1.8.4             alabaster.schemas_1.6.0  
## [139] zlibbioc_1.52.0           MASS_7.3-60.2            
## [141] AnnotationHub_3.14.0      plyr_1.8.9               
## [143] parallel_4.4.1            listenv_0.9.1            
## [145] ggrepel_0.9.6             deldir_2.0-4             
## [147] Biostrings_2.74.1         splines_4.4.1            
## [149] tensor_1.5                hms_1.1.3                
## [151] igraph_2.1.2              spatstat.geom_3.3-4      
## [153] RcppHNSW_0.6.0            reshape2_1.4.4           
## [155] ScaledMatrix_1.14.0       BiocVersion_3.20.0       
## [157] evaluate_1.0.3            BiocManager_1.30.25      
## [159] tzdb_0.4.0                httpuv_1.6.15            
## [161] RANN_2.6.2                polyclip_1.10-7          
## [163] future_1.34.0             scattermore_1.2          
## [165] rsvd_1.0.5                xtable_1.8-4             
## [167] tidytree_0.4.6            RSpectra_0.16-2          
## [169] later_1.4.1               viridisLite_0.4.2        
## [171] aplot_0.2.4               memoise_2.0.1            
## [173] AnnotationDbi_1.68.0      cluster_2.1.6            
## [175] timechange_0.3.0          globals_0.16.3
citation()
## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.