The purpose of this script is to annotate single-cell RNA-seq clusters following filtering, normalization, and clustering of the data with Seurat. This script has been adapted from the Seurat documentation (https://satijalab.org/seurat/articles/pbmc3k_tutorial#assigning-cell-type-identity-to-clusters).
library(Seurat)
library(tidyverse)
library(patchwork)
library(pheatmap)
library(RColorBrewer)
library(SingleR)
library(celldex) # To install: BiocManager::install("celldex")
library(data.table)
library(knitr)
library(clusterProfiler)
Input: Pre-processed Seurat object.
setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation")
thym_seurat <- readRDS(file = "../Normalization_and_Clustering/THYM_NormalizedAndClustered.RData")
# Differentially expressed features
## identifies all positive and negative markers of a single cluster compared to all other cells.
thym_seurat <- FindClusters(thym_seurat, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22654
## Number of edges: 677273
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9488
## Number of communities: 11
## Elapsed time: 4 seconds
thym_seurat <- RunUMAP(thym_seurat, dims = 1:10)
thym.markers <- FindAllMarkers(thym_seurat, only.pos=TRUE)
# umap
thym_seurat <- RunUMAP(thym_seurat,
dims = 1:10,
n.neighbors = 50, # default is 30
min.dist = 0.5) # default is 0.3
DimPlot(thym_seurat,
reduction = "umap",
label = TRUE,
label.size = 6) +
plot_annotation(title = "Canine Thymus, Resolution: 0.2, \nn.neighbors = 50, min.dist = 0.5", theme = theme(plot.title = element_text(hjust = 0.5, size = 20)))
Comparison with gene signatures from a human single-cell thymus atlas (https://cells.ucsc.edu/?ds=fetal-thymus+all; https://www.science.org/doi/10.1126/science.aay3224) and genes upregulated at various stages of thymocyte development from the Molecular Signatures Database (https://www.gsea-msigdb.org/gsea/msigdb).
Note about some of the cell types in the Park et al. human thymus single-cell atlas:
# significant results only
sig.thym.markers <- subset(thym.markers, p_val_adj < 0.05)
# loop through each cluster and create a ranked gene list for each
for (clust in unique(sig.thym.markers$cluster)){
name <- paste("cluster", clust, "rankedGenes", sep="")
rankedGenes <- sig.thym.markers %>%
dplyr::filter(cluster == clust) %>%
dplyr::select(gene, avg_log2FC) %>%
arrange(desc(avg_log2FC))
rankedGeneList <- rankedGenes$avg_log2FC
names(rankedGeneList) <- rankedGenes$gene
assign(name, rankedGeneList)
}
# import gmt file(s)
## multiple gmt files
setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation/Thymus_GMTs")
gmtFiles <- list.files(pattern = "\\.gmt", full.names = TRUE) # Get list of all .gmt files in this directory
gmtTables <- lapply(gmtFiles, read.gmt) # Apply the read.gmt function to the list of .gmt files and save as a variable
gmt <- do.call(rbind, gmtTables) # Rbind files
setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation")
## multiple gmt files
# gmtFiles <- list.files(pattern = "\\.gmt", full.names = TRUE) # Get list of all .gmt files in this directory
# gmtTables <- lapply(gmtFiles, read.gmt) # Apply the read.gmt function to the list of .gmt files and save as a variable
# gmt <- do.call(rbind, gmtTables) # Rbind files
# loop through each cluster's ranked gene list and run GSEA for gene lists in the gmt file(s) on each
for (geneList in ls(pattern = "cluster")){
name <- paste(geneList, "GSEA", sep="_")
input <- get(geneList)
gse <- GSEA(input,
exponent = 1,
pvalueCutoff = 1,
pAdjustMethod = "BH",
TERM2GENE = gmt,
verbose = TRUE,
by = "fgsea")
assign(name, gse)
}
# Remove any objects with no significantly enriched gene sets from the global environment
for (GSEAresult in ls(pattern = "_GSEA")){
obj <- get(GSEAresult)
res <- obj@result
if (nrow(res) == 0) {
rm(list = ls(pattern = GSEAresult))
}
}
# Loop through each GSEA object, export results as csv, and plot results as an enrichment dot plot
plot_list <- list() # initiate empty list for individual plots
for (GSEAresult in ls(pattern = "_GSEA")){
name <- paste("thym", GSEAresult, sep="_")
obj <- get(GSEAresult)
write.csv(obj, file=paste(name, "csv", sep="."))
eplot <- obj %>%
dotplot(showCategory = 10, x = "NES") +
scale_color_viridis_c(name = "Adjusted\nP-value",
option = "H") +
scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) +
geom_vline(xintercept = 0, linetype = 2) +
labs(title = name, y = "Gene Set") +
theme(plot.title = element_text(hjust = 0.5))
plot_list[[name]] <- eplot # add plot to list
}
# combine plots into single layout
combined_plot <- wrap_plots(plot_list, ncol = 3)
print(combined_plot)
# find markers distinguishing clusters
## cluster 0 vs 1
Cluster0_vs_Cluster1 <- FindMarkers(thym_seurat, ident.1 = 0, ident.2 = 1)
write.csv(Cluster0_vs_Cluster1, file="thym_cluster0vscluster1_DEG.csv")
Cluster0_vs_Cluster1 %>%
dplyr::arrange(avg_log2FC)
## cluster 1 vs 0
Cluster1_vs_Cluster0 <- FindMarkers(thym_seurat, ident.1 = 1, ident.2 = 0)
write.csv(Cluster1_vs_Cluster0, file="thym_cluster1vscluster0_DEG.csv")
Cluster1_vs_Cluster0 %>%
dplyr::arrange(avg_log2FC)
## cluster 4 vs 8
Cluster4_vs_Cluster8 <- FindMarkers(thym_seurat, ident.1 = 4, ident.2 = 8)
write.csv(Cluster4_vs_Cluster8, file="thym_cluster4vscluster8_DEG.csv")
Cluster4_vs_Cluster8 %>%
dplyr::arrange(avg_log2FC)
## cluster 5 vs 6
Cluster5_vs_Cluster6 <- FindMarkers(thym_seurat, ident.1 = 4, ident.2 = 8)
write.csv(Cluster5_vs_Cluster6, file="thym_cluster5vscluster6_DEG.csv")
Cluster5_vs_Cluster6 %>%
dplyr::arrange(avg_log2FC)
kable(head(Cluster0_vs_Cluster1, n=20), caption = "Cluster 0 vs Cluster 1, Canine Thymus, Resolution: 0.2")
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | |
---|---|---|---|---|---|
RPL13A | 0 | 2.845589 | 0.927 | 0.239 | 0 |
RPL14 | 0 | 2.466876 | 0.916 | 0.235 | 0 |
CFL1 | 0 | 2.517685 | 0.822 | 0.151 | 0 |
RPL27 | 0 | 2.369931 | 0.910 | 0.251 | 0 |
MYO7A | 0 | 2.863546 | 0.946 | 0.292 | 0 |
RPL36AL | 0 | 2.409323 | 0.794 | 0.141 | 0 |
SERF2 | 0 | 2.232048 | 0.900 | 0.247 | 0 |
EEF1A1 | 0 | 2.527629 | 0.914 | 0.262 | 0 |
RPLP0 | 0 | 2.340771 | 0.857 | 0.206 | 0 |
RPL34 | 0 | 2.358186 | 0.870 | 0.222 | 0 |
RPL7 | 0 | 2.375933 | 0.908 | 0.266 | 0 |
ENSCAFG00000037735 | 0 | 2.469168 | 0.909 | 0.270 | 0 |
RPS3 | 0 | 2.327889 | 0.875 | 0.236 | 0 |
ENSCAFG00000043730 | 0 | 2.569764 | 0.761 | 0.123 | 0 |
RPS5 | 0 | 2.294508 | 0.925 | 0.288 | 0 |
ENSCAFG00000006102 | 0 | 2.288204 | 0.900 | 0.264 | 0 |
ENSCAFG00000016065 | 0 | 2.531191 | 0.823 | 0.187 | 0 |
RPL5 | 0 | 2.209198 | 0.919 | 0.284 | 0 |
EIF3K | 0 | 2.194524 | 0.807 | 0.173 | 0 |
FTL | 0 | 2.521577 | 0.750 | 0.117 | 0 |
kable(head(Cluster1_vs_Cluster0, n=20), caption = "Cluster 1 vs Cluster 0, Canine Thymus, Resolution: 0.2")
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | |
---|---|---|---|---|---|
RPL13A | 0 | -2.845589 | 0.239 | 0.927 | 0 |
RPL14 | 0 | -2.466876 | 0.235 | 0.916 | 0 |
CFL1 | 0 | -2.517685 | 0.151 | 0.822 | 0 |
RPL27 | 0 | -2.369931 | 0.251 | 0.910 | 0 |
MYO7A | 0 | -2.863546 | 0.292 | 0.946 | 0 |
RPL36AL | 0 | -2.409323 | 0.141 | 0.794 | 0 |
SERF2 | 0 | -2.232048 | 0.247 | 0.900 | 0 |
EEF1A1 | 0 | -2.527629 | 0.262 | 0.914 | 0 |
RPLP0 | 0 | -2.340771 | 0.206 | 0.857 | 0 |
RPL34 | 0 | -2.358186 | 0.222 | 0.870 | 0 |
RPL7 | 0 | -2.375933 | 0.266 | 0.908 | 0 |
ENSCAFG00000037735 | 0 | -2.469168 | 0.270 | 0.909 | 0 |
RPS3 | 0 | -2.327889 | 0.236 | 0.875 | 0 |
ENSCAFG00000043730 | 0 | -2.569764 | 0.123 | 0.761 | 0 |
RPS5 | 0 | -2.294508 | 0.288 | 0.925 | 0 |
ENSCAFG00000006102 | 0 | -2.288204 | 0.264 | 0.900 | 0 |
ENSCAFG00000016065 | 0 | -2.531191 | 0.187 | 0.823 | 0 |
RPL5 | 0 | -2.209198 | 0.284 | 0.919 | 0 |
EIF3K | 0 | -2.194524 | 0.173 | 0.807 | 0 |
FTL | 0 | -2.521577 | 0.117 | 0.750 | 0 |
kable(head(Cluster4_vs_Cluster8, n=20), caption = "Cluster 4 vs Cluster 8, Canine Thymus, Resolution: 0.2")
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | |
---|---|---|---|---|---|
PTMA | 0 | 2.542254 | 0.999 | 0.955 | 0 |
ENSCAFG00000032392 | 0 | 3.463476 | 0.997 | 0.559 | 0 |
STMN1 | 0 | 2.530064 | 0.999 | 0.795 | 0 |
H2AZ1 | 0 | 3.015989 | 0.998 | 0.706 | 0 |
CFL1 | 0 | 3.148784 | 0.995 | 0.444 | 0 |
TPT1 | 0 | 2.611699 | 1.000 | 0.840 | 0 |
RPS8 | 0 | 2.895787 | 0.999 | 0.722 | 0 |
RPL8 | 0 | 2.789826 | 0.999 | 0.685 | 0 |
ENSCAFG00000009523 | 0 | 2.625405 | 0.998 | 0.772 | 0 |
DHDDS | 0 | 2.910188 | 0.998 | 0.785 | 0 |
RPS4X | 0 | 2.781179 | 1.000 | 0.759 | 0 |
ENSCAFG00000045333 | 0 | 3.186249 | 0.995 | 0.525 | 0 |
RPS6 | 0 | 3.015724 | 0.998 | 0.677 | 0 |
H3-3A | 0 | 2.432530 | 0.997 | 0.625 | 0 |
RPS7 | 0 | 2.570644 | 1.000 | 0.730 | 0 |
ENSCAFG00000019044 | 0 | 2.980221 | 0.999 | 0.622 | 0 |
RPS25 | 0 | 2.951160 | 0.996 | 0.609 | 0 |
RPL11 | 0 | 2.643341 | 0.999 | 0.711 | 0 |
ENSCAFG00000049043 | 0 | 2.754056 | 0.998 | 0.682 | 0 |
RPL13 | 0 | 2.820418 | 0.998 | 0.577 | 0 |
kable(head(Cluster5_vs_Cluster6, n=20), caption = "Cluster 5 vs Cluster 6, Canine Thymus, Resolution: 0.2")
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | |
---|---|---|---|---|---|
PTMA | 0 | 2.542254 | 0.999 | 0.955 | 0 |
ENSCAFG00000032392 | 0 | 3.463476 | 0.997 | 0.559 | 0 |
STMN1 | 0 | 2.530064 | 0.999 | 0.795 | 0 |
H2AZ1 | 0 | 3.015989 | 0.998 | 0.706 | 0 |
CFL1 | 0 | 3.148784 | 0.995 | 0.444 | 0 |
TPT1 | 0 | 2.611699 | 1.000 | 0.840 | 0 |
RPS8 | 0 | 2.895787 | 0.999 | 0.722 | 0 |
RPL8 | 0 | 2.789826 | 0.999 | 0.685 | 0 |
ENSCAFG00000009523 | 0 | 2.625405 | 0.998 | 0.772 | 0 |
DHDDS | 0 | 2.910188 | 0.998 | 0.785 | 0 |
RPS4X | 0 | 2.781179 | 1.000 | 0.759 | 0 |
ENSCAFG00000045333 | 0 | 3.186249 | 0.995 | 0.525 | 0 |
RPS6 | 0 | 3.015724 | 0.998 | 0.677 | 0 |
H3-3A | 0 | 2.432530 | 0.997 | 0.625 | 0 |
RPS7 | 0 | 2.570644 | 1.000 | 0.730 | 0 |
ENSCAFG00000019044 | 0 | 2.980221 | 0.999 | 0.622 | 0 |
RPS25 | 0 | 2.951160 | 0.996 | 0.609 | 0 |
RPL11 | 0 | 2.643341 | 0.999 | 0.711 | 0 |
ENSCAFG00000049043 | 0 | 2.754056 | 0.998 | 0.682 | 0 |
RPL13 | 0 | 2.820418 | 0.998 | 0.577 | 0 |
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] clusterProfiler_4.14.4 knitr_1.49
## [3] data.table_1.16.4 celldex_1.16.0
## [5] SingleR_2.8.0 SummarizedExperiment_1.36.0
## [7] Biobase_2.66.0 GenomicRanges_1.58.0
## [9] GenomeInfoDb_1.42.1 IRanges_2.40.1
## [11] S4Vectors_0.44.0 BiocGenerics_0.52.0
## [13] MatrixGenerics_1.18.1 matrixStats_1.5.0
## [15] RColorBrewer_1.1-3 pheatmap_1.0.12
## [17] patchwork_1.3.0 lubridate_1.9.4
## [19] forcats_1.0.0 stringr_1.5.1
## [21] dplyr_1.1.4 purrr_1.0.2
## [23] readr_2.1.5 tidyr_1.3.1
## [25] tibble_3.2.1 ggplot2_3.5.1
## [27] tidyverse_2.0.0 Seurat_5.2.1
## [29] SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.5 spatstat.sparse_3.1-0
## [3] enrichplot_1.26.6 httr_1.4.7
## [5] tools_4.4.1 sctransform_0.4.1
## [7] alabaster.base_1.6.1 R6_2.5.1
## [9] HDF5Array_1.34.0 lazyeval_0.2.2
## [11] uwot_0.2.2 rhdf5filters_1.18.0
## [13] withr_3.0.2 gridExtra_2.3
## [15] progressr_0.15.1 cli_3.6.3
## [17] spatstat.explore_3.3-3 fastDummies_1.7.5
## [19] labeling_0.4.3 alabaster.se_1.6.0
## [21] sass_0.4.9 spatstat.data_3.1-4
## [23] ggridges_0.5.6 pbapply_1.7-2
## [25] yulab.utils_0.1.9 gson_0.1.0
## [27] DOSE_4.0.0 R.utils_2.12.3
## [29] parallelly_1.40.1 limma_3.62.2
## [31] rstudioapi_0.17.1 RSQLite_2.3.9
## [33] gridGraphics_0.5-1 generics_0.1.3
## [35] ica_1.0-3 spatstat.random_3.3-2
## [37] GO.db_3.20.0 Matrix_1.7-0
## [39] abind_1.4-8 R.methodsS3_1.8.2
## [41] lifecycle_1.0.4 yaml_2.3.10
## [43] qvalue_2.38.0 rhdf5_2.50.2
## [45] SparseArray_1.6.1 BiocFileCache_2.14.0
## [47] Rtsne_0.17 grid_4.4.1
## [49] blob_1.2.4 promises_1.3.2
## [51] ExperimentHub_2.14.0 crayon_1.5.3
## [53] ggtangle_0.0.6 miniUI_0.1.1.1
## [55] lattice_0.22-6 beachmat_2.22.0
## [57] cowplot_1.1.3 KEGGREST_1.46.0
## [59] pillar_1.10.1 fgsea_1.32.2
## [61] future.apply_1.11.3 codetools_0.2-20
## [63] fastmatch_1.1-6 glue_1.8.0
## [65] ggfun_0.1.8 spatstat.univar_3.1-1
## [67] treeio_1.30.0 vctrs_0.6.5
## [69] png_0.1-8 gypsum_1.2.0
## [71] spam_2.11-1 gtable_0.3.6
## [73] cachem_1.1.0 xfun_0.49
## [75] S4Arrays_1.6.0 mime_0.12
## [77] survival_3.6-4 statmod_1.5.0
## [79] fitdistrplus_1.2-2 ROCR_1.0-11
## [81] nlme_3.1-164 ggtree_3.14.0
## [83] bit64_4.6.0-1 alabaster.ranges_1.6.0
## [85] filelock_1.0.3 RcppAnnoy_0.0.22
## [87] bslib_0.8.0 irlba_2.3.5.1
## [89] KernSmooth_2.23-24 colorspace_2.1-1
## [91] DBI_1.2.3 tidyselect_1.2.1
## [93] bit_4.5.0.1 compiler_4.4.1
## [95] curl_6.1.0 httr2_1.1.0
## [97] BiocNeighbors_2.0.1 DelayedArray_0.32.0
## [99] plotly_4.10.4 scales_1.3.0
## [101] lmtest_0.9-40 rappdirs_0.3.3
## [103] digest_0.6.37 goftest_1.2-3
## [105] presto_1.0.0 spatstat.utils_3.1-1
## [107] alabaster.matrix_1.6.1 rmarkdown_2.29
## [109] XVector_0.46.0 htmltools_0.5.8.1
## [111] pkgconfig_2.0.3 sparseMatrixStats_1.18.0
## [113] dbplyr_2.5.0 fastmap_1.2.0
## [115] rlang_1.1.5 htmlwidgets_1.6.4
## [117] UCSC.utils_1.2.0 shiny_1.10.0
## [119] DelayedMatrixStats_1.28.1 farver_2.1.2
## [121] jquerylib_0.1.4 zoo_1.8-12
## [123] jsonlite_1.8.9 BiocParallel_1.40.0
## [125] GOSemSim_2.32.0 R.oo_1.27.0
## [127] BiocSingular_1.22.0 magrittr_2.0.3
## [129] ggplotify_0.1.2 GenomeInfoDbData_1.2.13
## [131] dotCall64_1.2 Rhdf5lib_1.28.0
## [133] munsell_0.5.1 Rcpp_1.0.13-1
## [135] ape_5.8-1 reticulate_1.40.0
## [137] stringi_1.8.4 alabaster.schemas_1.6.0
## [139] zlibbioc_1.52.0 MASS_7.3-60.2
## [141] AnnotationHub_3.14.0 plyr_1.8.9
## [143] parallel_4.4.1 listenv_0.9.1
## [145] ggrepel_0.9.6 deldir_2.0-4
## [147] Biostrings_2.74.1 splines_4.4.1
## [149] tensor_1.5 hms_1.1.3
## [151] igraph_2.1.2 spatstat.geom_3.3-4
## [153] RcppHNSW_0.6.0 reshape2_1.4.4
## [155] ScaledMatrix_1.14.0 BiocVersion_3.20.0
## [157] evaluate_1.0.3 BiocManager_1.30.25
## [159] tzdb_0.4.0 httpuv_1.6.15
## [161] RANN_2.6.2 polyclip_1.10-7
## [163] future_1.34.0 scattermore_1.2
## [165] rsvd_1.0.5 xtable_1.8-4
## [167] tidytree_0.4.6 RSpectra_0.16-2
## [169] later_1.4.1 viridisLite_0.4.2
## [171] aplot_0.2.4 memoise_2.0.1
## [173] AnnotationDbi_1.68.0 cluster_2.1.6
## [175] timechange_0.3.0 globals_0.16.3
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.