Introduction

The purpose of this script is to annotate single-cell RNA-seq clusters following filtering, normalization, and clustering of the data with Seurat. This script has been adapted from the Seurat documentation (https://satijalab.org/seurat/articles/pbmc3k_tutorial#assigning-cell-type-identity-to-clusters).

Software

library(Seurat)
library(tidyverse)
library(patchwork)
library(pheatmap)
library(RColorBrewer)
library(SingleR)
library(celldex) # To install: BiocManager::install("celldex")
library(data.table)
library(knitr)
library(clusterProfiler)
library(presto)

Data

Input: Pre-processed Seurat object.

setwd("/scratch/alpine/edlarsen@colostate.edu/project_scrna_01/240828_scRNAseq/Cluster_Annotation")
thym_seurat <- readRDS(file = "../Normalization_and_Clustering/THYM_NormalizedAndClustered.RData")

# Differentially expressed features
## identifies all positive and negative markers of a single cluster compared to all other cells.
thym_seurat <- FindClusters(thym_seurat, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22654
## Number of edges: 677273
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9488
## Number of communities: 11
## Elapsed time: 4 seconds
thym_seurat <- RunUMAP(thym_seurat, dims = 1:10)
thym.markers <- FindAllMarkers(thym_seurat, only.pos=TRUE)

# umap
thym_seurat <- RunUMAP(thym_seurat, 
                           dims = 1:10,
                           n.neighbors = 50, # default is 30
                           min.dist = 0.5) # default is 0.3

DimPlot(thym_seurat,
        reduction = "umap",
                   label = TRUE,
                   label.size = 6) + 
  plot_annotation(title = "Canine Thymus, Resolution: 0.2, \nn.neighbors = 50, min.dist = 0.5", theme = theme(plot.title = element_text(hjust = 0.5, size = 20)))

Cell-Cycle Scoring

See if cluster differences are associated with differences in cell proliferation.

Assign Cell-Cycle Scores

Each cell will be assigned a score based on its expression of G2/M and S phase markers. The CellCycleStoring() function stores S and G2/M scores in object metadata, along with the predicted classification of each cell in either G2/M, S, or G1 phase.

# segregate G2/M phase and S phase markers from Seurat's built-in list of cell cycle markers
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

# assign each cell a score based on expression of G2/M and S phase markers
thym_seurat_cycle <- CellCycleScoring(thym_seurat,
                                s.features = s.genes, 
                                g2m.features = g2m.genes, 
                                set.ident = TRUE)

Visualize cycling cells on UMAP

thym_seurat_cycle <- RunPCA(thym_seurat_cycle, features = c(s.genes, g2m.genes))
DimPlot(thym_seurat_cycle,
        group.by = "Phase",
        reduction = "umap",
        pt.size=0.5) + 
  ggtitle("Cell Cycle Scoring: Canine Thymus")

Module scoring for Park et al. human thymus feature expression

Calculates the average expression levels of each cluster on a single-cell level, subtracted by the aggregated expression of control feature sets. All analyzed features are binned based on averaged expression, and the control features are randomly selected from each bin.

Top 20 marker genes for each cell type in human thymus atlas

Source: Park et al., Supplementary Table 4 (https://www.science.org/doi/10.1126/science.aay3224#supplementary-materials)

# make gene lists
human_thymus_atlas_top20 <- read.csv("Parketal_SupTable4_Top20MarkersForCellTypes.csv")
colnames(human_thymus_atlas_top20)
##  [1] "B_memory"   "B_naive"    "B_plasma"   "B_pro_pre"  "CD4.CTL"   
##  [6] "CD4.PD1"    "CD4.T"      "CD4.Tmem"   "CD8.T"      "CD8.Tmem"  
## [11] "CD8aa_I"    "CD8aa_II"   "DC1"        "DC2"        "DN_P"      
## [16] "DN_Q"       "DN_early"   "DP_P"       "DP_Q"       "ETP"       
## [21] "Endo"       "Epi_GCM2"   "Ery"        "Fb_1"       "Fb_2"      
## [26] "Fb_cycling" "ILC3"       "Lymph"      "Mac"        "Mast"      
## [31] "Mgk"        "Mono"       "NK"         "NKT"        "NMP"       
## [36] "T_agonist"  "TEC_myo"    "TEC_neuro"  "Tfh"        "Th17"      
## [41] "Treg"       "Treg_diff"  "VSMC"       "aDC1"       "aDC2"      
## [46] "aDC3"       "cTEC"       "mTEC_I"     "mTEC_II"    "mTEC_III"  
## [51] "mTEC_IV"    "mcTEC"      "pDC"        "abT_entry"  "gdT"
plot_list <- list() # initiate empty list for individual plots

for (column in colnames(human_thymus_atlas_top20)){
  genes <- list(human_thymus_atlas_top20[[column]])
  name <- paste(column, "Score", sep="_")
  thym_seurat <- AddModuleScore(thym_seurat, features = genes, name = name)
  newname <- paste(name, "1", sep="")
  plotname <- paste(name, "modscoreplot", sep="_")
  modscoreplot <- FeaturePlot(thym_seurat, features = newname) + scale_color_gradientn(colours = brewer.pal(name = "RdPu", n=11))
  plot_list[[name]] <- modscoreplot # add plot to list
}

# split plot_list into groups of 4 and handle any remaining plots
plot_groups <- split(plot_list, ceiling(seq_along(plot_list) / 6))

# combined plot with dynamic number of columns/rows
combine_plots <- function(group, ncol = 3){
  n_plots <- length(group)
  nrow <- ceiling(n_plots / ncol)
  wrap_plots(group, ncol = ncol, nrow = nrow)
}
combined_plots <- lapply(plot_groups, function(group) {
  combine_plots(group, ncol = 3)
})

# display each combined plot
for (i in seq_along(combined_plots)) {
  print(combined_plots[[i]])
}

Top 51 marker genes for each cell type in human thymus atlas

Source: Park et al. data accessed through UCSC Cell Browser

# make gene lists
human_thymus_atlas_top51 <- read.csv("Parketal_Top51DEGenesForThymusClusters_fromUCSCCellBrowser.csv")
colnames(human_thymus_atlas_top51)
##  [1] "DN_Quiescent"                                 
##  [2] "DN_early"                                     
##  [3] "DN_Proliferating"                             
##  [4] "DP_Proliferating"                             
##  [5] "DP_Quiescent"                                 
##  [6] "T_agonist"                                    
##  [7] "Differentiating_Tregs"                        
##  [8] "Treg"                                         
##  [9] "CD4_T"                                        
## [10] "CD8_T"                                        
## [11] "Th17"                                         
## [12] "HSC"                                          
## [13] "Early_Thymic_Progenitor"                      
## [14] "T_DN"                                         
## [15] "Megakaryocyte"                                
## [16] "Erythrocyte"                                  
## [17] "MastCell"                                     
## [18] "Megakaryocyte_Erythrocyte_MastCell_Progenitor"
## [19] "Neutrophil_Myeloid_Progenitor"                
## [20] "PreProB"                                      
## [21] "Plasmacytoid_DCs"                             
## [22] "DN_all"                                       
## [23] "DP_all"                                       
## [24] "ILC3"                                         
## [25] "NK"                                           
## [26] "CD4_T_Memory"                                 
## [27] "CD8_T_Memory"                                 
## [28] "NKT"                                          
## [29] "B_memory"
plot_list <- list() # initiate empty list for individual plots

for (column in colnames(human_thymus_atlas_top51)){
  genes <- list(human_thymus_atlas_top51[[column]])
  name <- paste(column, "Score", sep="_")
  thym_seurat <- AddModuleScore(thym_seurat, features = genes, name = name)
  newname <- paste(name, "1", sep="")
  modscoreplot <- FeaturePlot(thym_seurat, features = newname) + scale_color_gradientn(colours = brewer.pal(name = "RdPu", n=11))
  plot_list[[name]] <- modscoreplot # add plot to list
}

# split plot_list into groups of 6 and handle any remaining plots
plot_groups <- split(plot_list, ceiling(seq_along(plot_list) / 6))

# combined plot with dynamic number of columns/rows
combine_plots <- function(group, ncol = 3){
  n_plots <- length(group)
  nrow <- ceiling(n_plots / ncol)
  wrap_plots(group, ncol = ncol, nrow = nrow)
}
combined_plots <- lapply(plot_groups, function(group) {
  combine_plots(group, ncol = 3)
})

# display each combined plot
for (i in seq_along(combined_plots)) {
  print(combined_plots[[i]])
}

Assigning cell type identity to clusters

To be done once confident cluster assignment has been achieved.

new.thym.cluster.ids <- c("T0_DP", "T1_DP", "T2_EarlySP", "T3_DN", "T4_Cycling", "T5_LateSP1", "T6_LateSP2", "T7_Bcell", "T8_Cycling", "T9_MonoMacDC", "T10_Granulocytes")
names(new.thym.cluster.ids) <- levels(thym_seurat)
thym_seurat <- RenameIdents(thym_seurat, new.thym.cluster.ids)
DimPlot(thym_seurat, reduction = "umap", label = TRUE, label.size = 3, label.box = TRUE, pt.size = 0.5) + ggtitle("Canine Thymus, Resolution: 0.2")

saveRDS(thym_seurat, file="THYM_Annotated.RData")