Introduction

The purpose of this script is to perform gene expression analysis following pseudotime trajectory analysis on single-cell RNA-seq data of T-cell populations within the normal canine thymus and lymph node, to track how gene expression changes as T cells progress through development.

Acknowledgements

This script was adapted from the Monocle 3 documentation (https://cole-trapnell-lab.github.io/monocle3/docs/trajectories/)

Software

library(Seurat)
library(knitr)
library(tidyverse)
library(patchwork)
library(monocle3)
library(SeuratWrappers)
library(pheatmap)
library(stringr)
library(RColorBrewer)

Data

Input: A Seurat object of T-cell subsets from normal canine lymph node and thymus.

setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/Pseudotime")
seu <- readRDS(file = "C:/Users/edlarsen/Documents/240828_scRNAseq/T_Cells/IntegThymAndLN_Annotated.RData")

# convert seurat to cds
cds <- as.cell_data_set(seu, assay = "RNA")
cds <- estimate_size_factors(cds)
fData(cds)$gene_short_name <- rownames(fData(cds))

# preprocessing and trajectory analysis with Monocle 3
cds <- cluster_cells(cds, reduction_method = "UMAP")
colData(cds)$assigned_cell_type <- as.character(clusters(cds))
colData(cds)$assigned_cell_type <- dplyr::recode(colData(cds)$assigned_cell_type,
                                                 "1" = "DP_Thymocytes_2",
                                                 "2" = "DP_Thymocytes_1",
                                                 "3" = "Naive_to_Activated_T",
                                                 "4" = "Naive_T_2",
                                                 "5" = "SP_Thym_and_Naive_T",
                                                 "6" = "Naive_T_1",
                                                 "7" = "Activated_T",
                                                 "8" = "SP_Thym",
                                                 "9" = "Proliferating_DP_Thym",
                                                 "10" = "CD8_NK",
                                                 "11" = "ETP_DN_ISP_Thym",
                                                 "12" = "Late_SP_Thym")
cds <- reduce_dimension(cds)
cds <- cluster_cells(cds, reduction_method = "UMAP")
cds <- learn_graph(cds, use_partition = F)
cds <- order_cells(cds, reduction_method = "UMAP", root_cells = colnames(cds[, clusters(cds) == 10]))
cds$monocle3_pseudotime <- pseudotime(cds)

Top markers of each cluster

Plot the top gene for each cluster by pseudo R-squared value, a specificity metric ranging from 0 to 1.

marker_test_res <- top_markers(cds, group_cells_by = 'assigned_cell_type', reference_cells = 1000)
write.csv(marker_test_res, file = "topMarkers_Monocle3_clusters.csv")

top_specific_markers <- marker_test_res %>%
  # expressed by at least 85% of cells in the cluster
  filter(fraction_expressing >= 0.85) %>%
  group_by(cell_group) %>%
  # top 3 genes by pseudo R-squared value for specificity
  top_n(3, pseudo_R2)

top_specific_markers_ids <- unique(top_specific_markers %>% pull(gene_id))

plot_genes_by_group(cds,
                    top_specific_markers_ids,
                    group_cells_by = "assigned_cell_type",
                    ordering_type = "cluster_row_col",
                    max.size = 5)

About these genes:

Find genes that change as a function of pseudotime

Calculate Morans I values

The graph_test() function uses a statistic from spatial autocorrelation analysis (Moran’s I) to find genes that vary between groups of cells in UMAP space. A Moran’s I value of 0 indicates no effect, while +1 indicates perfect positive autocorrelation and suggests that nearby cells have very similar values of a gene’s expression. Significant values much less than zero are generally rare. Positive values indicate a gene is expressed in a focal region of the UMAP space (i.e., specific to one or more clusters).

deg <- graph_test(cds, neighbor_graph = "principal_graph") # the data frame 'deg' will contain the Moran's I test results for each gene in cds.
deg_ids <- row.names(subset(deg, q_value < 0.05))
write.csv(deg, file="monocle3_pseudotimeGeneExpressionAnalysis.csv")

Plot differentially expressed genes across pseudotime

Plot markers of interest that varied the most across pseudotime or were one of the top markers for a significant stage of thymocyte and T-cell development.

cds <- cluster_cells(cds, reduction_method = "UMAP")
cds <- learn_graph(cds, use_partition = F)
cds <- order_cells(cds, reduction_method = "UMAP", root_cells = colnames(cds[, clusters(cds) == 10]))

genes <- c("PAK3", "FBN2", "CDH1", "DPP4", "TBC1D9", "MRC1", "BAHCC1", "ALPL", "PTPRF", "CHN2", "NEDD4L", "CD1C", "ADAM12", "DHDDS", "DNTT", "ARPP21", "MAML3", "FSTL4", "INPP4B", "SCML4", "TOX2", "BCL2", "RIPOR2", "TGFBR2", "FAU", "RPL13", "RPL19", "RPS11", "RPS15A", "SELL", "LTB", "SCML4", "DLA-DRA", "TCF12", "RAG1", "FLT3", "CCR9", "CD44", "GATA3", "NOTCH1", "NOTCH3", "SATB1")

for (gene in genes){
  p <- plot_cells(cds,
             genes=gene,
             label_cell_groups = FALSE,
             show_trajectory_graph = FALSE) &
    scale_color_gradientn(colours = rev(brewer.pal(name="Spectral", n=11)))
  print(p)
}

for (gene in genes){
  cds_subset <- cds[rowData(cds)$gene_short_name == gene]
  cds_subset <- order_cells(cds_subset, reduction_method = "UMAP", root_cells = colnames(cds[, clusters(cds) == 10]))
  p <- plot_genes_in_pseudotime(cds_subset, color_cells_by = "assigned_cell_type") + theme(text = element_text(size=20)) + guides(color = guide_legend(override.aes = list(size=5)))
  print(p)
}

plot_genes_by_group(cds,
                    genes,
                    group_cells_by = "assigned_cell_type",
                    ordering_type = "cluster_row_col",
                    max.size = 5)

Assign genes to modules that have similar patterns of expression

find_gene_modules() runs UMAP on genes (as opposed to cells) to group them based on their Moran’s I value into modules using Louvain community analysis.

Repeat normalization and preprocessing steps on cds object

This is required for find_gene_modules to work; see https://github.com/cole-trapnell-lab/monocle3/issues/623 and https://github.com/cole-trapnell-lab/monocle3/issues/655.

cds <- preprocess_cds(cds)
cds <- reduce_dimension(cds)
cds <- cluster_cells(cds, reduction_method = "UMAP", resolution = 1e-4)

p1 <- plot_cells(cds,
                 color_cells_by = "assigned_cell_type",
                 label_groups_by_cluster = T,
                 label_branch_points = T,
                 show_trajectory_graph = F,
                 group_label_size = 5) + ggtitle("Assigned Cell Types from Seurat Clusters") + theme(legend.position = "right")

p2 <- plot_cells(cds, 
                 group_label_size = 5, 
                 show_trajectory_graph = FALSE) + ggtitle("Monocle3 Cluster IDs")
p1+p2

Associate gene modules with clusters

Visualize which gene modules are expressed in which clusters.

gene_module_df <- find_gene_modules(cds[deg_ids, ], resolution=1e-2) # creates data frame that contains a row for each gene and identifies the module it belongs to
write.csv(gene_module_df, file = "gene_module_df.csv")
kable(head(gene_module_df, n = 20))
id module supermodule dim_1 dim_2
ZNF157 44 1 0.4639565 0.5635854
EMD 25 1 -3.4970799 -1.2860831
SMC1A 41 1 1.2785526 -3.4609367
CENPI 41 1 1.2040056 -4.2394809
AFF2 74 1 0.0797271 2.9250151
ZNF41 18 1 -1.6303992 1.7328247
FLNA 27 1 -3.5133209 0.6423115
MORC4 36 1 4.3998816 -0.2569925
ENSCAFG00000013554 64 1 -5.1286199 -2.2687923
ARAF 40 1 2.8898923 0.6751892
RBM41 2 1 4.6193874 1.1333959
TIMM8A 33 1 -1.8862363 -0.9665346
SYN1 71 2 -5.8734577 2.9314056
NUP62CL 41 1 1.4067100 -4.0201897
TIMP1 19 1 -3.0838256 4.0108839
BTK 62 2 -6.1455238 4.2697926
ENSCAFG00000045333 29 1 -4.5652788 -1.9048082
PRPS1 56 1 -1.0661348 -3.4851185
IDS 19 1 -3.3845995 3.5496195
ELK1 60 1 0.6169909 1.7827289
# table aggregating expression of all genes in each module across all Monocle3 clusters
m3_cell_group_df <- tibble::tibble(
  cell=row.names(colData(cds)),
  cell_group=clusters(cds)[colnames(cds)])
agg_mat_m3 <- aggregate_gene_expression(cds, gene_module_df, m3_cell_group_df)
row.names(agg_mat_m3) <- stringr::str_c("Module ", row.names(agg_mat_m3))
colnames(agg_mat_m3) <- stringr::str_c("Cluster ", colnames(agg_mat_m3))
write.csv(agg_mat_m3, file = "GeneModuleAssignment_Monocle3Clusters.csv")

# table aggregating expression of all genes in each module across all cell type assignments from our Seurat object
ann_cell_group_df <- tibble::tibble(
  cell = colnames(cds),
  cell_group = colData(cds)$assigned_cell_type
)
agg_mat_ann <- aggregate_gene_expression(cds, gene_module_df, ann_cell_group_df)
row.names(agg_mat_ann) <- stringr::str_c("Module ", row.names(agg_mat_ann))
colnames(agg_mat_ann) <- stringr::str_c(unique(ann_cell_group_df$cell_group))
write.csv(agg_mat_ann, file = "GeneModuleAssignment_SeuratAnnotations.csv")
# heatmaps
pheatmap::pheatmap(agg_mat_m3,
                   cluster_rows = TRUE,
                   cluster_cols = TRUE,
                   scale = "column",
                   clustering_method = "ward.D2",
                   fontsize = 10,
                   main = "Expression of Gene Modules Across Monocle3 Clusters")

pheatmap::pheatmap(agg_mat_ann,
                   cluster_rows = TRUE,
                   cluster_cols = TRUE,
                   scale = "column",
                   clustering_method = "ward.D2",
                   fontsize = 10,
                   main = "Expression of Gene Modules Across Assigned Cell Types")

Citations

sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3          pheatmap_1.0.12            
##  [3] SeuratWrappers_0.4.0        monocle3_1.3.7             
##  [5] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
##  [7] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
##  [9] IRanges_2.38.1              S4Vectors_0.42.1           
## [11] MatrixGenerics_1.16.0       matrixStats_1.5.0          
## [13] Biobase_2.64.0              BiocGenerics_0.50.0        
## [15] patchwork_1.3.0             lubridate_1.9.4            
## [17] forcats_1.0.0               stringr_1.5.1              
## [19] dplyr_1.1.4                 purrr_1.0.2                
## [21] readr_2.1.5                 tidyr_1.3.1                
## [23] tibble_3.2.1                ggplot2_3.5.1              
## [25] tidyverse_2.0.0             knitr_1.49                 
## [27] Seurat_5.2.1                SeuratObject_5.0.2         
## [29] sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22          splines_4.4.0            
##   [3] later_1.4.1               R.oo_1.27.0              
##   [5] polyclip_1.10-7           fastDummies_1.7.5        
##   [7] lifecycle_1.0.4           sf_1.0-19                
##   [9] Rdpack_2.6.2              pbmcapply_1.5.1          
##  [11] globals_0.16.3            lattice_0.22-6           
##  [13] MASS_7.3-60.2             magrittr_2.0.3           
##  [15] speedglm_0.3-5            plotly_4.10.4            
##  [17] sass_0.4.9                rmarkdown_2.29           
##  [19] jquerylib_0.1.4           yaml_2.3.10              
##  [21] remotes_2.5.0             httpuv_1.6.15            
##  [23] sctransform_0.4.1         spam_2.11-1              
##  [25] spatstat.sparse_3.1-0     reticulate_1.40.0        
##  [27] DBI_1.2.3                 cowplot_1.1.3            
##  [29] pbapply_1.7-2             minqa_1.2.8              
##  [31] abind_1.4-8               zlibbioc_1.50.0          
##  [33] Rtsne_0.17                R.utils_2.12.3           
##  [35] GenomeInfoDbData_1.2.12   ggrepel_0.9.6            
##  [37] irlba_2.3.5.1             listenv_0.9.1            
##  [39] spatstat.utils_3.1-2      units_0.8-5              
##  [41] goftest_1.2-3             RSpectra_0.16-2          
##  [43] spatstat.random_3.3-2     fitdistrplus_1.2-2       
##  [45] parallelly_1.42.0         DelayedMatrixStats_1.26.0
##  [47] codetools_0.2-20          DelayedArray_0.30.1      
##  [49] tidyselect_1.2.1          UCSC.utils_1.0.0         
##  [51] farver_2.1.2              viridis_0.6.5            
##  [53] lme4_1.1-36               spatstat.explore_3.3-4   
##  [55] jsonlite_1.8.9            e1071_1.7-16             
##  [57] progressr_0.15.1          ggridges_0.5.6           
##  [59] survival_3.5-8            tools_4.4.0              
##  [61] ica_1.0-3                 Rcpp_1.0.14              
##  [63] glue_1.8.0                gridExtra_2.3            
##  [65] SparseArray_1.4.8         xfun_0.49                
##  [67] withr_3.0.2               BiocManager_1.30.25      
##  [69] fastmap_1.2.0             boot_1.3-30              
##  [71] spData_2.3.4              digest_0.6.35            
##  [73] rsvd_1.0.5                timechange_0.3.0         
##  [75] R6_2.5.1                  mime_0.12                
##  [77] wk_0.9.4                  colorspace_2.1-1         
##  [79] scattermore_1.2           tensor_1.5               
##  [81] spatstat.data_3.1-4       R.methodsS3_1.8.2        
##  [83] RhpcBLASctl_0.23-42       generics_0.1.3           
##  [85] data.table_1.16.4         class_7.3-22             
##  [87] httr_1.4.7                htmlwidgets_1.6.4        
##  [89] S4Arrays_1.4.1            spdep_1.3-10             
##  [91] uwot_0.2.2                pkgconfig_2.0.3          
##  [93] gtable_0.3.6              lmtest_0.9-40            
##  [95] XVector_0.44.0            htmltools_0.5.8.1        
##  [97] dotCall64_1.2             scales_1.3.0             
##  [99] png_0.1-8                 spatstat.univar_3.1-1    
## [101] reformulas_0.4.0          rstudioapi_0.17.1        
## [103] tzdb_0.4.0                reshape2_1.4.4           
## [105] nlme_3.1-164              nloptr_2.1.1             
## [107] proxy_0.4-27              cachem_1.1.0             
## [109] zoo_1.8-12                KernSmooth_2.23-22       
## [111] parallel_4.4.0            miniUI_0.1.1.1           
## [113] s2_1.1.7                  pillar_1.10.1            
## [115] grid_4.4.0                vctrs_0.6.5              
## [117] RANN_2.6.2                slam_0.1-55              
## [119] promises_1.3.2            xtable_1.8-4             
## [121] cluster_2.1.6             evaluate_1.0.3           
## [123] cli_3.6.2                 compiler_4.4.0           
## [125] rlang_1.1.3               crayon_1.5.3             
## [127] grr_0.9.5                 leidenbase_0.1.31        
## [129] future.apply_1.11.3       labeling_0.4.3           
## [131] classInt_0.4-11           plyr_1.8.9               
## [133] stringi_1.8.4             viridisLite_0.4.2        
## [135] deldir_2.0-4              assertthat_0.2.1         
## [137] munsell_0.5.1             lazyeval_0.2.2           
## [139] spatstat.geom_3.3-5       Matrix_1.7-0             
## [141] RcppHNSW_0.6.0            hms_1.1.3                
## [143] sparseMatrixStats_1.16.0  future_1.34.0            
## [145] shiny_1.10.0              rbibutils_2.3            
## [147] ROCR_1.0-11               igraph_2.1.4             
## [149] bslib_0.8.0               biglm_0.9-3
citation()
## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.