Introduction

The purpose of this script it to utilize gene set variation analysis (GSVA) and module scoring to compare the gene expression profile of canine CD4+ PTCL to the gene signatures of normal canine thymocytes and nodal T cells at various stages of development derived from a single-cell transcriptomic atlas.

Software

Gene Set Variation Analysis

GSVA evaluates the enrichment of each PTCL sample for gene signatures derived from a single-cell transcriptomic atlas of normal canine thymocytes and nodal T cells at various stages of development.

Data

Input: Bulk RNA-seq counts normalized and variance stabilized transformed by DESeq2, and a gmt file of the top 100 significantly upregulated genes in each cluster of the transcriptomic atlas (based on Seurat::FindAllMarkers).

setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/PTCL_GSVA_GSEA")

# GMT file
gmt <- getGmt("gene-signatures-of-seurat-integrated-ln-and-thym-clusters-top100.gmt", geneIdType = SymbolIdentifier())

# VST counts
## Cohort 1
vsd1 <- read.csv("VST_NormalizedCounts_CD4sOnly_Cohort1.csv")
vsd1 <- vsd1[!duplicated(vsd1$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(vsd1) <- vsd1$gene_name # make gene name rownames
vsd1 <- dplyr::select(vsd1, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts

# filter out cell cycle genes
vsd1 <- vsd1 %>%
  filter(!row.names(vsd1) %in% cc.genes$s.genes)
vsd1 <- vsd1 %>%
  filter(!row.names(vsd1) %in% cc.genes$g2m.genes)

vsd1_matrix <- data.matrix(vsd1) # Convert to a matrix prior to analysis.


## Cohort 2
vsd2 <- read.csv("VST_NormalizedCounts_Cohort2.csv")
vsd2 <- vsd2[!duplicated(vsd2$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(vsd2) <- vsd2$gene_name # make gene name rownames
vsd2 <- dplyr::select(vsd2, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts

# filter out cell cycle genes
vsd2 <- vsd2 %>%
  filter(!row.names(vsd2) %in% cc.genes$s.genes)
vsd2 <- vsd2 %>%
  filter(!row.names(vsd2) %in% cc.genes$g2m.genes)

vsd2_matrix <- data.matrix(vsd2) # Convert to a matrix prior to analysis.

# metadata
## Cohort 1
metadata1 <- read.table("metadata_Cohort1.txt")
colnames(metadata1) <- c("avery_number", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name # make sample names the rownames
metadata1 <- dplyr::select(metadata1, c("phenotype")) # Select columns in metadata1 to be annotated on the GSVA heatmap
metadata1 <- metadata1[row.names(metadata1) != "CF21", , drop = FALSE]

## keep only CD4 PTCL and CD4 CTRL samples from Cohort 1
keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata1 <- metadata1 %>%
  filter(phenotype %in% keepGroups)
keepList <- rownames(metadata1)
vsd1_matrix <- vsd1_matrix[, colnames(vsd1_matrix) %in% keepList]

## Cohort 2
metadata2 <- read.table("metadata_Cohort2.txt")
colnames(metadata2) <- c("avery_number", "sample_name", "phenotype")
rownames(metadata2) <- metadata2$sample_name # make sample names the rownames
metadata2 <- dplyr::select(metadata2, c("phenotype")) # Select columns in metadata2 to be annotated on the GSVA heatmap

Reference transcriptomic atlas:

seu <- readRDS(file = "../T_Cells/IntegThymAndLN_Annotated.Rdata")

kable(table(Idents(seu)))

Var1	Freq
Naive_T_2	9962
DP_Thymocytes_1	7452
Activated_T	5139
DP_Thymocytes_2	4602
SP_Thymocytes	3349
Naive_T_1	3151
Proliferating_DP_Thymocytes	2157
CD8_NK	1962
ETP_DN_Thymocytes	1962
Late_SP_Thymocytes_1	321
Late_SP_Thymocytes_2	254

DimPlot(seu, 
        reduction = "umap", 
        label = TRUE, 
        label.size = 3, 
        label.box = TRUE, 
        pt.size = 0.5) + 
  ggtitle("Canine Merged Lymph Node and Thymus T Cells, Resolution: 0.2")

GSVA Analysis

# set parameters for GSVA
gsva_param1 <- gsvaParam(
  vsd1_matrix,
  gmt,
  kcdf = "Gaussian", # Compute Gaussian-distributed scores
  minSize = 10, # Minimum gene set size
  maxSize = 500, # Maximum gene set size
  maxDiff = TRUE,
  absRanking = FALSE
)

gsva_param2 <- gsvaParam(
  vsd2_matrix,
  gmt,
  kcdf = "Gaussian", # Compute Gaussian-distributed scores
  minSize = 10, # Minimum gene set size
  maxSize = 500, # Maximum gene set size
  maxDiff = TRUE,
  absRanking = FALSE
)

# perform GSVA
gsva_results1 <- gsva(
  gsva_param1,
  verbose=FALSE,   # Don't print out the progress bar
)

gsva_results2 <- gsva(
  gsva_param2,
  verbose=FALSE,   # Don't print out the progress bar
)

gsva_heatmap1 <- pheatmap::pheatmap(gsva_results1,
                                      annotation_col = metadata1,
                                      show_colnames = TRUE,
                                      show_rownames = TRUE,
                                      fontsize_row = 10,
                                      cluster_cols = TRUE, 
                                      cluster_rows = TRUE,
                                      cutree_rows = 2,
                                      cutree_cols = 2,
                                      main = "GSVA (Seurat). Input: Vst transformed normalized DESeq2 counts, Cohort 1\nClustering: Ward, Distance: Euclidean",
                                      clustering_distance_rows = "euclidean",
                                      clustering_distance_cols = "euclidean",
                                      clustering_method = "ward.D2") + 
 scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad

gsva_heatmap2 <- pheatmap::pheatmap(gsva_results2,
                                      annotation_col = metadata2,
                                      show_colnames = TRUE,
                                      show_rownames = TRUE,
                                      fontsize_row = 10,
                                      cluster_cols = TRUE, 
                                      cluster_rows = TRUE,
                                      cutree_rows = 3,
                                      cutree_cols = 3,
                                      main = "GSVA (Seurat). Input: Vst transformed normalized DESeq2 counts, Cohort 2\nClustering: Ward, Distance: Euclidean",
                                      clustering_distance_rows = "euclidean",
                                      clustering_distance_cols = "euclidean",
                                      clustering_method = "ward.D2") + 
 scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad

Module scoring

Map the gene expression profiles of CD4+ PTCL and control CD4+ nodal lymphocytes (from bulk RNA-seq) to the single-cell atlas of normal canine thymocyte and T-cell subsets. ## Data Input: DESeq2 results of CD4+ PTCL vs control CD4+ nodal lymphocytes.

# DESeq2 results
df1 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_1/Output/CD4_PTCLvsCD4_CTRL_DESeq2res.csv", header=TRUE)
df2 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4_PTCLvsCD4_LN_CTRL_DESeq2res.csv", header=TRUE)
df3 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4THYM_CTRLvsCD4_LN_CTRL_DESeq2res.csv", header=TRUE)
df4 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4_PTCLvsCD4THYM_CTRL_DESeq2res.csv", header=TRUE)

# omit entries with log2fc of 'NA'
df1 <- df1[!is.na(df1$log2FoldChange),]
df2 <- df2[!is.na(df2$log2FoldChange),]
df3 <- df3[!is.na(df3$log2FoldChange),]
df4 <- df4[!is.na(df4$log2FoldChange),]

# filter out cell cycle genes
df1_filtered <- df1 %>%
  filter(!gene_name %in% cc.genes$s.genes)
df1_filtered <- df1 %>%
  filter(!gene_name %in% cc.genes$g2m.genes)
df1_filtered <- df1 %>%
  filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df1_filtered <- df1 %>%
  filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)

df2_filtered <- df2 %>%
  filter(!gene_name %in% cc.genes$s.genes)
df2_filtered <- df2 %>%
  filter(!gene_name %in% cc.genes$g2m.genes)
df2_filtered <- df2 %>%
  filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df2_filtered <- df2 %>%
  filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)

df3_filtered <- df3 %>%
  filter(!gene_name %in% cc.genes$s.genes)
df3_filtered <- df3 %>%
  filter(!gene_name %in% cc.genes$g2m.genes)
df3_filtered <- df3 %>%
  filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df3_filtered <- df3 %>%
  filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)

df4_filtered <- df4 %>%
  filter(!gene_name %in% cc.genes$s.genes)
df4_filtered <- df4 %>%
  filter(!gene_name %in% cc.genes$g2m.genes)
df4_filtered <- df4 %>%
  filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df4_filtered <- df4 %>%
  filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)

# take only significantly upregulated genes (log2fc > 1, padj < 0.05)
df1_filtered_up <- df1_filtered %>%
  filter(log2FoldChange >= 1) %>%
  filter(padj < 0.05)
  
df2_filtered_up <- df2_filtered %>%
  filter(log2FoldChange >= 1) %>%
  filter(padj < 0.05)

df3_filtered_up <- df3_filtered %>%
  filter(log2FoldChange >= 1) %>%
  filter(padj < 0.05)

df4_filtered_up <- df4_filtered %>%
  filter(log2FoldChange >= 1) %>%
  filter(padj < 0.05)

# arrange in ascending order by stat column and take only genes significantly upregulated in controls (log2fc < -1, padj < 0.05)
df1_filtered_dn <- df1_filtered %>%
  filter(log2FoldChange <= -1) %>%
  filter(padj < 0.05)

df2_filtered_dn <- df2_filtered %>%
  filter(log2FoldChange <= -1) %>%
  filter(padj < 0.05)

df3_filtered_dn <- df3_filtered %>%
  filter(log2FoldChange <= -1) %>%
  filter(padj < 0.05)

df4_filtered_dn <- df4_filtered %>%
  filter(log2FoldChange <= -1) %>%
  filter(padj < 0.05)

cohort1_ptcl_genes_up <- list(df1_filtered_up$gene_name)
cohort1_ctrl_genes_up <- list(df1_filtered_dn$gene_name)
cohort2_ptcl_genes_up <- list(df2_filtered_up$gene_name)
cohort2_ctrl_genes_up <- list(df2_filtered_dn$gene_name)
cohort2_thym_genes_up <- list(df3_filtered_up$gene_name)
cohort2_ctrlvsthym_genes_up <- list(df3_filtered_dn$gene_name)
cohort2_ptclvsthym_genes_up <- list(df4_filtered_up$gene_name)
cohort2_thymvsPTCL_genes_up <- list(df4_filtered_dn$gene_name)

# perform module scoring
seu <- AddModuleScore(seu, features = cohort1_ptcl_genes_up, name = "CD4PTCLCohort1_Score")
seu <- AddModuleScore(seu, features = cohort1_ctrl_genes_up, name = "CD4CTRLCohort1_Score")
seu <- AddModuleScore(seu, features = cohort2_ptcl_genes_up, name = "CD4PTCLCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_ctrl_genes_up, name = "CD4CTRLCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_thym_genes_up, name = "CD4THYMCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_ctrlvsthym_genes_up, name = "CD4LNCTRLvsTHYMCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_ptclvsthym_genes_up, name = "CD4PTCLvsThymCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_thymvsPTCL_genes_up, name = "CD4THYMvsPTCLCohort2_Score")

p1 <- FeaturePlot(seu, features = "CD4PTCLCohort1_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p2 <- FeaturePlot(seu, features = "CD4CTRLCohort1_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p1+p2

p3 <- FeaturePlot(seu, features = "CD4PTCLCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p4 <- FeaturePlot(seu, features = "CD4CTRLCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p3+p4

p5 <- FeaturePlot(seu, features = "CD4THYMCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p6 <- FeaturePlot(seu, features = "CD4LNCTRLvsTHYMCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p5+p6

p7 <- FeaturePlot(seu, features = "CD4PTCLvsThymCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p8 <- FeaturePlot(seu, features = "CD4THYMvsPTCLCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p7+p8

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] Seurat_5.2.1                SeuratObject_5.0.2         
##  [3] sp_2.1-4                    gplots_3.2.0               
##  [5] limma_3.60.6                GSEABase_1.66.0            
##  [7] graph_1.82.0                annotate_1.82.0            
##  [9] XML_3.99-0.18               AnnotationDbi_1.66.0       
## [11] GSVA_1.52.3                 ggplot2_3.5.1              
## [13] knitr_1.49                  readr_2.1.5                
## [15] dplyr_1.1.4                 pheatmap_1.0.12            
## [17] RColorBrewer_1.1-3          DESeq2_1.44.0              
## [19] SummarizedExperiment_1.34.0 Biobase_2.64.0             
## [21] MatrixGenerics_1.16.0       matrixStats_1.5.0          
## [23] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
## [25] IRanges_2.38.1              S4Vectors_0.42.1           
## [27] BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22            splines_4.4.0              
##   [3] later_1.4.1                 bitops_1.0-9               
##   [5] tibble_3.2.1                polyclip_1.10-7            
##   [7] fastDummies_1.7.5           lifecycle_1.0.4            
##   [9] globals_0.16.3              lattice_0.22-6             
##  [11] MASS_7.3-60.2               magrittr_2.0.3             
##  [13] plotly_4.10.4               sass_0.4.9                 
##  [15] rmarkdown_2.29              jquerylib_0.1.4            
##  [17] yaml_2.3.10                 httpuv_1.6.15              
##  [19] sctransform_0.4.1           spam_2.11-1                
##  [21] spatstat.sparse_3.1-0       reticulate_1.40.0          
##  [23] cowplot_1.1.3               pbapply_1.7-2              
##  [25] DBI_1.2.3                   abind_1.4-8                
##  [27] zlibbioc_1.50.0             Rtsne_0.17                 
##  [29] purrr_1.0.2                 GenomeInfoDbData_1.2.12    
##  [31] ggrepel_0.9.6               irlba_2.3.5.1              
##  [33] spatstat.utils_3.1-2        listenv_0.9.1              
##  [35] goftest_1.2-3               RSpectra_0.16-2            
##  [37] spatstat.random_3.3-2       fitdistrplus_1.2-2         
##  [39] parallelly_1.42.0           codetools_0.2-20           
##  [41] DelayedArray_0.30.1         tidyselect_1.2.1           
##  [43] UCSC.utils_1.0.0            farver_2.1.2               
##  [45] ScaledMatrix_1.12.0         spatstat.explore_3.3-4     
##  [47] jsonlite_1.8.9              progressr_0.15.1           
##  [49] ggridges_0.5.6              survival_3.5-8             
##  [51] tools_4.4.0                 ica_1.0-3                  
##  [53] Rcpp_1.0.14                 glue_1.8.0                 
##  [55] gridExtra_2.3               SparseArray_1.4.8          
##  [57] xfun_0.49                   HDF5Array_1.32.1           
##  [59] withr_3.0.2                 fastmap_1.2.0              
##  [61] rhdf5filters_1.16.0         caTools_1.18.3             
##  [63] digest_0.6.35               rsvd_1.0.5                 
##  [65] R6_2.5.1                    mime_0.12                  
##  [67] colorspace_2.1-1            scattermore_1.2            
##  [69] tensor_1.5                  gtools_3.9.5               
##  [71] spatstat.data_3.1-4         RSQLite_2.3.9              
##  [73] tidyr_1.3.1                 generics_0.1.3             
##  [75] data.table_1.16.4           httr_1.4.7                 
##  [77] htmlwidgets_1.6.4           S4Arrays_1.4.1             
##  [79] uwot_0.2.2                  pkgconfig_2.0.3            
##  [81] gtable_0.3.6                blob_1.2.4                 
##  [83] lmtest_0.9-40               SingleCellExperiment_1.26.0
##  [85] XVector_0.44.0              htmltools_0.5.8.1          
##  [87] dotCall64_1.2               scales_1.3.0               
##  [89] png_0.1-8                   SpatialExperiment_1.14.0   
##  [91] spatstat.univar_3.1-1       rstudioapi_0.17.1          
##  [93] tzdb_0.4.0                  reshape2_1.4.4             
##  [95] rjson_0.2.23                nlme_3.1-164               
##  [97] cachem_1.1.0                zoo_1.8-12                 
##  [99] rhdf5_2.48.0                stringr_1.5.1              
## [101] KernSmooth_2.23-22          parallel_4.4.0             
## [103] miniUI_0.1.1.1              pillar_1.10.1              
## [105] grid_4.4.0                  vctrs_0.6.5                
## [107] RANN_2.6.2                  promises_1.3.2             
## [109] BiocSingular_1.20.0         beachmat_2.20.0            
## [111] xtable_1.8-4                cluster_2.1.6              
## [113] evaluate_1.0.3              magick_2.8.5               
## [115] cli_3.6.2                   locfit_1.5-9.10            
## [117] compiler_4.4.0              rlang_1.1.3                
## [119] crayon_1.5.3                future.apply_1.11.3        
## [121] labeling_0.4.3              plyr_1.8.9                 
## [123] stringi_1.8.4               deldir_2.0-4               
## [125] viridisLite_0.4.2           BiocParallel_1.38.0        
## [127] munsell_0.5.1               Biostrings_2.72.1          
## [129] lazyeval_0.2.2              spatstat.geom_3.3-5        
## [131] Matrix_1.7-0                RcppHNSW_0.6.0             
## [133] hms_1.1.3                   patchwork_1.3.0            
## [135] sparseMatrixStats_1.16.0    bit64_4.6.0-1              
## [137] future_1.34.0               Rhdf5lib_1.26.0            
## [139] KEGGREST_1.44.1             statmod_1.5.0              
## [141] shiny_1.10.0                ROCR_1.0-11                
## [143] igraph_2.1.4                memoise_2.0.1              
## [145] bslib_0.8.0                 bit_4.5.0.1

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Comparing the GEP of Canine CD4+ PTCL to the Seurat Transcriptomic Atlas of Normal Canine Thymocytes and T cells

Eileen Owens

2025-02-03