The purpose of this script it to utilize gene set variation analysis (GSVA) and module scoring to compare the gene expression profile of canine CD4+ PTCL to the gene signatures of normal canine thymocytes and nodal T cells at various stages of development derived from a single-cell transcriptomic atlas.
GSVA evaluates the enrichment of each PTCL sample for gene signatures derived from a single-cell transcriptomic atlas of normal canine thymocytes and nodal T cells at various stages of development.
Input: Bulk RNA-seq counts normalized and variance stabilized transformed by DESeq2, and a gmt file of the top 100 significantly upregulated genes in each cluster of the transcriptomic atlas (based on Seurat::FindAllMarkers).
setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/PTCL_GSVA_GSEA")
# GMT file
gmt <- getGmt("gene-signatures-of-seurat-integrated-ln-and-thym-clusters-top100.gmt", geneIdType = SymbolIdentifier())
# VST counts
## Cohort 1
vsd1 <- read.csv("VST_NormalizedCounts_CD4sOnly_Cohort1.csv")
vsd1 <- vsd1[!duplicated(vsd1$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(vsd1) <- vsd1$gene_name # make gene name rownames
vsd1 <- dplyr::select(vsd1, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts
# filter out cell cycle genes
vsd1 <- vsd1 %>%
filter(!row.names(vsd1) %in% cc.genes$s.genes)
vsd1 <- vsd1 %>%
filter(!row.names(vsd1) %in% cc.genes$g2m.genes)
vsd1_matrix <- data.matrix(vsd1) # Convert to a matrix prior to analysis.
## Cohort 2
vsd2 <- read.csv("VST_NormalizedCounts_Cohort2.csv")
vsd2 <- vsd2[!duplicated(vsd2$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(vsd2) <- vsd2$gene_name # make gene name rownames
vsd2 <- dplyr::select(vsd2, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts
# filter out cell cycle genes
vsd2 <- vsd2 %>%
filter(!row.names(vsd2) %in% cc.genes$s.genes)
vsd2 <- vsd2 %>%
filter(!row.names(vsd2) %in% cc.genes$g2m.genes)
vsd2_matrix <- data.matrix(vsd2) # Convert to a matrix prior to analysis.
# metadata
## Cohort 1
metadata1 <- read.table("metadata_Cohort1.txt")
colnames(metadata1) <- c("avery_number", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name # make sample names the rownames
metadata1 <- dplyr::select(metadata1, c("phenotype")) # Select columns in metadata1 to be annotated on the GSVA heatmap
metadata1 <- metadata1[row.names(metadata1) != "CF21", , drop = FALSE]
## keep only CD4 PTCL and CD4 CTRL samples from Cohort 1
keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata1 <- metadata1 %>%
filter(phenotype %in% keepGroups)
keepList <- rownames(metadata1)
vsd1_matrix <- vsd1_matrix[, colnames(vsd1_matrix) %in% keepList]
## Cohort 2
metadata2 <- read.table("metadata_Cohort2.txt")
colnames(metadata2) <- c("avery_number", "sample_name", "phenotype")
rownames(metadata2) <- metadata2$sample_name # make sample names the rownames
metadata2 <- dplyr::select(metadata2, c("phenotype")) # Select columns in metadata2 to be annotated on the GSVA heatmap
seu <- readRDS(file = "../T_Cells/IntegThymAndLN_Annotated.Rdata")
kable(table(Idents(seu)))
Var1 | Freq |
---|---|
Naive_T_2 | 9962 |
DP_Thymocytes_1 | 7452 |
Activated_T | 5139 |
DP_Thymocytes_2 | 4602 |
SP_Thymocytes | 3349 |
Naive_T_1 | 3151 |
Proliferating_DP_Thymocytes | 2157 |
CD8_NK | 1962 |
ETP_DN_Thymocytes | 1962 |
Late_SP_Thymocytes_1 | 321 |
Late_SP_Thymocytes_2 | 254 |
DimPlot(seu,
reduction = "umap",
label = TRUE,
label.size = 3,
label.box = TRUE,
pt.size = 0.5) +
ggtitle("Canine Merged Lymph Node and Thymus T Cells, Resolution: 0.2")
# set parameters for GSVA
gsva_param1 <- gsvaParam(
vsd1_matrix,
gmt,
kcdf = "Gaussian", # Compute Gaussian-distributed scores
minSize = 10, # Minimum gene set size
maxSize = 500, # Maximum gene set size
maxDiff = TRUE,
absRanking = FALSE
)
gsva_param2 <- gsvaParam(
vsd2_matrix,
gmt,
kcdf = "Gaussian", # Compute Gaussian-distributed scores
minSize = 10, # Minimum gene set size
maxSize = 500, # Maximum gene set size
maxDiff = TRUE,
absRanking = FALSE
)
# perform GSVA
gsva_results1 <- gsva(
gsva_param1,
verbose=FALSE, # Don't print out the progress bar
)
gsva_results2 <- gsva(
gsva_param2,
verbose=FALSE, # Don't print out the progress bar
)
gsva_heatmap1 <- pheatmap::pheatmap(gsva_results1,
annotation_col = metadata1,
show_colnames = TRUE,
show_rownames = TRUE,
fontsize_row = 10,
cluster_cols = TRUE,
cluster_rows = TRUE,
cutree_rows = 2,
cutree_cols = 2,
main = "GSVA (Seurat). Input: Vst transformed normalized DESeq2 counts, Cohort 1\nClustering: Ward, Distance: Euclidean",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2") +
scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad
gsva_heatmap2 <- pheatmap::pheatmap(gsva_results2,
annotation_col = metadata2,
show_colnames = TRUE,
show_rownames = TRUE,
fontsize_row = 10,
cluster_cols = TRUE,
cluster_rows = TRUE,
cutree_rows = 3,
cutree_cols = 3,
main = "GSVA (Seurat). Input: Vst transformed normalized DESeq2 counts, Cohort 2\nClustering: Ward, Distance: Euclidean",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2") +
scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad
Map the gene expression profiles of CD4+ PTCL and control CD4+ nodal lymphocytes (from bulk RNA-seq) to the single-cell atlas of normal canine thymocyte and T-cell subsets. ## Data Input: DESeq2 results of CD4+ PTCL vs control CD4+ nodal lymphocytes.
# DESeq2 results
df1 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_1/Output/CD4_PTCLvsCD4_CTRL_DESeq2res.csv", header=TRUE)
df2 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4_PTCLvsCD4_LN_CTRL_DESeq2res.csv", header=TRUE)
df3 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4THYM_CTRLvsCD4_LN_CTRL_DESeq2res.csv", header=TRUE)
df4 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4_PTCLvsCD4THYM_CTRL_DESeq2res.csv", header=TRUE)
# omit entries with log2fc of 'NA'
df1 <- df1[!is.na(df1$log2FoldChange),]
df2 <- df2[!is.na(df2$log2FoldChange),]
df3 <- df3[!is.na(df3$log2FoldChange),]
df4 <- df4[!is.na(df4$log2FoldChange),]
# filter out cell cycle genes
df1_filtered <- df1 %>%
filter(!gene_name %in% cc.genes$s.genes)
df1_filtered <- df1 %>%
filter(!gene_name %in% cc.genes$g2m.genes)
df1_filtered <- df1 %>%
filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df1_filtered <- df1 %>%
filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)
df2_filtered <- df2 %>%
filter(!gene_name %in% cc.genes$s.genes)
df2_filtered <- df2 %>%
filter(!gene_name %in% cc.genes$g2m.genes)
df2_filtered <- df2 %>%
filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df2_filtered <- df2 %>%
filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)
df3_filtered <- df3 %>%
filter(!gene_name %in% cc.genes$s.genes)
df3_filtered <- df3 %>%
filter(!gene_name %in% cc.genes$g2m.genes)
df3_filtered <- df3 %>%
filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df3_filtered <- df3 %>%
filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)
df4_filtered <- df4 %>%
filter(!gene_name %in% cc.genes$s.genes)
df4_filtered <- df4 %>%
filter(!gene_name %in% cc.genes$g2m.genes)
df4_filtered <- df4 %>%
filter(!gene_name %in% cc.genes.updated.2019$s.genes)
df4_filtered <- df4 %>%
filter(!gene_name %in% cc.genes.updated.2019$g2m.genes)
# take only significantly upregulated genes (log2fc > 1, padj < 0.05)
df1_filtered_up <- df1_filtered %>%
filter(log2FoldChange >= 1) %>%
filter(padj < 0.05)
df2_filtered_up <- df2_filtered %>%
filter(log2FoldChange >= 1) %>%
filter(padj < 0.05)
df3_filtered_up <- df3_filtered %>%
filter(log2FoldChange >= 1) %>%
filter(padj < 0.05)
df4_filtered_up <- df4_filtered %>%
filter(log2FoldChange >= 1) %>%
filter(padj < 0.05)
# arrange in ascending order by stat column and take only genes significantly upregulated in controls (log2fc < -1, padj < 0.05)
df1_filtered_dn <- df1_filtered %>%
filter(log2FoldChange <= -1) %>%
filter(padj < 0.05)
df2_filtered_dn <- df2_filtered %>%
filter(log2FoldChange <= -1) %>%
filter(padj < 0.05)
df3_filtered_dn <- df3_filtered %>%
filter(log2FoldChange <= -1) %>%
filter(padj < 0.05)
df4_filtered_dn <- df4_filtered %>%
filter(log2FoldChange <= -1) %>%
filter(padj < 0.05)
cohort1_ptcl_genes_up <- list(df1_filtered_up$gene_name)
cohort1_ctrl_genes_up <- list(df1_filtered_dn$gene_name)
cohort2_ptcl_genes_up <- list(df2_filtered_up$gene_name)
cohort2_ctrl_genes_up <- list(df2_filtered_dn$gene_name)
cohort2_thym_genes_up <- list(df3_filtered_up$gene_name)
cohort2_ctrlvsthym_genes_up <- list(df3_filtered_dn$gene_name)
cohort2_ptclvsthym_genes_up <- list(df4_filtered_up$gene_name)
cohort2_thymvsPTCL_genes_up <- list(df4_filtered_dn$gene_name)
# perform module scoring
seu <- AddModuleScore(seu, features = cohort1_ptcl_genes_up, name = "CD4PTCLCohort1_Score")
seu <- AddModuleScore(seu, features = cohort1_ctrl_genes_up, name = "CD4CTRLCohort1_Score")
seu <- AddModuleScore(seu, features = cohort2_ptcl_genes_up, name = "CD4PTCLCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_ctrl_genes_up, name = "CD4CTRLCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_thym_genes_up, name = "CD4THYMCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_ctrlvsthym_genes_up, name = "CD4LNCTRLvsTHYMCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_ptclvsthym_genes_up, name = "CD4PTCLvsThymCohort2_Score")
seu <- AddModuleScore(seu, features = cohort2_thymvsPTCL_genes_up, name = "CD4THYMvsPTCLCohort2_Score")
p1 <- FeaturePlot(seu, features = "CD4PTCLCohort1_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p2 <- FeaturePlot(seu, features = "CD4CTRLCohort1_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p1+p2
p3 <- FeaturePlot(seu, features = "CD4PTCLCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p4 <- FeaturePlot(seu, features = "CD4CTRLCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p3+p4
p5 <- FeaturePlot(seu, features = "CD4THYMCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p6 <- FeaturePlot(seu, features = "CD4LNCTRLvsTHYMCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p5+p6
p7 <- FeaturePlot(seu, features = "CD4PTCLvsThymCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p8 <- FeaturePlot(seu, features = "CD4THYMvsPTCLCohort2_Score1") + scale_color_gradientn(colours = rev(brewer.pal(name = "Spectral", n=11)))
p7+p8
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] Seurat_5.2.1 SeuratObject_5.0.2
## [3] sp_2.1-4 gplots_3.2.0
## [5] limma_3.60.6 GSEABase_1.66.0
## [7] graph_1.82.0 annotate_1.82.0
## [9] XML_3.99-0.18 AnnotationDbi_1.66.0
## [11] GSVA_1.52.3 ggplot2_3.5.1
## [13] knitr_1.49 readr_2.1.5
## [15] dplyr_1.1.4 pheatmap_1.0.12
## [17] RColorBrewer_1.1-3 DESeq2_1.44.0
## [19] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [21] MatrixGenerics_1.16.0 matrixStats_1.5.0
## [23] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1
## [25] IRanges_2.38.1 S4Vectors_0.42.1
## [27] BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.0
## [3] later_1.4.1 bitops_1.0-9
## [5] tibble_3.2.1 polyclip_1.10-7
## [7] fastDummies_1.7.5 lifecycle_1.0.4
## [9] globals_0.16.3 lattice_0.22-6
## [11] MASS_7.3-60.2 magrittr_2.0.3
## [13] plotly_4.10.4 sass_0.4.9
## [15] rmarkdown_2.29 jquerylib_0.1.4
## [17] yaml_2.3.10 httpuv_1.6.15
## [19] sctransform_0.4.1 spam_2.11-1
## [21] spatstat.sparse_3.1-0 reticulate_1.40.0
## [23] cowplot_1.1.3 pbapply_1.7-2
## [25] DBI_1.2.3 abind_1.4-8
## [27] zlibbioc_1.50.0 Rtsne_0.17
## [29] purrr_1.0.2 GenomeInfoDbData_1.2.12
## [31] ggrepel_0.9.6 irlba_2.3.5.1
## [33] spatstat.utils_3.1-2 listenv_0.9.1
## [35] goftest_1.2-3 RSpectra_0.16-2
## [37] spatstat.random_3.3-2 fitdistrplus_1.2-2
## [39] parallelly_1.42.0 codetools_0.2-20
## [41] DelayedArray_0.30.1 tidyselect_1.2.1
## [43] UCSC.utils_1.0.0 farver_2.1.2
## [45] ScaledMatrix_1.12.0 spatstat.explore_3.3-4
## [47] jsonlite_1.8.9 progressr_0.15.1
## [49] ggridges_0.5.6 survival_3.5-8
## [51] tools_4.4.0 ica_1.0-3
## [53] Rcpp_1.0.14 glue_1.8.0
## [55] gridExtra_2.3 SparseArray_1.4.8
## [57] xfun_0.49 HDF5Array_1.32.1
## [59] withr_3.0.2 fastmap_1.2.0
## [61] rhdf5filters_1.16.0 caTools_1.18.3
## [63] digest_0.6.35 rsvd_1.0.5
## [65] R6_2.5.1 mime_0.12
## [67] colorspace_2.1-1 scattermore_1.2
## [69] tensor_1.5 gtools_3.9.5
## [71] spatstat.data_3.1-4 RSQLite_2.3.9
## [73] tidyr_1.3.1 generics_0.1.3
## [75] data.table_1.16.4 httr_1.4.7
## [77] htmlwidgets_1.6.4 S4Arrays_1.4.1
## [79] uwot_0.2.2 pkgconfig_2.0.3
## [81] gtable_0.3.6 blob_1.2.4
## [83] lmtest_0.9-40 SingleCellExperiment_1.26.0
## [85] XVector_0.44.0 htmltools_0.5.8.1
## [87] dotCall64_1.2 scales_1.3.0
## [89] png_0.1-8 SpatialExperiment_1.14.0
## [91] spatstat.univar_3.1-1 rstudioapi_0.17.1
## [93] tzdb_0.4.0 reshape2_1.4.4
## [95] rjson_0.2.23 nlme_3.1-164
## [97] cachem_1.1.0 zoo_1.8-12
## [99] rhdf5_2.48.0 stringr_1.5.1
## [101] KernSmooth_2.23-22 parallel_4.4.0
## [103] miniUI_0.1.1.1 pillar_1.10.1
## [105] grid_4.4.0 vctrs_0.6.5
## [107] RANN_2.6.2 promises_1.3.2
## [109] BiocSingular_1.20.0 beachmat_2.20.0
## [111] xtable_1.8-4 cluster_2.1.6
## [113] evaluate_1.0.3 magick_2.8.5
## [115] cli_3.6.2 locfit_1.5-9.10
## [117] compiler_4.4.0 rlang_1.1.3
## [119] crayon_1.5.3 future.apply_1.11.3
## [121] labeling_0.4.3 plyr_1.8.9
## [123] stringi_1.8.4 deldir_2.0-4
## [125] viridisLite_0.4.2 BiocParallel_1.38.0
## [127] munsell_0.5.1 Biostrings_2.72.1
## [129] lazyeval_0.2.2 spatstat.geom_3.3-5
## [131] Matrix_1.7-0 RcppHNSW_0.6.0
## [133] hms_1.1.3 patchwork_1.3.0
## [135] sparseMatrixStats_1.16.0 bit64_4.6.0-1
## [137] future_1.34.0 Rhdf5lib_1.26.0
## [139] KEGGREST_1.44.1 statmod_1.5.0
## [141] shiny_1.10.0 ROCR_1.0-11
## [143] igraph_2.1.4 memoise_2.0.1
## [145] bslib_0.8.0 bit_4.5.0.1
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.