Oh S, Geistlinger L, Ramos M, Blankenberg D, van den Beek M, Taroni JN, Carey VJ, Waldron L, Davis S. GenomicSuperSignature facilitates interpretation of RNA-seq experiments through robust, efficient comparison to public databases. Nature Communications 2022;13: 3695. doi: 10.1038/s41467-022-31411-3
BiocManager::install("shbrief/GenomicSuperSignature")
suppressPackageStartupMessages({
library(GenomicSuperSignature)
library(bcellViper)
library(dplyr)
library(AnVIL)
})
## getModel
system.time(RAVmodel <- getModel("PLIERpriors"))
## [1] "downloading"
## user system elapsed
## 6.824 1.919 16.506
system.time(RAVmodel_C2 <- getModel("C2"))
## [1] "downloading"
## user system elapsed
## 6.878 1.849 18.133
## Check the model
RAVmodel
## class: PCAGenomicSignatures
## dim: 13934 4764
## metadata(8): cluster size ... version geneSets
## assays(1): RAVindex
## rownames(13934): CASKIN1 DDX3Y ... CTC-457E21.9 AC007966.1
## rowData names(0):
## colnames(4764): RAV1 RAV2 ... RAV4763 RAV4764
## colData names(4): RAV studies silhouetteWidth gsea
## trainingData(2): PCAsummary MeSH
## trainingData names(536): DRP000987 SRP059172 ... SRP164913 SRP188526
geneSets(RAVmodel)
## [1] "Three priors from PLIER (bloodCellMarkersIRISDMAP, svmMarkers, and canonicalPathways)"
## Input data
data(bcellViper)
dset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 6249 features, 211 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
## varLabels: sampleID description detailed_description
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
## validate
system.time(val_all <- validate(dset, RAVmodel))
## user system elapsed
## 0.395 0.125 0.594
heatmapTable(val_all, RAVmodel)
val_ind <- validatedSignatures(val_all, RAVmodel, indexOnly = TRUE)
## MeSH terms
# for (i in val_ind) {drawWordcloud(RAVmodel, ind = i)}
drawWordcloud(RAVmodel, val_ind[5])
## GSEA
subsetEnrichedPathways(RAVmodel, val_ind[5]) %>% as.data.frame
## RAV1139.Description
## Up_1 DMAP_ERY3
## Up_2 KEGG_ALZHEIMERS_DISEASE
## Up_3 REACTOME_POST_TRANSLATIONAL_PROTEIN_MODIFICATION
## Up_4 REACTOME_APOPTOSIS
## Up_5 KEGG_HUNTINGTONS_DISEASE
## Up_6 REACTOME_MRNA_PROCESSING
## Up_7 REACTOME_HOST_INTERACTIONS_OF_HIV_FACTORS
## Up_8 PID_E2F_PATHWAY
## Up_9 KEGG_PYRIMIDINE_METABOLISM
## Up_10 REACTOME_CDK_MEDIATED_PHOSPHORYLATION_AND_REMOVAL_OF_CDC6
## Relevant studies
findStudiesInCluster(RAVmodel, val_ind[5], studyTitle = TRUE)
## studyName PC Variance explained (%)
## 1 SRP028567 2 14.76
## 2 SRP059057 3 7.45
## 3 SRP095405 1 37.27
## 4 SRP144647 1 32.82
## title
## 1 RNA-Seq analysis of primary AML specimens exposed to AhR modulating agents
## 2 Transcriptome analysis of CD4+ T cells reveals imprint of BACH2 and IFN? regulation
## 3 Identification of genes induced by NOTCH1 in a chronic lymphocytic leukaemia (CLL) cell line and tracking of these genes in primary CLL patients
## 4 Transcriptomes from naïve CD4+ T-cells from infants and children with and without food allergy [RNA-seq]
## Misc metadata
getRAVInfo(RAVmodel, val_ind[5])
## $clusterSize
## [1] 4
##
## $silhouetteWidth
## [1] 0.09
##
## $enrichedPathways
## [1] 56
##
## $members
## studyName PC Variance explained (%)
## 1 SRP028567 2 14.76
## 2 SRP059057 3 7.45
## 3 SRP095405 1 37.27
## 4 SRP144647 1 32.82
getStudyInfo(RAVmodel, "SRP095405")
## $studyTitle
## [1] "Identification of genes induced by NOTCH1 in a chronic lymphocytic leukaemia (CLL) cell line and tracking of these genes in primary CLL patients"
##
## $studySize
## [1] 52
##
## $RAVs
## PC RAV Variance explained (%)
## 1 1 1139 37.27
## 2 2 1046 8.08
## 3 3 1554 5.13
## 4 4 203 4.23
## 5 5 564 2.34
## 6 6 1602 2.16
## 7 7 1601 1.93
## 8 8 1604 1.88
## 9 9 703 1.56
## 10 10 2502 1.50
## 11 11 1605 1.33
## 12 12 2878 1.28
## 13 13 1438 1.23
## 14 14 2986 1.19
## 15 15 2242 1.14
## 16 16 1605 1.13
## 17 17 2987 1.08
## 18 18 723 1.06
## 19 19 261 1.01
## 20 20 2988 1.00
TCGA_validationDatasets.rda file was prepared using this script.
## The data file stored in Google Cloud Bucket
## log-transformed raw data
dir <- "gs://genomic_super_signature"
fpath <- file.path(dir, "TCGA_validationDatasets.rda") # AnVIL package
## Load the data
load(gsutil_pipe(fpath))
## Panel B
brca <- TCGA_validationDatasets[["BRCA"]]
system.time(val_brca <- validate(brca, RAVmodel_C2))
## user system elapsed
## 1.029 0.158 1.195
heatmapTable(val_brca, RAVmodel_C2)
## Panel C
drawWordcloud(RAVmodel, 221)
## Panel D
findStudiesInCluster(RAVmodel, 221, studyTitle = TRUE)
## studyName PC Variance explained (%)
## 1 ERP016798 2 8.25
## 2 SRP023262 9 1.07
## 3 SRP111343 3 4.46
## title
## 1 Whole transcriptome profiling of 63 breast cancer tumours
## 2 A shared transcriptional program in early breast neoplasias despite genetic and clinical distinctions
## 3 RNAseq analysis of chemotherapy and radiation therapy-naïve breast tumors
## Panel E
subsetEnrichedPathways(RAVmodel, 221, include_nes = TRUE) %>% as.data.frame
## RAV221.Description RAV221.NES
## Up_1 REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM -2.272304
## Up_2 REACTOME_ADAPTIVE_IMMUNE_SYSTEM -2.287344
## Up_3 IRIS_DendriticCell-LPSstimulated -2.304916
## Up_4 REACTOME_MITOTIC_G1_G1_S_PHASES -2.368176
## Up_5 IRIS_Monocyte-Day7 -2.407126
## Up_6 REACTOME_MITOTIC_M_M_G1_PHASES -2.450146
## Up_7 REACTOME_DNA_REPLICATION -2.491313
## Up_8 REACTOME_CELL_CYCLE -2.499352
## Up_9 REACTOME_CELL_CYCLE_MITOTIC -2.546613
## Up_10 KEGG_CELL_CYCLE -2.548161
# annotateRAV(RAVmodel, 221, n = 10)
E-MTAB-2452 dataset
## The data file stored in Google Cloud Bucket using AnVIL package
dir <- "gs://genomic_super_signature"
fpath <- file.path(dir, "E-MTAB-2452_hugene11st_SCANfast_with_GeneSymbol.pcl")
x <- gsutil_pipe(fpath, open = "rb")
## Load the data
annot.dat <- readr::read_tsv(x, show_col_types = FALSE) %>% as.data.frame
rownames(annot.dat) <- annot.dat[, 2]
dataset <- as.matrix(annot.dat[, 3:ncol(annot.dat)])
rownames(dataset) <- annot.dat$GeneSymbol
dataset[1:3, 1:3]
## CD14_triad0058_1.CEL CD14_triad0058_2.CEL CD14_triad0058_3.CEL
## A1BG 1.154439e-01 0.17987252 0.17031624
## NAT2 -7.464545e-06 -0.09915562 -0.02097987
## ADA 5.338218e-01 0.67212096 0.76375038
system.time(val_all <- validate(dataset, RAVmodel))
## user system elapsed
## 0.842 0.052 0.896
annotatePC(2, val_all, RAVmodel, simplify = FALSE)
## $`PC2-RAV1552`
## Description NES pvalue qvalues
## 1 IRIS_Monocyte-Day0 2.586697 1e-10 2.680702e-09
## 2 IRIS_DendriticCell-Control 2.433219 1e-10 2.680702e-09
## 3 DMAP_MONO2 2.376574 1e-10 2.680702e-09
## 4 IRIS_Monocyte-Day7 2.366122 1e-10 2.680702e-09
## 5 SVM Monocytes 2.314221 1e-10 2.680702e-09
annotatePC(1:3, val_all, RAVmodel, scoreCutoff = 0)
## PC1.RAV23 PC2.RAV1552
## 1 SVM T cells CD8 IRIS_Monocyte-Day0
## 2 SVM T cells CD4 naive IRIS_DendriticCell-Control
## 3 SVM T cells follicular helper DMAP_MONO2
## 4 SVM T cells regulatory (Tregs) IRIS_Monocyte-Day7
## 5 SVM T cells gamma delta SVM Monocytes
## PC3.RAV1387
## 1 MIPS_55S_RIBOSOME_MITOCHONDRIAL
## 2 REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_S...
## 3 MIPS_39S_RIBOSOMAL_SUBUNIT_MITOCHONDRIAL
## 4 REACTOME_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_T...
## 5 REACTOME_RESPIRATORY_ELECTRON_TRANSPORT
Label each sample with their known cell type.
cellType <- gsub("_.*$", "", colnames(dataset))
cellType <- gsub("CD4", "CD4,T cell", cellType)
cellType <- gsub("CD14", "CD14,monocyte", cellType)
cellType <- gsub("CD16", "CD16,neutrophil", cellType)
names(cellType) <- colnames(dataset)
plotAnnotatedPCA(dataset, RAVmodel, c(2,3), val_all,
scoreCutoff = 0.3,
color_by = cellType,
color_lab = "Cell Type")
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.2.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] AnVIL_1.9.5 dplyr_1.0.9
## [3] bcellViper_1.30.0 GenomicSuperSignature_1.5.3
## [5] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [7] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [9] IRanges_2.28.0 S4Vectors_0.32.4
## [11] BiocGenerics_0.40.0 MatrixGenerics_1.6.0
## [13] matrixStats_0.62.0 BiocStyle_2.22.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-3 ggsignif_0.6.3 rjson_0.2.21
## [4] ellipsis_0.3.2 flextable_0.7.2 circlize_0.4.15
## [7] futile.logger_1.4.3 XVector_0.34.0 base64enc_0.1-3
## [10] GlobalOptions_0.1.2 clue_0.3-61 rstudioapi_0.13
## [13] farver_2.1.1 ggpubr_0.4.0 remotes_2.4.2
## [16] bit64_4.0.5 fansi_1.0.3 xml2_1.3.3
## [19] codetools_0.2-18 doParallel_1.0.17 cachem_1.0.6
## [22] knitr_1.39 jsonlite_1.8.0 broom_1.0.0
## [25] cluster_2.1.3 dbplyr_2.2.1 png_0.1-7
## [28] BiocManager_1.30.18 readr_2.1.2 compiler_4.1.2
## [31] httr_1.4.3 backports_1.4.1 assertthat_0.2.1
## [34] Matrix_1.4-1 fastmap_1.1.0 cli_3.3.0
## [37] formatR_1.12 htmltools_0.5.3 tools_4.1.2
## [40] gtable_0.3.0 glue_1.6.2 GenomeInfoDbData_1.2.7
## [43] rappdirs_0.3.3 Rcpp_1.0.9 rapiclient_0.1.3
## [46] carData_3.0-5 jquerylib_0.1.4 vctrs_0.4.1
## [49] iterators_1.0.14 xfun_0.31 stringr_1.4.0
## [52] ps_1.7.1 lifecycle_1.0.1 irlba_2.3.5
## [55] rstatix_0.7.0 zlibbioc_1.40.0 scales_1.2.0
## [58] vroom_1.5.7 hms_1.1.1 parallel_4.1.2
## [61] lambda.r_1.2.4 RColorBrewer_1.1-3 ComplexHeatmap_2.10.0
## [64] yaml_2.3.5 curl_4.3.2 memoise_2.0.1
## [67] ggplot2_3.3.6 gdtools_0.2.4 sass_0.4.2
## [70] stringi_1.7.8 RSQLite_2.2.15 highr_0.9
## [73] foreach_1.5.2 filelock_1.0.2 zip_2.2.0
## [76] shape_1.4.6 systemfonts_1.0.4 rlang_1.0.4
## [79] pkgconfig_2.0.3 bitops_1.0-7 evaluate_0.15
## [82] lattice_0.20-45 purrr_0.3.4 labeling_0.4.2
## [85] cowplot_1.1.1 processx_3.7.0 bit_4.0.4
## [88] tidyselect_1.1.2 magrittr_2.0.3 bookdown_0.27
## [91] R6_2.5.1 magick_2.7.3 generics_0.1.3
## [94] DelayedArray_0.20.0 DBI_1.1.3 pillar_1.8.0
## [97] abind_1.4-5 RCurl_1.98-1.7 tibble_3.1.8
## [100] crayon_1.5.1 car_3.1-0 futile.options_1.0.1
## [103] uuid_1.1-0 wordcloud_2.6 utf8_1.2.2
## [106] BiocFileCache_2.2.1 officer_0.4.3 tzdb_0.3.0
## [109] rmarkdown_2.14 GetoptLong_1.0.5 grid_4.1.2
## [112] data.table_1.14.2 callr_3.7.1 blob_1.2.3
## [115] webshot_0.5.3 digest_0.6.29 tidyr_1.2.0
## [118] munsell_0.5.0 bslib_0.4.0