Contents

suppressPackageStartupMessages({
    library(SummarizedExperiment)
    library(ExperimentHub)
    library(tissueTreg)
    library(GenomicSuperSignature)
})
RAVmodel <- readRDS("~/GSS/GenomicSuperSignaturePaper/inst/extdata/RAVmodel_C2.rds")
RAVmodel_mmu <- readRDS("/mnt/STORE1/16tb_b/refinebio_mice/mouse_colon/refinebioRseq_mmu_colon_RAVmodel_MSigDB_mmu_C2_20220810.rds")

Input data

Use the data from tissueTreg package, RNA-seq data from tissue T regulatory cells from mice.

The package provides ready to use epigenomes (obtained from TWGBS) and transcriptomes (RNA-seq) from various tissues as obtained in the study (Delacher and Imbusch 2017, PMID: 28783152). Regulatory T cells (Treg cells) perform two distinct functions: they maintain self-tolerance, and they support organ homeostasis by differentiating into specialized tissue Treg cells. The underlying dataset characterises the epigenetic and transcriptomic modifications for specialized tissue Treg cells.

eh <- ExperimentHub()
se_rpkms <- eh[["EH1074"]]

Gene symbols for this dataset:

head(rowData(se_rpkms))
#> DataFrame with 6 rows and 1 column
#>                      id_symbol
#>                    <character>
#> ENSMUSG00000030105       Arl8b
#> ENSMUSG00000042428       Mgat3
#> ENSMUSG00000096054       Syne1
#> ENSMUSG00000046532          Ar
#> ENSMUSG00000020495        Smg8
#> ENSMUSG00000058979       Cecr5

Remove low-expressing genes

## Use the same default filter as `GSEABenchmarkeR::loadEData`
## Exclude genes with cpm < 2 in more than half of the samples
ind <- which(rowSums(assay(se_rpkms)<2) >=8)   # 12,126 low-expressing genes
filtered <- se_rpkms[-ind,]   # 8,235 gene x 15 samples

Log transformation to make the data follow normal distribution

dat <- log2(assay(filtered) + 1)
hist(dat)

Change EnsemblID to gene symbols

rownames(dat) <- rowData(filtered)$id_symbol
dat[1:6, 1:6]
#>       Fat-Treg-R1 Fat-Treg-R2 Fat-Treg-R3 Liver-Treg-R1 Liver-Treg-R2
#> Arl8b    4.906891    3.584963    4.321928      2.807355      3.584963
#> Smg8     1.584963    0.000000    1.584963      1.000000      1.584963
#> Cecr5    1.584963    2.321928    2.321928      3.000000      3.906891
#> Nol8     2.584963    2.807355    3.459432      3.807355      3.584963
#> Smox     2.321928    2.000000    2.321928      2.000000      2.321928
#> Nop9     2.807355    1.000000    1.584963      1.584963      1.000000
#>       Liver-Treg-R3
#> Arl8b      4.087463
#> Smg8       1.584963
#> Cecr5      3.000000
#> Nol8       3.000000
#> Smox       2.321928
#> Nop9       1.584963

Ortholog mapping

Check orthologs between human and mice:

library(Orthology.eg.db)

## Get the list of Gene IDs for mice
mKeys <- keys(Orthology.eg.db, "Mus.musculus")

## Map all the available mouse genes to human genes
mice2human <- select(Orthology.eg.db, mKeys, "Homo.sapiens", "Mus.musculus")
library(org.Hs.eg.db)
library(org.Mm.eg.db)

Mm_obj <- as.character(mice2human$Mus.musculus) %>% as.list
mice2human$Mm_GS <- EnrichmentBrowser::idMap(obj = Mm_obj, org = "mmu", 
                                             from = "ENTREZID", to = "SYMBOL")

Hs_obj <- as.character(mice2human$Homo.sapiens) %>% as.list
mice2human$Hs_GS <- EnrichmentBrowser::idMap(obj = Hs_obj, org = "hsa", 
                                             from = "ENTREZID", to = "SYMBOL")

# saveRDS(mice2human, "miceHumanOrthologs.rds")
## Convert mouse genes to human gene ortholog
Mm_genes <- rowData(filtered)$id_symbol   # 8,235 genes in filtered matrix

## Mouse genes that were used in the input and have human orthologs
cg_ind <- which(Mm_genes %in% mice2human$Mm_GS & !isEmpty(mice2human$Hs_GS)) 
cg <- Mm_genes[cg_ind]   # 7,184 mice genes have human ortholog info

## Make a named list, `orth_map`
orth_map <- as.list(mice2human$Hs_GS)   
sum(sapply(orth_map, isEmpty))   # 45 empty entries
#> [1] 45
names(orth_map) <- mice2human$Mm_GS   # 16,730 mice-human orthologs
## Subset the input data to the genes having human orthologs
dat_sub <- filtered[cg_ind,]
rowData(dat_sub)$Hs_ortholog <- orth_map[cg]

assay_sub <- assay(dat_sub)
rownames(assay_sub) <- rowData(dat_sub)$Hs_ortholog

Apply RAVmodel on re-labeled mouse data

val_all <- validate(log2(assay_sub + 1), RAVmodel)
#> Warning in irlba::irlba(as.matrix(t(dat[gene_common, ])), nv = 8, center =
#> TRUE, : You're computing too large a percentage of total singular values, use a
#> standard svd instead.
heatmapTable(val_all, RAVmodel, num.out = 5, swCutoff = 0)

ind <- 2244
drawWordcloud(RAVmodel, ind)

subsetEnrichedPathways(RAVmodel, ind, n = 5, both = TRUE) %>% as.data.frame()
#>                                          RAV2244.Description
#> Up_1                                  FEVR_CTNNB1_TARGETS_UP
#> Up_2                           WONG_ADULT_TISSUE_STEM_MODULE
#> Up_3                    KRIGE_RESPONSE_TO_TOSEDOSTAT_24HR_UP
#> Up_4                         REACTOME_ADAPTIVE_IMMUNE_SYSTEM
#> Up_5                  BOQUEST_STEM_CELL_CULTURED_VS_FRESH_UP
#> Down_1 ALTEMEIER_RESPONSE_TO_LPS_WITH_MECHANICAL_VENTILATION
#> Down_2                            MCLACHLAN_DENTAL_CARIES_UP
#> Down_3                     LI_INDUCED_T_TO_NATURAL_KILLER_UP
#> Down_4      SMIRNOV_CIRCULATING_ENDOTHELIOCYTES_IN_CANCER_UP
#> Down_5                      VERHAAK_GLIOBLASTOMA_MESENCHYMAL
findStudiesInCluster(RAVmodel, ind, studyTitle = TRUE)
#>   studyName PC Variance explained (%)
#> 1 SRP072980  3                   4.43
#> 2 SRP076719  2                   6.81
#> 3 SRP119636  6                   1.59
#>                                                                                                                                    title
#> 1                                                                            Stochastic Principles Governing Alternative Splicing of RNA
#> 2                                                          Multiple Origins of Virus Persistence during Natural Control of HIV Infection
#> 3 Integration of genome-wide analysis to characterize long noncoding RNAs in diverse immune cell types of the stage IV melanoma patients
val_ind <- validatedSignatures(val_all, RAVmodel, indexOnly = TRUE)
for (i in val_ind) {
    print(paste("Enriched MeSH terms for RAV", i))
    drawWordcloud(RAVmodel, i)
}
#> [1] "Enriched MeSH terms for RAV 2244"

#> [1] "Enriched MeSH terms for RAV 522"

#> [1] "Enriched MeSH terms for RAV 4260"

#> [1] "Enriched MeSH terms for RAV 203"

#> [1] "Enriched MeSH terms for RAV 1455"

for (i in val_ind) {
    print(paste("Associated studies for RAV", i))
    res <- findStudiesInCluster(RAVmodel, i, studyTitle = TRUE)
    print(res)
}
#> [1] "Associated studies for RAV 2244"
#>   studyName PC Variance explained (%)
#> 1 SRP072980  3                   4.43
#> 2 SRP076719  2                   6.81
#> 3 SRP119636  6                   1.59
#>                                                                                                                                    title
#> 1                                                                            Stochastic Principles Governing Alternative Splicing of RNA
#> 2                                                          Multiple Origins of Virus Persistence during Natural Control of HIV Infection
#> 3 Integration of genome-wide analysis to characterize long noncoding RNAs in diverse immune cell types of the stage IV melanoma patients
#> [1] "Associated studies for RAV 522"
#>   studyName PC Variance explained (%)
#> 1 ERP104864 11                   0.86
#> 2 SRP075396  3                   3.79
#> 3 SRP097735  3                   3.71
#> 4 SRP103099 16                   0.99
#>                                                                                                                                                                         title
#> 1 Comprehensive transcriptome analysis of synovium and peripheral blood reveals major disease heterogeneity early in the RA disease process prior to therapeutic intervention
#> 2                                                                                            Xenograft sequencing to elucidate molecular events following anti-CD44 treatment
#> 3                                           Neuroblastoma cells undergo transcriptomic alterations during dissemination into the bone marrow and subsequent tumor progression
#> 4                                                                                    Patient derived xenograft models of leukemia and lymphoma whole transcriptome sequencing
#> [1] "Associated studies for RAV 4260"
#>   studyName PC Variance explained (%)
#> 1 SRP149978  2                   3.28
#> 2 SRP155483  3                   3.18
#> 3 SRP158994  3                   3.75
#>                                                                                                      title
#> 1                                                         RNA-seq of HIV bnAb and control individuals PBMC
#> 2                          White blood cells from rheumatoid arthritis patients and matched healthy donors
#> 3 Longitudinal transcriptomic characterization of the immune response to acute hepatitis C virus infection
#> [1] "Associated studies for RAV 203"
#>   studyName PC Variance explained (%)
#> 1 ERP013260  4                   5.09
#> 2 SRP049605  2                   3.63
#> 3 SRP055390  3                   4.83
#> 4 SRP066197  2                   2.74
#> 5 SRP095405  4                   4.23
#>                                                                                                                                              title
#> 1                               Transcriptional landscape of human tissue lymphocytes unveils\nuniqueness of tumor-infiltrating T regulatory cells
#> 2                                                  Identification of a Molecular Signature for Acute Lyme Disease by Human Transcriptome Profiling
#> 3                                                      Transcriptome analysis in chronic lymphocytic leukemia cells using RNA sequencing (RNA-seq)
#> 4                                                     Transcriptional profiling of TH2 cells identifies pathogenic features associated with asthma
#> 5 Identification of genes induced by NOTCH1 in a chronic lymphocytic leukaemia (CLL) cell line and tracking of these genes in primary CLL patients
#> [1] "Associated studies for RAV 1455"
#>   studyName PC Variance explained (%)
#> 1 SRP050272  3                   3.34
#> 2 SRP056197  1                  21.16
#> 3 SRP056197  2                  10.33
#> 4 SRP056295  2                   7.50
#> 5 SRP062144  1                  39.70
#>                                                                                                                                          title
#> 1                                                                        Expression and Prognostic impact of LncRNAs in Acute Myeloid Leukemia
#> 2                                                                                                           Leucegene: AML sequencing (part 4)
#> 3                                                                                                           Leucegene: AML sequencing (part 4)
#> 4                                                                                                           Leucegene: AML sequencing (part 5)
#> 5 Integrated analysis of MLL-AF9 AML patients and model leukemias highlights RET and other novel therapeutic targets (RNA-seq AML development)

GSEA

org <- c("hsa",   # Homo sapiens (human)
         "mmu",   # Mus musculus (mouse)
         "cel",   # Caenorhabditis elegans (nematode)
         "dme",   # Drosophila melanogaster (fruit fly)
         "dre",   # Danio rerio (zebrafish)
         "sce")   # Saccharomyces cereviseae (budding yeast)