R Notebook

instalaciones

BiocManager::install("AnnotationHub", checkBuilt = T)
BiocManager::install(c("GOSemSim","clusterProfiler","DOSE", "ape", "ggtree", "org.Hs.eg.db","org.Mn.eg.db" ))
BiocManager::install("STRINGdb", checkBuilt = T)
install.packages("igraph", dependencies = T)
BiocManager::install("DESeq2", checkBuilt = T)
BiocManager::install("apeglm", checkBuilt = T)
BiocManager::install("pathview", checkBuilt = T)

Data

library(tidyverse)
data0 <- GEOquery::getGEO('GSE148349',GSEMatrix = T, AnnotGPL = T, getGPL = T)
data0 %>% names %>% str_extract('GSE\\d+.GPL\\d+') %>% set_names(data0,.) -> data

data0     %>% names %>% str_extract('GSE\\d+.GPL\\d+') %>% set_names(data,.) -> data
my.GEOfile <- GEOquery::getGEOfile('GSE148349',AnnotGPL = T, destdir = getwd())

if(!'GSE148349' %in% list.files()){
              suppfiles  <- GEOquery::getGEOSuppFiles('GSE148349',makeDirectory = TRUE, baseDir = getwd())
               suppfiles %>% row.names() -> full.path.raw_tar
                }else{  
                full.path.raw_tar <- file.path(getwd(),'GSE148349','GSE148349_RAW.tar')}


full.path.raw_tar %>% str_match('.*(?=GSE\\d+.RAW.*)') -> gse.folder
untar(tarfile = full.path.raw_tar ,exdir =  gse.folder)

na.cleaner      <- purrr::compose(na.omit,as.character)
my.gunzip       <- function(a.file){GEOquery::gunzip(a.file,remove = F, overwrite = T)}
get.my.gz.files <- function(a.gse.folder){a.gse.folder %>% list.files(full.names = T ) %>% 
                                          str_extract('.*gz') %>% na.cleaner %>% map(my.gunzip)
                                          }

gse.folder %>% get.my.gz.files

## [[1]]
## [1] 4539917
## 
## [[2]]
## [1] 4480568
## 
## [[3]]
## [1] 4460881
## 
## [[4]]
## [1] 4505095
## 
## [[5]]
## [1] 4444373
## 
## [[6]]
## [1] 4447812
## 
## [[7]]
## [1] 4490769
## 
## [[8]]
## [1] 4558121
## 
## [[9]]
## [1] 4471386
## 
## [[10]]
## [1] 4428544
## 
## [[11]]
## [1] 4514839
## 
## [[12]]
## [1] 4492808
## 
## [[13]]
## [1] 4457462
## 
## [[14]]
## [1] 4535590
## 
## [[15]]
## [1] 4491637
## 
## [[16]]
## [1] 4523600
## 
## [[17]]
## [1] 4526613
## 
## [[18]]
## [1] 4485652
## 
## [[19]]
## [1] 4441697
## 
## [[20]]
## [1] 4479779
## 
## [[21]]
## [1] 4449736
## 
## [[22]]
## [1] 4464040
## 
## [[23]]
## [1] 6150604
## 
## [[24]]
## [1] 6144594
## 
## [[25]]
## [1] 6109133
## 
## [[26]]
## [1] 6141559
## 
## [[27]]
## [1] 6057293
## 
## [[28]]
## [1] 6060900
## 
## [[29]]
## [1] 6012731
## 
## [[30]]
## [1] 6039165
## 
## [[31]]
## [1] 5951896
## 
## [[32]]
## [1] 6031094
## 
## [[33]]
## [1] 6060183
## 
## [[34]]
## [1] 6053800
## 
## [[35]]
## [1] 6086705
## 
## [[36]]
## [1] 6100447
## 
## [[37]]
## [1] 6029777
## 
## [[38]]
## [1] 4352214
## 
## [[39]]
## [1] 4388185
## 
## [[40]]
## [1] 4371904
## 
## [[41]]
## [1] 4391209
## 
## [[42]]
## [1] 4376523
## 
## [[43]]
## [1] 4386717
## 
## [[44]]
## [1] 4327284
## 
## [[45]]
## [1] 4383123
## 
## [[46]]
## [1] 4346199
## 
## [[47]]
## [1] 4384989
## 
## [[48]]
## [1] 4388084
## 
## [[49]]
## [1] 4377429
## 
## [[50]]
## [1] 4376469
## 
## [[51]]
## [1] 4370958
## 
## [[52]]
## [1] 4374562
## 
## [[53]]
## [1] 4355669
## 
## [[54]]
## [1] 4357262
## 
## [[55]]
## [1] 4385908
## 
## [[56]]
## [1] 6035214
## 
## [[57]]
## [1] 6080089
## 
## [[58]]
## [1] 6051445
## 
## [[59]]
## [1] 6059527
## 
## [[60]]
## [1] 6091559
## 
## [[61]]
## [1] 6084302
## 
## [[62]]
## [1] 6072122
## 
## [[63]]
## [1] 6026022
## 
## [[64]]
## [1] 6104480
## 
## [[65]]
## [1] 6116598
## 
## [[66]]
## [1] 6109975

gse.folder %>% list.files(full.names = T ) %>% str_extract('.*tsv$') %>%  na.cleaner -> tsv.files.path

tsv.files.path %>%
      map(function(files.path00){readr::read_delim(files.path00, delim = '\t', skip = 0,col_names = TRUE)}) -> df_list #

tsv.files.path %>% str_extract('GSM\\d+') %>% set_names(df_list, .) -> df_GSM_list

get_gsm <- function(my.list.item){gsm_code <- names(my.list.item)
                                   my.list.item  %>% .[[1]] %>% dplyr::select(target_id,est_counts) %>% 
                                   set_names(c('target_id',gsm_code)) -> a_gsm; return(a_gsm)}

list_of_gsms <- list()
for(i in 1:length(df_GSM_list)){
                                df_GSM_list[i] %>% get_gsm -> one.gsm
                                list_of_gsms[[i]] <- one.gsm}



list_of_gsms %>%reduce(full_join) -> all_gsms

library(Biobase)
data$`GSE148349-GPL24247` %>% exprs %>% colnames -> GSE148349_GPL24247
data$`GSE148349-GPL24676` %>% exprs %>% colnames -> GSE148349_GPL24676

library(magrittr)
all_gsms %>% select(GSE148349_GPL24247) %>% as.matrix %>% set_rownames(all_gsms$target_id) %>% as.data.frame %>% drop_na -> exprs_GSE148349_GPL24247
all_gsms %>% select(GSE148349_GPL24676) %>% as.matrix %>% set_rownames(all_gsms$target_id) %>% as.data.frame %>% drop_na -> exprs_GSE148349_GPL24676

exp_set_GSE148349_GPL24247 <- ExpressionSet(assayData = as.matrix(exprs_GSE148349_GPL24247),
              phenoData  = data$`GSE148349-GPL24247`@phenoData,
              Annotation = data$`GSE148349-GPL24247`@annotation,
          experimentData = data$`GSE148349-GPL24247`@experimentData,
          protocolData   = data$`GSE148349-GPL24247`@protocolData)



exp_set_GSE148349_GPL24676 <- ExpressionSet(assayData = as.matrix(exprs_GSE148349_GPL24676),
              phenoData  = data$`GSE148349-GPL24676`@phenoData,
              Annotation = data$`GSE148349-GPL24676`@annotation,
          experimentData = data$`GSE148349-GPL24676`@experimentData,
          protocolData   = data$`GSE148349-GPL24676`@protocolData)

exp_set_GSE148349_GPL24247 %>% exprs() %>% round() -> countdata #Por qué redondear

coldata <- pData(exp_set_GSE148349_GPL24247)

coldata$title %>% str_detect('Untreated') -> untreated_bool

coldata$title[untreated_bool] <- 'control'
coldata$title[!untreated_bool] <- 'treatment'

#BiocManager::install("DESeq2", checkBuilt = T)

library(DESeq2)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
                                 colData   = coldata,
                                 design    = ~ title)
keep <- rowSums(counts(ddsMat)) > 1
dds  <- ddsMat[keep,]
dds  <- estimateSizeFactors(dds)

#BiocManager::install("apeglm", checkBuilt = T)
DE.analysis  <- DESeq(dds)

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 1827 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

res0         <- results(DE.analysis,   contrast = c('title','treatment','control'))

library(EnhancedVolcano)

## Loading required package: ggrepel

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

EnhancedVolcano(res0,
    lab = rownames(res0),
    x = 'log2FoldChange',
    y = 'padj',
    xlim = c(-6,6),
    xlab = bquote(~Log[2]~ 'fold change'),
    pCutoff = 1e-5,
    FCcutoff = 2.0,
    pointSize = 2.0,
    labSize = 4.0,
    colAlpha = .8,
    legendPosition = 'right',
    legendLabSize = 12,
    legendIconSize = 4.0,
    drawConnectors = TRUE,
    widthConnectors = 0.2,
    colConnectors = 'grey30')

################### ### Mining the biological knowledge of the differentially expressed genes

res0 %>% as.data.frame %>%rownames_to_column('genes') %>%  select(genes,padj,log2FoldChange) %>% filter(padj < 1.0e-07 & abs(log2FoldChange)>4 ) -> my_DEGs
my_DEGs

Obtener especie, Symbol y Entrez ID

library(org.Mm.eg.db)

## Loading required package: AnnotationDbi

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:dplyr':
## 
##     select

##

org.Mm.eg.db %>% keytypes

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MGI"          "ONTOLOGY"    
## [16] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"         "PROSITE"     
## [21] "REFSEQ"       "SYMBOL"       "UNIGENE"      "UNIPROT"

#Especie
exp_set_GSE148349_GPL24247 %>% pData() %>% dplyr::select(starts_with('orga')) %>% .[[1]] %>% unique

## [1] "Mus musculus"

genes.ENTREZID <- clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "ENTREZID", "org.Mm.eg.db")

##

## 'select()' returned 1:many mapping between keys and columns

## Warning in clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "ENTREZID", :
## 89.08% of input gene IDs are fail to map...

DEGs0          <- inner_join(my_DEGs, genes.ENTREZID, by = c('genes'='ENSEMBLTRANS'))
genes.SYMBOL   <- clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "SYMBOL", "org.Mm.eg.db")

## 'select()' returned 1:many mapping between keys and columns

## Warning in clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "SYMBOL", :
## 89.08% of input gene IDs are fail to map...

DEGs           <- inner_join(DEGs0, genes.SYMBOL, by = c('genes'='ENSEMBLTRANS'))
DEGs

library(GOSemSim)

## GOSemSim v2.16.1  For help: https://guangchuangyu.github.io/GOSemSim
## 
## If you use GOSemSim in published research, please cite:
## [36m-[39m Guangchuang Yu. Gene Ontology Semantic Similarity Analysis Using GOSemSim. In: Kidder B. (eds) Stem Cell Transcriptional Networks. Methods in Molecular Biology, 2020, 2117:207-215. Humana, New York, NY. doi:10.1007/978-1-0716-0301-7_11
## [36m-[39m Guangchuang Yu, Fei Li, Yide Qin, Xiaochen Bo, Yibo Wu, Shengqi Wang. GOSemSim: an R package for measuring semantic similarity among GO terms and gene products Bioinformatics 2010, 26(7):976-978. doi:10.1093/bioinformatics/btq064

MnGO <- godata('org.Mm.eg.db', ont="BP")

## preparing gene to GO mapping data...

## preparing IC data...

#eg  <- DEGs$SYMBOL
eg  <- DEGs$ENTREZID
sim  <- mgeneSim(eg, semData = MnGO, measure = "Wang", combine = "BMA", verbose=FALSE)
DOSE::simplot(sim)

## Using ID as id variables

sim %>% row.names() -> my.entrez.ids
DEGs %>% filter(ENTREZID == my.entrez.ids) -> my.dict #%>% .[['SYMBOL']] -> my.symbols

## Warning in ENTREZID == my.entrez.ids: longitud de objeto mayor no es múltiplo de
## la longitud de uno menor

sim[my.dict$ENTREZID,my.dict$ENTREZID] -> sim2
row.names(sim2) <- my.dict$SYMBOL
colnames(sim2) <- my.dict$SYMBOL
DOSE::simplot(sim2)

## Using ID as id variables

### Gene cluster semantic similarity measurement

DEGs %>% filter(log2FoldChange < 15) %>% .[["ENTREZID"]] -> gs1
DEGs %>% filter(log2FoldChange > 20) %>% .[["ENTREZID"]] -> gs2

clusterSim(gs1, gs2, semData=MnGO, measure="Wang", combine="BMA")

## [1] 0.345

GO enrichment analysis

library(clusterProfiler)

## clusterProfiler v3.18.0  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.

## 
## Attaching package: 'clusterProfiler'

## The following object is masked from 'package:AnnotationDbi':
## 
##     select

## The following object is masked from 'package:IRanges':
## 
##     slice

## The following object is masked from 'package:S4Vectors':
## 
##     rename

## The following object is masked from 'package:purrr':
## 
##     simplify

## The following object is masked from 'package:stats':
## 
##     filter

library(org.Mm.eg.db)

ggo <- groupGO(gene     = DEGs$ENTREZID,
               OrgDb    = org.Mm.eg.db,
               keyType = "ENTREZID",
               ont      = "CC",
               level    = 3,
               readable = TRUE)

ggo %>% as.data.frame()

GO enrichment analysis

library(clusterProfiler)
data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]
# Entrez gene ID
head(gene)

## [1] "4312"  "8318"  "10874" "55143" "55388" "991"

library(org.Hs.eg.db)

##

glimpse(gene)

##  chr [1:207] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...

glimpse(names(geneList))

##  chr [1:12495] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...

ego <- enrichGO(gene          = gene,
                universe      = names(geneList),
                OrgDb         = org.Hs.eg.db,
                keyType       = "ENTREZID",
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
head(ego)

gene.df <- bitr(gene, fromType = "ENTREZID",
        toType = c("ENSEMBL", "SYMBOL"),
        OrgDb = org.Hs.eg.db)

## 'select()' returned 1:many mapping between keys and columns

## Warning in bitr(gene, fromType = "ENTREZID", toType = c("ENSEMBL", "SYMBOL"), :
## 0.48% of input gene IDs are fail to map...

ego2 <- enrichGO(gene         = gene.df$ENSEMBL,
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENSEMBL',
                ont           = "CC",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
head(ego2, 3)

GO Gene Set Enrichment Analysis

ego3 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "CC",
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)

## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.

geneList %>% length

## [1] 12495

ego3@result %>% select(setSize) %>% colSums

## setSize 
##    7710

goplot(ego)

data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]

kk <- enrichKEGG(gene         = gene,
                 organism     = 'hsa',
                 pvalueCutoff = 0.05)

## Reading KEGG annotation online:
## 
## Reading KEGG annotation online:

kk@result

kk2 <- gseKEGG(geneList     = geneList,
               organism     = 'hsa',
               minGSSize    = 120,
               pvalueCutoff = 0.05,
               verbose      = FALSE)
kk2@result

KEGG module over-representation analysis

KEGG Module is a collection of manually defined function units. In some situation, KEGG Modules have a more straightforward interpretation.

mkk <- enrichMKEGG(gene = gene,
                   organism = 'hsa',
                   pvalueCutoff = 1,
                   qvalueCutoff = 1)

## Reading KEGG annotation online:
## 
## Reading KEGG annotation online:

mkk@result

mkk2 <- gseMKEGG(geneList = geneList,
                 organism = 'hsa',
                 pvalueCutoff = 1)

## preparing geneSet collections...

## GSEA analysis...

## leading edge analysis...

## done...

mkk2@result

Visualize enriched KEGG pathways

clusterProfiler::browseKEGG(kk, 'hsa04110')

#BiocManager::install("pathview", checkBuilt = T)

library("pathview")

## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
## 
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################

hsa04110 <- pathview(gene.data  = geneList,
                     pathway.id = "hsa04110",
                     species    = "hsa",
                     limit      = list(gene=max(abs(geneList)), cpd=1))

## 'select()' returned 1:1 mapping between keys and columns

## Info: Working in directory /home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/clase_08_AA_analisis_funcional_y_redes

## Info: Writing image file hsa04110.pathview.png

GETTING INTERACTIONS WITH STRING (https://www.string-db.org) is a database of known and predicted protein-protein interactions

library(STRINGdb)
data(interactions_example) 
data(diff_exp_example1)

#To begin, you should first know the NCBI taxonomy identifiers of the organism on which you have performed the experiment (e.g. 9606 for Human, 10090 for mouse). 
# create a new STRING_db object
string_db <- STRINGdb$new( version="11", species=9606, score_threshold=0, input_directory="")



tp53 = string_db$mp( "tp53" )
atm = string_db$mp( "atm" )


string_db$get_neighbors( c(tp53, atm) )


string_db$get_neighbors(tp53)