BiocManager::install("AnnotationHub", checkBuilt = T)
BiocManager::install(c("GOSemSim","clusterProfiler","DOSE", "ape", "ggtree", "org.Hs.eg.db","org.Mn.eg.db" ))
BiocManager::install("STRINGdb", checkBuilt = T)
install.packages("igraph", dependencies = T)
BiocManager::install("DESeq2", checkBuilt = T)
BiocManager::install("apeglm", checkBuilt = T)
BiocManager::install("pathview", checkBuilt = T)
library(tidyverse)
data0 <- GEOquery::getGEO('GSE148349',GSEMatrix = T, AnnotGPL = T, getGPL = T)
data0 %>% names %>% str_extract('GSE\\d+.GPL\\d+') %>% set_names(data0,.) -> data
data0 %>% names %>% str_extract('GSE\\d+.GPL\\d+') %>% set_names(data,.) -> data
my.GEOfile <- GEOquery::getGEOfile('GSE148349',AnnotGPL = T, destdir = getwd())
if(!'GSE148349' %in% list.files()){
suppfiles <- GEOquery::getGEOSuppFiles('GSE148349',makeDirectory = TRUE, baseDir = getwd())
suppfiles %>% row.names() -> full.path.raw_tar
}else{
full.path.raw_tar <- file.path(getwd(),'GSE148349','GSE148349_RAW.tar')}
full.path.raw_tar %>% str_match('.*(?=GSE\\d+.RAW.*)') -> gse.folder
untar(tarfile = full.path.raw_tar ,exdir = gse.folder)
na.cleaner <- purrr::compose(na.omit,as.character)
my.gunzip <- function(a.file){GEOquery::gunzip(a.file,remove = F, overwrite = T)}
get.my.gz.files <- function(a.gse.folder){a.gse.folder %>% list.files(full.names = T ) %>%
str_extract('.*gz') %>% na.cleaner %>% map(my.gunzip)
}
gse.folder %>% get.my.gz.files
## [[1]]
## [1] 4539917
##
## [[2]]
## [1] 4480568
##
## [[3]]
## [1] 4460881
##
## [[4]]
## [1] 4505095
##
## [[5]]
## [1] 4444373
##
## [[6]]
## [1] 4447812
##
## [[7]]
## [1] 4490769
##
## [[8]]
## [1] 4558121
##
## [[9]]
## [1] 4471386
##
## [[10]]
## [1] 4428544
##
## [[11]]
## [1] 4514839
##
## [[12]]
## [1] 4492808
##
## [[13]]
## [1] 4457462
##
## [[14]]
## [1] 4535590
##
## [[15]]
## [1] 4491637
##
## [[16]]
## [1] 4523600
##
## [[17]]
## [1] 4526613
##
## [[18]]
## [1] 4485652
##
## [[19]]
## [1] 4441697
##
## [[20]]
## [1] 4479779
##
## [[21]]
## [1] 4449736
##
## [[22]]
## [1] 4464040
##
## [[23]]
## [1] 6150604
##
## [[24]]
## [1] 6144594
##
## [[25]]
## [1] 6109133
##
## [[26]]
## [1] 6141559
##
## [[27]]
## [1] 6057293
##
## [[28]]
## [1] 6060900
##
## [[29]]
## [1] 6012731
##
## [[30]]
## [1] 6039165
##
## [[31]]
## [1] 5951896
##
## [[32]]
## [1] 6031094
##
## [[33]]
## [1] 6060183
##
## [[34]]
## [1] 6053800
##
## [[35]]
## [1] 6086705
##
## [[36]]
## [1] 6100447
##
## [[37]]
## [1] 6029777
##
## [[38]]
## [1] 4352214
##
## [[39]]
## [1] 4388185
##
## [[40]]
## [1] 4371904
##
## [[41]]
## [1] 4391209
##
## [[42]]
## [1] 4376523
##
## [[43]]
## [1] 4386717
##
## [[44]]
## [1] 4327284
##
## [[45]]
## [1] 4383123
##
## [[46]]
## [1] 4346199
##
## [[47]]
## [1] 4384989
##
## [[48]]
## [1] 4388084
##
## [[49]]
## [1] 4377429
##
## [[50]]
## [1] 4376469
##
## [[51]]
## [1] 4370958
##
## [[52]]
## [1] 4374562
##
## [[53]]
## [1] 4355669
##
## [[54]]
## [1] 4357262
##
## [[55]]
## [1] 4385908
##
## [[56]]
## [1] 6035214
##
## [[57]]
## [1] 6080089
##
## [[58]]
## [1] 6051445
##
## [[59]]
## [1] 6059527
##
## [[60]]
## [1] 6091559
##
## [[61]]
## [1] 6084302
##
## [[62]]
## [1] 6072122
##
## [[63]]
## [1] 6026022
##
## [[64]]
## [1] 6104480
##
## [[65]]
## [1] 6116598
##
## [[66]]
## [1] 6109975
gse.folder %>% list.files(full.names = T ) %>% str_extract('.*tsv$') %>% na.cleaner -> tsv.files.path
tsv.files.path %>%
map(function(files.path00){readr::read_delim(files.path00, delim = '\t', skip = 0,col_names = TRUE)}) -> df_list #
tsv.files.path %>% str_extract('GSM\\d+') %>% set_names(df_list, .) -> df_GSM_list
get_gsm <- function(my.list.item){gsm_code <- names(my.list.item)
my.list.item %>% .[[1]] %>% dplyr::select(target_id,est_counts) %>%
set_names(c('target_id',gsm_code)) -> a_gsm; return(a_gsm)}
list_of_gsms <- list()
for(i in 1:length(df_GSM_list)){
df_GSM_list[i] %>% get_gsm -> one.gsm
list_of_gsms[[i]] <- one.gsm}
list_of_gsms %>%reduce(full_join) -> all_gsms
library(Biobase)
data$`GSE148349-GPL24247` %>% exprs %>% colnames -> GSE148349_GPL24247
data$`GSE148349-GPL24676` %>% exprs %>% colnames -> GSE148349_GPL24676
library(magrittr)
all_gsms %>% select(GSE148349_GPL24247) %>% as.matrix %>% set_rownames(all_gsms$target_id) %>% as.data.frame %>% drop_na -> exprs_GSE148349_GPL24247
all_gsms %>% select(GSE148349_GPL24676) %>% as.matrix %>% set_rownames(all_gsms$target_id) %>% as.data.frame %>% drop_na -> exprs_GSE148349_GPL24676
exp_set_GSE148349_GPL24247 <- ExpressionSet(assayData = as.matrix(exprs_GSE148349_GPL24247),
phenoData = data$`GSE148349-GPL24247`@phenoData,
Annotation = data$`GSE148349-GPL24247`@annotation,
experimentData = data$`GSE148349-GPL24247`@experimentData,
protocolData = data$`GSE148349-GPL24247`@protocolData)
exp_set_GSE148349_GPL24676 <- ExpressionSet(assayData = as.matrix(exprs_GSE148349_GPL24676),
phenoData = data$`GSE148349-GPL24676`@phenoData,
Annotation = data$`GSE148349-GPL24676`@annotation,
experimentData = data$`GSE148349-GPL24676`@experimentData,
protocolData = data$`GSE148349-GPL24676`@protocolData)
exp_set_GSE148349_GPL24247 %>% exprs() %>% round() -> countdata #Por qué redondear
coldata <- pData(exp_set_GSE148349_GPL24247)
coldata$title %>% str_detect('Untreated') -> untreated_bool
coldata$title[untreated_bool] <- 'control'
coldata$title[!untreated_bool] <- 'treatment'
#BiocManager::install("DESeq2", checkBuilt = T)
library(DESeq2)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ title)
keep <- rowSums(counts(ddsMat)) > 1
dds <- ddsMat[keep,]
dds <- estimateSizeFactors(dds)
#BiocManager::install("apeglm", checkBuilt = T)
DE.analysis <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1827 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res0 <- results(DE.analysis, contrast = c('title','treatment','control'))
library(EnhancedVolcano)
## Loading required package: ggrepel
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
EnhancedVolcano(res0,
lab = rownames(res0),
x = 'log2FoldChange',
y = 'padj',
xlim = c(-6,6),
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 1e-5,
FCcutoff = 2.0,
pointSize = 2.0,
labSize = 4.0,
colAlpha = .8,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'grey30')
################### ### Mining the biological knowledge of the differentially expressed genes
res0 %>% as.data.frame %>%rownames_to_column('genes') %>% select(genes,padj,log2FoldChange) %>% filter(padj < 1.0e-07 & abs(log2FoldChange)>4 ) -> my_DEGs
my_DEGs
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
org.Mm.eg.db %>% keytypes
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MGI" "ONTOLOGY"
## [16] "ONTOLOGYALL" "PATH" "PFAM" "PMID" "PROSITE"
## [21] "REFSEQ" "SYMBOL" "UNIGENE" "UNIPROT"
#Especie
exp_set_GSE148349_GPL24247 %>% pData() %>% dplyr::select(starts_with('orga')) %>% .[[1]] %>% unique
## [1] "Mus musculus"
genes.ENTREZID <- clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "ENTREZID", "org.Mm.eg.db")
##
## 'select()' returned 1:many mapping between keys and columns
## Warning in clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "ENTREZID", :
## 89.08% of input gene IDs are fail to map...
DEGs0 <- inner_join(my_DEGs, genes.ENTREZID, by = c('genes'='ENSEMBLTRANS'))
genes.SYMBOL <- clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "SYMBOL", "org.Mm.eg.db")
## 'select()' returned 1:many mapping between keys and columns
## Warning in clusterProfiler::bitr(my_DEGs$genes, "ENSEMBLTRANS", "SYMBOL", :
## 89.08% of input gene IDs are fail to map...
DEGs <- inner_join(DEGs0, genes.SYMBOL, by = c('genes'='ENSEMBLTRANS'))
DEGs
library(GOSemSim)
## GOSemSim v2.16.1 For help: https://guangchuangyu.github.io/GOSemSim
##
## If you use GOSemSim in published research, please cite:
## [36m-[39m Guangchuang Yu. Gene Ontology Semantic Similarity Analysis Using GOSemSim. In: Kidder B. (eds) Stem Cell Transcriptional Networks. Methods in Molecular Biology, 2020, 2117:207-215. Humana, New York, NY. doi:10.1007/978-1-0716-0301-7_11
## [36m-[39m Guangchuang Yu, Fei Li, Yide Qin, Xiaochen Bo, Yibo Wu, Shengqi Wang. GOSemSim: an R package for measuring semantic similarity among GO terms and gene products Bioinformatics 2010, 26(7):976-978. doi:10.1093/bioinformatics/btq064
MnGO <- godata('org.Mm.eg.db', ont="BP")
## preparing gene to GO mapping data...
## preparing IC data...
#eg <- DEGs$SYMBOL
eg <- DEGs$ENTREZID
sim <- mgeneSim(eg, semData = MnGO, measure = "Wang", combine = "BMA", verbose=FALSE)
DOSE::simplot(sim)
## Using ID as id variables
sim %>% row.names() -> my.entrez.ids
DEGs %>% filter(ENTREZID == my.entrez.ids) -> my.dict #%>% .[['SYMBOL']] -> my.symbols
## Warning in ENTREZID == my.entrez.ids: longitud de objeto mayor no es múltiplo de
## la longitud de uno menor
sim[my.dict$ENTREZID,my.dict$ENTREZID] -> sim2
row.names(sim2) <- my.dict$SYMBOL
colnames(sim2) <- my.dict$SYMBOL
DOSE::simplot(sim2)
## Using ID as id variables
### Gene cluster semantic similarity measurement
DEGs %>% filter(log2FoldChange < 15) %>% .[["ENTREZID"]] -> gs1
DEGs %>% filter(log2FoldChange > 20) %>% .[["ENTREZID"]] -> gs2
clusterSim(gs1, gs2, semData=MnGO, measure="Wang", combine="BMA")
## [1] 0.345
library(clusterProfiler)
## clusterProfiler v3.18.0 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:purrr':
##
## simplify
## The following object is masked from 'package:stats':
##
## filter
library(org.Mm.eg.db)
ggo <- groupGO(gene = DEGs$ENTREZID,
OrgDb = org.Mm.eg.db,
keyType = "ENTREZID",
ont = "CC",
level = 3,
readable = TRUE)
ggo %>% as.data.frame()
library(clusterProfiler)
data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]
# Entrez gene ID
head(gene)
## [1] "4312" "8318" "10874" "55143" "55388" "991"
library(org.Hs.eg.db)
##
glimpse(gene)
## chr [1:207] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
glimpse(names(geneList))
## chr [1:12495] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
ego <- enrichGO(gene = gene,
universe = names(geneList),
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)
gene.df <- bitr(gene, fromType = "ENTREZID",
toType = c("ENSEMBL", "SYMBOL"),
OrgDb = org.Hs.eg.db)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(gene, fromType = "ENTREZID", toType = c("ENSEMBL", "SYMBOL"), :
## 0.48% of input gene IDs are fail to map...
ego2 <- enrichGO(gene = gene.df$ENSEMBL,
OrgDb = org.Hs.eg.db,
keyType = 'ENSEMBL',
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(ego2, 3)
GO Gene Set Enrichment Analysis
ego3 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "CC",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
geneList %>% length
## [1] 12495
ego3@result %>% select(setSize) %>% colSums
## setSize
## 7710
goplot(ego)
data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]
kk <- enrichKEGG(gene = gene,
organism = 'hsa',
pvalueCutoff = 0.05)
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
kk@result
kk2 <- gseKEGG(geneList = geneList,
organism = 'hsa',
minGSSize = 120,
pvalueCutoff = 0.05,
verbose = FALSE)
kk2@result
KEGG Module is a collection of manually defined function units. In some situation, KEGG Modules have a more straightforward interpretation.
mkk <- enrichMKEGG(gene = gene,
organism = 'hsa',
pvalueCutoff = 1,
qvalueCutoff = 1)
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
mkk@result
mkk2 <- gseMKEGG(geneList = geneList,
organism = 'hsa',
pvalueCutoff = 1)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
mkk2@result
clusterProfiler::browseKEGG(kk, 'hsa04110')
#BiocManager::install("pathview", checkBuilt = T)
library("pathview")
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
##
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################
hsa04110 <- pathview(gene.data = geneList,
pathway.id = "hsa04110",
species = "hsa",
limit = list(gene=max(abs(geneList)), cpd=1))
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/clase_08_AA_analisis_funcional_y_redes
## Info: Writing image file hsa04110.pathview.png
library(STRINGdb)
data(interactions_example)
data(diff_exp_example1)
#To begin, you should first know the NCBI taxonomy identifiers of the organism on which you have performed the experiment (e.g. 9606 for Human, 10090 for mouse).
# create a new STRING_db object
string_db <- STRINGdb$new( version="11", species=9606, score_threshold=0, input_directory="")
tp53 = string_db$mp( "tp53" )
atm = string_db$mp( "atm" )
string_db$get_neighbors( c(tp53, atm) )
string_db$get_neighbors(tp53)