Video
Comienza en 4:13. Clase 04 junio 2020. Meta-análisis y Clustering.
Cargamos las variables generadas en la clase del 01 de junio Esto se indicó en el final del notebook del 01 de junio
setwd("~/clase_04_junio_2020")
load("~/clase_04_junio_2020/work_space_anterior_del_1_junio.RData")
setwd("~/clase_04_junio_2020")
library(tidyverse)
mis.DEGs.studies %>% names()
## [1] "GSE100382" "GSE55536" "GSE82227"
mis.DEGs.studies$GSE82227
## log2 fold change (MAP): my.experiments stimulated vs control
## Wald test p-value: my.experiments stimulated vs control
## DataFrame with 33554 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000000003 5.6157277089734 -0.619789533712436 0.389855839366189
## ENSG00000000419 1021.89616650713 -0.00326102532892136 0.10741228082964
## ENSG00000000457 268.134719086845 0.641466250197953 0.263280732803794
## ENSG00000000460 124.422307504729 0.0792926248306727 0.144037237294177
## ENSG00000000938 19028.9406659109 -0.341919060417635 0.175195335305818
## ... ... ... ...
## ENSG00000266865 20.8849740896683 0.067908844670179 0.228226939260565
## ENSG00000266867 1.22147329440195 -0.12338446475705 0.440594706959507
## ENSG00000266876 143.390534160037 0.122547895270455 0.120260027893951
## ENSG00000266877 4.59797859426845 1.80382056043122 0.402564643191592
## ENSG00000266891 27.7147477249008 0.270792378646866 0.385980217799908
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSG00000000003 -1.58900908255543 0.112058342182824 0.336090283663031
## ENSG00000000419 -0.0303593540280242 0.975780460688283 0.992214661042102
## ENSG00000000457 2.4369976061114 0.0148097774056293 0.0872241937902766
## ENSG00000000460 0.550495962299751 0.581979245536643 0.813845480397272
## ENSG00000000938 -1.95159101614597 0.050986780705295 0.203981411174042
## ... ... ... ...
## ENSG00000266865 0.297532852052367 0.766059730647689 0.908599149173973
## ENSG00000266867 -0.280341684574176 0.779215372727847 0.914310108946344
## ENSG00000266876 1.01900850536632 0.308198928641972 0.599349168796754
## ENSG00000266877 4.37146644817847 1.23414817462436e-05 0.00033221481859006
## ENSG00000266891 0.701963730835245 0.482701782380699 0.746158997890809
mis.DEGs.studies$GSE82227 %>% rownames() -> ENSGs
library(biomaRt)
ensembl <- useMart("ensembl")
searchDatasets(mart = ensembl, pattern = "sapiens")
Hs.ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
listAttributes(Hs.ensembl)
searchAttributes(mart = Hs.ensembl, pattern = 'symbol')
my.genes <- c('ACSL1','ACSL4','ADM','ADORA2A') #estos sÃmbolos vienen de los otros rnaseq hgnc_symbol
my.genes.symbols <- getBM(
attributes=c('ensembl_gene_id','hgnc_symbol','entrezgene_description'),filters ='hgnc_symbol',values = my.genes,mart = Hs.ensembl)
my.genes.symbols
my.genes = ENSGs
my.genes.symbols = getBM(attributes=c('ensembl_gene_id','hgnc_symbol','entrezgene_description'),filters ='ensembl_gene_id',values = my.genes,mart = Hs.ensembl)
my.genes.symbols %>% tail()
mis.DEGs.studies %>% glimpse()
## List of 3
## $ GSE100382:Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:12555] "A1BG" "A1BG-AS1" "A2M" "A2M-AS1" ...
## .. ..@ nrows : int 12555
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
## $ GSE55536 :Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:11951] "A1BG" "A2M" "A4GALT" "AAAS" ...
## .. ..@ nrows : int 11951
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
## $ GSE82227 :Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:33554] "ENSG00000000003" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460" ...
## .. ..@ nrows : int 33554
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
library(DESeq2)
un.estudio <- 'GSE100382'
mis.DEGs.studies[[un.estudio]]$baseMean %>% as.numeric.Array() -> mis.means
mis.DEGs.studies[[un.estudio]]$padj %>% as.numeric.Array() -> mis.padjs
mis.DEGs.studies[[un.estudio]]$log2FoldChange %>% as.numeric.Array() -> mis.lfcs
mis.DEGs.studies[[un.estudio]] %>% rownames() %>% as.character.Array() -> mis.filas
mis.DEGs.studies[[un.estudio]] %>% rownames() %>% as.character.Array() -> mis.simbolos
para.metaanal <- list()
para.metaanal[['GSE100382']] <- data.frame(symbols= mis.simbolos ,padj = mis.padjs, log2FoldChange = mis.lfcs, baseMean = mis.means, row.names = mis.filas)
####################################################################################################
un.estudio <- 'GSE55536'
mis.DEGs.studies[[un.estudio]]$baseMean %>% as.numeric.Array() -> mis.means
mis.DEGs.studies[[un.estudio]]$padj %>% as.numeric.Array() -> mis.padjs
mis.DEGs.studies[[un.estudio]]$log2FoldChange %>% as.numeric.Array() -> mis.lfcs
mis.DEGs.studies[[un.estudio]] %>% rownames() %>% as.character.Array() -> mis.filas
mis.DEGs.studies[[un.estudio]] %>% rownames() %>% as.character.Array() -> mis.simbolos
para.metaanal[['GSE55536']] <- data.frame(symbols= mis.simbolos ,padj = mis.padjs, log2FoldChange = mis.lfcs, baseMean = mis.means, row.names = mis.filas)
##################################################################################################
un.estudio <- 'GSE82227'
mis.DEGs.studies[[un.estudio]]$baseMean %>% as.numeric.Array() -> mis.means
mis.DEGs.studies[[un.estudio]]$padj %>% as.numeric.Array() -> mis.padjs
mis.DEGs.studies[[un.estudio]]$log2FoldChange %>% as.numeric.Array() -> mis.lfcs
mis.DEGs.studies[[un.estudio]] %>% rownames() %>% as.character.Array() -> mis.filas
mis.DEGs.studies[[un.estudio]] %>% rownames() %>% as.character.Array() -> mis.simbolos
para.metaanal[['GSE82227']] <- data.frame(symbols= mis.simbolos ,padj = mis.padjs, log2FoldChange = mis.lfcs, baseMean = mis.means, row.names = mis.filas)
################################################################################################
para.metaanal %>% glimpse()
## List of 3
## $ GSE100382:'data.frame': 12555 obs. of 4 variables:
## ..$ symbols : Factor w/ 12555 levels "1-Mar","1-Sep",..: 20 21 22 23 24 25 26 27 28 29 ...
## ..$ padj : num [1:12555] 0.415 0.32 0.306 NA 0.809 ...
## ..$ log2FoldChange: num [1:12555] -0.647 -0.609 -1.168 -1.322 0.349 ...
## ..$ baseMean : num [1:12555] 1.435 2.475 29.927 0.365 7.522 ...
## $ GSE55536 :'data.frame': 11951 obs. of 4 variables:
## ..$ symbols : Factor w/ 11951 levels "A1BG","A2M","A4GALT",..: 1 2 3 4 5 6 7 8 9 10 ...
## ..$ padj : num [1:11951] 5.60e-01 8.38e-01 1.19e-03 8.39e-05 2.83e-02 ...
## ..$ log2FoldChange: num [1:11951] 0.249 0.187 2.261 -0.753 -0.624 ...
## ..$ baseMean : num [1:11951] 3.05 120.03 5.53 12.53 5.71 ...
## $ GSE82227 :'data.frame': 33554 obs. of 4 variables:
## ..$ symbols : Factor w/ 33554 levels "ENSG00000000003",..: 1 2 3 4 5 6 7 8 9 10 ...
## ..$ padj : num [1:33554] 0.3361 0.9922 0.0872 0.8138 0.204 ...
## ..$ log2FoldChange: num [1:33554] -0.61979 -0.00326 0.64147 0.07929 -0.34192 ...
## ..$ baseMean : num [1:33554] 5.62 1021.9 268.13 124.42 19028.94 ...
para.metaanal$GSE82227
my.genes.symbols
GSE82227.con.symbols <- inner_join(para.metaanal$GSE82227,my.genes.symbols, by = c("symbols" = "ensembl_gene_id"))
para.metaanal[['GSE82227']] <- data.frame(symbols= GSE82227.con.symbols$hgnc_symbol ,padj = GSE82227.con.symbols$padj, log2FoldChange = GSE82227.con.symbols$log2FoldChange, baseMean = GSE82227.con.symbols$baseMean)
para.metaanal %>% glimpse()
## List of 3
## $ GSE100382:'data.frame': 12555 obs. of 4 variables:
## ..$ symbols : Factor w/ 12555 levels "1-Mar","1-Sep",..: 20 21 22 23 24 25 26 27 28 29 ...
## ..$ padj : num [1:12555] 0.415 0.32 0.306 NA 0.809 ...
## ..$ log2FoldChange: num [1:12555] -0.647 -0.609 -1.168 -1.322 0.349 ...
## ..$ baseMean : num [1:12555] 1.435 2.475 29.927 0.365 7.522 ...
## $ GSE55536 :'data.frame': 11951 obs. of 4 variables:
## ..$ symbols : Factor w/ 11951 levels "A1BG","A2M","A4GALT",..: 1 2 3 4 5 6 7 8 9 10 ...
## ..$ padj : num [1:11951] 5.60e-01 8.38e-01 1.19e-03 8.39e-05 2.83e-02 ...
## ..$ log2FoldChange: num [1:11951] 0.249 0.187 2.261 -0.753 -0.624 ...
## ..$ baseMean : num [1:11951] 3.05 120.03 5.53 12.53 5.71 ...
## $ GSE82227 :'data.frame': 31075 obs. of 4 variables:
## ..$ symbols : Factor w/ 24555 levels "","A1BG","A2M",..: 22606 4982 19029 2110 6414 3286 6687 6915 13191 20913 ...
## ..$ padj : num [1:31075] 0.3361 0.9922 0.0872 0.8138 0.204 ...
## ..$ log2FoldChange: num [1:31075] -0.61979 -0.00326 0.64147 0.07929 -0.34192 ...
## ..$ baseMean : num [1:31075] 5.62 1021.9 268.13 124.42 19028.94 ...
library(MetaVolcanoR)
meta_degs_combining <- combining_mv(diffexp = para.metaanal, metathr=0.05,metafc = "Median", pcriteria= 'padj', foldchangecol= 'log2FoldChange', genenamecol='symbols', collaps = T)
meta_degs_votecount <- votecount_mv(diffexp=para.metaanal, pcriteria='padj', foldchangecol= 'log2FoldChange', pvalue=0.01, foldchange=1, genenamecol='symbols',metathr=1,collaps=T)
mis.resultados <- meta_degs_combining@metaresult
rownames(mis.resultados) <- mis.resultados$symbols
colnames(mis.resultados) <- c("symbols", "padj", "log2FoldChange" , "idx" )
mis.resultados[c('padj','log2FoldChange')] -> meta.rest
library(EnhancedVolcano)
EnhancedVolcano(meta.rest,
lab = rownames(meta.rest),
x = 'log2FoldChange',
y = 'padj',
xlim = c(-4,4),
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 10e-15,
FCcutoff = 1,
pointSize = 3.0,
labSize = 3.0,
colAlpha = .8,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'grey30')
meta_degs_votecount@metaresult
meta_degs_votecount@MetaVolcano
subset(meta.rest, (padj < 1e-5)&(log2FoldChange > 2)) %>% rownames()%>% as.character.Array() -> up0
subset(meta.rest, (padj < 1e-5)&(log2FoldChange < -1)) %>% rownames() %>% as.character.Array() -> down0
meta_degs_votecount@metaresult$degvcount %>% unique()
## [1] "2.Up-regulated" "0.Down-regulated" "1.Unperturbed"
votes <- meta_degs_votecount@metaresult
subset(votes, degvcount == "2.Up-regulated")$symbols %>% as.character.Array() -> up1
subset(votes, degvcount == "0.Down-regulated")$symbols %>% as.character.Array() -> down1
mis.up <- intersect(up0, up1) %>% sort()
mis.down <- intersect(down0, down1)
Clusters en studio GSE55536
library(Biobase)
library(stringr)
library(DESeq2)
selected.ready.set = datasets.ready$GSE55536
selected.ready.set %>% exprs() %>% round() -> countdata #Por qué redondear
coldata <- pData(selected.ready.set)
coldata$title %>% as.character() %>% str_replace_all('.*(M1|M2).*','stimulated') %>% str_replace_all('.*rep.*','control') -> coldata$title
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ title)
keep <- rowSums(counts(ddsMat)) > 1
dds <- ddsMat[keep,]
dds <- estimateSizeFactors(dds)
dds %>% assay() %>% head(2)
## GSM1338794 GSM1338795 GSM1338796 GSM1338797 GSM1338798 GSM1338799
## A1BG 3 2 2 2 2 2
## A2M 222 140 94 124 154 88
## GSM1338800 GSM1338801 GSM1338802 GSM1338803 GSM1338804 GSM1338805
## A1BG 2 1 2 1 5 6
## A2M 0 0 0 0 120 111
## GSM1338806 GSM1338807 GSM1338808 GSM1338809 GSM1338810 GSM1338811
## A1BG 3 3 3 3 1 3
## A2M 64 94 79 109 152 158
## GSM1338812 GSM1338813 GSM1338814 GSM1338815 GSM1925959 GSM1925960
## A1BG 7 5 3 4 4 2
## A2M 54 164 70 110 202 212
## GSM1925961 GSM1925962 GSM1925963 GSM1925964 GSM1925965 GSM1925966
## A1BG 4 1 1 4 3 6
## A2M 244 0 0 547 201 53
## GSM1925967 GSM1925968 GSM1925969
## A1BG 2 6 4
## A2M 25 303 177
The transformation rlog is useful when checking for outliers or as input for machine-learning techniques.
rld <- rlog(dds, blind = T)
head(assay(rld), 3)
## GSM1338794 GSM1338795 GSM1338796 GSM1338797 GSM1338798 GSM1338799
## A1BG 1.5154054 1.2703177 1.261567 1.281655 1.2717388 1.2730265
## A2M 7.2551483 6.7745657 6.367324 6.672748 6.8733804 6.3231464
## A4GALT 0.4486667 0.4462147 2.050554 1.921290 0.4471108 0.4479227
## GSM1338800 GSM1338801 GSM1338802 GSM1338803 GSM1338804 GSM1338805
## A1BG 1.338073 1.013382 1.337116 1.013349 1.9153102 2.0658371
## A2M 3.762988 3.759428 3.762598 3.759412 6.6703880 6.5966831
## A4GALT 1.318446 1.308824 1.811264 1.579819 0.4644316 0.9353351
## GSM1338806 GSM1338807 GSM1338808 GSM1338809 GSM1338810 GSM1338811
## A1BG 1.5541342 1.5419504 1.566525 1.559669 0.9980332 1.556639
## A2M 6.0811771 6.4277961 6.296657 6.599101 6.9278184 6.968060
## A4GALT 0.9418591 0.4634461 3.098864 3.021129 2.2617316 3.016629
## GSM1338812 GSM1338813 GSM1338814 GSM1338815 GSM1925959 GSM1925960
## A1BG 2.197760 1.8891060 1.5501390 1.736408 1.712720 1.264301
## A2M 5.917214 6.9503440 6.1588597 6.572502 7.154551 7.189030
## A4GALT 1.279721 0.9147644 0.9386108 0.927088 1.246817 3.521241
## GSM1925961 GSM1925962 GSM1925963 GSM1925964 GSM1925965 GSM1925966
## A1BG 1.6817191 1.0057500 1.007113 1.7193671 1.503986 2.001921
## A2M 7.3060139 3.7556161 3.756295 8.2378664 7.133240 5.824421
## A4GALT 0.8858898 0.9522876 1.300302 0.4513675 1.714973 2.987765
## GSM1925967 GSM1925968 GSM1925969
## A1BG 1.229633 2.0290831 1.687741
## A2M 5.169843 7.5846292 6.981541
## A4GALT 4.225060 0.4486102 1.485671
Extracción de genes UP desde metaanalisis
datos.exprs.norm <- assay(rld) %>% as.data.frame()
rownames(datos.exprs.norm) %in% mis.up -> bool.up
datos.exprs.norm[bool.up,] -> exprs.up
Clustering
km.res <- kmeans(exprs.up, centers = 5, iter.max = 10, nstart = 1)
dd <- cbind(exprs.up, cluster = km.res$cluster)
head(dd)
library(factoextra)
fviz_cluster(km.res,data = exprs.up, labelsize = 5)
res.hk <-hkmeans(exprs.up, 5)
fviz_cluster(res.hk,labelsize = 5)