Notebook actualizado al 04-junio-2020
Video:
Comienza en 11:03. Clase 01 junio 2020: Queries tipo SQL & expresión diferencial:
Descargar y descomprimir la base de datos
library(GEOquery)
library(SRAdb)
library(DBI)
setwd("~/clase_01_junio_2020")
#download.file('https://gbnci-abcc.ncifcrf.gov/backup/SRAmetadb.sqlite.gz',destfile = 'SRAmetadb.sqlite.gz')
#gunzip(filename= 'SRAmetadb.sqlite.gz', destname = 'SRAmetadb.sqlite', overwrite = T, remove = F)
setwd("~/clase_01_junio_2020")
library(GEOquery)
library(SRAdb)
library(DBI)
sqlfile <- "/home/rstudio/clase_01_junio_2020/SRAmetadb.sqlite" #catálogo!!!!!
sra_con <- dbConnect(SQLite(),sqlfile)
#Interrogar al catálogo
sra_tables <- dbListTables(sra_con)
sra_tables
## [1] "col_desc" "experiment" "fastq" "metaInfo"
## [5] "run" "sample" "sra" "sra_ft"
## [9] "sra_ft_content" "sra_ft_segdir" "sra_ft_segments" "study"
## [13] "submission"
dbListFields(sra_con,"submission")
## [1] "submission_ID" "submission_alias" "submission_accession"
## [4] "submission_comment" "files" "broker_name"
## [7] "center_name" "lab_name" "submission_date"
## [10] "sra_link" "submission_url_link" "xref_link"
## [13] "submission_entrez_link" "ddbj_link" "ena_link"
## [16] "submission_attribute" "sradb_updated"
Desde una Run Accesion
#Una query (consulta) tipo sql
dbGetQuery(
sra_con, paste( "SELECT submission_alias, lab_name,broker_name,submission_comment,sradb_updated
FROM submission
WHERE","submission_accession
LIKE 'SRA114259'",sep=" "))
archivos.relacionados <- listSRAfile( c("SRA114259"), sra_con, fileType ='sra')
archivos.relacionados
Obtener otros campos
conversion <- sraConvert(c('SRA114259'), sra_con = sra_con )
archivos.relacionados <- apply(conversion, 2, unique)
archivos.relacionados
## $submission
## [1] "SRA114259"
##
## $study
## [1] "SRP191205" "SRP028271" "SRP046387" "SRP040281" "SRP038896" "SRP033351"
## [7] "SRP228460"
##
## $sample
## [1] "SRS5607937" "SRS5608040" "SRS5607923" "SRS5607949" "SRS5607933"
## [6] "SRS5608057" "SRS5608055" "SRS5607936" "SRS5608037" "SRS5608046"
## [11] "SRS5607931" "SRS5608058" "SRS508575" "SRS5607935" "SRS5608039"
## [16] "SRS5607930" "SRS5607942" "SRS5608052" "SRS5608056" "SRS5608041"
## [21] "SRS5608044" "SRS5608051" "SRS5608036" "SRS508581" "SRS5607957"
## [26] "SRS5607920" "SRS5607934" "SRS508569" "SRS508572" "SRS5608061"
## [31] "SRS5608064" "SRS5607926" "SRS5608038" "SRS5607932" "SRS5608048"
## [36] "SRS5607928" "SRS508576" "SRS5607922" "SRS5607946" "SRS5607925"
## [41] "SRS5607927" "SRS508573" "SRS5608053" "SRS5607918" "SRS5607938"
## [46] "SRS5608045" "SRS5607939" "SRS5607953" "SRS508570" "SRS5608035"
## [51] "SRS5607956" "SRS5607963" "SRS5607954" "SRS5607952" "SRS5607943"
## [56] "SRS5608071" "SRS5608054" "SRS508571" "SRS5607940" "SRS5608043"
## [61] "SRS5607947" "SRS5608060" "SRS5608063" "SRS5608050" "SRS5607965"
## [66] "SRS5607959" "SRS5607960" "SRS5607962" "SRS5607941" "SRS5607919"
## [71] "SRS5608049" "SRS508574" "SRS508577" "SRS5607924" "SRS5607950"
## [76] "SRS508579" "SRS5607951" "SRS508580" "SRS5608072" "SRS5608042"
## [81] "SRS5607955" "SRS5607929" "SRS5607944" "SRS508582" "SRS5607921"
## [86] "SRS5608067" "SRS5608065" "SRS5608074" "SRS5608059" "SRS508567"
## [91] "SRS508568" "SRS5607948" "SRS5608077" "SRS5607945" "SRS508578"
## [96] "SRS5608070" "SRS5608073" "SRS5608069" "SRS5608047" "SRS5607958"
## [101] "SRS5607961" "SRS5608066" "SRS5607964" "SRS5608068"
##
## $experiment
## [1] "SRX7095476" "SRX7095580" "SRX7095462" "SRX7095488" "SRX7095472"
## [6] "SRX7095597" "SRX7095595" "SRX7095475" "SRX7095577" "SRX7095586"
## [11] "SRX7095470" "SRX7095598" "SRX384353" "SRX7095474" "SRX7095579"
## [16] "SRX7095469" "SRX7095481" "SRX7095592" "SRX7095596" "SRX7095581"
## [21] "SRX7095584" "SRX7095591" "SRX7095576" "SRX384360" "SRX7095496"
## [26] "SRX7095459" "SRX7095473" "SRX384348" "SRX384350" "SRX7095601"
## [31] "SRX7095604" "SRX7095465" "SRX7095578" "SRX7095471" "SRX7095588"
## [36] "SRX7095467" "SRX384354" "SRX7095461" "SRX7095485" "SRX7095464"
## [41] "SRX7095466" "SRX384352" "SRX7095593" "SRX7095457" "SRX7095477"
## [46] "SRX7095585" "SRX7095478" "SRX7095493" "SRX384347" "SRX7095575"
## [51] "SRX7095497" "SRX7095503" "SRX7095494" "SRX7095492" "SRX7095482"
## [56] "SRX7095611" "SRX7095594" "SRX384349" "SRX7095479" "SRX7095583"
## [61] "SRX7095486" "SRX7095600" "SRX7095603" "SRX7095590" "SRX7095505"
## [66] "SRX7095499" "SRX7095500" "SRX7095502" "SRX7095480" "SRX7095458"
## [71] "SRX7095589" "SRX384351" "SRX384356" "SRX7095463" "SRX7095490"
## [76] "SRX384357" "SRX7095491" "SRX384358" "SRX7095612" "SRX7095582"
## [81] "SRX7095495" "SRX7095468" "SRX7095483" "SRX384359" "SRX7095460"
## [86] "SRX7095607" "SRX7095605" "SRX7095614" "SRX7095599" "SRX384346"
## [91] "SRX384345" "SRX7095487" "SRX7095617" "SRX7095484" "SRX384355"
## [96] "SRX7095610" "SRX7095613" "SRX7095489" "SRX7095609" "SRX7095587"
## [101] "SRX7095498" "SRX7095501" "SRX7095606" "SRX7095504" "SRX7095608"
##
## $run
## [1] "SRR1039511" "SRR1039520" "SRR1039512" "SRR1039509" "SRR1039522"
## [6] "SRR1039521" "SRR1039516" "SRR1039510" "SRR1039523" "SRR1039513"
## [11] "SRR1039517" "SRR1039515" "SRR1039508" "SRR1039519" "SRR1039514"
## [16] "SRR1039518"
estudios.relacionados <- archivos.relacionados$study
estudios.relacionados
## [1] "SRP191205" "SRP028271" "SRP046387" "SRP040281" "SRP038896" "SRP033351"
## [7] "SRP228460"
dbListFields(sra_con,"study")
## [1] "study_ID" "study_alias" "study_accession"
## [4] "study_title" "study_type" "study_abstract"
## [7] "broker_name" "center_name" "center_project_name"
## [10] "study_description" "related_studies" "primary_study"
## [13] "sra_link" "study_url_link" "xref_link"
## [16] "study_entrez_link" "ddbj_link" "ena_link"
## [19] "study_attribute" "submission_accession" "sradb_updated"
dbListFields(sra_con,"run")
## [1] "run_ID" "bamFile" "run_alias"
## [4] "run_accession" "broker_name" "instrument_name"
## [7] "run_date" "run_file" "run_center"
## [10] "total_data_blocks" "experiment_accession" "experiment_name"
## [13] "sra_link" "run_url_link" "xref_link"
## [16] "run_entrez_link" "ddbj_link" "ena_link"
## [19] "run_attribute" "submission_accession" "sradb_updated"
mi.studio.1 <-dbGetQuery(
sra_con,
paste("SELECT study_alias,study_accession,study_title,study_abstract,study_description,study_type",
"FROM study
WHERE", "study_accession
LIKE 'SRP191205'"
,sep=" "))
mi.studio.1
Obtención de estudios desde keywords buscar otras bases de datos: data-base query sql proteomics -> sqlite + R
mis.studios <-
dbGetQuery(
sra_con,
paste("SELECT center_name,broker_name,study_alias,study_accession,study_title,study_abstract,study_description,study_type",
"FROM study WHERE", "((study_title LIKE '%transcriptomic%' AND
study_title LIKE '%characterization%' AND
study_title LIKE '%ipsc%Macrophages%')
OR
(study_title LIKE '%monocyte-derived%' AND
study_title LIKE '%TLR2/1 ligand%' AND
study_title LIKE '%MDMs%')
OR
(study_title LIKE '%Type I IFNs%' AND
study_title LIKE '%Macrophages%' AND
study_title LIKE '%Epigenomic Landscape%'))
AND
(study_type LIKE 'transcr%')"
,sep=" "))
mis.studios
mis.studios$study_alias
## [1] "GSE55536" "GSE82227" "GSE100382"
RNAseqs <- mis.studios$study_alias
RNAseqs
## [1] "GSE55536" "GSE82227" "GSE100382"
Es particular para GEO
Primer estudio
RNA-seq GSE100382 ##################
#En geo esto es solo metadata: nombres de las muestas, protocolos, etc. no pero datos de expresión.
library(GEOquery)
library(dplyr)
library(Biobase)
selected.set = 'GSE100382'
i = which(selected.set == RNAseqs)
downloaded.sets = list()
downloaded.sets[[i]] = getGEO(RNAseqs[i], GSEMatrix =TRUE, getGPL= T, AnnotGPL = T)
my.RNAseq = downloaded.sets[[i]]
sampleNames = my.RNAseq[[1]]$title
sampleNamesGEO = sampleNames(phenoData(my.RNAseq[[1]]))
data.frame(sampleNames,sampleNamesGEO) %>% arrange(sampleNames) -> names.to.comvert
#Bajamos los datos de expresión
library(stringr)
library(tibble)
#setwd("~/RNAseq_meta_analysis")
manydirectories = list.dirs()
directorynames = basename(manydirectories)
if(selected.set %in% directorynames){unlink(selected.set, recursive = TRUE)}
files = getGEOSuppFiles('GSE100382')
rownames(files) %>% str_detect('.txt.gz') -> my.txt.gz
rownames(files)[my.txt.gz] %>% str_replace_all('.gz','') -> expr.file.path
gunzip(rownames(files)[my.txt.gz])
#Igualar nombres de las muestras, que son los nombres de las columnas de la matriz de expresióm
seqdata = read.table(expr.file.path,header = T, sep = '\t')
first.col.genes = colnames(seqdata)[1]
seqdata = seqdata[!duplicated(seqdata[first.col.genes]), ]
seqdata%>%remove_rownames()%>%column_to_rownames(first.col.genes)%>%data.matrix()%>% as.matrix() -> rna.seq.exprs.mat
rna.seq.exprs.mat[,order(colnames(rna.seq.exprs.mat))] -> rna.seq.exprs.mat
cols.main.data = as.character(names.to.comvert$sampleNames)
cols.supplem.data = as.character(colnames(rna.seq.exprs.mat))
if (cols.main.data!=cols.supplem.data){print('Nombres de columnas diferentes!')} else {print('Cols OK')}
## [1] "Nombres de columnas diferentes!"
###########################################################################
names.to.comvert$sampleNames <- as.character(colnames(rna.seq.exprs.mat))
cols.main.data = as.character(names.to.comvert$sampleNames)
cols.supplem.data = as.character(colnames(rna.seq.exprs.mat))
if (cols.main.data!=cols.supplem.data){print('Nombres de columnas diferentes!')} else {print('Cols OK')}
## [1] "Cols OK"
##########################################################################
colnames(rna.seq.exprs.mat) <-names.to.comvert$sampleNamesGEO
#Integración de la metadata con los datos de expresión
ok.data = rna.seq.exprs.mat[, (rownames(as.data.frame(my.RNAseq[[1]])))]
datasets.ready = list()
datasets.ready[[RNAseqs[i]]] = ExpressionSet(assayData = ok.data,
phenoData = phenoData(my.RNAseq[[1]]),
featureData = annotatedDataFrameFrom(ok.data, byrow=TRUE),
Annotation = annotation(my.RNAseq[[1]]),
experimentData = experimentData(my.RNAseq[[1]]),
protocolData = protocolData(my.RNAseq[[1]]))
datasets.ready %>% glimpse()
## List of 1
## $ GSE100382:Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## .. ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. ..@ assayData :<environment: 0x5632efa77ea0>
## .. ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ annotation : chr(0)
## .. ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
Segundo estudio
RNA-seq GSE55536 #####################
library(GEOquery)
library(dplyr)
library(Biobase)
selected.set = 'GSE55536'
i = which(selected.set == RNAseqs)
downloaded.sets = list()
downloaded.sets[[i]] = getGEO(RNAseqs[i], GSEMatrix =TRUE, getGPL= T, AnnotGPL = T)
my.RNAseq = downloaded.sets[[i]]
sampleNames = my.RNAseq[[1]]$title
sampleNamesGEO = sampleNames(phenoData(my.RNAseq[[1]]))
data.frame(sampleNames,sampleNamesGEO) %>% arrange(sampleNames) -> names.to.comvert
library(stringr)
library(tibble)
#setwd("~/RNAseq_meta_analysis")
manydirectories = list.dirs()
directorynames = basename(manydirectories)
if(selected.set %in% directorynames){unlink(selected.set, recursive = TRUE)}
files = getGEOSuppFiles(selected.set)
rownames(files) %>% str_detect('.txt.gz') -> my.txt.gz
rownames(files)[my.txt.gz] %>% str_replace_all('.gz','') -> expr.file.path
gunzip(rownames(files)[my.txt.gz])
seqdata = read.table(expr.file.path,header = T, sep = '\t')
first.col.genes = colnames(seqdata)[1]
seqdata = seqdata[!duplicated(seqdata[first.col.genes]), ]
seqdata%>%remove_rownames()%>%column_to_rownames(first.col.genes)%>%data.matrix()%>% as.matrix() -> rna.seq.exprs.mat
rna.seq.exprs.mat[,order(colnames(rna.seq.exprs.mat))] -> rna.seq.exprs.mat
cols.main.data = as.character(names.to.comvert$sampleNames)
cols.supplem.data = as.character(colnames(rna.seq.exprs.mat))
if (cols.main.data!=cols.supplem.data){print('Nombres de columnas diferentes!')} else {print('Cols OK')}
## [1] "Nombres de columnas diferentes!"
###########################################################################
colnames(rna.seq.exprs.mat) <- as.character(names.to.comvert$sampleNames)
###########################################################################
cols.main.data = as.character(names.to.comvert$sampleNames)
cols.supplem.data = as.character(colnames(rna.seq.exprs.mat))
if (cols.main.data!=cols.supplem.data){print('Nombres de columnas diferentes!')} else {print('Cols OK')}
## [1] "Cols OK"
##########################################################################
colnames(rna.seq.exprs.mat) <-names.to.comvert$sampleNamesGEO
##########################################################################
ok.data = rna.seq.exprs.mat[, ( rownames(as.data.frame(my.RNAseq[[1]])) )]
###################################################################################################
#datasets.ready = list()
###################################################################################################
datasets.ready[[RNAseqs[i]]] = ExpressionSet(assayData = ok.data,
phenoData = phenoData(my.RNAseq[[1]]),
featureData = annotatedDataFrameFrom(ok.data, byrow=TRUE),
Annotation = annotation(my.RNAseq[[1]]),
experimentData = experimentData(my.RNAseq[[1]]),
protocolData = protocolData(my.RNAseq[[1]]))
datasets.ready %>% glimpse()
## List of 2
## $ GSE100382:Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## .. ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. ..@ assayData :<environment: 0x5632efa77ea0>
## .. ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ annotation : chr(0)
## .. ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## $ GSE55536 :Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## .. ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. ..@ assayData :<environment: 0x5632f17b6148>
## .. ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ annotation : chr(0)
## .. ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
Tercer estudio
RNA-seq GSE82227 ##################
library(GEOquery)
library(dplyr)
library(Biobase)
selected.set = 'GSE82227'
i = which(selected.set == RNAseqs)
downloaded.sets = list()
downloaded.sets[[i]] = getGEO(RNAseqs[i], GSEMatrix =TRUE, getGPL= T, AnnotGPL = T)
my.RNAseq = downloaded.sets[[i]]
sampleNames = my.RNAseq[[1]]$title
sampleNamesGEO = sampleNames(phenoData(my.RNAseq[[1]]))
data.frame(sampleNames,sampleNamesGEO) %>% arrange(sampleNames) -> names.to.comvert
library(stringr)
library(tibble)
#etwd("~/RNAseq_meta_analysis")
manydirectories = list.dirs()
directorynames = basename(manydirectories)
if(selected.set %in% directorynames){unlink(selected.set, recursive = TRUE)}
files = getGEOSuppFiles(selected.set)
rownames(files) %>% str_detect('.gz') -> my.txt.gz
rownames(files)[my.txt.gz] %>% str_replace_all('.gz','') -> expr.file.path
gunzip(rownames(files)[my.txt.gz])
library(readr)
#############################################################################
#seqdata = read.table(expr.file.path,header = T, sep = '\t')
################################################################################
seqdata = read_csv(expr.file.path)
first.col.genes = colnames(seqdata)[1]
seqdata = seqdata[!duplicated(seqdata[first.col.genes]), ]
seqdata%>%remove_rownames()%>%column_to_rownames(first.col.genes)%>%data.matrix()%>% as.matrix() -> rna.seq.exprs.mat
rna.seq.exprs.mat[,order(colnames(rna.seq.exprs.mat))] -> rna.seq.exprs.mat
cols.main.data = as.character(names.to.comvert$sampleNames)
cols.supplem.data = as.character(colnames(rna.seq.exprs.mat))
if (cols.main.data!=cols.supplem.data){print('Nombres de columnas diferentes!')} else {print('Cols OK')}
## [1] "Nombres de columnas diferentes!"
###########################################################################
colnames(rna.seq.exprs.mat) <- as.character(names.to.comvert$sampleNames)
###########################################################################
cols.main.data = as.character(names.to.comvert$sampleNames)
cols.supplem.data = as.character(colnames(rna.seq.exprs.mat))
if (cols.main.data!=cols.supplem.data){print('Nombres de columnas diferentes!')} else {print('Cols OK')}
## [1] "Cols OK"
##########################################################################
colnames(rna.seq.exprs.mat) <-names.to.comvert$sampleNamesGEO
##########################################################################
ok.data = rna.seq.exprs.mat[, ( rownames(as.data.frame(my.RNAseq[[1]])) )]
###################################################################################################
#datasets.ready = list()
###################################################################################################
datasets.ready[[RNAseqs[i]]] = ExpressionSet(assayData = ok.data,
phenoData = phenoData(my.RNAseq[[1]]),
featureData = annotatedDataFrameFrom(ok.data, byrow=TRUE),
Annotation = annotation(my.RNAseq[[1]]),
experimentData = experimentData(my.RNAseq[[1]]),
protocolData = protocolData(my.RNAseq[[1]]))
datasets.ready %>% glimpse()
## List of 3
## $ GSE100382:Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## .. ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. ..@ assayData :<environment: 0x5632efa77ea0>
## .. ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ annotation : chr(0)
## .. ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## $ GSE55536 :Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## .. ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. ..@ assayData :<environment: 0x5632f17b6148>
## .. ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ annotation : chr(0)
## .. ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
## $ GSE82227 :Formal class 'ExpressionSet' [package "Biobase"] with 7 slots
## .. ..@ experimentData :Formal class 'MIAME' [package "Biobase"] with 13 slots
## .. ..@ assayData :<environment: 0x5632f3ae0608>
## .. ..@ phenoData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ featureData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ annotation : chr(0)
## .. ..@ protocolData :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
## .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
Hasta ahora bajamos tablas de conteo (forma pre-procesada, ya con el mapeo listo! del RNA-seq). En nuestro caso los tres son illumina.
Si sabemos que las normalizaciones no son comparables hay que bajar el dato crudo.
Los ‘reads’ son datos crudos, que nosotros no hemos bajado ni bajamos a trabajar por requieren mucho tiempo y recursos.
Pre-procesamiento (mínimo y no recomendable, sólo fines proof-of-concept) y análisis de expresión diferencial
RNA-seq GSE100382 ##################
#Terminar con números enteros positivos >=0
library(Biobase)
selected.ready.set <- datasets.ready$GSE100382
selected.ready.set %>% exprs() %>% round() -> countdata
head(countdata,2)
## GSM2679941 GSM2679942 GSM2679943 GSM2679944 GSM2679945 GSM2679946
## A1BG 3 2 1 2 1 0
## A1BG-AS1 3 3 3 2 1 1
## GSM2679947 GSM2679948 GSM2679949 GSM2679950 GSM2679951 GSM2679952
## A1BG 2 2 2 1 1 2
## A1BG-AS1 4 3 4 3 2 3
## GSM2679953 GSM2679954 GSM2679955 GSM2679956 GSM2679957 GSM2679958
## A1BG 1 1 1 1 1 1
## A1BG-AS1 3 2 2 2 1 2
## GSM2679959 GSM2679960 GSM2679961 GSM2679962 GSM2679963 GSM2679964
## A1BG 2 1 2 2 1 2
## A1BG-AS1 3 4 3 2 2 2
library(stringr)
library(DESeq2)
coldata <- pData(selected.ready.set)
coldata$title
## V2 V3 V4 V5 V6
## RNA_N_rep1 RNA_N_rep2 RNA_N_rep3 RNA_L_rep1 RNA_L_rep2
## V7 V8 V9 V10 V11
## RNA_L_rep3 RNA_T_rep1 RNA_T_rep2 RNA_T_rep3 RNA_TL_rep1
## V12 V13 V14 V15 V16
## RNA_TL_rep2 RNA_TL_rep3 RNA_IFN_rep1 RNA_IFN_rep2 RNA_IFN_rep3
## V17 V18 V19 V20 V21
## RNA_IFN_L_rep1 RNA_IFN_L_rep2 RNA_IFN_L_rep3 RNA_IFN_T_rep1 RNA_IFN_T_rep2
## V22 V23 V24 V25
## RNA_IFN_T_rep3 RNA_IFN_TL_rep1 RNA_IFN_TL_rep2 RNA_IFN_TL_rep3
## 24 Levels: RNA_IFN_L_rep1 RNA_IFN_L_rep2 RNA_IFN_L_rep3 ... RNA_TL_rep3
coldata$title %>% as.character() %>% str_replace_all('.*(_L_|_TL_|_IFN_).*','stimulated') %>% str_replace_all('.*(_N_|_T_).*','control') -> coldata$title
coldata$title
## [1] "control" "control" "control" "stimulated" "stimulated"
## [6] "stimulated" "control" "control" "control" "stimulated"
## [11] "stimulated" "stimulated" "stimulated" "stimulated" "stimulated"
## [16] "stimulated" "stimulated" "stimulated" "stimulated" "stimulated"
## [21] "stimulated" "stimulated" "stimulated" "stimulated"
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ title)
keep <- rowSums(counts(ddsMat)) > 1
dds <- ddsMat[keep,]
dds <- estimateSizeFactors(dds)
library(DESeq2)
DE.analysis <- DESeq(dds)
res0 <- results(DE.analysis, contrast = c('title','stimulated','control'))
Resul <- lfcShrink(DE.analysis, contrast = c('title','stimulated','control'), res=res0)
library(EnhancedVolcano)
EnhancedVolcano(Resul,
lab = rownames(Resul),
x = 'log2FoldChange',
y = 'pvalue',
xlim = c(-6,6),
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 10e-10,
FCcutoff = 2.0,
pointSize = 2.0,
labSize = 4.0,
colAlpha = .8,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'grey30')
mis.DEGs.studies <- list()
mis.DEGs.studies[['GSE100382']] <- Resul
mis.DEGs.studies %>% .[['GSE100382']] %>% glimpse()
## Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## ..@ priorInfo :List of 4
## .. ..$ type : chr "normal"
## .. ..$ package : chr "DESeq2"
## .. ..$ version :Classes 'package_version', 'numeric_version' hidden list of 1
## .. ..$ betaPriorVar: Named num [1:3] 1.00e+06 2.05 2.05
## .. .. ..- attr(*, "names")= chr [1:3] "Intercept" "titlecontrol" "titlestimulated"
## ..@ rownames : chr [1:12555] "A1BG" "A1BG-AS1" "A2M" "A2M-AS1" ...
## ..@ nrows : int 12555
## ..@ listData :List of 6
## .. ..$ baseMean : num [1:12555] 1.435 2.475 29.927 0.365 7.522 ...
## .. ..$ log2FoldChange: num [1:12555] -0.647 -0.609 -1.168 -1.322 0.349 ...
## .. ..$ lfcSE : num [1:12555] 0.528 0.419 0.779 0.97 0.784 ...
## .. ..$ stat : num [1:12555] -1.231 -1.457 -1.489 -1.347 0.445 ...
## .. ..$ pvalue : num [1:12555] 0.218 0.145 0.136 0.178 0.656 ...
## .. ..$ padj : num [1:12555] 0.415 0.32 0.306 NA 0.809 ...
## ..@ elementType : chr "ANY"
## ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## ..@ metadata :List of 6
## .. ..$ filterThreshold: Named num 0.629
## .. .. ..- attr(*, "names")= chr "13.58508%"
## .. ..$ filterTheta : num 0.136
## .. ..$ filterNumRej :'data.frame': 50 obs. of 2 variables:
## .. ..$ lo.fit :List of 2
## .. ..$ alpha : num 0.1
## .. ..$ lfcThreshold : num 0
RNA-seq GSE55536 ##################
library(Biobase)
library(stringr)
selected.ready.set = datasets.ready$GSE55536
selected.ready.set %>% exprs() %>% round() -> countdata #Por qué redondear
coldata <- pData(selected.ready.set)
coldata$title
## V2 V3 V4 V5 V6
## HMDM MAC rep1 HMDM MAC rep2 HMDM M1 rep1 HMDM M1 rep2 HMDM M2 rep1
## V7 V8 V9 V10 V11
## HMDM M2 rep2 iPS rep1 iPS rep2 iPS rep3 iPS rep4
## V12 V13 V14 V15 V16
## IPSDM MAC rep1 IPSDM MAC rep2 IPSDM MAC rep3 IPSDM MAC rep4 IPSDM M1 rep1
## V17 V18 V19 V20 V21
## IPSDM M1 rep2 IPSDM M1 rep3 IPSDM M1 rep4 IPSDM M2 rep1 IPSDM M2 rep2
## V22 V23 V24 V25 V26
## IPSDM M2 rep3 IPSDM M2 rep4 HMDM MAC rep3 HMDM M1 rep3 HMDM M2 rep3
## V27 V28 V29 V30 V31
## iPS rep5 iPS rep6 IPSDM MAC rep5 IPSDM MAC rep6 IPSDM M1 rep5
## V32 V33 V34
## IPSDM M1 rep6 IPSDM M2 rep5 IPSDM M2 rep6
## 33 Levels: HMDM M1 rep1 HMDM M1 rep2 HMDM M1 rep3 HMDM M2 rep1 ... IPSDM MAC rep6
coldata$title %>% as.character() %>% str_replace_all('.*(M1|M2).*','stimulated') %>% str_replace_all('.*rep.*','control') -> coldata$title
coldata$title
## [1] "control" "control" "stimulated" "stimulated" "stimulated"
## [6] "stimulated" "control" "control" "control" "control"
## [11] "control" "control" "control" "control" "stimulated"
## [16] "stimulated" "stimulated" "stimulated" "stimulated" "stimulated"
## [21] "stimulated" "stimulated" "control" "stimulated" "stimulated"
## [26] "control" "control" "control" "control" "stimulated"
## [31] "stimulated" "stimulated" "stimulated"
library(DESeq2)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ title)
keep <- rowSums(counts(ddsMat)) > 1
dds <- ddsMat[keep,]
dds <- estimateSizeFactors(dds)
library(DESeq2)
DE.analysis <- DESeq(dds)
res0 <- results(DE.analysis, contrast = c('title','stimulated','control'))
Resul <- lfcShrink(DE.analysis, contrast = c('title','stimulated','control'), res=res0)
library(EnhancedVolcano)
EnhancedVolcano(Resul,
lab = rownames(Resul),
x = 'log2FoldChange',
y = 'pvalue',
xlim = c(-6,6),
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 10e-10,
FCcutoff = 2.0,
pointSize = 2.0,
labSize = 4.0,
colAlpha = .8,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'grey30')
mis.DEGs.studies[['GSE55536']] <- Resul
mis.DEGs.studies %>% glimpse()
## List of 2
## $ GSE100382:Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:12555] "A1BG" "A1BG-AS1" "A2M" "A2M-AS1" ...
## .. ..@ nrows : int 12555
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
## $ GSE55536 :Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:11951] "A1BG" "A2M" "A4GALT" "AAAS" ...
## .. ..@ nrows : int 11951
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
RNA-seq GSE82227 ##################
library(Biobase)
library(stringr)
selected.ready.set = datasets.ready$GSE82227
selected.ready.set %>% exprs() %>% round() -> countdata #Por qué redondear
coldata <- pData(selected.ready.set)
coldata$characteristics_ch1.2
## V2 V3
## stimulation: media alone stimulation: TLR2/1 ligand
## V4 V5
## stimulation: interferon gamma stimulation: media alone
## V6 V7
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V8 V9
## stimulation: media alone stimulation: TLR2/1 ligand
## V10 V11
## stimulation: interferon gamma stimulation: media alone
## V12 V13
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V14 V15
## stimulation: media alone stimulation: TLR2/1 ligand
## V16 V17
## stimulation: interferon gamma stimulation: media alone
## V18 V19
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V20 V21
## stimulation: media alone stimulation: TLR2/1 ligand
## V22 V23
## stimulation: interferon gamma stimulation: media alone
## V24 V25
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V26 V27
## stimulation: media alone stimulation: TLR2/1 ligand
## V28 V29
## stimulation: interferon gamma stimulation: media alone
## V30 V31
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V32 V33
## stimulation: media alone stimulation: TLR2/1 ligand
## V34 V35
## stimulation: interferon gamma stimulation: media alone
## V36 V37
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V38 V39
## stimulation: media alone stimulation: TLR2/1 ligand
## V40 V41
## stimulation: interferon gamma stimulation: media alone
## V42 V43
## stimulation: TLR2/1 ligand stimulation: interferon gamma
## V44 V45
## stimulation: media alone stimulation: TLR2/1 ligand
## V46
## stimulation: interferon gamma
## 3 Levels: stimulation: interferon gamma ... stimulation: TLR2/1 ligand
coldata$characteristics_ch1.2 %>%as.character()%>%str_replace_all('.*alone.*','control') %>%
str_replace_all('.*(ligand|gamma).*','stimulated') -> coldata$my.experiments
coldata$my.experiments
## [1] "control" "stimulated" "stimulated" "control" "stimulated"
## [6] "stimulated" "control" "stimulated" "stimulated" "control"
## [11] "stimulated" "stimulated" "control" "stimulated" "stimulated"
## [16] "control" "stimulated" "stimulated" "control" "stimulated"
## [21] "stimulated" "control" "stimulated" "stimulated" "control"
## [26] "stimulated" "stimulated" "control" "stimulated" "stimulated"
## [31] "control" "stimulated" "stimulated" "control" "stimulated"
## [36] "stimulated" "control" "stimulated" "stimulated" "control"
## [41] "stimulated" "stimulated" "control" "stimulated" "stimulated"
library(DESeq2)
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ my.experiments)
keep <- rowSums(counts(ddsMat)) > 1
dds <- ddsMat[keep,]
dds <- estimateSizeFactors(dds)
library(DESeq2)
DE.analysis <- DESeq(dds)
res0 <- results(DE.analysis, contrast = c('my.experiments','stimulated','control'))
Resul <- lfcShrink(DE.analysis, contrast = c('my.experiments','stimulated','control'), res=res0)
library(EnhancedVolcano)
EnhancedVolcano(Resul,
lab = rownames(Resul),
x = 'log2FoldChange',
y = 'pvalue',
xlim = c(-6,6),
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = 10e-10,
FCcutoff = 2.0,
pointSize = 2.0,
labSize = 2.0,
colAlpha = .8,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'grey30')
mis.DEGs.studies[['GSE82227']] <- Resul
mis.DEGs.studies %>% glimpse()
## List of 3
## $ GSE100382:Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:12555] "A1BG" "A1BG-AS1" "A2M" "A2M-AS1" ...
## .. ..@ nrows : int 12555
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
## $ GSE55536 :Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:11951] "A1BG" "A2M" "A4GALT" "AAAS" ...
## .. ..@ nrows : int 11951
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
## $ GSE82227 :Formal class 'DESeqResults' [package "DESeq2"] with 7 slots
## .. ..@ priorInfo :List of 4
## .. ..@ rownames : chr [1:33554] "ENSG00000000003" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460" ...
## .. ..@ nrows : int 33554
## .. ..@ listData :List of 6
## .. ..@ elementType : chr "ANY"
## .. ..@ elementMetadata:Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## .. ..@ metadata :List of 6
Después de correr todo el notebook hay que guardar todas las variable generadas en este notebook, esto se hace haciendo click en el diskette del ‘Environment’. El archivo guardado se debe cargar al comienzo del notebook del 04 de junio.