getwd() # Obtiene el directorio de trabajo
paste0(getwd(),'/hola') # Convierte getwkd() a caracter y concatena
file.path(getwd(),'hola') # Hace direcciones independientes del SO
file.path(getwd(),'hola','hola') # mtcars:
mpg: Miles/(US) gallon
cyl: Number of cylinders
disp: Displacement (cu.in.)
hp: Gross horsepower
drat: Rear axle ratio
wt: Weight (1000 lbs)
qsec: 1/4 mile time
vs: V/S
am: Transmission (0 = automatic, 1 = manual)
gear: Number of forward gears
carb: Number of carburetors
#install.packages("devtools", dependencies = TRUE)
#install.packages("curl", dependencies = TRUE)
#install.packages("tidyverse", dependencies = TRUE)
library(readr)
my.data <- mtcars
my.data # La data de mtcars# Exportando data
write_csv(my.data, path = 'mtcars.csv' ) # Coma Separated Value
write_tsv(my.data, path = 'mtcars.tsv') # Tab Separated Value
write_delim(my.data,path = 'mtcars.txt', delim = ";") # Texto con delimitador ";"
write_delim(my.data,path = 'mtcars.hola', delim = "*_*") # Texto con delimitador "*_*"
openxlsx::write.xlsx(my.data, file = "mtcars.xlsx") # Excellibrary(tibble) # Un tibble es una actualización del dataframe, para
rownames_to_column(my.data, var = "Car brand") -> my.data.2
library(magrittr) #
my.data %>% rownames_to_column(var = "Car brand")-> my.data.2
# Exportando data con pipes (%>%)
my.data.2 %>% write_csv('mtcars.csv') # Coma Separated Value
my.data.2 %>% write_tsv('mtcars.tsv') # Tabulation Separated Value
my.data.2 %>% write_delim('mtcars.txt', delim = ";")
my.data.2 %>% write_delim('mtcars.hola', delim = "*_*")
my.data.2 %>% openxlsx::write.xlsx("mtcars.xlsx") # Excel
# Lectura de los archivos
read_csv('mtcars.csv', skip = 0) ->mtcars.csv
read_tsv('mtcars.tsv', skip = 0) ->mtcars.tsv
read_delim('mtcars.txt', delim = ";", skip = 0) ->mtcars.txt
read_delim('mtcars.hola', delim = "*_*", skip = 0) ->mtcars.hola
read_excel('mtcars.xlsx', skip = 0) ->mtcars.xlsx
########Chequear igualdad###############
library(dplyr)
all_equal(mtcars.csv,mtcars.tsv) # La data del csv deberia ser igual a la del tsv## [1] TRUE
## [1] TRUE
## [1] TRUE
list(mtcars.csv, mtcars.tsv, mtcars.txt, mtcars.hola,mtcars.xlsx) %>% unique() %>% length() # TODO: porque da dos salidas? Una es una tabla y el otro es una consola?## [1] 2
R cuenta con multiples metodos para descargar archivos. El paquete utils usa programas del sistema como metodos para download.file(), como curl, wget, wininet, etc. los cuales pueden no estar instalados en el sistema. Por ejemplo, download.file(URL, method = "wget") usualmente falla en Windows. Especificando otro metodo, download.file(URL, method = "libcurl") funciona.
library(utils) # Usar download.file() para descargar
# Descargando data de Dropbox
my.dropbox.url <- 'https://www.dropbox.com/s/j3kiivpcbghpb4v/log2FC.csv'
#Para LINUX usar:
download.file(url= my.dropbox.url, destfile= 'log2FC.csv', method = "wget")
#Para WINDOWS usar:
download.file(url= my.dropbox.url, destfile= 'log2FC.csv', method = "libcurl")
hola.dropbox <- read_csv("log2FC.csv")
hola.dropbox # Abriendo la dataOtro paquete, httr, usa metodos del protocolo http independientes del sistema operativo. De aqui, usamos el metodo GET, asociado con la función del mismo nombre.
library(httr) # Usa GET() para descargar
# Descargando data de Github
my.github.url <- 'https://github.com/DeepenData/Computational-Biology-and-Bioinformatics/raw/master/Wang2018_supplemental_DEGs.xlsx'
#Para LINUX y WINDOMS usar:
GET(my.github.url, write_disk(tf <- tempfile(fileext = ".xlsx"))) ## Response [https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/Wang2018_supplemental_DEGs.xlsx]
## Date: 2020-11-05 22:23
## Status: 200
## Content-Type: application/octet-stream
## Size: 75.6 kB
## <ON DISK> C:\Users\manuel\AppData\Local\Temp\RtmpKC5wr9\file219029032133.xlsx
Por motivos de compatibilidad, puede ser preferible usar GET(), pero en general los metodos de download.file() suelen ser más rapidos y la mayor parte de la gente en bioinformatica usa sistemas basados en Linux.
library(utils) # Usar download.file() para descargar
library(httr) # Usa GET() para descargar
# Sitio web arbitrario
my.supplementary.url <- 'https://www.pnas.org/highwire/filestream/794560/field_highwire_adjunct_files/0/pnas.1800165115.sd01.xlsx'
# Para LINUX y WINDOMS usar httr::GET()
GET(my.supplementary.url, # Dirección de descarga
write_disk(tf <- tempfile(fileext = ".xlsx"))) # Guardar como "tf" temporal
hola.supplementary <- read_excel(tf, skip = 1) # Lee "tf", omite linea 1
hola.supplementary # Abriendo la data
# Para LINUX USAR
download.file(url= my.supplementary.url, # Dirección de descarga
destfile= 'pnas.1800165115.sd01.xlsx', # Guardar como...
method = "wget") # Usando wget, incluido en Linux
hola.supplementary <- read_excel("pnas.1800165115.sd01.xlsx")
hola.supplementary # Abriendo la dataTambien, es posible omitir la parte intermedia de la descarga e inmediatamente guardar un objeto al ambiente de R desde la fuente web donde esta publicada.
hola.live <- read_csv('https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/labels.csv')
hola.live # Abriendo la dataEsto es especialmente util si tenemos data accesible online que se actualiza frecuentemente, como las cifras actualizadas de Covid19 o un sistema de monitoreo IoT.
biomartr tutorials:
install.packages("BiocManager", dependencies = TRUE)
BiocManager::install()
BiocManager::install("Biostrings")
BiocManager::install("biomaRt")
install.packages("biomartr")#, dependencies = TRUE)library(biomartr)
library(magrittr)
#Objetos clase DNAStringSet
inmortal <- getGenome( db = "refseq", #Más bases de datos
organism = "Thermococcus gammatolerans",
path = file.path("_ncbi_downloads","genomes") ) %>% read_genome()
inmortal## DNAStringSet object of length 1:
## width seq names
## [1] 2045438 GTTGATTACCCAATCTTCGCCT...GGAAGATGATATTATAAGGCAG NC_012804.1 Therm...
Dato impresionante: se pueden leer genomas directamente desde el url del NCBI
url.NCBI <- 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.35_GRCh38.p9/GCF_000001405.35_GRCh38.p9_cds_from_genomic.fna.gz'
human <- read_genome(url.NCBI)
human %>% names() %>% sample(10)## [1] "lcl|NC_000003.12_cds_XP_011531769.1_22155 [gene=DNAH12] [db_xref=GeneID:201625] [protein=dynein heavy chain 12, axonemal isoform X2] [protein_id=XP_011531769.1] [location=complement(join(57293781..57293971,57295525..57295592,57296344..57296435,57296847..57296984,57301735..57301939,57309151..57309254,57309666..57309854,57310717..57310950,57314494..57314631,57322343..57322483,57323007..57323260,57323469..57323619,57334465..57334609,57334782..57334940,57352085..57352225,57357176..57357348,57363594..57363786,57366729..57366921,57368046..57368260,57375371..57375516,57375818..57375965,57376981..57377222,57379158..57379298,57380278..57380367,57382262..57382393,57384829..57385005,57385349..57385429,57386441..57386603,57387086..57387219,57391872..57392066,57394171..57394332,57403309..57403501,57404969..57405147,57405653..57405952,57408280..57408535,57413746..57413912,57415426..57415564,57419367..57419518,57421518..57421706,57425022..57425141,57428633..57428821,57429691..57429774,57433367..57433507,57433645..57433828,57436951..57437060,57444697..57444816,57445174..57445419,57446031..57446270,57446537..57446689,57452843..57453015,57453247..57453403,57454775..57454894,57457721..57458003,57458099..57458220,57459592..57459786,57461489..57461689,57462690..57462875,57468736..57468979,57470443..57470636,57471472..57471606,57472546..57472671,57483376..57483511,57489509..57489687,57501321..57501412,57502323..57502479,57504016..57504204,57507643..57507838,57508382..57508507,57509140..57509212,57510790..57510979,57523583..57523609,57523803..57523884,57542701..57542870))]"
## [2] "lcl|NC_000016.10_cds_XP_011521209.1_82341 [gene=CYLD] [db_xref=GeneID:1540] [protein=ubiquitin carboxyl-terminal hydrolase CYLD isoform X1] [protein_id=XP_011521209.1] [location=join(50749699..50750202,50751604..50751906,50754319..50754424,50776179..50776277,50777825..50777941,50779665..50780044,50781246..50781411,50782325..50782466,50784329..50784451,50786855..50786946,50787786..50787852,50791558..50791690,50792597..50792705,50793546..50793664,50794212..50794428,50796324..50796508)]"
## [3] "lcl|NC_000015.10_cds_NP_115622.2_79125 [gene=MEX3B] [db_xref=CCDS:CCDS10319.1,GeneID:84206] [protein=RNA-binding protein MEX3B] [protein_id=NP_115622.2] [location=complement(join(82043160..82044613,82045450..82045705))]"
## [4] "lcl|NC_000010.11_cds_XP_011517937.1_54729 [gene=SVIL] [db_xref=GeneID:6840] [protein=supervillin isoform X8] [protein_id=XP_011517937.1] [location=complement(join(29458247..29458333,29458434..29458589,29462277..29462401,29463492..29463635,29465595..29465750,29467742..29467875,29470276..29470483,29471138..29471243,29473838..29473989,29480537..29480813,29481584..29481728,29484656..29484831,29486085..29486230,29486410..29486557,29487163..29487299,29488601..29488756,29490847..29491019,29493214..29493391,29494914..29495000,29495092..29495181,29499116..29499263,29507737..29507814,29508342..29508389,29512735..29512861,29522410..29522635,29523451..29524027,29524472..29524715,29526961..29527056,29529705..29529844,29530607..29530668,29531254..29531288,29532002..29532172,29532529..29533458,29535989..29536069,29550597..29551263,29554783..29554934,29555051..29555058))]"
## [5] "lcl|NC_000010.11_cds_NP_056446.4_57188 [gene=TCTN3] [db_xref=CCDS:CCDS31258.2,GeneID:26123] [protein=tectonic-3 isoform a precursor] [protein_id=NP_056446.4] [location=complement(join(95664067..95664300,95680472..95680609,95682651..95682804,95683101..95683195,95683522..95683629,95684499..95684624,95685556..95685636,95686495..95686530,95687044..95687159,95687247..95687355,95687592..95687719,95692920..95693038,95693353..95693476,95693644..95693899))]"
## [6] "lcl|NT_187675.1_cds_NP_055034.2_114789 [gene=KIR2DL2] [db_xref=GeneID:3803] [protein=killer cell immunoglobulin-like receptor 2DL2 precursor] [exception=annotated by transcript or proteomic data] [protein_id=NP_055034.2] [location=join(53265..53298,54207..54242,56679..56978,58493..58786,62064..62114,66380..66481,66944..66996,67095..67271)]"
## [7] "lcl|NC_000010.11_cds_XP_006717924.1_55662 [gene=MBL2] [db_xref=GeneID:4153] [protein=mannose-binding protein C isoform X1] [protein_id=XP_006717924.1] [location=complement(join(52768137..52768510,52769247..52769315,52770670..52770786,52771449..52771635))]"
## [8] "lcl|NC_000008.11_cds_XP_011515883.1_47350 [gene=TCEB1] [db_xref=GeneID:6921] [protein=transcription elongation factor B polypeptide 1 isoform X1] [protein_id=XP_011515883.1] [location=complement(join(73946630..73946820,73955911..73956054,73959765..73959768))]"
## [9] "lcl|NC_000001.11_cds_XP_011539995.1_1239 [gene=ARHGEF10L] [db_xref=GeneID:55160] [protein=rho guanine nucleotide exchange factor 10-like protein isoform X6] [protein_id=XP_011539995.1] [location=join(17580596..17580632,17587460..17587645,17588446..17588479,17602127..17602218,17603508..17603591,17607799..17607977,17616094..17616202,17619339..17619445,17621864..17621941,17622996..17623175,17624387..17624503,17625956..17626048,17627330..17627503,17632321..17632466,17634835..17635016,17637888..17638003,17638562..17638689,17640202..17640302,17648554..17648675,17654636..17654722,17655879..17656102,17656554..17656708,17664447..17664595,17687573..17687747,17695158..17695280,17696848..17697380)]"
## [10] "lcl|NC_000020.11_cds_XP_016883353.1_99983 [gene=PHF20] [db_xref=GeneID:51230] [protein=PHD finger protein 20 isoform X1] [protein_id=XP_016883353.1] [location=join(35801523..35801605,35842573..35842744,35847350..35847434,35858302..35858381,35863013..35863400,35869438..35869551,35870955..35871134,35871650..35871829,35899370..35899648,35913249..35913347,35914033..35914197,35917484..35917662,35927780..35927879,35931249..35931444,35938697..35939108,35940864..35941047,35947485..35947627)]"
## DNAStringSet object of length 114967:
## width seq names
## [1] 918 ATGGTGACTGAATTCATTTT...ATTCTAGTGTAAAGTTTTAG lcl|NC_000001.11_...
## [2] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## [3] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## [4] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## [5] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## ... ... ...
## [114963] 297 ATGCCCCTCATTTACATAAA...TAAACCTACTCCAATGCTAA lcl|NC_012920.1_c...
## [114964] 1378 ATGCTAAAACTAATCGTCCC...CATTACCGGGTTTTCCTCTT lcl|NC_012920.1_c...
## [114965] 1812 ATAACCATGCACACTACTAT...CCCTACTCCTAATCACATAA lcl|NC_012920.1_c...
## [114966] 525 ATGATGTATGCTTTGTTTCT...AGATTGCTCGGGGGAATAGG lcl|NC_012920.1_c...
## [114967] 1141 ATGACCCCAATACGCAAAAC...CAAAATACTCAAATGGGCCT lcl|NC_012920.1_c...
Para más información sobre GEOquery y consultas de bases de datos de expresión génica ver la siguiente clase y rpub:
download_species <- c("Thermococcus gammatolerans",
"Thermotoga maritima")
# retrieve these three species from NCBI RefSeq
mis.proteomas <- getProteomeSet("refseq", organisms = download_species, path = "set_proteomes")
read_proteome(mis.proteomas[1])
read_proteome(mis.proteomas[2])Proteómica Importing de ProteomExchange R for Proteomics
BiocManager::install()
BiocManager::install("RforProteomics",
ask = F,
dependencies = TRUE,
type = "source",
checkBuilt = TRUE)Para instalar MSnbase:
Dentro de la terminal:
Usar la siguiente página omitiendo los pasos de instalación de R 4.0
#BiocManager::install("MSnbase", ask = T, dependencies = TRUE, type = "source", checkBuilt = TRUE)
library(magrittr)
library(Biobase)
library(MSnbase)
## Downloading the mzTab data
mztab <- pxget(px1, "PXD000001_mztab.txt")
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")TMT: https://en.wikipedia.org/wiki/Tandem_mass_tag
sampleNames(qnt) <- reporterNames(TMT6)
#head(exprs(qnt))
qnt <- filterNA(qnt)
processingData(qnt)
## combine into proteins
## - using the 'accession' feature meta data
## - sum the peptide intensities
protqnt <- combineFeatures(qnt,
groupBy = fData(qnt)$accession,
fun = sum)
protqnt %>% exprs %>% tail()library("RColorBrewer") ## Color palettes
library("ggplot2") ## Convenient and nice plotting
library("reshape2") ## Flexibly reshape data
cls <- brewer.pal(5, "Set1")
matplot(t(tail(exprs(protqnt), n = 5)), type = "b",
lty = 1, col = cls,
ylab = "Protein intensity (summed peptides)",
xlab = "TMT reporters")
legend("topright", tail(featureNames(protqnt), n=5),
lty = 1, bty = "n", cex = .8, col = cls)Multi-omic
Collection Retrieval
The automated retrieval of collections (= Genome, Proteome, CDS, RNA, GFF, Repeat Masker, AssemblyStats) will make sure that the genome file of an organism will match the CDS, proteome, RNA, GFF, etc file and was generated using the same genome assembly version. One aspect of why genomics studies fail in computational and biological reproducibility is that it is not clear whether CDS, proteome, RNA, GFF, etc files used in a proposed analysis were generated using the same genome assembly file denoting the same genome assembly version. To avoid this seemingly trivial mistake we encourage users to retrieve genome file collections using the biomartr function getCollection() and attach the corresponding output as Supplementary Data to the respective genomics study to ensure computational and biological reproducibility.
By specifying the scientific name of an organism of interest a collection consisting of the genome file, proteome file, CDS file, RNA file, GFF file, Repeat Masker file, AssemblyStats file of the organism of interest can be downloaded and stored locally.
FONDECYT Postdoctoral Fellow, Universidad de Chile, deepen.data@gmail.com↩︎
Pregrado, Universidad de Chile↩︎