Importación y escritura

RPub de esta clase

getwd()                          # Obtiene el directorio de trabajo
paste0(getwd(),'/hola')          # Convierte getwkd() a caracter y concatena
file.path(getwd(),'hola')        # Hace direcciones independientes del SO
file.path(getwd(),'hola','hola') # 

Manipulando data local

mtcars:
    mpg: Miles/(US) gallon
    cyl: Number of cylinders
    disp: Displacement (cu.in.)
    hp: Gross horsepower
    drat: Rear axle ratio
    wt: Weight (1000 lbs)
    qsec: 1/4 mile time
    vs: V/S
    am: Transmission (0 = automatic, 1 = manual)
    gear: Number of forward gears
    carb: Number of carburetors
#install.packages("devtools", dependencies = TRUE)
#install.packages("curl", dependencies = TRUE)
#install.packages("tidyverse", dependencies = TRUE)

library(readr)
my.data <- mtcars
my.data # La data de mtcars
# Exportando data
write_csv(my.data,  path = 'mtcars.csv' ) # Coma Separated Value
write_tsv(my.data,  path = 'mtcars.tsv')  # Tab Separated Value
write_delim(my.data,path = 'mtcars.txt', delim = ";") # Texto con delimitador ";"
write_delim(my.data,path = 'mtcars.hola', delim = "*_*") # Texto con delimitador "*_*"

openxlsx::write.xlsx(my.data, file = "mtcars.xlsx") # Excel
# (Re)Importando data
mtcars.csv<-read_csv('mtcars.csv')
mtcars.tsv<-read_tsv('mtcars.tsv')
mtcars.txt<-read_delim('mtcars.txt', delim = ";")
mtcars.hola<-read_delim('mtcars.hola', delim = "*")

library(readxl) # Importando un Excel
mtcars.xlsx<-read_excel('mtcars.xlsx')

Manipulando columna rownames

library(tibble)   # Un tibble es una actualización del dataframe, para 
rownames_to_column(my.data, var = "Car brand")   -> my.data.2


library(magrittr) # 
my.data %>% rownames_to_column(var = "Car brand")-> my.data.2

# Exportando data con pipes (%>%)
my.data.2 %>%   write_csv('mtcars.csv') # Coma Separated Value
my.data.2 %>%   write_tsv('mtcars.tsv') # Tabulation Separated Value
my.data.2 %>% write_delim('mtcars.txt',  delim = ";")
my.data.2 %>% write_delim('mtcars.hola', delim = "*_*")
my.data.2 %>% openxlsx::write.xlsx("mtcars.xlsx") # Excel

# Lectura de los archivos
read_csv('mtcars.csv', skip = 0)                   ->mtcars.csv
read_tsv('mtcars.tsv', skip = 0)                   ->mtcars.tsv
read_delim('mtcars.txt', delim = ";", skip = 0)    ->mtcars.txt
read_delim('mtcars.hola', delim = "*_*", skip = 0) ->mtcars.hola
read_excel('mtcars.xlsx', skip = 0)                ->mtcars.xlsx

########Chequear igualdad###############
library(dplyr)
all_equal(mtcars.csv,mtcars.tsv)   # La data del csv deberia ser igual a la del tsv
## [1] TRUE
all_equal(mtcars.txt,mtcars.hola)  # La data del txt deberia ser igual a la del .hola
## [1] TRUE
all_equal(mtcars.hola,mtcars.xlsx) # La data del txt deberia ser igual a la del Excel
## [1] TRUE
list(mtcars.csv, mtcars.tsv, mtcars.txt, mtcars.hola,mtcars.xlsx) %>% unique() %>% length() # TODO: porque da dos salidas? Una es una tabla y el otro es una consola?
## [1] 2

Descargando data online

R cuenta con multiples metodos para descargar archivos. El paquete utils usa programas del sistema como metodos para download.file(), como curl, wget, wininet, etc. los cuales pueden no estar instalados en el sistema. Por ejemplo, download.file(URL, method = "wget") usualmente falla en Windows. Especificando otro metodo, download.file(URL, method = "libcurl") funciona.

library(utils) # Usar download.file() para descargar

# Descargando data de Dropbox
my.dropbox.url <- 'https://www.dropbox.com/s/j3kiivpcbghpb4v/log2FC.csv'

#Para LINUX usar:
download.file(url= my.dropbox.url,  destfile= 'log2FC.csv', method = "wget") 

#Para WINDOWS usar:
download.file(url= my.dropbox.url,  destfile= 'log2FC.csv', method = "libcurl") 

hola.dropbox <- read_csv("log2FC.csv")
hola.dropbox # Abriendo la data

Otro paquete, httr, usa metodos del protocolo http independientes del sistema operativo. De aqui, usamos el metodo GET, asociado con la función del mismo nombre.

library(httr)  # Usa GET() para descargar

# Descargando data de Github
my.github.url <- 'https://github.com/DeepenData/Computational-Biology-and-Bioinformatics/raw/master/Wang2018_supplemental_DEGs.xlsx'

#Para LINUX y WINDOMS  usar:
GET(my.github.url, write_disk(tf <- tempfile(fileext = ".xlsx"))) 
## Response [https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/Wang2018_supplemental_DEGs.xlsx]
##   Date: 2020-11-05 22:23
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 75.6 kB
## <ON DISK>  C:\Users\manuel\AppData\Local\Temp\RtmpKC5wr9\file219029032133.xlsx
hola.github <- read_excel(tf, skip = 1) 
hola.github # Abriendo la data

Por motivos de compatibilidad, puede ser preferible usar GET(), pero en general los metodos de download.file() suelen ser más rapidos y la mayor parte de la gente en bioinformatica usa sistemas basados en Linux.

library(utils) # Usar download.file() para descargar
library(httr)  # Usa GET() para descargar

# Sitio web arbitrario

my.supplementary.url <- 'https://www.pnas.org/highwire/filestream/794560/field_highwire_adjunct_files/0/pnas.1800165115.sd01.xlsx'

# Para LINUX y WINDOMS  usar httr::GET()
GET(my.supplementary.url, # Dirección de descarga
    write_disk(tf <- tempfile(fileext = ".xlsx"))) # Guardar como "tf" temporal
hola.supplementary <- read_excel(tf, skip = 1) # Lee "tf", omite linea 1

hola.supplementary # Abriendo la data

# Para LINUX USAR
download.file(url= my.supplementary.url,  # Dirección de descarga
              destfile= 'pnas.1800165115.sd01.xlsx', # Guardar como...
              method = "wget")  # Usando wget, incluido en Linux
hola.supplementary <- read_excel("pnas.1800165115.sd01.xlsx")

hola.supplementary # Abriendo la data

Tambien, es posible omitir la parte intermedia de la descarga e inmediatamente guardar un objeto al ambiente de R desde la fuente web donde esta publicada.

hola.live <- read_csv('https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/labels.csv')
hola.live # Abriendo la data

Esto es especialmente util si tenemos data accesible online que se actualiza frecuentemente, como las cifras actualizadas de Covid19 o un sistema de monitoreo IoT.

Analisis omicos en Biología

Genomica

biomartr tutorials:

install.packages("BiocManager", dependencies = TRUE)
BiocManager::install()
BiocManager::install("Biostrings")
BiocManager::install("biomaRt")
install.packages("biomartr")#, dependencies = TRUE)
library(biomartr)
library(magrittr)
#Objetos clase DNAStringSet
inmortal <- getGenome( db       = "refseq", #Más bases de datos
                       organism = "Thermococcus gammatolerans",
                       path     = file.path("_ncbi_downloads","genomes") ) %>%  read_genome()
inmortal
## DNAStringSet object of length 1:
##       width seq                                             names               
## [1] 2045438 GTTGATTACCCAATCTTCGCCT...GGAAGATGATATTATAAGGCAG NC_012804.1 Therm...

Dato impresionante: se pueden leer genomas directamente desde el url del NCBI

url.NCBI <- 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.35_GRCh38.p9/GCF_000001405.35_GRCh38.p9_cds_from_genomic.fna.gz'
human    <- read_genome(url.NCBI)
human %>% names() %>% sample(10)
##  [1] "lcl|NC_000003.12_cds_XP_011531769.1_22155 [gene=DNAH12] [db_xref=GeneID:201625] [protein=dynein heavy chain 12, axonemal isoform X2] [protein_id=XP_011531769.1] [location=complement(join(57293781..57293971,57295525..57295592,57296344..57296435,57296847..57296984,57301735..57301939,57309151..57309254,57309666..57309854,57310717..57310950,57314494..57314631,57322343..57322483,57323007..57323260,57323469..57323619,57334465..57334609,57334782..57334940,57352085..57352225,57357176..57357348,57363594..57363786,57366729..57366921,57368046..57368260,57375371..57375516,57375818..57375965,57376981..57377222,57379158..57379298,57380278..57380367,57382262..57382393,57384829..57385005,57385349..57385429,57386441..57386603,57387086..57387219,57391872..57392066,57394171..57394332,57403309..57403501,57404969..57405147,57405653..57405952,57408280..57408535,57413746..57413912,57415426..57415564,57419367..57419518,57421518..57421706,57425022..57425141,57428633..57428821,57429691..57429774,57433367..57433507,57433645..57433828,57436951..57437060,57444697..57444816,57445174..57445419,57446031..57446270,57446537..57446689,57452843..57453015,57453247..57453403,57454775..57454894,57457721..57458003,57458099..57458220,57459592..57459786,57461489..57461689,57462690..57462875,57468736..57468979,57470443..57470636,57471472..57471606,57472546..57472671,57483376..57483511,57489509..57489687,57501321..57501412,57502323..57502479,57504016..57504204,57507643..57507838,57508382..57508507,57509140..57509212,57510790..57510979,57523583..57523609,57523803..57523884,57542701..57542870))]"
##  [2] "lcl|NC_000016.10_cds_XP_011521209.1_82341 [gene=CYLD] [db_xref=GeneID:1540] [protein=ubiquitin carboxyl-terminal hydrolase CYLD isoform X1] [protein_id=XP_011521209.1] [location=join(50749699..50750202,50751604..50751906,50754319..50754424,50776179..50776277,50777825..50777941,50779665..50780044,50781246..50781411,50782325..50782466,50784329..50784451,50786855..50786946,50787786..50787852,50791558..50791690,50792597..50792705,50793546..50793664,50794212..50794428,50796324..50796508)]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
##  [3] "lcl|NC_000015.10_cds_NP_115622.2_79125 [gene=MEX3B] [db_xref=CCDS:CCDS10319.1,GeneID:84206] [protein=RNA-binding protein MEX3B] [protein_id=NP_115622.2] [location=complement(join(82043160..82044613,82045450..82045705))]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
##  [4] "lcl|NC_000010.11_cds_XP_011517937.1_54729 [gene=SVIL] [db_xref=GeneID:6840] [protein=supervillin isoform X8] [protein_id=XP_011517937.1] [location=complement(join(29458247..29458333,29458434..29458589,29462277..29462401,29463492..29463635,29465595..29465750,29467742..29467875,29470276..29470483,29471138..29471243,29473838..29473989,29480537..29480813,29481584..29481728,29484656..29484831,29486085..29486230,29486410..29486557,29487163..29487299,29488601..29488756,29490847..29491019,29493214..29493391,29494914..29495000,29495092..29495181,29499116..29499263,29507737..29507814,29508342..29508389,29512735..29512861,29522410..29522635,29523451..29524027,29524472..29524715,29526961..29527056,29529705..29529844,29530607..29530668,29531254..29531288,29532002..29532172,29532529..29533458,29535989..29536069,29550597..29551263,29554783..29554934,29555051..29555058))]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
##  [5] "lcl|NC_000010.11_cds_NP_056446.4_57188 [gene=TCTN3] [db_xref=CCDS:CCDS31258.2,GeneID:26123] [protein=tectonic-3 isoform a precursor] [protein_id=NP_056446.4] [location=complement(join(95664067..95664300,95680472..95680609,95682651..95682804,95683101..95683195,95683522..95683629,95684499..95684624,95685556..95685636,95686495..95686530,95687044..95687159,95687247..95687355,95687592..95687719,95692920..95693038,95693353..95693476,95693644..95693899))]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
##  [6] "lcl|NT_187675.1_cds_NP_055034.2_114789 [gene=KIR2DL2] [db_xref=GeneID:3803] [protein=killer cell immunoglobulin-like receptor 2DL2 precursor] [exception=annotated by transcript or proteomic data] [protein_id=NP_055034.2] [location=join(53265..53298,54207..54242,56679..56978,58493..58786,62064..62114,66380..66481,66944..66996,67095..67271)]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
##  [7] "lcl|NC_000010.11_cds_XP_006717924.1_55662 [gene=MBL2] [db_xref=GeneID:4153] [protein=mannose-binding protein C isoform X1] [protein_id=XP_006717924.1] [location=complement(join(52768137..52768510,52769247..52769315,52770670..52770786,52771449..52771635))]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
##  [8] "lcl|NC_000008.11_cds_XP_011515883.1_47350 [gene=TCEB1] [db_xref=GeneID:6921] [protein=transcription elongation factor B polypeptide 1 isoform X1] [protein_id=XP_011515883.1] [location=complement(join(73946630..73946820,73955911..73956054,73959765..73959768))]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
##  [9] "lcl|NC_000001.11_cds_XP_011539995.1_1239 [gene=ARHGEF10L] [db_xref=GeneID:55160] [protein=rho guanine nucleotide exchange factor 10-like protein isoform X6] [protein_id=XP_011539995.1] [location=join(17580596..17580632,17587460..17587645,17588446..17588479,17602127..17602218,17603508..17603591,17607799..17607977,17616094..17616202,17619339..17619445,17621864..17621941,17622996..17623175,17624387..17624503,17625956..17626048,17627330..17627503,17632321..17632466,17634835..17635016,17637888..17638003,17638562..17638689,17640202..17640302,17648554..17648675,17654636..17654722,17655879..17656102,17656554..17656708,17664447..17664595,17687573..17687747,17695158..17695280,17696848..17697380)]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [10] "lcl|NC_000020.11_cds_XP_016883353.1_99983 [gene=PHF20] [db_xref=GeneID:51230] [protein=PHD finger protein 20 isoform X1] [protein_id=XP_016883353.1] [location=join(35801523..35801605,35842573..35842744,35847350..35847434,35858302..35858381,35863013..35863400,35869438..35869551,35870955..35871134,35871650..35871829,35899370..35899648,35913249..35913347,35914033..35914197,35917484..35917662,35927780..35927879,35931249..35931444,35938697..35939108,35940864..35941047,35947485..35947627)]"
human 
## DNAStringSet object of length 114967:
##           width seq                                         names               
##      [1]    918 ATGGTGACTGAATTCATTTT...ATTCTAGTGTAAAGTTTTAG lcl|NC_000001.11_...
##      [2]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      [3]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      [4]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      [5]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      ...    ... ...
## [114963]    297 ATGCCCCTCATTTACATAAA...TAAACCTACTCCAATGCTAA lcl|NC_012920.1_c...
## [114964]   1378 ATGCTAAAACTAATCGTCCC...CATTACCGGGTTTTCCTCTT lcl|NC_012920.1_c...
## [114965]   1812 ATAACCATGCACACTACTAT...CCCTACTCCTAATCACATAA lcl|NC_012920.1_c...
## [114966]    525 ATGATGTATGCTTTGTTTCT...AGATTGCTCGGGGGAATAGG lcl|NC_012920.1_c...
## [114967]   1141 ATGACCCCAATACGCAAAAC...CAAAATACTCAAATGGGCCT lcl|NC_012920.1_c...

Epigenomica

Para más información sobre GEOquery y consultas de bases de datos de expresión génica ver la siguiente clase y rpub:

https://rpubs.com/DeepenData/622645

# Instalación de GEOquery
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("GEOquery")
library(GEOquery) # Utilizada para analizar datos de expresion
gset = getGEO("GSE36278") # Descarga un dataset de
fData(gset[[1]])          # Lista las sondas del set

Transcriptomica

inmortal <- getRNA( db       = "refseq", #Más bases de datos
                       organism = "Thermococcus gammatolerans",
                       path     = file.path("_ncbi_downloads","transcriptome") ) %>%  read_rna()

inmortal
inmortal %>% names() %>% sample(10)

Proteomica

download_species <- c("Thermococcus gammatolerans", 
                      "Thermotoga maritima")
# retrieve these three species from NCBI RefSeq                       
mis.proteomas <- getProteomeSet("refseq", organisms = download_species, path = "set_proteomes")

read_proteome(mis.proteomas[1])
read_proteome(mis.proteomas[2])

Proteómica Importing de ProteomExchange R for Proteomics

Estudios diferenciales

BiocManager::install()
BiocManager::install("RforProteomics",
                     ask = F, 
                     dependencies = TRUE,
                     type = "source",
                     checkBuilt = TRUE)
## Experiment information
library("rpx")
px1 <- PXDataset("PXD000001")
pxfiles(px1)^

Para instalar MSnbase:

Dentro de la terminal:

sudo apt-cache search libnetcdf 
sudo apt-get update 
sudo apt-get install libnetcdf-c++4-1

Usar la siguiente página omitiendo los pasos de instalación de R 4.0

Cartografos

#BiocManager::install("MSnbase", ask = T, dependencies = TRUE,   type = "source", checkBuilt = TRUE)
library(magrittr)
library(Biobase)
library(MSnbase)

## Downloading the mzTab data
mztab <- pxget(px1, "PXD000001_mztab.txt")
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")

TMT: https://en.wikipedia.org/wiki/Tandem_mass_tag

sampleNames(qnt) <- reporterNames(TMT6)
#head(exprs(qnt))
qnt <- filterNA(qnt)
processingData(qnt)
## combine into proteins
## - using the 'accession' feature meta data
## - sum the peptide intensities
protqnt <- combineFeatures(qnt,
                           groupBy = fData(qnt)$accession,
                           fun = sum)


protqnt %>% exprs %>% tail()
library("RColorBrewer") ## Color palettes
library("ggplot2")  ## Convenient and nice plotting
library("reshape2") ## Flexibly reshape data

cls <- brewer.pal(5, "Set1")
matplot(t(tail(exprs(protqnt), n = 5)), type = "b",
        lty = 1, col = cls,
        ylab = "Protein intensity (summed peptides)",
        xlab = "TMT reporters")
legend("topright", tail(featureNames(protqnt), n=5),
       lty = 1, bty = "n", cex = .8, col = cls)

Multi-omic

Collection Retrieval

The automated retrieval of collections (= Genome, Proteome, CDS, RNA, GFF, Repeat Masker, AssemblyStats) will make sure that the genome file of an organism will match the CDS, proteome, RNA, GFF, etc file and was generated using the same genome assembly version. One aspect of why genomics studies fail in computational and biological reproducibility is that it is not clear whether CDS, proteome, RNA, GFF, etc files used in a proposed analysis were generated using the same genome assembly file denoting the same genome assembly version. To avoid this seemingly trivial mistake we encourage users to retrieve genome file collections using the biomartr function getCollection() and attach the corresponding output as Supplementary Data to the respective genomics study to ensure computational and biological reproducibility.

By specifying the scientific name of an organism of interest a collection consisting of the genome file, proteome file, CDS file, RNA file, GFF file, Repeat Masker file, AssemblyStats file of the organism of interest can be downloaded and stored locally.

inmortal_collect <- getCollection( db = "genbank", 
               organism = "Thermococcus gammatolerans", 
               path = file.path("refseq","Collections"))

  1. FONDECYT Postdoctoral Fellow, Universidad de Chile, ↩︎

  2. Pregrado, Universidad de Chile↩︎