Importación y escritura.

RPub de esta clase

getwd()
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020"
paste0(getwd(),'/hola')
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola"
file.path(getwd(),'hola')
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola"
file.path(getwd(),'hola','hola')
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola/hola"

Manipulando data local

mtcars:

    mpg: Miles/(US) gallon
    cyl: Number of cylinders
    disp: Displacement (cu.in.)
    hp: Gross horsepower
    drat: Rear axle ratio
    wt: Weight (1000 lbs)
    qsec: 1/4 mile time
    vs: V/S
    am: Transmission (0 = automatic, 1 = manual)
    gear: Number of forward gears
    carb: Number of carburetors
#install.packages("devtools", dependencies = TRUE)
#install.packages("curl", dependencies = TRUE)
#install.packages("tidyverse", dependencies = TRUE)
library(readr)
my.data <- mtcars
my.data
# Exportando data
write_csv(my.data,  path = 'mtcars.csv' ) # Coma Separated Value
write_tsv(my.data,  path = 'mtcars.tsv')  # Tab Separated Value
write_delim(my.data,path = 'mtcars.txt', delim = ";") # Texto con delimitador ";"
write_delim(my.data,path = 'mtcars.hola', delim = "*_*") # Texto con delimitador "*_*"

openxlsx::write.xlsx(my.data, file = "mtcars.xlsx") # Excel
# (Re)Importando data
mtcars.csv<-read_csv('mtcars.csv')
mtcars.tsv<-read_tsv('mtcars.tsv')
mtcars.txt<-read_delim('mtcars.txt', delim = ";")
mtcars.hola<-read_delim('mtcars.hola', delim = "*")

library(readxl) # Importando un Excel
mtcars.xlsx<-read_excel('mtcars.xlsx')

Manipulando columna rownames

library(tibble) 
rownames_to_column(my.data, var = "Car brand")   -> my.data.2
library(magrittr) # 
my.data %>% rownames_to_column(var = "Car brand")-> my.data.2
#####Con pipes#############
my.data.2 %>%   write_csv('mtcars.csv')
my.data.2 %>%   write_tsv('mtcars.tsv')
my.data.2 %>% write_delim('mtcars.txt',  delim = ";")
my.data.2 %>% write_delim('mtcars.hola', delim = "*_*")
my.data.2 %>% openxlsx::write.xlsx("mtcars.xlsx")

###########Lectura##################
read_csv('mtcars.csv', skip = 0)                         ->mtcars.csv
read_tsv('mtcars.tsv', skip = 0)                         ->mtcars.tsv
read_delim('mtcars.txt', delim = ";", skip = 0)          ->mtcars.txt
read_delim('mtcars.hola', delim = "*_*", skip = 0)       ->mtcars.hola
read_excel('mtcars.xlsx', skip = 0)                      ->mtcars.xlsx

########Chequear igualdad###############
library(dplyr)
all_equal(mtcars.csv,mtcars.tsv)   # La data del csv deberia ser igual a la del tsv
## [1] TRUE
all_equal(mtcars.txt,mtcars.hola)  # La data del txt deberia ser igual a la del .hola
## [1] TRUE
all_equal(mtcars.hola,mtcars.xlsx) # La data del txt deberia ser igual a la del Excel
## [1] TRUE
list(mtcars.csv, mtcars.tsv, mtcars.txt, mtcars.hola,mtcars.xlsx) %>% unique() %>% length() # TODO: porque da dos salidas? Una es una tabla y el otro es una consola?
## [1] 2

Descargando data online

library(utils)
library(httr)
#Para LINUX y WINDOMS  usar:
my.github.url <- 'https://github.com/DeepenData/Computational-Biology-and-Bioinformatics/raw/master/Wang2018_supplemental_DEGs.xlsx'
GET(my.github.url, write_disk(tf <- tempfile(fileext = ".xlsx"))) 
## Response [https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/Wang2018_supplemental_DEGs.xlsx]
##   Date: 2020-09-26 00:12
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 75.6 kB
## <ON DISK>  /tmp/RtmpLnqSq0/filecf511adf424.xlsx
hola.github <- read_excel(tf, skip = 1) 
hola.github 

Descargando data en line

###Dropbox
my.dropbox.url <- 'https://www.dropbox.com/s/j3kiivpcbghpb4v/log2FC.csv'
#Para windows usar:
download.file(url= my.dropbox.url,  destfile= 'log2FC.csv', method = "libcurl") 
#Para linux usar:
download.file(url= my.dropbox.url,  destfile= 'log2FC.csv', method = "wget") 
hola.dropbox <- read_csv("log2FC.csv")
hola.dropbox
library(utils)
library(httr)
###Sitio web arbitrario
#Para LINUX USAR:
my.supplementary.url <- 'https://www.pnas.org/highwire/filestream/794560/field_highwire_adjunct_files/0/pnas.1800165115.sd01.xlsx'
download.file(url= my.supplementary.url,  destfile= 'pnas.1800165115.sd01.xlsx', method = "wget") # ERROR!
hola.supplementary <- read_excel("pnas.1800165115.sd01.xlsx")
#Para LINUX y WINDOMS  usar:
GET(my.supplementary.url, write_disk(tf <- tempfile(fileext = ".xlsx")))
## Response [https://www.pnas.org/content/pnas/suppl/2018/02/07/1800165115.DCSupplemental/pnas.1800165115.sd01.xlsx]
##   Date: 2020-09-26 00:12
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 61.2 kB
## <ON DISK>  /tmp/RtmpLnqSq0/filecf525a4caf8.xlsx
hola.supplementary <- read_excel(tf, skip = 1) 
hola.supplementary 
read_csv('https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/labels.csv')

Datos omicos

genómica biomartr tutorials: - https://docs.ropensci.org/biomartr/articles/ - https://cran.r-project.org/web/packages/biomartr/vignettes/Sequence_Retrieval.html - https://cran.r-project.org/web/packages/biomartr/readme/README.html

#install.packages("BiocManager", dependencies = TRUE)
#BiocManager::install()
#BiocManager::install("Biostrings")
#BiocManager::install("biomaRt")
#install.packages("biomartr")#, dependencies = TRUE)
library(biomartr)
library(magrittr)
#Objetos clase DNAStringSet
inmortal <- getGenome( db       = "refseq", #Más bases de datos
                       organism = "Thermococcus gammatolerans",
                       path     = file.path("_ncbi_downloads","genomes") ) %>%  read_genome()
inmortal
## DNAStringSet object of length 1:
##       width seq                                             names               
## [1] 2045438 GTTGATTACCCAATCTTCGCCT...GGAAGATGATATTATAAGGCAG NC_012804.1 Therm...

Dato impresionante: se pueden leer genomas directamente desde el url del NCBI

url.NCBI <- 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.35_GRCh38.p9/GCF_000001405.35_GRCh38.p9_cds_from_genomic.fna.gz'
human    <- read_genome(url.NCBI)
human %>% names() %>% sample(10)
##  [1] "lcl|NC_000003.12_cds_XP_016862545.1_23624 [gene=CCDC14] [db_xref=GeneID:64770] [protein=coiled-coil domain-containing protein 14 isoform X4] [protein_id=XP_016862545.1] [location=complement(join(123914779..123915718,123931102..123931234,123931308..123931526,123933673..123933755,123944849..123944990,123946803..123947319,123948691..123948785,123948896..123949132,123952595..123952610))]"                                                                                       
##  [2] "lcl|NC_000017.11_cds_XP_006721793.1_88054 [gene=TMEM92] [db_xref=GeneID:162461] [protein=transmembrane protein 92 isoform X1] [protein_id=XP_006721793.1] [location=join(50274502..50274570,50277715..50277740,50278556..50278630,50278801..50278954,50279195..50279308)]"                                                                                                                                                                                                                
##  [3] "lcl|NC_000010.11_cds_XP_005269439.1_58024 [gene=SMNDC1] [db_xref=GeneID:10285] [protein=survival of motor neuron-related-splicing factor 30 isoform X1] [protein_id=XP_005269439.1] [location=complement(join(110294150..110294287,110295228..110295381,110297567..110297728,110298648..110298790,110300547..110300549))]"                                                                                                                                                                
##  [4] "lcl|NC_000005.10_cds_NP_061746.1_33467 [gene=PCDHGB2] [db_xref=CCDS:CCDS54924.1,GeneID:56103] [protein=protocadherin gamma-B2 isoform 1 precursor] [protein_id=NP_061746.1] [location=join(141360136..141362556,141494807..141494865,141505393..141505481,141510947..141511173)]"                                                                                                                                                                                                         
##  [5] "lcl|NC_000022.11_cds_NP_060299.4_102607 [gene=CECR5] [db_xref=CCDS:CCDS13741.1,GeneID:27440] [protein=cat eye syndrome critical region protein 5 isoform 1] [protein_id=NP_060299.4] [location=complement(join(17138021..17138357,17138550..17138738,17141059..17141233,17143098..17143131,17145024..17145117,17148448..17148560,17149542..17149745,17165209..17165244))]"                                                                                                                
##  [6] "lcl|NC_000005.10_cds_NP_057728.1_32629 [gene=PRR16] [db_xref=CCDS:CCDS4127.1,GeneID:51334] [protein=protein Largen isoform 2] [protein_id=NP_057728.1] [location=join(120481204..120481293,120685954..120686709)]"                                                                                                                                                                                                                                                                        
##  [7] "lcl|NC_000003.12_cds_NP_001186908.1_24546 [gene=PLSCR2] [db_xref=CCDS:CCDS75029.1,GeneID:57047] [protein=phospholipid scramblase 2 isoform 2] [protein_id=NP_001186908.1] [location=complement(join(146441792..146441821,146449206..146449367,146454002..146454163,146455239..146455459,146458411..146458453,146459848..146460083,146495895..146495922))]"                                                                                                                                
##  [8] "lcl|NC_000004.12_cds_XP_016863462.1_28014 [gene=C4orf22] [db_xref=GeneID:255119] [protein=uncharacterized protein C4orf22 isoform X2] [protein_id=XP_016863462.1] [location=join(80321435..80321463,80335744..80335879,80362754..80362884,80583093..80583183,80869993..80870135,80963517..80963556)]"                                                                                                                                                                                     
##  [9] "lcl|NC_000008.11_cds_XP_016869227.1_45966 [gene=BMP1] [db_xref=GeneID:649] [protein=bone morphogenetic protein 1 isoform X2] [protein_id=XP_016869227.1] [location=join(22165406..22165553,22173602..22173715,22176143..22176313,22176533..22176650,22176961..22177139,22177852..22177957,22179705..22179829,22180368..22180483,22192049..22192151,22194058..22194174,22194445..22194590,22194724..22194919,22195462..22195587,22196680..22196840,22197240..22197420,22199264..22199310)]"
## [10] "lcl|NC_000015.10_cds_XP_016877777.1_77468 [gene=MYEF2] [db_xref=GeneID:50804] [protein=myelin expression factor 2 isoform X9] [protein_id=XP_016877777.1] [location=complement(join(48142908..48143071,48149032..48149083,48149163..48149371,48151100..48151171,48151473..48151571,48151874..48151942,48153792..48153893,48157993..48158056,48158175..48158224,48158769..48158922,48159613..48159804,48165933..48165977))]"
human 
## DNAStringSet object of length 114967:
##           width seq                                         names               
##      [1]    918 ATGGTGACTGAATTCATTTT...ATTCTAGTGTAAAGTTTTAG lcl|NC_000001.11_...
##      [2]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      [3]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      [4]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      [5]    402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
##      ...    ... ...
## [114963]    297 ATGCCCCTCATTTACATAAA...TAAACCTACTCCAATGCTAA lcl|NC_012920.1_c...
## [114964]   1378 ATGCTAAAACTAATCGTCCC...CATTACCGGGTTTTCCTCTT lcl|NC_012920.1_c...
## [114965]   1812 ATAACCATGCACACTACTAT...CCCTACTCCTAATCACATAA lcl|NC_012920.1_c...
## [114966]    525 ATGATGTATGCTTTGTTTCT...AGATTGCTCGGGGGAATAGG lcl|NC_012920.1_c...
## [114967]   1141 ATGACCCCAATACGCAAAAC...CAAAATACTCAAATGGGCCT lcl|NC_012920.1_c...

Epigenomica

Para más información sobre GEOquery y consultas de bases de datos de expresión génica ver la siguiente clase y rpub:

https://rpubs.com/DeepenData/622645

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("GEOquery")
library(GEOquery)
gset = getGEO("GSE36278") # Descarga un dataset de
## Found 1 file(s)
## GSE36278_series_matrix.txt.gz
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ID_REF = col_character()
## )
## See spec(...) for full column specifications.
## File stored at:
## /tmp/RtmpLnqSq0/GPL13534.soft
## Warning: 65 parsing failures.
##    row     col           expected     actual         file
## 485513 SPOT_ID 1/0/T/F/TRUE/FALSE rs10796216 literal data
## 485514 SPOT_ID 1/0/T/F/TRUE/FALSE rs715359   literal data
## 485515 SPOT_ID 1/0/T/F/TRUE/FALSE rs1040870  literal data
## 485516 SPOT_ID 1/0/T/F/TRUE/FALSE rs10936224 literal data
## 485517 SPOT_ID 1/0/T/F/TRUE/FALSE rs213028   literal data
## ...... ....... .................. .......... ............
## See problems(...) for more details.
fData(gset[[1]])          # Lista las sondas del set

Transcriptomica

inmortal <- getRNA( db       = "refseq", #Más bases de datos
                       organism = "Thermococcus gammatolerans",
                       path     = file.path("_ncbi_downloads","transcriptome") ) %>%  read_rna()

inmortal
## DNAStringSet object of length 52:
##      width seq                                              names               
##  [1]    78 CCCGCGGTAGCCTAGCCTGGGAG...TCAAATCCGGGCCGCGGGACCA lcl|NC_012804.1_t...
##  [2]    77 GCCGGGGTGGTGTAGCCTGGTCA...TCAAATCCCGGCCCCGGCCCCA lcl|NC_012804.1_t...
##  [3]    78 GGGGGCGTGGTGTAGCCTGGTCC...TCAAATCCCCGCGCCCCCACCA lcl|NC_012804.1_t...
##  [4]    76 AGCCCCGTGGTGTAGCGGCCAAG...TCGAATCCCGGCGGGGCTACCA lcl|NC_012804.1_t...
##  [5]    77 GGGCCCGTGGTCTAGACGGTTAT...TCGAATCCCCGCGGGCCCACCA lcl|NC_012804.1_t...
##  ...   ... ...
## [48]    85 GCGGGGGTTGCCGAGCCTGGTCA...GGTTCAAATCCCCGCCCCCGCA lcl|NC_012804.1_t...
## [49]    87 GCCGGGATTGCCTAGCCTGGGAA...TCAAATCCCCGTCCCGGCGCCA lcl|NC_012804.1_t...
## [50]  2876 GCAACTAAGCCGCCTGGTGGATG...GTCCCCGAAAATAGGCGGCCAT lcl|NC_012804.1_r...
## [51]    77 GGGCCGGTAGCTCAGCCTGGGAG...TCGAATCCCGGCCGGTCCACCA lcl|NC_012804.1_t...
## [52]  1513 GTACTCCTTGCTCAATTCCGGTT...CGATCACCTCCTATCGCCGGAA lcl|NC_012804.1_r...
inmortal %>% names() %>% sample(10)
##  [1] "lcl|NC_012804.1_trna_41 [locus_tag=TGAM_RS09680] [db_xref=GeneID:7987160] [product=tRNA-Ser] [location=complement(1835756..1835841)] [gbkey=tRNA]"         
##  [2] "lcl|NC_012804.1_trna_25 [locus_tag=TGAM_RS05685] [db_xref=GeneID:7986995] [product=tRNA-Met] [location=1066698..1066775] [gbkey=tRNA]"                     
##  [3] "lcl|NC_012804.1_trna_38 [locus_tag=TGAM_RS09660] [db_xref=GeneID:7988314] [product=tRNA-Leu] [location=complement(1834145..1834232)] [gbkey=tRNA]"         
##  [4] "lcl|NC_012804.1_rrna_52 [locus_tag=TGAM_RS10410] [db_xref=GeneID:7987171] [product=16S ribosomal RNA] [location=complement(1948874..1950386)] [gbkey=rRNA]"
##  [5] "lcl|NC_012804.1_trna_37 [locus_tag=TGAM_RS09290] [db_xref=GeneID:7987156] [product=tRNA-Ile] [location=1769527..1769604] [gbkey=tRNA]"                     
##  [6] "lcl|NC_012804.1_trna_33 [locus_tag=TGAM_RS09030] [db_xref=GeneID:7987614] [product=tRNA-Ala] [location=1724892..1724969] [gbkey=tRNA]"                     
##  [7] "lcl|NC_012804.1_trna_11 [locus_tag=TGAM_RS02060] [db_xref=GeneID:7987952] [product=tRNA-Ile] [location=complement(391152..391229)] [gbkey=tRNA]"           
##  [8] "lcl|NC_012804.1_trna_42 [locus_tag=TGAM_RS09765] [db_xref=GeneID:7988332] [product=tRNA-Gly] [location=1853611..1853688] [gbkey=tRNA]"                     
##  [9] "lcl|NC_012804.1_trna_51 [locus_tag=TGAM_RS10405] [db_xref=GeneID:7987170] [product=tRNA-Ala] [location=complement(1948755..1948831)] [gbkey=tRNA]"         
## [10] "lcl|NC_012804.1_trna_21 [locus_tag=TGAM_RS03500] [db_xref=GeneID:7987138] [product=tRNA-Glu] [location=644930..645007] [gbkey=tRNA]"

Proteomica

download_species <- c("Thermococcus gammatolerans", 
                      "Thermotoga maritima")
# retrieve these three species from NCBI RefSeq                       
mis.proteomas <- getProteomeSet("refseq", organisms = download_species, path = "set_proteomes")

read_proteome(mis.proteomas[1])
## AAStringSet object of length 2137:
##        width seq                                            names               
##    [1]    51 MARNKPLAKKLRLAKAAKQNRR...TNRKVMTHPKRRHWRRTKLKE WP_010477181.1 MU...
##    [2]    91 MAEEHVVYIGKKPVMNYVLAVI...LPTADGRTANTSTIEIVLEKP WP_010477671.1 MU...
##    [3]    56 MAKADYNKRKPRKFGKGARRCM...LMLCRHCFREVAPKLGFKKYE WP_011250478.1 MU...
##    [4]   102 MQKARIKLASTNIKALNEVTDQ...MRQIMRIRVPEDVTIEIELIS WP_012571711.1 MU...
##    [5]   208 MKTWRRYEEYLLTGEWHVHTNY...EVKTVRGGDVHGVGEFKYFWG WP_012751161.1 PH...
##    ...   ... ...
## [2133]   267 MIGIIFDMDGVVYRGNRPIDGA...PDLVFPSIKELKDYLSTVLGD WP_169302034.1 HA...
## [2134]   246 MRVIFLDLDGTLLGDDYSPENA...KLSHPKAKHISSIEELLGVIP WP_169302035.1 ma...
## [2135]   108 MAQKPIGNSRAKQLPKVFNSEE...AYPKAGSAVAKYVSTIGRWVS WP_169302036.1 hy...
## [2136]   425 MILTRHDTTIQSLAKEYFLKSD...VQEWRDAAKAKYQKVLQEVNG WP_169302037.1 ex...
## [2137]   123 MEVNGRKVAGFALYFLGIALGI...GVVLTTLGLFVYKIGGREDAR WP_169302038.1 hy...
read_proteome(mis.proteomas[2])
## AAStringSet object of length 1808:
##        width seq                                            names               
##    [1]    72 MRKIFTAIIEYDPEKKQYVGMV...EAGDEINLQEFVALEMIEVET WP_004079900.1 MU...
##    [2]    77 MSYLPIVDPKTMEKVLLKLGFQ...RKIIREAGISVEEFKKVLENL WP_004079902.1 ty...
##    [3]    70 METQKEIVFIAVESEDGGYIEK...FDEGAPKYVHARFVKDVTIAV WP_004079908.1 MU...
##    [4]   580 MNLRSIQKILRFYSLIRKRFLV...TFRRIIETYVNESKRIADKDV WP_004079909.1 MU...
##    [5]   144 MEAAVVVAYSYFVLKLEFAISN...LLGHVLFKKIERKSRAEGELV WP_004079914.1 MU...
##    ...   ... ...
## [1804]   247 MGGGKMIKRVKTGIPGMDEILH...HPFEITDKGIVIYPSEGGEGR WP_162487497.1 Ka...
## [1805]   262 MGPVDIGLIQLLSAYIFVVVLM...VSVFLYLGYKAFFNRENQLIV WP_164924970.1 ir...
## [1806]   280 MKKILTIVRYILIAICLIFFLF...PVVVFALVAQRYLIRGLTSER WP_164924971.1 AB...
## [1807]   246 MLKWLDSNPSIQELKKFAKSLG...FPSGQGMMAQGIMWEIFRSGR WP_164924972.1 DU...
## [1808]   525 MTGRFLKIIIKKATENLLKHRD...NLDLEIYEGGQPHYPYLMLLQ WP_164924973.1 DA...

Proteómica Importing de ProteomExchange R for Proteomics

Estudios diferenciales

#BiocManager::install()
#BiocManager::install("RforProteomics",
#                     ask = F, 
#                     dependencies = TRUE,
#                     type = "source",
#                     checkBuilt = TRUE)
## Experiment information
library("rpx")
px1 <- PXDataset("PXD000001")
pxfiles(px1)
##  [1] "F063721.dat"                                                         
##  [2] "F063721.dat-mztab.txt"                                               
##  [3] "PRIDE_Exp_Complete_Ac_22134.xml.gz"                                  
##  [4] "PRIDE_Exp_mzData_Ac_22134.xml.gz"                                    
##  [5] "PXD000001_mztab.txt"                                                 
##  [6] "README.txt"                                                          
##  [7] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML" 
##  [8] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzXML"
##  [9] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzXML"         
## [10] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw"           
## [11] "erwinia_carotovora.fasta"                                            
## [12] "generated"

Para instalar MSnbase:

Dentro de la terminal:

sudo apt-cache search libnetcdf 
sudo apt-get update 
sudo apt-get install libnetcdf-c++4-1

Usar la siguiente página omitiendo los pasos de instalación de R 4.0

Para_instalar

#BiocManager::install("MSnbase", ask = T, dependencies = TRUE,   type = "source", checkBuilt = TRUE)
library(magrittr)
library(Biobase)
library(MSnbase)
## Loading required package: mzR
## Loading required package: Rcpp
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: ProtGenerics
## 
## Attaching package: 'ProtGenerics'
## The following object is masked from 'package:stats':
## 
##     smooth
## 
## This is MSnbase version 2.15.6 
##   Visit https://lgatto.github.io/MSnbase/ to get started.
## 
## Attaching package: 'MSnbase'
## The following object is masked from 'package:base':
## 
##     trimws
## Downloading the mzTab data
mztab <- pxget(px1, "PXD000001_mztab.txt")
## Downloading 1 file
## /home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/PXD000001_mztab.txt already present.
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")
## Warning: Version 0.9 is deprecated. Please see '?readMzTabData' and '?MzTab' for
## details.

TMT: https://en.wikipedia.org/wiki/Tandem_mass_tag

sampleNames(qnt) <- reporterNames(TMT6)
#head(exprs(qnt))
qnt <- filterNA(qnt)
processingData(qnt)
## - - - Processing information - - -
## mzTab read: Fri Sep 25 21:15:46 2020 
## Subset [2351,6][1504,6] Fri Sep 25 21:15:46 2020 
## Removed features with more than 0 NAs: Fri Sep 25 21:15:46 2020 
## Dropped featureData's levels Fri Sep 25 21:15:46 2020 
##  MSnbase version: 2.15.6
## combine into proteins
## - using the 'accession' feature meta data
## - sum the peptide intensities
protqnt <- combineFeatures(qnt,
                           groupBy = fData(qnt)$accession,
                           fun = sum)


protqnt %>% exprs %>% tail()
##          TMT6.126 TMT6.127  TMT6.128  TMT6.129 TMT6.130   TMT6.131
## ECA4517  371367.4   376656  439160.7  371917.9   337498   376772.2
## P00489  5086802.6  4970118 5563334.7 5471237.4  2778741  2905920.8
## P00761  3076535.9  3190179 3833943.6 3835475.1  2701778  3646836.3
## P00924  5484890.4  3269353 1850660.8 1062968.7  1596193  5199268.0
## P02769   837322.6  1486133 2796767.6 6210387.7  3511954  1126899.3
## P62894  5498893.0  7031520 6517750.7 6773717.8  7095527 14682351.1
library("RColorBrewer") ## Color palettes
library("ggplot2")  ## Convenient and nice plotting
library("reshape2") ## Flexibly reshape data

cls <- brewer.pal(5, "Set1")
matplot(t(tail(exprs(protqnt), n = 5)), type = "b",
        lty = 1, col = cls,
        ylab = "Protein intensity (summed peptides)",
        xlab = "TMT reporters")
legend("topright", tail(featureNames(protqnt), n=5),
       lty = 1, bty = "n", cex = .8, col = cls)

Importación de colección multi-ómica

Collection Retrieval

The automated retrieval of collections (= Genome, Proteome, CDS, RNA, GFF, Repeat Masker, AssemblyStats) will make sure that the genome file of an organism will match the CDS, proteome, RNA, GFF, etc file and was generated using the same genome assembly version. One aspect of why genomics studies fail in computational and biological reproducibility is that it is not clear whether CDS, proteome, RNA, GFF, etc files used in a proposed analysis were generated using the same genome assembly file denoting the same genome assembly version. To avoid this seemingly trivial mistake we encourage users to retrieve genome file collections using the biomartr function getCollection() and attach the corresponding output as Supplementary Data to the respective genomics study to ensure computational and biological reproducibility.

By specifying the scientific name of an organism of interest a collection consisting of the genome file, proteome file, CDS file, RNA file, GFF file, Repeat Masker file, AssemblyStats file of the organism of interest can be downloaded and stored locally.

inmortal_collect <- getCollection( db = "genbank", 
               organism = "Thermococcus gammatolerans", 
               path = file.path("refseq","Collections"))