getwd()
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020"
paste0(getwd(),'/hola')
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola"
file.path(getwd(),'hola')
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola"
file.path(getwd(),'hola','hola')
## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola/hola"
mtcars:
mpg: Miles/(US) gallon
cyl: Number of cylinders
disp: Displacement (cu.in.)
hp: Gross horsepower
drat: Rear axle ratio
wt: Weight (1000 lbs)
qsec: 1/4 mile time
vs: V/S
am: Transmission (0 = automatic, 1 = manual)
gear: Number of forward gears
carb: Number of carburetors
#install.packages("devtools", dependencies = TRUE)
#install.packages("curl", dependencies = TRUE)
#install.packages("tidyverse", dependencies = TRUE)
library(readr)
my.data <- mtcars
my.data
# Exportando data
write_csv(my.data, path = 'mtcars.csv' ) # Coma Separated Value
write_tsv(my.data, path = 'mtcars.tsv') # Tab Separated Value
write_delim(my.data,path = 'mtcars.txt', delim = ";") # Texto con delimitador ";"
write_delim(my.data,path = 'mtcars.hola', delim = "*_*") # Texto con delimitador "*_*"
openxlsx::write.xlsx(my.data, file = "mtcars.xlsx") # Excel
# (Re)Importando data
mtcars.csv<-read_csv('mtcars.csv')
mtcars.tsv<-read_tsv('mtcars.tsv')
mtcars.txt<-read_delim('mtcars.txt', delim = ";")
mtcars.hola<-read_delim('mtcars.hola', delim = "*")
library(readxl) # Importando un Excel
mtcars.xlsx<-read_excel('mtcars.xlsx')
library(tibble)
rownames_to_column(my.data, var = "Car brand") -> my.data.2
library(magrittr) #
my.data %>% rownames_to_column(var = "Car brand")-> my.data.2
#####Con pipes#############
my.data.2 %>% write_csv('mtcars.csv')
my.data.2 %>% write_tsv('mtcars.tsv')
my.data.2 %>% write_delim('mtcars.txt', delim = ";")
my.data.2 %>% write_delim('mtcars.hola', delim = "*_*")
my.data.2 %>% openxlsx::write.xlsx("mtcars.xlsx")
###########Lectura##################
read_csv('mtcars.csv', skip = 0) ->mtcars.csv
read_tsv('mtcars.tsv', skip = 0) ->mtcars.tsv
read_delim('mtcars.txt', delim = ";", skip = 0) ->mtcars.txt
read_delim('mtcars.hola', delim = "*_*", skip = 0) ->mtcars.hola
read_excel('mtcars.xlsx', skip = 0) ->mtcars.xlsx
########Chequear igualdad###############
library(dplyr)
all_equal(mtcars.csv,mtcars.tsv) # La data del csv deberia ser igual a la del tsv
## [1] TRUE
all_equal(mtcars.txt,mtcars.hola) # La data del txt deberia ser igual a la del .hola
## [1] TRUE
all_equal(mtcars.hola,mtcars.xlsx) # La data del txt deberia ser igual a la del Excel
## [1] TRUE
list(mtcars.csv, mtcars.tsv, mtcars.txt, mtcars.hola,mtcars.xlsx) %>% unique() %>% length() # TODO: porque da dos salidas? Una es una tabla y el otro es una consola?
## [1] 2
library(utils)
library(httr)
#Para LINUX y WINDOMS usar:
my.github.url <- 'https://github.com/DeepenData/Computational-Biology-and-Bioinformatics/raw/master/Wang2018_supplemental_DEGs.xlsx'
GET(my.github.url, write_disk(tf <- tempfile(fileext = ".xlsx")))
## Response [https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/Wang2018_supplemental_DEGs.xlsx]
## Date: 2020-09-26 00:12
## Status: 200
## Content-Type: application/octet-stream
## Size: 75.6 kB
## <ON DISK> /tmp/RtmpLnqSq0/filecf511adf424.xlsx
hola.github <- read_excel(tf, skip = 1)
hola.github
Descargando data en line
###Dropbox
my.dropbox.url <- 'https://www.dropbox.com/s/j3kiivpcbghpb4v/log2FC.csv'
#Para windows usar:
download.file(url= my.dropbox.url, destfile= 'log2FC.csv', method = "libcurl")
#Para linux usar:
download.file(url= my.dropbox.url, destfile= 'log2FC.csv', method = "wget")
hola.dropbox <- read_csv("log2FC.csv")
hola.dropbox
library(utils)
library(httr)
###Sitio web arbitrario
#Para LINUX USAR:
my.supplementary.url <- 'https://www.pnas.org/highwire/filestream/794560/field_highwire_adjunct_files/0/pnas.1800165115.sd01.xlsx'
download.file(url= my.supplementary.url, destfile= 'pnas.1800165115.sd01.xlsx', method = "wget") # ERROR!
hola.supplementary <- read_excel("pnas.1800165115.sd01.xlsx")
#Para LINUX y WINDOMS usar:
GET(my.supplementary.url, write_disk(tf <- tempfile(fileext = ".xlsx")))
## Response [https://www.pnas.org/content/pnas/suppl/2018/02/07/1800165115.DCSupplemental/pnas.1800165115.sd01.xlsx]
## Date: 2020-09-26 00:12
## Status: 200
## Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
## Size: 61.2 kB
## <ON DISK> /tmp/RtmpLnqSq0/filecf525a4caf8.xlsx
hola.supplementary <- read_excel(tf, skip = 1)
hola.supplementary
read_csv('https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/labels.csv')
#install.packages("BiocManager", dependencies = TRUE)
#BiocManager::install()
#BiocManager::install("Biostrings")
#BiocManager::install("biomaRt")
#install.packages("biomartr")#, dependencies = TRUE)
library(biomartr)
library(magrittr)
#Objetos clase DNAStringSet
inmortal <- getGenome( db = "refseq", #Más bases de datos
organism = "Thermococcus gammatolerans",
path = file.path("_ncbi_downloads","genomes") ) %>% read_genome()
inmortal
## DNAStringSet object of length 1:
## width seq names
## [1] 2045438 GTTGATTACCCAATCTTCGCCT...GGAAGATGATATTATAAGGCAG NC_012804.1 Therm...
Dato impresionante: se pueden leer genomas directamente desde el url del NCBI
url.NCBI <- 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.35_GRCh38.p9/GCF_000001405.35_GRCh38.p9_cds_from_genomic.fna.gz'
human <- read_genome(url.NCBI)
human %>% names() %>% sample(10)
## [1] "lcl|NC_000003.12_cds_XP_016862545.1_23624 [gene=CCDC14] [db_xref=GeneID:64770] [protein=coiled-coil domain-containing protein 14 isoform X4] [protein_id=XP_016862545.1] [location=complement(join(123914779..123915718,123931102..123931234,123931308..123931526,123933673..123933755,123944849..123944990,123946803..123947319,123948691..123948785,123948896..123949132,123952595..123952610))]"
## [2] "lcl|NC_000017.11_cds_XP_006721793.1_88054 [gene=TMEM92] [db_xref=GeneID:162461] [protein=transmembrane protein 92 isoform X1] [protein_id=XP_006721793.1] [location=join(50274502..50274570,50277715..50277740,50278556..50278630,50278801..50278954,50279195..50279308)]"
## [3] "lcl|NC_000010.11_cds_XP_005269439.1_58024 [gene=SMNDC1] [db_xref=GeneID:10285] [protein=survival of motor neuron-related-splicing factor 30 isoform X1] [protein_id=XP_005269439.1] [location=complement(join(110294150..110294287,110295228..110295381,110297567..110297728,110298648..110298790,110300547..110300549))]"
## [4] "lcl|NC_000005.10_cds_NP_061746.1_33467 [gene=PCDHGB2] [db_xref=CCDS:CCDS54924.1,GeneID:56103] [protein=protocadherin gamma-B2 isoform 1 precursor] [protein_id=NP_061746.1] [location=join(141360136..141362556,141494807..141494865,141505393..141505481,141510947..141511173)]"
## [5] "lcl|NC_000022.11_cds_NP_060299.4_102607 [gene=CECR5] [db_xref=CCDS:CCDS13741.1,GeneID:27440] [protein=cat eye syndrome critical region protein 5 isoform 1] [protein_id=NP_060299.4] [location=complement(join(17138021..17138357,17138550..17138738,17141059..17141233,17143098..17143131,17145024..17145117,17148448..17148560,17149542..17149745,17165209..17165244))]"
## [6] "lcl|NC_000005.10_cds_NP_057728.1_32629 [gene=PRR16] [db_xref=CCDS:CCDS4127.1,GeneID:51334] [protein=protein Largen isoform 2] [protein_id=NP_057728.1] [location=join(120481204..120481293,120685954..120686709)]"
## [7] "lcl|NC_000003.12_cds_NP_001186908.1_24546 [gene=PLSCR2] [db_xref=CCDS:CCDS75029.1,GeneID:57047] [protein=phospholipid scramblase 2 isoform 2] [protein_id=NP_001186908.1] [location=complement(join(146441792..146441821,146449206..146449367,146454002..146454163,146455239..146455459,146458411..146458453,146459848..146460083,146495895..146495922))]"
## [8] "lcl|NC_000004.12_cds_XP_016863462.1_28014 [gene=C4orf22] [db_xref=GeneID:255119] [protein=uncharacterized protein C4orf22 isoform X2] [protein_id=XP_016863462.1] [location=join(80321435..80321463,80335744..80335879,80362754..80362884,80583093..80583183,80869993..80870135,80963517..80963556)]"
## [9] "lcl|NC_000008.11_cds_XP_016869227.1_45966 [gene=BMP1] [db_xref=GeneID:649] [protein=bone morphogenetic protein 1 isoform X2] [protein_id=XP_016869227.1] [location=join(22165406..22165553,22173602..22173715,22176143..22176313,22176533..22176650,22176961..22177139,22177852..22177957,22179705..22179829,22180368..22180483,22192049..22192151,22194058..22194174,22194445..22194590,22194724..22194919,22195462..22195587,22196680..22196840,22197240..22197420,22199264..22199310)]"
## [10] "lcl|NC_000015.10_cds_XP_016877777.1_77468 [gene=MYEF2] [db_xref=GeneID:50804] [protein=myelin expression factor 2 isoform X9] [protein_id=XP_016877777.1] [location=complement(join(48142908..48143071,48149032..48149083,48149163..48149371,48151100..48151171,48151473..48151571,48151874..48151942,48153792..48153893,48157993..48158056,48158175..48158224,48158769..48158922,48159613..48159804,48165933..48165977))]"
human
## DNAStringSet object of length 114967:
## width seq names
## [1] 918 ATGGTGACTGAATTCATTTT...ATTCTAGTGTAAAGTTTTAG lcl|NC_000001.11_...
## [2] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## [3] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## [4] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## [5] 402 ATGAGTGACAGCATCAACTT...GACCCAGGCACAGGCATTAG lcl|NC_000001.11_...
## ... ... ...
## [114963] 297 ATGCCCCTCATTTACATAAA...TAAACCTACTCCAATGCTAA lcl|NC_012920.1_c...
## [114964] 1378 ATGCTAAAACTAATCGTCCC...CATTACCGGGTTTTCCTCTT lcl|NC_012920.1_c...
## [114965] 1812 ATAACCATGCACACTACTAT...CCCTACTCCTAATCACATAA lcl|NC_012920.1_c...
## [114966] 525 ATGATGTATGCTTTGTTTCT...AGATTGCTCGGGGGAATAGG lcl|NC_012920.1_c...
## [114967] 1141 ATGACCCCAATACGCAAAAC...CAAAATACTCAAATGGGCCT lcl|NC_012920.1_c...
Para más información sobre GEOquery y consultas de bases de datos de expresión génica ver la siguiente clase y rpub:
https://rpubs.com/DeepenData/622645
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("GEOquery")
library(GEOquery)
gset = getGEO("GSE36278") # Descarga un dataset de
## Found 1 file(s)
## GSE36278_series_matrix.txt.gz
## Parsed with column specification:
## cols(
## .default = col_double(),
## ID_REF = col_character()
## )
## See spec(...) for full column specifications.
## File stored at:
## /tmp/RtmpLnqSq0/GPL13534.soft
## Warning: 65 parsing failures.
## row col expected actual file
## 485513 SPOT_ID 1/0/T/F/TRUE/FALSE rs10796216 literal data
## 485514 SPOT_ID 1/0/T/F/TRUE/FALSE rs715359 literal data
## 485515 SPOT_ID 1/0/T/F/TRUE/FALSE rs1040870 literal data
## 485516 SPOT_ID 1/0/T/F/TRUE/FALSE rs10936224 literal data
## 485517 SPOT_ID 1/0/T/F/TRUE/FALSE rs213028 literal data
## ...... ....... .................. .......... ............
## See problems(...) for more details.
fData(gset[[1]]) # Lista las sondas del set
inmortal <- getRNA( db = "refseq", #Más bases de datos
organism = "Thermococcus gammatolerans",
path = file.path("_ncbi_downloads","transcriptome") ) %>% read_rna()
inmortal
## DNAStringSet object of length 52:
## width seq names
## [1] 78 CCCGCGGTAGCCTAGCCTGGGAG...TCAAATCCGGGCCGCGGGACCA lcl|NC_012804.1_t...
## [2] 77 GCCGGGGTGGTGTAGCCTGGTCA...TCAAATCCCGGCCCCGGCCCCA lcl|NC_012804.1_t...
## [3] 78 GGGGGCGTGGTGTAGCCTGGTCC...TCAAATCCCCGCGCCCCCACCA lcl|NC_012804.1_t...
## [4] 76 AGCCCCGTGGTGTAGCGGCCAAG...TCGAATCCCGGCGGGGCTACCA lcl|NC_012804.1_t...
## [5] 77 GGGCCCGTGGTCTAGACGGTTAT...TCGAATCCCCGCGGGCCCACCA lcl|NC_012804.1_t...
## ... ... ...
## [48] 85 GCGGGGGTTGCCGAGCCTGGTCA...GGTTCAAATCCCCGCCCCCGCA lcl|NC_012804.1_t...
## [49] 87 GCCGGGATTGCCTAGCCTGGGAA...TCAAATCCCCGTCCCGGCGCCA lcl|NC_012804.1_t...
## [50] 2876 GCAACTAAGCCGCCTGGTGGATG...GTCCCCGAAAATAGGCGGCCAT lcl|NC_012804.1_r...
## [51] 77 GGGCCGGTAGCTCAGCCTGGGAG...TCGAATCCCGGCCGGTCCACCA lcl|NC_012804.1_t...
## [52] 1513 GTACTCCTTGCTCAATTCCGGTT...CGATCACCTCCTATCGCCGGAA lcl|NC_012804.1_r...
inmortal %>% names() %>% sample(10)
## [1] "lcl|NC_012804.1_trna_41 [locus_tag=TGAM_RS09680] [db_xref=GeneID:7987160] [product=tRNA-Ser] [location=complement(1835756..1835841)] [gbkey=tRNA]"
## [2] "lcl|NC_012804.1_trna_25 [locus_tag=TGAM_RS05685] [db_xref=GeneID:7986995] [product=tRNA-Met] [location=1066698..1066775] [gbkey=tRNA]"
## [3] "lcl|NC_012804.1_trna_38 [locus_tag=TGAM_RS09660] [db_xref=GeneID:7988314] [product=tRNA-Leu] [location=complement(1834145..1834232)] [gbkey=tRNA]"
## [4] "lcl|NC_012804.1_rrna_52 [locus_tag=TGAM_RS10410] [db_xref=GeneID:7987171] [product=16S ribosomal RNA] [location=complement(1948874..1950386)] [gbkey=rRNA]"
## [5] "lcl|NC_012804.1_trna_37 [locus_tag=TGAM_RS09290] [db_xref=GeneID:7987156] [product=tRNA-Ile] [location=1769527..1769604] [gbkey=tRNA]"
## [6] "lcl|NC_012804.1_trna_33 [locus_tag=TGAM_RS09030] [db_xref=GeneID:7987614] [product=tRNA-Ala] [location=1724892..1724969] [gbkey=tRNA]"
## [7] "lcl|NC_012804.1_trna_11 [locus_tag=TGAM_RS02060] [db_xref=GeneID:7987952] [product=tRNA-Ile] [location=complement(391152..391229)] [gbkey=tRNA]"
## [8] "lcl|NC_012804.1_trna_42 [locus_tag=TGAM_RS09765] [db_xref=GeneID:7988332] [product=tRNA-Gly] [location=1853611..1853688] [gbkey=tRNA]"
## [9] "lcl|NC_012804.1_trna_51 [locus_tag=TGAM_RS10405] [db_xref=GeneID:7987170] [product=tRNA-Ala] [location=complement(1948755..1948831)] [gbkey=tRNA]"
## [10] "lcl|NC_012804.1_trna_21 [locus_tag=TGAM_RS03500] [db_xref=GeneID:7987138] [product=tRNA-Glu] [location=644930..645007] [gbkey=tRNA]"
download_species <- c("Thermococcus gammatolerans",
"Thermotoga maritima")
# retrieve these three species from NCBI RefSeq
mis.proteomas <- getProteomeSet("refseq", organisms = download_species, path = "set_proteomes")
read_proteome(mis.proteomas[1])
## AAStringSet object of length 2137:
## width seq names
## [1] 51 MARNKPLAKKLRLAKAAKQNRR...TNRKVMTHPKRRHWRRTKLKE WP_010477181.1 MU...
## [2] 91 MAEEHVVYIGKKPVMNYVLAVI...LPTADGRTANTSTIEIVLEKP WP_010477671.1 MU...
## [3] 56 MAKADYNKRKPRKFGKGARRCM...LMLCRHCFREVAPKLGFKKYE WP_011250478.1 MU...
## [4] 102 MQKARIKLASTNIKALNEVTDQ...MRQIMRIRVPEDVTIEIELIS WP_012571711.1 MU...
## [5] 208 MKTWRRYEEYLLTGEWHVHTNY...EVKTVRGGDVHGVGEFKYFWG WP_012751161.1 PH...
## ... ... ...
## [2133] 267 MIGIIFDMDGVVYRGNRPIDGA...PDLVFPSIKELKDYLSTVLGD WP_169302034.1 HA...
## [2134] 246 MRVIFLDLDGTLLGDDYSPENA...KLSHPKAKHISSIEELLGVIP WP_169302035.1 ma...
## [2135] 108 MAQKPIGNSRAKQLPKVFNSEE...AYPKAGSAVAKYVSTIGRWVS WP_169302036.1 hy...
## [2136] 425 MILTRHDTTIQSLAKEYFLKSD...VQEWRDAAKAKYQKVLQEVNG WP_169302037.1 ex...
## [2137] 123 MEVNGRKVAGFALYFLGIALGI...GVVLTTLGLFVYKIGGREDAR WP_169302038.1 hy...
read_proteome(mis.proteomas[2])
## AAStringSet object of length 1808:
## width seq names
## [1] 72 MRKIFTAIIEYDPEKKQYVGMV...EAGDEINLQEFVALEMIEVET WP_004079900.1 MU...
## [2] 77 MSYLPIVDPKTMEKVLLKLGFQ...RKIIREAGISVEEFKKVLENL WP_004079902.1 ty...
## [3] 70 METQKEIVFIAVESEDGGYIEK...FDEGAPKYVHARFVKDVTIAV WP_004079908.1 MU...
## [4] 580 MNLRSIQKILRFYSLIRKRFLV...TFRRIIETYVNESKRIADKDV WP_004079909.1 MU...
## [5] 144 MEAAVVVAYSYFVLKLEFAISN...LLGHVLFKKIERKSRAEGELV WP_004079914.1 MU...
## ... ... ...
## [1804] 247 MGGGKMIKRVKTGIPGMDEILH...HPFEITDKGIVIYPSEGGEGR WP_162487497.1 Ka...
## [1805] 262 MGPVDIGLIQLLSAYIFVVVLM...VSVFLYLGYKAFFNRENQLIV WP_164924970.1 ir...
## [1806] 280 MKKILTIVRYILIAICLIFFLF...PVVVFALVAQRYLIRGLTSER WP_164924971.1 AB...
## [1807] 246 MLKWLDSNPSIQELKKFAKSLG...FPSGQGMMAQGIMWEIFRSGR WP_164924972.1 DU...
## [1808] 525 MTGRFLKIIIKKATENLLKHRD...NLDLEIYEGGQPHYPYLMLLQ WP_164924973.1 DA...
Proteómica Importing de ProteomExchange R for Proteomics
#BiocManager::install()
#BiocManager::install("RforProteomics",
# ask = F,
# dependencies = TRUE,
# type = "source",
# checkBuilt = TRUE)
## Experiment information
library("rpx")
px1 <- PXDataset("PXD000001")
pxfiles(px1)
## [1] "F063721.dat"
## [2] "F063721.dat-mztab.txt"
## [3] "PRIDE_Exp_Complete_Ac_22134.xml.gz"
## [4] "PRIDE_Exp_mzData_Ac_22134.xml.gz"
## [5] "PXD000001_mztab.txt"
## [6] "README.txt"
## [7] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML"
## [8] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzXML"
## [9] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzXML"
## [10] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw"
## [11] "erwinia_carotovora.fasta"
## [12] "generated"
Para instalar MSnbase:
Dentro de la terminal:
sudo apt-cache search libnetcdf
sudo apt-get update
sudo apt-get install libnetcdf-c++4-1
Usar la siguiente página omitiendo los pasos de instalación de R 4.0
#BiocManager::install("MSnbase", ask = T, dependencies = TRUE, type = "source", checkBuilt = TRUE)
library(magrittr)
library(Biobase)
library(MSnbase)
## Loading required package: mzR
## Loading required package: Rcpp
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: ProtGenerics
##
## Attaching package: 'ProtGenerics'
## The following object is masked from 'package:stats':
##
## smooth
##
## This is MSnbase version 2.15.6
## Visit https://lgatto.github.io/MSnbase/ to get started.
##
## Attaching package: 'MSnbase'
## The following object is masked from 'package:base':
##
## trimws
## Downloading the mzTab data
mztab <- pxget(px1, "PXD000001_mztab.txt")
## Downloading 1 file
## /home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/PXD000001_mztab.txt already present.
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")
## Warning: Version 0.9 is deprecated. Please see '?readMzTabData' and '?MzTab' for
## details.
TMT: https://en.wikipedia.org/wiki/Tandem_mass_tag
sampleNames(qnt) <- reporterNames(TMT6)
#head(exprs(qnt))
qnt <- filterNA(qnt)
processingData(qnt)
## - - - Processing information - - -
## mzTab read: Fri Sep 25 21:15:46 2020
## Subset [2351,6][1504,6] Fri Sep 25 21:15:46 2020
## Removed features with more than 0 NAs: Fri Sep 25 21:15:46 2020
## Dropped featureData's levels Fri Sep 25 21:15:46 2020
## MSnbase version: 2.15.6
## combine into proteins
## - using the 'accession' feature meta data
## - sum the peptide intensities
protqnt <- combineFeatures(qnt,
groupBy = fData(qnt)$accession,
fun = sum)
protqnt %>% exprs %>% tail()
## TMT6.126 TMT6.127 TMT6.128 TMT6.129 TMT6.130 TMT6.131
## ECA4517 371367.4 376656 439160.7 371917.9 337498 376772.2
## P00489 5086802.6 4970118 5563334.7 5471237.4 2778741 2905920.8
## P00761 3076535.9 3190179 3833943.6 3835475.1 2701778 3646836.3
## P00924 5484890.4 3269353 1850660.8 1062968.7 1596193 5199268.0
## P02769 837322.6 1486133 2796767.6 6210387.7 3511954 1126899.3
## P62894 5498893.0 7031520 6517750.7 6773717.8 7095527 14682351.1
library("RColorBrewer") ## Color palettes
library("ggplot2") ## Convenient and nice plotting
library("reshape2") ## Flexibly reshape data
cls <- brewer.pal(5, "Set1")
matplot(t(tail(exprs(protqnt), n = 5)), type = "b",
lty = 1, col = cls,
ylab = "Protein intensity (summed peptides)",
xlab = "TMT reporters")
legend("topright", tail(featureNames(protqnt), n=5),
lty = 1, bty = "n", cex = .8, col = cls)
Collection Retrieval
The automated retrieval of collections (= Genome, Proteome, CDS, RNA, GFF, Repeat Masker, AssemblyStats) will make sure that the genome file of an organism will match the CDS, proteome, RNA, GFF, etc file and was generated using the same genome assembly version. One aspect of why genomics studies fail in computational and biological reproducibility is that it is not clear whether CDS, proteome, RNA, GFF, etc files used in a proposed analysis were generated using the same genome assembly file denoting the same genome assembly version. To avoid this seemingly trivial mistake we encourage users to retrieve genome file collections using the biomartr function getCollection() and attach the corresponding output as Supplementary Data to the respective genomics study to ensure computational and biological reproducibility.
By specifying the scientific name of an organism of interest a collection consisting of the genome file, proteome file, CDS file, RNA file, GFF file, Repeat Masker file, AssemblyStats file of the organism of interest can be downloaded and stored locally.
inmortal_collect <- getCollection( db = "genbank",
organism = "Thermococcus gammatolerans",
path = file.path("refseq","Collections"))