Primera clase - Taller DeepenData: R intermedio-avanzado

Contenidos:

Item 1
Item 2
Item 3
- Item 3a
- Item 3b

getwd()

## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020"

paste0(getwd(),'/hola')

## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola"

file.path(getwd(),'hola')

## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola"

file.path(getwd(),'hola','hola')

## [1] "/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/hola/hola"

mtcars:

mpg: Miles/(US) gallon
cyl: Number of cylinders
disp: Displacement (cu.in.)
hp: Gross horsepower
drat: Rear axle ratio
wt: Weight (1000 lbs)
qsec: 1/4 mile time
vs: V/S
am: Transmission (0 = automatic, 1 = manual)
gear: Number of forward gears
carb: Number of carburetors

#install.packages("devtools", dependencies = TRUE)
#install.packages("curl", dependencies = TRUE)
#install.packages("tidyverse", dependencies = TRUE)
library(readr)
my.data <- mtcars
write_csv(my.data,  path = 'mtcars.csv' )
write_tsv(my.data,  path = 'mtcars.tsv')
write_delim(my.data,path = 'mtcars.txt', delim = ";")
write_delim(my.data,path = 'mtcars.hola', delim = "*_*")
openxlsx::write.xlsx(my.data, file = "mtcars.xlsx")

Lectura

mtcars.csv<-read_csv('mtcars.csv')
mtcars.tsv<-read_tsv('mtcars.tsv')
mtcars.txt<-read_delim('mtcars.txt', delim = ";")
mtcars.hola<-read_delim('mtcars.hola', delim = "*_*")
library(readxl)
mtcars.xlsx<-read_excel('mtcars.xlsx')

Escritura y lectura con columna de rownames

library(tibble)
rownames_to_column(my.data, var = "Car brand")   -> my.data.2
library(magrittr)
my.data %>% rownames_to_column(var = "Car brand")-> my.data.2
#####Con pipes#############
my.data.2 %>%   write_csv('mtcars.csv')
my.data.2 %>%   write_tsv('mtcars.tsv')
my.data.2 %>% write_delim('mtcars.txt',  delim = ";")
my.data.2 %>% write_delim('mtcars.hola', delim = "*_*")
my.data.2 %>% openxlsx::write.xlsx("mtcars.xlsx")
###########Lectura##################
read_csv('mtcars.csv', skip = 0)                         ->mtcars.csv
read_tsv('mtcars.tsv', skip = 0)                         ->mtcars.tsv
read_delim('mtcars.txt', delim = ";", skip = 0)          ->mtcars.txt
read_delim('mtcars.hola', delim = "*_*", skip = 0)       ->mtcars.hola
read_excel('mtcars.xlsx', skip = 0)                      ->mtcars.xlsx
str(mtcars.xlsx)

## tibble [32 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Car brand: chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ mpg      : num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl      : num [1:32] 6 6 4 6 8 6 8 4 4 6 ...
##  $ disp     : num [1:32] 160 160 108 258 360 ...
##  $ hp       : num [1:32] 110 110 93 110 175 105 245 62 95 123 ...
##  $ drat     : num [1:32] 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt       : num [1:32] 2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec     : num [1:32] 16.5 17 18.6 19.4 17 ...
##  $ vs       : num [1:32] 0 0 1 1 0 1 0 1 1 1 ...
##  $ am       : num [1:32] 1 1 1 0 0 0 0 0 0 0 ...
##  $ gear     : num [1:32] 4 4 4 3 3 3 3 4 4 4 ...
##  $ carb     : num [1:32] 4 4 1 1 2 1 4 2 2 4 ...

########Chequear igualdad###############
library(dplyr)
all_equal(mtcars.csv,mtcars.tsv)

## [1] TRUE

all_equal(mtcars.txt,mtcars.hola)

## [1] TRUE

all_equal(mtcars.hola,mtcars.xlsx)

## [1] TRUE

list(mtcars.csv, mtcars.tsv, mtcars.txt, mtcars.hola,mtcars.xlsx) %>% unique() %>% length()

## [1] 2

Descargar

library(utils)
#getwd()
###github
my.github.url <- 'https://github.com/DeepenData/Computational-Biology-and-Bioinformatics/raw/master/Wang2018_supplemental_DEGs.xlsx'
download.file(url= my.github.url,  destfile= 'Wang2018_supplemental_DEGs.xlsx')
hola.github <- read_excel("Wang2018_supplemental_DEGs.xlsx", skip = 1)
###dropbox
my.dropbox.url <- 'https://www.dropbox.com/s/j3kiivpcbghpb4v/log2FC.csv'
download.file(url= my.dropbox.url,  destfile= 'log2FC.csv', method = "wget")
hola.dropbox <- read_csv("log2FC.csv")
###Site
my.supplementary.url <- 'https://www.pnas.org/highwire/filestream/794560/field_highwire_adjunct_files/0/pnas.1800165115.sd01.xlsx'
download.file(url= my.supplementary.url,  destfile= 'pnas.1800165115.sd01.xlsx', method = "wget")
hola.supplementary <- read_excel("pnas.1800165115.sd01.xlsx")

Leer desde url

read_csv('https://raw.githubusercontent.com/DeepenData/Computational-Biology-and-Bioinformatics/master/labels.csv')

Datos ómicos

genómica biomartr tutorials: https://docs.ropensci.org/biomartr/articles/ https://cran.r-project.org/web/packages/biomartr/vignettes/Sequence_Retrieval.html https://cran.r-project.org/web/packages/biomartr/readme/README.html

#install.packages("BiocManager", dependencies = TRUE)
#BiocManager::install()
#BiocManager::install("Biostrings")
#BiocManager::install("biomaRt")
#install.packages("biomartr", dependencies = TRUE)
library(biomartr)
library(magrittr)
#Objetos clase DNAStringSet
inmortal <- getGenome( db       = "refseq", #Más bases de datos
                       organism = "Thermococcus gammatolerans",
                       path     = file.path("_ncbi_downloads","genomes") ) %>%  read_genome()

human <- read_genome("/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/_ncbi_downloads/genomes/Homo_sapiens_genomic_refseq.fna.gz")
human

## DNAStringSet object of length 639:
##           width seq                                         names               
##   [1] 248956422 NNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNN NC_000001.11 Homo...
##   [2]    175055 GAATTCAGCTGAGAAGAACA...TTGTCAGTCACATAGAATTC NT_187361.1 Homo ...
##   [3]     32032 AGGGGTCTGCTTAGAGAGGG...CTTACGTTGACGTGGAATTC NT_187362.1 Homo ...
##   [4]    127682 GATCGAGACTATCCTGGCTA...GTCAATTGGGACCTTTGATC NT_187363.1 Homo ...
##   [5]     66860 GAATTCATTCGATGACGATT...AAACTCTCAGCCACGAATTC NT_187364.1 Homo ...
##   ...       ... ...
## [635]    170148 TTTCTTTCTTTTTTTTTTTT...ACAGGACTCATGGGGAATTC NT_187685.1 Homo ...
## [636]    215732 TGTGGTGAGGACCCTTAAGA...ACAGGACTCATGGGGAATTC NT_187686.1 Homo ...
## [637]    170537 TCTACTCTCCCATGCTTGCC...ACAGGACTCATGGGGAATTC NT_187687.1 Homo ...
## [638]    177381 GATCTATCTGTATCTCCACA...ACAGGACTCATGGGGAATTC NT_113949.2 Homo ...
## [639]     16569 GATCACAGGTCTATCACCCT...TTAAATAAGACATCACGATG NC_012920.1 Homo ...

human %>% names() %>% sample(10)

##  [1] "NT_187688.1 Homo sapiens chromosome 3 genomic scaffold, GRCh38.p13 alternate locus group ALT_REF_LOCI_4 HSCHR3_5_CTG3"             
##  [2] "NW_003571061.2 Homo sapiens chromosome 19 genomic scaffold, GRCh38.p13 alternate locus group ALT_REF_LOCI_8 HSCHR19LRC_PGF2_CTG3_1"
##  [3] "NT_187659.1 Homo sapiens chromosome 15 genomic scaffold, GRCh38.p13 alternate locus group ALT_REF_LOCI_2 HSCHR15_2_CTG3"           
##  [4] "NW_021159988.1 Homo sapiens chromosome 2 genomic patch of type FIX, GRCh38.p13 PATCHES HG1384_PATCH"                               
##  [5] "NC_000014.9 Homo sapiens chromosome 14, GRCh38.p13 Primary Assembly"                                                               
##  [6] "NW_003315956.1 Homo sapiens chromosome 18 genomic scaffold, GRCh38.p13 alternate locus group ALT_REF_LOCI_1 HSCHR18_1_CTG1_1"      
##  [7] "NT_187430.1 Homo sapiens unplaced genomic scaffold, GRCh38.p13 Primary Assembly HSCHRUN_RANDOM_134"                                
##  [8] "NT_187680.1 Homo sapiens chromosome 8 genomic scaffold, GRCh38.p13 alternate locus group ALT_REF_LOCI_3 HSCHR8_7_CTG1"             
##  [9] "NT_187502.1 Homo sapiens unplaced genomic scaffold, GRCh38.p13 Primary Assembly HSCHRUN_RANDOM_CTG24"                              
## [10] "NW_018654712.1 Homo sapiens chromosome 5 genomic patch of type NOVEL, GRCh38.p13 PATCHES HSCHR5_9_CTG1"

Epigenómica

library(GEOquery)

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Setting options('download.file.method.GEOquery'='auto')

## Setting options('GEOquery.inmemory.gpl'=FALSE)

gset = getGEO("GSE36278")

## Found 1 file(s)

## GSE36278_series_matrix.txt.gz

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ID_REF = col_character()
## )

## See spec(...) for full column specifications.

## File stored at:

## /tmp/RtmpwijQye/GPL13534.soft

## Warning: 65 parsing failures.
##    row     col           expected     actual         file
## 485513 SPOT_ID 1/0/T/F/TRUE/FALSE rs10796216 literal data
## 485514 SPOT_ID 1/0/T/F/TRUE/FALSE rs715359   literal data
## 485515 SPOT_ID 1/0/T/F/TRUE/FALSE rs1040870  literal data
## 485516 SPOT_ID 1/0/T/F/TRUE/FALSE rs10936224 literal data
## 485517 SPOT_ID 1/0/T/F/TRUE/FALSE rs213028   literal data
## ...... ....... .................. .......... ............
## See problems(...) for more details.

fData(gset[[1]])

Transcriptómica

inmortal <- getRNA( db       = "refseq", #Más bases de datos
                       organism = "Thermococcus gammatolerans",
                       path     = file.path("_ncbi_downloads","transcriptome") ) %>%  read_rna()

inmortal %>% names() %>% sample(10)

##  [1] "lcl|NC_012804.1_trna_51 [locus_tag=TGAM_RS10405] [db_xref=GeneID:7987170] [product=tRNA-Ala] [location=complement(1948755..1948831)] [gbkey=tRNA]"                                                       
##  [2] "lcl|NC_012804.1_trna_18 [locus_tag=TGAM_RS03305] [db_xref=GeneID:7987135] [product=tRNA-Asp] [location=619903..619980] [gbkey=tRNA]"                                                                     
##  [3] "lcl|NC_012804.1_trna_23 [locus_tag=TGAM_RS04715] [db_xref=GeneID:7986882] [product=tRNA-Val] [location=complement(857890..857967)] [gbkey=tRNA]"                                                         
##  [4] "lcl|NC_012804.1_trna_29 [locus_tag=TGAM_RS08255] [db_xref=GeneID:7987541] [product=tRNA-Arg] [location=complement(1540634..1540711)] [gbkey=tRNA]"                                                       
##  [5] "lcl|NC_012804.1_trna_31 [locus_tag=TGAM_RS08935] [db_xref=GeneID:7987488] [product=tRNA-Leu] [location=1706698..1706785] [gbkey=tRNA]"                                                                   
##  [6] "lcl|NC_012804.1_ncrna_10 [gene=ffs] [locus_tag=TGAM_RS11005] [db_xref=RFAM:RF01857] [product=signal recognition particle sRNA] [ncRNA_class=SRP_RNA] [location=complement(337485..337798)] [gbkey=ncRNA]"
##  [7] "lcl|NC_012804.1_trna_25 [locus_tag=TGAM_RS05685] [db_xref=GeneID:7986995] [product=tRNA-Met] [location=1066698..1066775] [gbkey=tRNA]"                                                                   
##  [8] "lcl|NC_012804.1_trna_32 [locus_tag=TGAM_RS08995] [db_xref=GeneID:7987525] [product=tRNA-Leu] [location=complement(1720505..1720592)] [gbkey=tRNA]"                                                       
##  [9] "lcl|NC_012804.1_trna_22 [locus_tag=TGAM_RS04365] [db_xref=GeneID:7989023] [product=tRNA-Ser] [location=801210..801296] [gbkey=tRNA]"                                                                     
## [10] "lcl|NC_012804.1_trna_30 [locus_tag=TGAM_RS08405] [db_xref=GeneID:7987570] [product=tRNA-Pro] [location=complement(1566053..1566130)] [gbkey=tRNA]"

Proteoma

download_species <- c("Thermococcus gammatolerans", 
                      "Thermotoga maritima")
# retrieve these three species from NCBI RefSeq                       
biomartr::getProteomeSet("refseq", organisms = download_species, path = "set_proteomes")

## [1] "set_proteomes/Tgammatolerans.faa" "set_proteomes/Tmaritima.faa"

read_proteome("/home/alejandro/DeepenData/Taller_sept_oct_nov_dic_2020/set_proteomes/Tmaritima.faa")

## AAStringSet object of length 1808:
##        width seq                                            names               
##    [1]    72 MRKIFTAIIEYDPEKKQYVGMV...EAGDEINLQEFVALEMIEVET WP_004079900.1 MU...
##    [2]    77 MSYLPIVDPKTMEKVLLKLGFQ...RKIIREAGISVEEFKKVLENL WP_004079902.1 ty...
##    [3]    70 METQKEIVFIAVESEDGGYIEK...FDEGAPKYVHARFVKDVTIAV WP_004079908.1 MU...
##    [4]   580 MNLRSIQKILRFYSLIRKRFLV...TFRRIIETYVNESKRIADKDV WP_004079909.1 MU...
##    [5]   144 MEAAVVVAYSYFVLKLEFAISN...LLGHVLFKKIERKSRAEGELV WP_004079914.1 MU...
##    ...   ... ...
## [1804]   247 MGGGKMIKRVKTGIPGMDEILH...HPFEITDKGIVIYPSEGGEGR WP_162487497.1 Ka...
## [1805]   262 MGPVDIGLIQLLSAYIFVVVLM...VSVFLYLGYKAFFNRENQLIV WP_164924970.1 ir...
## [1806]   280 MKKILTIVRYILIAICLIFFLF...PVVVFALVAQRYLIRGLTSER WP_164924971.1 AB...
## [1807]   246 MLKWLDSNPSIQELKKFAKSLG...FPSGQGMMAQGIMWEIFRSGR WP_164924972.1 DU...
## [1808]   525 MTGRFLKIIIKKATENLLKHRD...NLDLEIYEGGQPHYPYLMLLQ WP_164924973.1 DA...

Proteómica Importing from http://www.proteomexchange.org/ tutorial: https://bioconductor.org/packages/release/data/experiment/vignettes/RforProteomics/inst/doc/RforProteomics.html

#BiocManager::install()
#BiocManager::install("RforProteomics", ask = F, dependencies = TRUE,   type = "source", checkBuilt = TRUE)
## Experiment information
library("rpx")
px1 <- PXDataset("PXD000001")
pxfiles(px1)

##  [1] "F063721.dat"                                                         
##  [2] "F063721.dat-mztab.txt"                                               
##  [3] "PRIDE_Exp_Complete_Ac_22134.xml.gz"                                  
##  [4] "PRIDE_Exp_mzData_Ac_22134.xml.gz"                                    
##  [5] "PXD000001_mztab.txt"                                                 
##  [6] "README.txt"                                                          
##  [7] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML" 
##  [8] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzXML"
##  [9] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzXML"         
## [10] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw"           
## [11] "erwinia_carotovora.fasta"                                            
## [12] "generated"

follow: sudo apt-cache search libnetcdf sudo apt-get update sudo apt-get install libnetcdf-c++4-1

https://rtask.thinkr.fr/installation-of-r-4-0-on-ubuntu-20-04-lts-and-tips-for-spatial-packages/

#BiocManager::install("MSnbase", ask = T, dependencies = TRUE,   type = "source", checkBuilt = TRUE)
library(magrittr)
library(Biobase)
library(MSnbase)
## Downloading the mzTab data
mztab <- pxget(px1, "PXD000001_mztab.txt")
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")

TMT: https://en.wikipedia.org/wiki/Tandem_mass_tag

sampleNames(qnt) <- reporterNames(TMT6)
#head(exprs(qnt))
qnt <- filterNA(qnt)
processingData(qnt)

## - - - Processing information - - -
## mzTab read: Thu Sep 24 10:50:18 2020 
## Subset [2351,6][1504,6] Thu Sep 24 10:50:18 2020 
## Removed features with more than 0 NAs: Thu Sep 24 10:50:18 2020 
## Dropped featureData's levels Thu Sep 24 10:50:18 2020 
##  MSnbase version: 2.15.6

## combine into proteins
## - using the 'accession' feature meta data
## - sum the peptide intensities
protqnt <- combineFeatures(qnt,
                           groupBy = fData(qnt)$accession,
                           fun = sum)


protqnt %>% exprs %>% tail()

##          TMT6.126 TMT6.127  TMT6.128  TMT6.129 TMT6.130   TMT6.131
## ECA4517  371367.4   376656  439160.7  371917.9   337498   376772.2
## P00489  5086802.6  4970118 5563334.7 5471237.4  2778741  2905920.8
## P00761  3076535.9  3190179 3833943.6 3835475.1  2701778  3646836.3
## P00924  5484890.4  3269353 1850660.8 1062968.7  1596193  5199268.0
## P02769   837322.6  1486133 2796767.6 6210387.7  3511954  1126899.3
## P62894  5498893.0  7031520 6517750.7 6773717.8  7095527 14682351.1

library("RColorBrewer") ## Color palettes
library("ggplot2")  ## Convenient and nice plotting
library("reshape2") ## Flexibly reshape data

cls <- brewer.pal(5, "Set1")
matplot(t(tail(exprs(protqnt), n = 5)), type = "b",
        lty = 1, col = cls,
        ylab = "Protein intensity (summed peptides)",
        xlab = "TMT reporters")
legend("topright", tail(featureNames(protqnt), n=5),
       lty = 1, bty = "n", cex = .8, col = cls)

Multi-omic

Collection Retrieval

The automated retrieval of collections (= Genome, Proteome, CDS, RNA, GFF, Repeat Masker, AssemblyStats) will make sure that the genome file of an organism will match the CDS, proteome, RNA, GFF, etc file and was generated using the same genome assembly version. One aspect of why genomics studies fail in computational and biological reproducibility is that it is not clear whether CDS, proteome, RNA, GFF, etc files used in a proposed analysis were generated using the same genome assembly file denoting the same genome assembly version. To avoid this seemingly trivial mistake we encourage users to retrieve genome file collections using the biomartr function getCollection() and attach the corresponding output as Supplementary Data to the respective genomics study to ensure computational and biological reproducibility.

By specifying the scientific name of an organism of interest a collection consisting of the genome file, proteome file, CDS file, RNA file, GFF file, Repeat Masker file, AssemblyStats file of the organism of interest can be downloaded and stored locally.

inmortal_collect <- getCollection( db = "refseq", 
               organism = "Thermococcus gammatolerans", 
               path = file.path("refseq","Collections"))

Primera clase - Taller DeepenData: R intermedio-avanzado

Acevedo A., Alejandro & Muñoz, Manuel. Universidad de Chile. Corresponding: manuel.munoz.g@ug.uchile.cl

24-09-2020