# Bioconductor
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("RforProteomics")
BiocManager::install("MSnbase")
BiocManager::install("msdata")
BiocManager::install("IPPD")
BiocManager::install("BRAIN")
BiocManager::install("Rdisop")
BiocManager::install("cleaver")
BiocManager::install("UniProt.ws")
BiocManager::install("rTANDEM")
# CRAN
install.packages(c("scales", "RUnit", "shiny", "R.utils", "reshape2", "proto", "plyr"))
library(plyr)
library(proto)
library(reshape2)
library(scales)
library(MSnbase)
library(RUnit)
library(shiny)
library(R.utils)
library(Rcpp)
library(RforProteomics)
library(RColorBrewer)
library(ggplot2)
library(reshape2)
library(dplyr)
library(MALDIquant)
library(MALDIquantForeign)
library(mzR)
library(msdata)
library(Rcpp)
library(IPPD)
library(BRAIN)
library(Rdisop)
library(OrgMassSpecR)
library(cleaver)
library(UniProt.ws)
library(rTANDEM)
### Getting Help
## List all the vignettes in the RforProteomics package
vignette(package = "RforProteomics")
browseVignettes("RforProteomics")
## Open the vignette called RforProteomics
vignette("RforProteomics", package = "RforProteomics")
vignette("RProtVis",package="RforProteomics")
browseVignettes("mzR")
## Creates a connection to the mzXML file
filepath <- system.file("threonine/threonine_i2_e35_pH_tree.mzXML", package = "msdata")
mz <- openMSfile(filepath)
## Demonstraction of data access
basename(fileName(mz))
[1] "threonine_i2_e35_pH_tree.mzXML"
runInfo(mz)
$scanCount
[1] 55
$lowMz
[1] 50.0036
$highMz
[1] 298.673
$dStartTime
[1] 0.3485
$dEndTime
[1] 390.027
$msLevels
[1] 1 2 3 4
$startTimeStamp
[1] NA
instrumentInfo(mz)
$manufacturer
[1] "Thermo Scientific"
$model
[1] "LTQ Orbitrap"
$ionisation
[1] "electrospray ionization"
$analyzer
[1] "fourier transform ion cyclotron resonance mass spectrometer"
$detector
[1] "unknown"
$software
[1] "Xcalibur software 2.2 SP1"
$sample
[1] ""
$source
[1] ""
pIX <- peaks(mz,9)
head(pIX)
[,1] [,2]
[1,] 50.09395 8352.145
[2,] 50.50436 7364.491
[3,] 50.54242 6773.922
[4,] 50.84211 7098.680
[5,] 50.95626 8522.258
[6,] 51.04808 8100.911
plot(pIX[,1],pIX[,2], type = "h") # Histogram like (or high-density) vertical lines
## Close the connection to release memory of cached content
close(mz)
## Creates a connection to the mzID file
file <- system.file("mzid", "Tandem.mzid.gz", package="msdata")
(mzid <- openIDfile(file))
Identification file handle.
Filename: Tandem.mzid.gz
Number of psms: 171
mzidInfo(mzid)
$FileProvider
[1] "researcher"
$CreationDate
[1] "2012-07-25T14:03:16"
$software
[1] "xtandem x! tandem CYCLONE (2010.06.01.5) "
[2] "ProteoWizard MzIdentML 3.0.501 ProteoWizard"
$ModificationSearched
[1] "Oxidation" "Carbamidomethyl"
$FragmentTolerance
[1] "0.8 dalton"
$ParentTolerance
[1] "1.5 dalton"
$enzymes
$enzymes$name
[1] "Trypsin"
$enzymes$nTermGain
[1] "H"
$enzymes$cTermGain
[1] "OH"
$enzymes$minDistance
[1] "0"
$enzymes$missedCleavages
[1] "1"
$SpectraSource
[1] "D:/TestSpace/NeoTestMarch2011/55merge.mgf"
softwareInfo(mzid)
[1] "xtandem x! tandem CYCLONE (2010.06.01.5) "
[2] "ProteoWizard MzIdentML 3.0.501 ProteoWizard"
enzymes(mzid)
## Details about each peptide-spectrum-match
names(psms(mzid))
[1] "spectrumID" "chargeState" "rank"
[4] "passThreshold" "experimentalMassToCharge" "calculatedMassToCharge"
[7] "sequence" "modNum" "isDecoy"
[10] "post" "pre" "start"
[13] "end" "DatabaseAccess" "DBseqLength"
[16] "DatabaseSeq" "DatabaseDescription" "spectrum.title"
[19] "acquisitionNum"
head(psms(mzid))[,1:13]
mod <- modifications(mzid)
head(mod)
##Raw data abstraction with MSnExp objects
## uses a simple dummy test included in the package
mzXML <- dir(system.file(package="MSnbase",dir="extdata"),
full.name=TRUE,
pattern="mzXML$")
basename(mzXML)
[1] "dummyiTRAQ.mzXML"
(raw <- readMSData(mzXML, verbose = FALSE, centroided = TRUE))
MSn experiment data ("MSnExp")
Object size in memory: 0.18 Mb
- - - Spectra data - - -
MS level(s): 2
Number of spectra: 5
MSn retention times: 25:1 - 25:2 minutes
- - - Processing information - - -
Data loaded: Wed Dec 25 11:09:29 2019
MSnbase version: 2.12.0
- - - Meta data - - -
phenoData
rowNames: dummyiTRAQ.mzXML
varLabels: sampleNames
varMetadata: labelDescription
Loaded from:
dummyiTRAQ.mzXML
protocolData: none
featureData
featureNames: F1.S1 F1.S2 ... F1.S5 (5 total)
fvarLabels: spectrum
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
raw[[3]]
Object of class "Spectrum2"
Precursor: 645.3741
Retention time: 25:2
Charge: 2
MSn level: 2
Peaks count: 2125
Total ion count: 150838188
plot(raw, full = TRUE)
plot(raw[[5]], full = TRUE, reporters = iTRAQ4)
MALDIquant::intensity(s)
[1] 1 2 3 4 5 6 7 8 9 10
data(fiedler2009subset)
length(fiedler2009subset)
[1] 16
fiedler2009subset[1:2]
$sPankreas_HB_L_061019_G10.M19.T_0209513_0020740_18
S4 class type : MassSpectrum
Number of m/z values : 42388
Range of m/z values : 1000.015 - 9999.734
Range of intensity values: 5 - 101840
Memory usage : 506.359 KiB
Name : Pankreas_HB_L_061019_G10.M19
File : /data/set A - discovery leipzig/control/Pankreas_HB_L_061019_G10/0_m19/1/1SLin/fid
$sPankreas_HB_L_061019_G10.M20.T_0209513_0020740_18
S4 class type : MassSpectrum
Number of m/z values : 42388
Range of m/z values : 1000.015 - 9999.734
Range of intensity values: 6 - 111862
Memory usage : 506.359 KiB
Name : Pankreas_HB_L_061019_G10.M20
File : /data/set A - discovery leipzig/control/Pankreas_HB_L_061019_G10/0_m20/1/1SLin/fid
any(sapply(fiedler2009subset, MALDIquant::isEmpty))
[1] FALSE
table(sapply(fiedler2009subset, length))
42388
16
all(sapply(fiedler2009subset, MALDIquant::isRegular))
[1] TRUE
## Raw data
summary(MALDIquant::mass(fiedler2009subset[[1]]))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1000 2373 4330 4720 6872 10000
summary(MALDIquant::intensity(fiedler2009subset[[1]]))
Min. 1st Qu. Median Mean 3rd Qu. Max.
5 282 923 2131 3033 101840
head(as.matrix(fiedler2009subset[[1]]))
mass intensity
[1,] 1000.015 3149
[2,] 1000.117 3134
[3,] 1000.219 3127
[4,] 1000.321 3131
[5,] 1000.423 3170
[6,] 1000.525 3162
plot(fiedler2009subset[[1]])
plot(fiedler2009subset[[16]])
## 1- Preprocessing: sqrt transform (for variance stabilization)
s1 <- transformIntensity(fiedler2009subset, method = "sqrt")
plot(fiedler2009subset[[1]])
plot(s1[[1]])
## 2- Smoothing - 5 point moving average
s2 <- smoothIntensity(s1, method = "MovingAverage", halfWindowSize = 2)
plot(s2[[1]])
## 3- Baseline subtraction
s3_1 <- estimateBaseline(s2[[1]], method = "SNIP", iterations = 100)
plot(s2[[1]])
lines(s3_1,col = "red", lwd = 2)
s3 <- removeBaseline(s2, method = "SNIP")
plot(s3[[1]])
## 4- Calibration/Normalization
s4 <- calibrateIntensity(s3, method = "TIC")
plot(s4[[1]])
## 5- Chromatogram alignment
s5_1 <- alignSpectra(s4, halfWindowSize = 20, SNR = 2, tolerance = 0.002,
warpingMethod = "lowess")
samples <- sapply(s5_1, function(x) metaData(x)$sampleName) %>% factor
s5 <- averageMassSpectra(s5_1, labels = samples, method = "mean")
length(s5)
[1] 8
plot(s5[[1]])
## 6- Peak picking
noise <- MALDIquant::estimateNoise(s5[[1]])
plot(s5[[1]], xlim = c(1000, 10000), ylim = c(0, 0.002))
lines(noise, col = "red")
lines(noise[,1], noise[, 2]*2, col = "blue")
s6 <- detectPeaks(s5, method = "MAD", halfWindowSize = 20, SNR = 2)
plot(s5[[1]], xlim = c(1000, 10000), ylim = c(0, 0.002))
points(s6[[1]], col = "red", pch = 4)
#####Overview#####
par(mfrow=c(2,3))
xl <- range(MALDIquant::mass(s2[[1]]))
# use same xlim on all plots for better comparison
plot(fiedler2009subset[[1]], sub="", main="1: raw", xlim=xl)
plot(s1[[1]], sub="", main="2: variance stabilisation", xlim=xl)
plot(s2[[1]], sub="", main="3: smoothing", xlim=xl)
plot(s3[[1]], sub="", main="4: base line correction", xlim=xl)
plot(s4[[1]], sub="", main="5: peak detection", xlim=xl)
points(s6[[1]], col="red", pch=4)
#### %in% To be or not to be ####
top20 <- MALDIquant::intensity(s6[[1]]) %in% sort(MALDIquant::intensity(s6[[1]]), decreasing = TRUE)[1:20]
plot(s6[[1]], sub = "", main = "6: peak plot", xlim = xl)
labelPeaks(s6[[1]], index = top20, underline = TRUE)
(atoms <- getAtomsFromSeq("SIVPSGASTGVHEALEMR") %>% unlist)
C H N O S
77 129 23 27 1
molecule <- getMolecule("C2H5OH")
getFormula(molecule)
[1] "C2H6O"
getMass(molecule)
[1] 46.04186
#The relative atomic masses of the isotopes, and the charge state
MonoisotopicMass(formula = list(C = 2, O = 1, H=6))
[1] 46.04186
#Average relative atomic masses of the elements
MolecularWeight(formula = list(C = 2, O = 1, H=6))
[1] 46.068
essentialElements <- initializeCHNOPSMgKCaFe()
chlorophyll <- getMolecule("C55H72MgN4O5H", z = 1,
elements = essentialElements)
(isotopes <- getIsotope(chlorophyll, seq(1,4)))
[,1] [,2] [,3] [,4]
[1,] 893.5431390 894.5459934 895.546247 896.54752708
[2,] 0.4140648 0.3171228 0.178565 0.06773657
plot(t(isotopes), type = "h", xlab = "m/z", ylab = "Intensity")
(molecules <- decomposeMass(46.042, ppm = 20))
$formula
[1] "C2H6O"
$score
[1] 1
$exactmass
[1] 46.04186
$charge
[1] 0
$parity
[1] "e"
$valid
[1] "Valid"
$DBE
[1] 0
$isotopes
$isotopes[[1]]
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 46.0418648 47.04534542 48.04631711 4.904960e+01 5.005324e+01 5.105929e+01
[2,] 0.9749152 0.02293559 0.00210353 4.540559e-05 2.816635e-07 2.324709e-10
[,7] [,8] [,9] [,10]
[1,] 5.206548e+01 5.307171e+01 5.407796e+01 5.508422e+01
[2,] 8.458096e-14 1.665930e-17 1.857005e-21 1.107409e-25
length(decomposeMass(147.053))
[1] 8
cbind(getFormula(molecules), getScore(molecules), getValid(molecules))
[,1] [,2]
[1,] "C5H9NO4" "0.999999998064664"
[2,] "C3H17P2S" "1.93533578193563e-09"
[,3]
[1,] "Valid"
[2,] "Invalid"
cleave("LAAGKVEDSD", enzym = "trypsin")
$LAAGKVEDSD
[1] "LAAGK" "VEDSD"
## Miss one cleavage position
cleave("LAAGKVEDSD", enzym = "trypsin", missedCleavages = 1)
$LAAGKVEDSD
[1] "LAAGKVEDSD"
cleavageRanges("LAAGKVEDSD", enzym="trypsin", missedCleavages=1)
$LAAGKVEDSD
start end
[1,] 1 10
## miss zero or one cleavage positions
cleave("LAAGKVEDSD", enzym = "trypsin", missedCleavages = 0:1)
$LAAGKVEDSD
[1] "LAAGK" "VEDSD" "LAAGKVEDSD"
## create AAStringSet object and cleave it
p <- AAStringSet(c(gaju="LAAGKVEDSD", pnm="AGEPKLDAGV"))
cleave(p, enzym="trypsin")
AAStringSetList of length 2
[["gaju"]] LAAGK VEDSD
[["pnm"]] AGEPK LDAGV
cleavageRanges(p, enzym="trypsin")
IRangesList object of length 2:
$gaju
IRanges object with 2 ranges and 0 metadata columns:
start end width
<integer> <integer> <integer>
[1] 1 5 5
[2] 6 10 5
$pnm
IRanges object with 2 ranges and 0 metadata columns:
start end width
<integer> <integer> <integer>
[1] 1 5 5
[2] 6 10 5
cleavageSites(p, enzym="trypsin")
$gaju
[1] 5
$pnm
[1] 5
## fragment unlabeled peptide
FragmentPeptide("NECFLQHK")
## fragment peptide with carbon-13 labeled lysine
k.mass <- MonoisotopicMass(formula = list(C = 6, H = 12, N = 2, O = 1),
isotopes = list(C = 13.0033548378))
FragmentPeptide("NECFLQHk", custom = list(code = "k", mass = k.mass))
## fragment peptide with two modifications
m.mass <- MonoisotopicMass(formula = list(C=5, H=9, N=1, O=2, S=1))
FragmentPeptide("NDmELWk", custom = list(code = c("m", "k"), mass = c(m.mass, k.mass)))
NA
#Performing the search
taxonomy <- rTTaxo(taxon = "yeast",
format = "peptide",
URL = system.file(
"extdata/fasta/scd.fasta.pro",
package="rTANDEM"))
param <- rTParam()
param <- setParamValue(param,
'protein', 'taxon',
value="yeast")
param <- setParamValue(param, 'list path',
'taxonomy information', taxonomy)
param <- setParamValue(param,
'list path', 'default parameters',
value = system.file(
"extdata/default_input.xml",
package="rTANDEM"))
param <- setParamValue(param, 'spectrum', 'path',
value = system.file(
"extdata/test_spectra.mgf",
package="rTANDEM"))
param <- setParamValue(param, 'output', 'xsl path',
value = system.file(
"extdata/tandem-input-style.xsl",
package="rTANDEM"))
param <- setParamValue(param, 'output', 'path',
value = paste(getwd(),
"output.xml", sep="/"))
resultPath <- tandem(param)
Loading spectra
(mgf). loaded.
Spectra matching criteria = 242
Starting threads . started.
Computing models:
testin
sequences modelled = 5 ks
Model refinement:
partial cleavage ..... done.
unanticipated cleavage ..... done.
modified N-terminus ..... done.
finishing refinement ... done.
Creating report:
initial calculations ..... done.
sorting ..... done.
finding repeats ..... done.
evaluating results ..... done.
calculating expectations ..... done.
writing results ..... done.
Valid models = 40
Unique models = 41
Estimated false positives = 1 +/- 1
basename(resultPath)
[1] "output.2019_12_25_11_07_50.t.xml"
#Import and analyse results
res <- GetResultsFromXML(resultPath)
## the inferred proteins
proteins <- GetProteins(res,
log.expect = -1.3,
min.peptides = 2)
proteins[, -(4:5), with = FALSE]