library('baySeq')
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Loading required package: IRanges
## Loading required package: XVector
##
## Attaching package: 'baySeq'
##
## The following object is masked from 'package:GenomicRanges':
##
## rbind
##
## The following object is masked from 'package:IRanges':
##
## rbind
##
## The following object is masked from 'package:BiocGenerics':
##
## rbind
##
## The following object is masked from 'package:base':
##
## rbind
library('ggplot2')
Because Bayseq is designed to run massive experiments is better to use parallel processing.
cl <- makeCluster(5)
Prepare data
setwd("/home/iar28/DESEQ_ivan/baySeq/Si_SB_Zm/")
reads <- read.delim("RSEMcountsfinal_Si.txt", header=TRUE, sep ="\t")
head(reads)
## gene M1 M2 M3 BS1 BS2 BS3
## 1 Si000001m.g 645 644 616 818 616 756
## 2 Si000002m.g 2102 3658 2466 3219 2877 2209
## 3 Si000003m.g 201 516 349 2345 1699 1361
## 4 Si000006m.g 378 601 476 4405 4559 4435
## 5 Si000007m.g 1170 1851 1320 1246 1315 1046
## 6 Si000008m.g 2309 3986 2647 7187 6101 5292
bayseqdata <- data.frame(row.names=reads$gene,
reads$M1,
reads$M2,
reads$M3,
reads$BS1,
reads$BS2,
reads$BS3)
colnames(bayseqdata) <- c("M1",
"M2",
"M3",
"BS.1",
"BS.2",
"BS.3"
)
bayseqmatrix <- data.matrix(bayseqdata)
Defining our replicate and group structure
replicates <- as.factor(c("M","M","M","BS","BS","BS"))
groups <- list(NDE = factor(c(1,1,1,1,1,1)),
DE = factor(c("m","m","m","bs","bs","bs")))
Combine the count data and user defined groups into a countData object
CD <- new("countData",
data = bayseqmatrix,
replicates = replicates,
groups = groups)
Libray sizes can be inferred from the data if the user is not able to supply Here we take the sum of the reads below the nth percentile of the data( instead of just using the total number of sequenced reads aligning to the genome)
libsizes(CD) <- getLibsizes(CD, estimationType= "quantile")
head(libsizes(CD))
## M1 M2 M3 BS.1 BS.2 BS.3
## 1425735 2007923 1420909 1365417 1292223 1385621
We estimate an empirical distribution on the parameters of the Negative Binomial distribution by bootstraping from the data, taking individual counts and finding quasi-likelihood parameters for a Negative Binomial distribution
CD <- getPriors.NB(CD, samplesize = 10000, estimation = "QL", cl = cl)
## Finding priors...done.
Before getting posterior likelihoods for the data is good to consider the possibility that some loci will not be expressed at all in the data
plotPriors(CD, group= "NDE")
Acquire posterior likelihoods, estimating the proportinos of DE expressed counts
CD <- getLikelihoods.NB(CD, pET = 'BIC', cl = cl)
## Finding posterior likelihoods...
## .
## done.
We can then MA-plot coloring by posterior likelihoods of DE
plotMA.CD(CD, samplesA = "M", samplesB = "BS",
col = rgb(red = exp(CD@posteriors[,2]), green = 0, blue = 0))
plot(CD@posteriors[,2])
Ask for the top candidates for differential expression using the topCounts function and generate a normalized output object we can export
topCounts(CD, group ="DE")
## rowID M1 M2 M3 BS.1 BS.2 BS.3 Likelihood
## Si000068m.g 51 262 560 323 64380 72552 56490 1
## Si000645m.g 525 4359 6002 5288 693137 806481 615658 1
## Si001591m.g 1253 177449 226801 178069 5803 4401 5676 1
## Si001775m.g 1383 929 1524 1151 100577 140687 123496 1
## Si003109m.g 2326 4266 3767 4447 107884 109585 100643 1
## Si003230m.g 2420 184 231 200 11934 8641 7286 1
## Si003343m.g 2503 8906 9412 7772 121248 117839 114563 1
## Si003468m.g 2587 5430 5280 5473 126875 141373 121722 1
## Si003491m.g 2600 377 307 340 21428 24538 15865 1
## Si003564m.g 2641 6026 4927 6010 121506 156397 127958 1
## DE FDR.DE
## Si000068m.g bs>m 0
## Si000645m.g bs>m 0
## Si001591m.g m>bs 0
## Si001775m.g bs>m 0
## Si003109m.g bs>m 0
## Si003230m.g bs>m 0
## Si003343m.g bs>m 0
## Si003468m.g bs>m 0
## Si003491m.g bs>m 0
## Si003564m.g bs>m 0
output_DE <- topCounts(CD, group = "DE", number = 100000, normaliseData=TRUE)
write.table(output_DE, file="Si_bayseq.txt", sep="\t", row.names=TRUE, quote=F)
S
## Error: object 'S' not found