Setaria and Maize differential expression analysis using Bayseq

library('baySeq')

## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Loading required package: IRanges
## Loading required package: XVector
## 
## Attaching package: 'baySeq'
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     rbind
## 
## The following object is masked from 'package:IRanges':
## 
##     rbind
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     rbind
## 
## The following object is masked from 'package:base':
## 
##     rbind

library('ggplot2')

Because Bayseq is designed to run massive experiments is better to use parallel processing.

cl <- makeCluster(5)

Prepare data

setwd("/home/iar28/DESEQ_ivan/baySeq/Si_SB_Zm/")
reads <- read.delim("RSEMcountsfinal_Si.txt", header=TRUE, sep ="\t")
head(reads)

##          gene   M1   M2   M3  BS1  BS2  BS3
## 1 Si000001m.g  645  644  616  818  616  756
## 2 Si000002m.g 2102 3658 2466 3219 2877 2209
## 3 Si000003m.g  201  516  349 2345 1699 1361
## 4 Si000006m.g  378  601  476 4405 4559 4435
## 5 Si000007m.g 1170 1851 1320 1246 1315 1046
## 6 Si000008m.g 2309 3986 2647 7187 6101 5292

bayseqdata <- data.frame(row.names=reads$gene, 
                        reads$M1,
                        reads$M2,
                        reads$M3,
                        reads$BS1,
                        reads$BS2,
                        reads$BS3)
colnames(bayseqdata) <- c("M1",
                         "M2",
                         "M3",
                         "BS.1",
                         "BS.2",
                         "BS.3"
                         )
bayseqmatrix <- data.matrix(bayseqdata)

Defining our replicate and group structure

replicates <- as.factor(c("M","M","M","BS","BS","BS")) 
groups <- list(NDE = factor(c(1,1,1,1,1,1)),
               DE = factor(c("m","m","m","bs","bs","bs")))

Combine the count data and user defined groups into a countData object

CD <- new("countData",
          data = bayseqmatrix,
          replicates = replicates, 
          groups = groups)

Libray sizes can be inferred from the data if the user is not able to supply Here we take the sum of the reads below the nth percentile of the data( instead of just using the total number of sequenced reads aligning to the genome)

libsizes(CD) <- getLibsizes(CD, estimationType= "quantile")
head(libsizes(CD))

##      M1      M2      M3    BS.1    BS.2    BS.3 
## 1425735 2007923 1420909 1365417 1292223 1385621

We estimate an empirical distribution on the parameters of the Negative Binomial distribution by bootstraping from the data, taking individual counts and finding quasi-likelihood parameters for a Negative Binomial distribution

CD <- getPriors.NB(CD, samplesize = 10000, estimation = "QL", cl = cl)

## Finding priors...done.

Before getting posterior likelihoods for the data is good to consider the possibility that some loci will not be expressed at all in the data

plotPriors(CD, group= "NDE")

plot of chunk unnamed-chunk-9

Acquire posterior likelihoods, estimating the proportinos of DE expressed counts

CD <- getLikelihoods.NB(CD, pET = 'BIC', cl = cl)

## Finding posterior likelihoods...

## .

## done.

We can then MA-plot coloring by posterior likelihoods of DE

plotMA.CD(CD, samplesA = "M", samplesB = "BS",
          col = rgb(red = exp(CD@posteriors[,2]), green = 0, blue = 0))

plot of chunk unnamed-chunk-11

plot(CD@posteriors[,2])

plot of chunk unnamed-chunk-11

Ask for the top candidates for differential expression using the topCounts function and generate a normalized output object we can export

topCounts(CD, group ="DE")

##             rowID     M1     M2     M3   BS.1   BS.2   BS.3 Likelihood
## Si000068m.g    51    262    560    323  64380  72552  56490          1
## Si000645m.g   525   4359   6002   5288 693137 806481 615658          1
## Si001591m.g  1253 177449 226801 178069   5803   4401   5676          1
## Si001775m.g  1383    929   1524   1151 100577 140687 123496          1
## Si003109m.g  2326   4266   3767   4447 107884 109585 100643          1
## Si003230m.g  2420    184    231    200  11934   8641   7286          1
## Si003343m.g  2503   8906   9412   7772 121248 117839 114563          1
## Si003468m.g  2587   5430   5280   5473 126875 141373 121722          1
## Si003491m.g  2600    377    307    340  21428  24538  15865          1
## Si003564m.g  2641   6026   4927   6010 121506 156397 127958          1
##               DE FDR.DE
## Si000068m.g bs>m      0
## Si000645m.g bs>m      0
## Si001591m.g m>bs      0
## Si001775m.g bs>m      0
## Si003109m.g bs>m      0
## Si003230m.g bs>m      0
## Si003343m.g bs>m      0
## Si003468m.g bs>m      0
## Si003491m.g bs>m      0
## Si003564m.g bs>m      0

output_DE <- topCounts(CD, group = "DE", number = 100000, normaliseData=TRUE)
write.table(output_DE, file="Si_bayseq.txt", sep="\t", row.names=TRUE, quote=F)
S

## Error: object 'S' not found