New Data

This describes the analysis that has been done for the new data and in addition compare the results that was obtained in the earlier analysis of the data. As I reported in my emails I do not belive that the data is actually stranded as both the pattern seen in IGV and the counts suggest that the reads are comming from unstranded libraries. Replicate 3 for both 24D and 39F have been sequenced twice, and I am not sure howto bost treat this statistically, for now I have just treated them as if they were independent libraries, which is not the optimal way to take the data structure into account.

Sample	Treatment	Replicate	Number of sequenced fragments
24D-1	24D	R01	51714243
24D-2	24D	R02	45382927
24D-3.1	24D	R03.1	21065731
24D-3.2	24D	R03.2	13137632
39F-1	39F	R01	39186674
39F-2	39F	R02	39651752
39F-3.1	39F	R03.1	26049145
39F-3.2	39F	R03.2	9389168
MSOR-1	MSOR	R01	35119014
MSOR-2	MSOR	R02	45784879
MSOR-3	MSOR	R03	45615768

Short read mapping

The reads were mapped using the short read aligner [Subreadalign] [Subread]. The analysis were done from within R with the code below. To convert read data to counts per gene eg. gene expreession the function featureCounts from the the same package was used. All these results were then saved as ‘Analysis_New.data’ that I shared with you via Uppsala University Dropbox service.

library(Rsubread)
library(limma)
library(edgeR)

# setwd("/proj/b2014273/private/BILS/NewData/Mapping")
targets <- readTargets('../References/TargetsNewData.txt', sep = ' ')
treat <- factor(targets$Treatment)
design <- model.matrix(~treat)

buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta')

align(index = 'TAIR10', readfile1 = targets$Read1, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16)

fc.newdata <- featureCounts(files=targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf", isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')

Newdata.dge.list <- DGEList(counts=fc.newdata$counts, genes=fc.newdata$annotation[,c("GeneID","Length")])
Newdata.dge.list.rpkm <- rpkm(Newdata.dge.list,Newdata.dge.list$genes$Length)
#Filtering. Only keep in the analysis those genes which had >10 reads per million mapped reads in at least two libraries.

setwd("~/Dropbox/BilsWork/1699_b2014273/R-analysis")
load('Analysis.newdata.rdata') # Since the actual analysis has been done on Uppmax I load data here
library(Rsubread)
library(limma)
library(edgeR)

Below code creates three different data sets

Aligned with subreadalign and counted with featurecounts (like the previous data)
Aligned with Star and counted with featurecounts
Aligned with Star and counted with HTseqcount

This was done as I saw big difference in the number of significant genes depending compared to the other analysis. As it turns out the difference might not have been that big as I thought since they added a filter on fold-change that I normally do not do. Adding the same cut-off to my analysis gives more similar results, but as we use different methods to estimate the expected variance the results are not identical.

targets <- readTargets('../References/TargetsNewData.txt', sep = ' ')
star.map <- list.files(".", pattern = "Starmap")

buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta')
align(index = 'TAIR10', readfile1 = targets$Read1, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16)
fc.newdata <- featureCounts(files=targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
fc.newdata2 <- featureCounts(files=star.map, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
star.counts <- list.files("Counts/", full.names = TRUE, pattern = "Starmap")
fc.newdata3 <- do.call("cbind",lapply(star.counts, FUN = function(files){read.table(files, header = FALSE, sep = "\t", row.names = 1, comment.char = "_")}))

In addition to the 3 data set I analysed all data first with the libraries corresponding to how they were sequenced and then again when the re-sequenced library were added and treated as a single sequence event (the objects ending with 9 are the merged ones containing 9 samples).

treat <- factor(targets$Treatment)
treat <- relevel(treat, ref = "DMSO")
design <- model.matrix(~treat)

targets2 <- targets[c(1:3,5:7,9:11),]
treat2 <- factor(targets2$Treatment)
treat2 <- relevel(treat2, ref = "DMSO")
design2 <- model.matrix(~treat2)

fc.newdata9 <- fc.newdata
fc.newdata9$counts[,3] <- fc.newdata9$counts[,3] + fc.newdata9$counts[,4]
fc.newdata9$counts[,7] <- fc.newdata9$counts[,7] + fc.newdata9$counts[,8]
fc.newdata9$counts <- fc.newdata9$counts[,-c(4,8)]

fc.newdata29 <- fc.newdata2
fc.newdata29$counts[,3] <- fc.newdata29$counts[,3] + fc.newdata29$counts[,4]
fc.newdata29$counts[,7] <- fc.newdata29$counts[,7] + fc.newdata29$counts[,8]
fc.newdata29$counts <- fc.newdata29$counts[,-c(4,8)]

colnames(fc.newdata3) <- colnames(Newdata.dge.list$counts)
fc.newdata39 <- fc.newdata3
fc.newdata39[,3] <- fc.newdata39[,3] + fc.newdata39[,4]
fc.newdata39[,7] <- fc.newdata39[,7] + fc.newdata39[,8]
fc.newdata39 <- fc.newdata39[,-c(4,8)]

Newdata.dge.list <- DGEList(counts=fc.newdata$counts, genes=fc.newdata$annotation[,c("GeneID","Length")])
Newdata9.dge.list <- DGEList(counts=fc.newdata9$counts, genes=fc.newdata9$annotation[,c("GeneID","Length")])
Newdata2.dge.list <- DGEList(counts=fc.newdata2$counts, genes=fc.newdata2$annotation[,c("GeneID","Length")])
Newdata29.dge.list <- DGEList(counts=fc.newdata29$counts, genes=fc.newdata29$annotation[,c("GeneID","Length")])
Newdata3.dge.list <- DGEList(counts=fc.newdata3, genes=row.names(fc.newdata3))
Newdata39.dge.list <- DGEList(counts=fc.newdata39, genes=row.names(fc.newdata39))

filterlowE <- function(DGE.obj) {
    isexpr <- rowSums(cpm(DGE.obj) > 3) >= 3
    calcNormFactors(DGE.obj[isexpr,keep.lib.size = FALSE])
}

Newdata.dge.list <- filterlowE(Newdata.dge.list)
Newdata9.dge.list <- filterlowE(Newdata9.dge.list)
Newdata2.dge.list <- filterlowE(Newdata2.dge.list)
Newdata29.dge.list <- filterlowE(Newdata29.dge.list)
Newdata3.dge.list <- filterlowE(Newdata3.dge.list)
Newdata39.dge.list <- filterlowE(Newdata39.dge.list)


y.newdata <- voom(Newdata.dge.list,design)
y.newdata9 <- voom(Newdata9.dge.list, design2)
y.newdata2 <- voom(Newdata2.dge.list,design)
y.newdata29 <- voom(Newdata29.dge.list, design2)
y.newdata3 <- voom(Newdata3.dge.list,design)
y.newdata39 <- voom(Newdata39.dge.list, design2)

plotMDS(y.newdata)

plotMDS(y.newdata2, labels = colnames(Newdata.dge.list$counts))
plotMDS(y.newdata3)

plotMDS(y.newdata9)

plotMDS(y.newdata29, labels = colnames(Newdata9.dge.list$counts))
plotMDS(y.newdata39)

fit.newdata <- lmFit(y.newdata,design)
fit.newdata <- eBayes(fit.newdata)
summary(decideTests(fit.newdata[,2:3]))

##    treat24D treat39F
## -1     3044      313
## 0      6904    11496
## 1      2619      758

summary(decideTests(fit.newdata[,2:3], lfc = 0.5))

##    treat24D treat39F
## -1      748       21
## 0     11567    12318
## 1       252      228

fit.newdata2 <- lmFit(y.newdata2,design)
fit.newdata2 <- eBayes(fit.newdata2)
summary(decideTests(fit.newdata2[,2:3]))

##    treat24D treat39F
## -1     3034      338
## 0      6674    11341
## 1      2711      740

summary(decideTests(fit.newdata2[,2:3], lfc = 0.5))

##    treat24D treat39F
## -1      745       26
## 0     11420    12167
## 1       254      226

fit.newdata3 <- lmFit(y.newdata3,design)
fit.newdata3 <- eBayes(fit.newdata3)
summary(decideTests(fit.newdata3[,2:3]))

##    treat24D treat39F
## -1     3034      338
## 0      6674    11341
## 1      2711      740

summary(decideTests(fit.newdata3[,2:3], lfc = 0.5))

##    treat24D treat39F
## -1      745       26
## 0     11420    12167
## 1       254      226

fit.newdata9 <- lmFit(y.newdata9,design2)
fit.newdata9 <- eBayes(fit.newdata9)
summary(r9 <- decideTests(fit.newdata9[,2:3]))

##    treat224D treat239F
## -1      2928       236
## 0       7120     11584
## 1       2449       677

summary(decideTests(fit.newdata9[,2:3], lfc = 0.5))

##    treat224D treat239F
## -1       736        21
## 0      11512     12258
## 1        249       218

fit.newdata29 <- lmFit(y.newdata29,design2)
fit.newdata29 <- eBayes(fit.newdata29)
summary(r29 <- decideTests(fit.newdata29[,2:3]))

##    treat224D treat239F
## -1      2936       255
## 0       6900     11433
## 1       2513       661

summary(decideTests(fit.newdata29[,2:3], lfc = 0.5))

##    treat224D treat239F
## -1       737        25
## 0      11363     12109
## 1        249       215

fit.newdata39 <- lmFit(y.newdata39,design2)
fit.newdata39 <- eBayes(fit.newdata39)
summary(r39 <- decideTests(fit.newdata39[,2:3]))

##    treat224D treat239F
## -1      2936       255
## 0       6900     11433
## 1       2513       661

summary(decideTests(fit.newdata39[,2:3], lfc = 0.5))

##    treat224D treat239F
## -1       737        25
## 0      11363     12109
## 1        249       215

write.fit(fit.newdata9[,2:3], results = r9, file = "fit9.txt", sep = "\t", digits = 30, adjust = "BH")

write.fit(fit.newdata29[,2:3], results = r29, file = "fit29.txt", sep = "\t", digits = 30, adjust = "BH")

write.fit(fit.newdata39[,2:3], results = r39, file = "fit39.txt", sep = "\t", digits = 30, adjust = "BH")

New Data

Thomas Källman, BILS

9 Sep 2015

Data

Short read mapping