This describes the analysis that has been done for the new data and in addition compare the results that was obtained in the earlier analysis of the data. As I reported in my emails I do not belive that the data is actually stranded as both the pattern seen in IGV and the counts suggest that the reads are comming from unstranded libraries. Replicate 3 for both 24D and 39F have been sequenced twice, and I am not sure howto bost treat this statistically, for now I have just treated them as if they were independent libraries, which is not the optimal way to take the data structure into account.

Data

Sample Treatment Replicate Number of sequenced fragments
24D-1 24D R01 51714243
24D-2 24D R02 45382927
24D-3.1 24D R03.1 21065731
24D-3.2 24D R03.2 13137632
39F-1 39F R01 39186674
39F-2 39F R02 39651752
39F-3.1 39F R03.1 26049145
39F-3.2 39F R03.2 9389168
MSOR-1 MSOR R01 35119014
MSOR-2 MSOR R02 45784879
MSOR-3 MSOR R03 45615768

Short read mapping

The reads were mapped using the short read aligner [Subreadalign] [Subread]. The analysis were done from within R with the code below. To convert read data to counts per gene eg. gene expreession the function featureCounts from the the same package was used. All these results were then saved as ‘Analysis_New.data’ that I shared with you via Uppsala University Dropbox service.

library(Rsubread)
library(limma)
library(edgeR)

# setwd("/proj/b2014273/private/BILS/NewData/Mapping")
targets <- readTargets('../References/TargetsNewData.txt', sep = ' ')
treat <- factor(targets$Treatment)
design <- model.matrix(~treat)

buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta')

align(index = 'TAIR10', readfile1 = targets$Read1, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16)

fc.newdata <- featureCounts(files=targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf", isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')

Newdata.dge.list <- DGEList(counts=fc.newdata$counts, genes=fc.newdata$annotation[,c("GeneID","Length")])
Newdata.dge.list.rpkm <- rpkm(Newdata.dge.list,Newdata.dge.list$genes$Length)
#Filtering. Only keep in the analysis those genes which had >10 reads per million mapped reads in at least two libraries.
setwd("~/Dropbox/BilsWork/1699_b2014273/R-analysis")
load('Analysis.newdata.rdata') # Since the actual analysis has been done on Uppmax I load data here
library(Rsubread)
library(limma)
library(edgeR)

Below code creates three different data sets

  1. Aligned with subreadalign and counted with featurecounts (like the previous data)
  2. Aligned with Star and counted with featurecounts
  3. Aligned with Star and counted with HTseqcount

This was done as I saw big difference in the number of significant genes depending compared to the other analysis. As it turns out the difference might not have been that big as I thought since they added a filter on fold-change that I normally do not do. Adding the same cut-off to my analysis gives more similar results, but as we use different methods to estimate the expected variance the results are not identical.

targets <- readTargets('../References/TargetsNewData.txt', sep = ' ')
star.map <- list.files(".", pattern = "Starmap")

buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta')
align(index = 'TAIR10', readfile1 = targets$Read1, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16)
fc.newdata <- featureCounts(files=targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
fc.newdata2 <- featureCounts(files=star.map, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
star.counts <- list.files("Counts/", full.names = TRUE, pattern = "Starmap")
fc.newdata3 <- do.call("cbind",lapply(star.counts, FUN = function(files){read.table(files, header = FALSE, sep = "\t", row.names = 1, comment.char = "_")}))

In addition to the 3 data set I analysed all data first with the libraries corresponding to how they were sequenced and then again when the re-sequenced library were added and treated as a single sequence event (the objects ending with 9 are the merged ones containing 9 samples).

treat <- factor(targets$Treatment)
treat <- relevel(treat, ref = "DMSO")
design <- model.matrix(~treat)

targets2 <- targets[c(1:3,5:7,9:11),]
treat2 <- factor(targets2$Treatment)
treat2 <- relevel(treat2, ref = "DMSO")
design2 <- model.matrix(~treat2)

fc.newdata9 <- fc.newdata
fc.newdata9$counts[,3] <- fc.newdata9$counts[,3] + fc.newdata9$counts[,4]
fc.newdata9$counts[,7] <- fc.newdata9$counts[,7] + fc.newdata9$counts[,8]
fc.newdata9$counts <- fc.newdata9$counts[,-c(4,8)]

fc.newdata29 <- fc.newdata2
fc.newdata29$counts[,3] <- fc.newdata29$counts[,3] + fc.newdata29$counts[,4]
fc.newdata29$counts[,7] <- fc.newdata29$counts[,7] + fc.newdata29$counts[,8]
fc.newdata29$counts <- fc.newdata29$counts[,-c(4,8)]

colnames(fc.newdata3) <- colnames(Newdata.dge.list$counts)
fc.newdata39 <- fc.newdata3
fc.newdata39[,3] <- fc.newdata39[,3] + fc.newdata39[,4]
fc.newdata39[,7] <- fc.newdata39[,7] + fc.newdata39[,8]
fc.newdata39 <- fc.newdata39[,-c(4,8)]

Newdata.dge.list <- DGEList(counts=fc.newdata$counts, genes=fc.newdata$annotation[,c("GeneID","Length")])
Newdata9.dge.list <- DGEList(counts=fc.newdata9$counts, genes=fc.newdata9$annotation[,c("GeneID","Length")])
Newdata2.dge.list <- DGEList(counts=fc.newdata2$counts, genes=fc.newdata2$annotation[,c("GeneID","Length")])
Newdata29.dge.list <- DGEList(counts=fc.newdata29$counts, genes=fc.newdata29$annotation[,c("GeneID","Length")])
Newdata3.dge.list <- DGEList(counts=fc.newdata3, genes=row.names(fc.newdata3))
Newdata39.dge.list <- DGEList(counts=fc.newdata39, genes=row.names(fc.newdata39))
filterlowE <- function(DGE.obj) {
    isexpr <- rowSums(cpm(DGE.obj) > 3) >= 3
    calcNormFactors(DGE.obj[isexpr,keep.lib.size = FALSE])
}

Newdata.dge.list <- filterlowE(Newdata.dge.list)
Newdata9.dge.list <- filterlowE(Newdata9.dge.list)
Newdata2.dge.list <- filterlowE(Newdata2.dge.list)
Newdata29.dge.list <- filterlowE(Newdata29.dge.list)
Newdata3.dge.list <- filterlowE(Newdata3.dge.list)
Newdata39.dge.list <- filterlowE(Newdata39.dge.list)


y.newdata <- voom(Newdata.dge.list,design)
y.newdata9 <- voom(Newdata9.dge.list, design2)
y.newdata2 <- voom(Newdata2.dge.list,design)
y.newdata29 <- voom(Newdata29.dge.list, design2)
y.newdata3 <- voom(Newdata3.dge.list,design)
y.newdata39 <- voom(Newdata39.dge.list, design2)

plotMDS(y.newdata)

plotMDS(y.newdata2, labels = colnames(Newdata.dge.list$counts))
plotMDS(y.newdata3)

plotMDS(y.newdata9)

plotMDS(y.newdata29, labels = colnames(Newdata9.dge.list$counts))
plotMDS(y.newdata39)

fit.newdata <- lmFit(y.newdata,design)
fit.newdata <- eBayes(fit.newdata)
summary(decideTests(fit.newdata[,2:3]))
##    treat24D treat39F
## -1     3044      313
## 0      6904    11496
## 1      2619      758
summary(decideTests(fit.newdata[,2:3], lfc = 0.5))
##    treat24D treat39F
## -1      748       21
## 0     11567    12318
## 1       252      228
fit.newdata2 <- lmFit(y.newdata2,design)
fit.newdata2 <- eBayes(fit.newdata2)
summary(decideTests(fit.newdata2[,2:3]))
##    treat24D treat39F
## -1     3034      338
## 0      6674    11341
## 1      2711      740
summary(decideTests(fit.newdata2[,2:3], lfc = 0.5))
##    treat24D treat39F
## -1      745       26
## 0     11420    12167
## 1       254      226
fit.newdata3 <- lmFit(y.newdata3,design)
fit.newdata3 <- eBayes(fit.newdata3)
summary(decideTests(fit.newdata3[,2:3]))
##    treat24D treat39F
## -1     3034      338
## 0      6674    11341
## 1      2711      740
summary(decideTests(fit.newdata3[,2:3], lfc = 0.5))
##    treat24D treat39F
## -1      745       26
## 0     11420    12167
## 1       254      226
fit.newdata9 <- lmFit(y.newdata9,design2)
fit.newdata9 <- eBayes(fit.newdata9)
summary(r9 <- decideTests(fit.newdata9[,2:3]))
##    treat224D treat239F
## -1      2928       236
## 0       7120     11584
## 1       2449       677
summary(decideTests(fit.newdata9[,2:3], lfc = 0.5))
##    treat224D treat239F
## -1       736        21
## 0      11512     12258
## 1        249       218
fit.newdata29 <- lmFit(y.newdata29,design2)
fit.newdata29 <- eBayes(fit.newdata29)
summary(r29 <- decideTests(fit.newdata29[,2:3]))
##    treat224D treat239F
## -1      2936       255
## 0       6900     11433
## 1       2513       661
summary(decideTests(fit.newdata29[,2:3], lfc = 0.5))
##    treat224D treat239F
## -1       737        25
## 0      11363     12109
## 1        249       215
fit.newdata39 <- lmFit(y.newdata39,design2)
fit.newdata39 <- eBayes(fit.newdata39)
summary(r39 <- decideTests(fit.newdata39[,2:3]))
##    treat224D treat239F
## -1      2936       255
## 0       6900     11433
## 1       2513       661
summary(decideTests(fit.newdata39[,2:3], lfc = 0.5))
##    treat224D treat239F
## -1       737        25
## 0      11363     12109
## 1        249       215
write.fit(fit.newdata9[,2:3], results = r9, file = "fit9.txt", sep = "\t", digits = 30, adjust = "BH")

write.fit(fit.newdata29[,2:3], results = r29, file = "fit29.txt", sep = "\t", digits = 30, adjust = "BH")

write.fit(fit.newdata39[,2:3], results = r39, file = "fit39.txt", sep = "\t", digits = 30, adjust = "BH")