This describes the analysis that has been done for the new data and in addition compare the results that was obtained in the earlier analysis of the data. As I reported in my emails I do not belive that the data is actually stranded as both the pattern seen in IGV and the counts suggest that the reads are comming from unstranded libraries. Replicate 3 for both 24D and 39F have been sequenced twice, and I am not sure howto bost treat this statistically, for now I have just treated them as if they were independent libraries, which is not the optimal way to take the data structure into account.
Sample | Treatment | Replicate | Number of sequenced fragments |
---|---|---|---|
24D-1 | 24D | R01 | 51714243 |
24D-2 | 24D | R02 | 45382927 |
24D-3.1 | 24D | R03.1 | 21065731 |
24D-3.2 | 24D | R03.2 | 13137632 |
39F-1 | 39F | R01 | 39186674 |
39F-2 | 39F | R02 | 39651752 |
39F-3.1 | 39F | R03.1 | 26049145 |
39F-3.2 | 39F | R03.2 | 9389168 |
MSOR-1 | MSOR | R01 | 35119014 |
MSOR-2 | MSOR | R02 | 45784879 |
MSOR-3 | MSOR | R03 | 45615768 |
The reads were mapped using the short read aligner [Subreadalign] [Subread]. The analysis were done from within R with the code below. To convert read data to counts per gene eg. gene expreession the function featureCounts from the the same package was used. All these results were then saved as ‘Analysis_New.data’ that I shared with you via Uppsala University Dropbox service.
library(Rsubread)
library(limma)
library(edgeR)
# setwd("/proj/b2014273/private/BILS/NewData/Mapping")
targets <- readTargets('../References/TargetsNewData.txt', sep = ' ')
treat <- factor(targets$Treatment)
design <- model.matrix(~treat)
buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta')
align(index = 'TAIR10', readfile1 = targets$Read1, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16)
fc.newdata <- featureCounts(files=targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf", isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
Newdata.dge.list <- DGEList(counts=fc.newdata$counts, genes=fc.newdata$annotation[,c("GeneID","Length")])
Newdata.dge.list.rpkm <- rpkm(Newdata.dge.list,Newdata.dge.list$genes$Length)
#Filtering. Only keep in the analysis those genes which had >10 reads per million mapped reads in at least two libraries.
setwd("~/Dropbox/BilsWork/1699_b2014273/R-analysis")
load('Analysis.newdata.rdata') # Since the actual analysis has been done on Uppmax I load data here
library(Rsubread)
library(limma)
library(edgeR)
Below code creates three different data sets
This was done as I saw big difference in the number of significant genes depending compared to the other analysis. As it turns out the difference might not have been that big as I thought since they added a filter on fold-change that I normally do not do. Adding the same cut-off to my analysis gives more similar results, but as we use different methods to estimate the expected variance the results are not identical.
targets <- readTargets('../References/TargetsNewData.txt', sep = ' ')
star.map <- list.files(".", pattern = "Starmap")
buildindex(basename = "TAIR10", reference = '../References/TAIR10_chr_all.fasta')
align(index = 'TAIR10', readfile1 = targets$Read1, input_format = 'gzFASTQ', output_format = 'BAM', output_file = targets$OutputFile, tieBreakHamming=TRUE, unique=TRUE, indels=5, nthreads = 16)
fc.newdata <- featureCounts(files=targets$OutputFile, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
fc.newdata2 <- featureCounts(files=star.map, annot.ext = "/proj/b2014273/private/BILS/References/TAIR4.gtf",isGTFAnnotationFile = TRUE, strandSpecific = 0, GTF.featureType="exon", GTF.attrType='gene_id')
star.counts <- list.files("Counts/", full.names = TRUE, pattern = "Starmap")
fc.newdata3 <- do.call("cbind",lapply(star.counts, FUN = function(files){read.table(files, header = FALSE, sep = "\t", row.names = 1, comment.char = "_")}))
In addition to the 3 data set I analysed all data first with the libraries corresponding to how they were sequenced and then again when the re-sequenced library were added and treated as a single sequence event (the objects ending with 9 are the merged ones containing 9 samples).
treat <- factor(targets$Treatment)
treat <- relevel(treat, ref = "DMSO")
design <- model.matrix(~treat)
targets2 <- targets[c(1:3,5:7,9:11),]
treat2 <- factor(targets2$Treatment)
treat2 <- relevel(treat2, ref = "DMSO")
design2 <- model.matrix(~treat2)
fc.newdata9 <- fc.newdata
fc.newdata9$counts[,3] <- fc.newdata9$counts[,3] + fc.newdata9$counts[,4]
fc.newdata9$counts[,7] <- fc.newdata9$counts[,7] + fc.newdata9$counts[,8]
fc.newdata9$counts <- fc.newdata9$counts[,-c(4,8)]
fc.newdata29 <- fc.newdata2
fc.newdata29$counts[,3] <- fc.newdata29$counts[,3] + fc.newdata29$counts[,4]
fc.newdata29$counts[,7] <- fc.newdata29$counts[,7] + fc.newdata29$counts[,8]
fc.newdata29$counts <- fc.newdata29$counts[,-c(4,8)]
colnames(fc.newdata3) <- colnames(Newdata.dge.list$counts)
fc.newdata39 <- fc.newdata3
fc.newdata39[,3] <- fc.newdata39[,3] + fc.newdata39[,4]
fc.newdata39[,7] <- fc.newdata39[,7] + fc.newdata39[,8]
fc.newdata39 <- fc.newdata39[,-c(4,8)]
Newdata.dge.list <- DGEList(counts=fc.newdata$counts, genes=fc.newdata$annotation[,c("GeneID","Length")])
Newdata9.dge.list <- DGEList(counts=fc.newdata9$counts, genes=fc.newdata9$annotation[,c("GeneID","Length")])
Newdata2.dge.list <- DGEList(counts=fc.newdata2$counts, genes=fc.newdata2$annotation[,c("GeneID","Length")])
Newdata29.dge.list <- DGEList(counts=fc.newdata29$counts, genes=fc.newdata29$annotation[,c("GeneID","Length")])
Newdata3.dge.list <- DGEList(counts=fc.newdata3, genes=row.names(fc.newdata3))
Newdata39.dge.list <- DGEList(counts=fc.newdata39, genes=row.names(fc.newdata39))
filterlowE <- function(DGE.obj) {
isexpr <- rowSums(cpm(DGE.obj) > 3) >= 3
calcNormFactors(DGE.obj[isexpr,keep.lib.size = FALSE])
}
Newdata.dge.list <- filterlowE(Newdata.dge.list)
Newdata9.dge.list <- filterlowE(Newdata9.dge.list)
Newdata2.dge.list <- filterlowE(Newdata2.dge.list)
Newdata29.dge.list <- filterlowE(Newdata29.dge.list)
Newdata3.dge.list <- filterlowE(Newdata3.dge.list)
Newdata39.dge.list <- filterlowE(Newdata39.dge.list)
y.newdata <- voom(Newdata.dge.list,design)
y.newdata9 <- voom(Newdata9.dge.list, design2)
y.newdata2 <- voom(Newdata2.dge.list,design)
y.newdata29 <- voom(Newdata29.dge.list, design2)
y.newdata3 <- voom(Newdata3.dge.list,design)
y.newdata39 <- voom(Newdata39.dge.list, design2)
plotMDS(y.newdata)
plotMDS(y.newdata2, labels = colnames(Newdata.dge.list$counts))
plotMDS(y.newdata3)
plotMDS(y.newdata9)
plotMDS(y.newdata29, labels = colnames(Newdata9.dge.list$counts))
plotMDS(y.newdata39)
fit.newdata <- lmFit(y.newdata,design)
fit.newdata <- eBayes(fit.newdata)
summary(decideTests(fit.newdata[,2:3]))
## treat24D treat39F
## -1 3044 313
## 0 6904 11496
## 1 2619 758
summary(decideTests(fit.newdata[,2:3], lfc = 0.5))
## treat24D treat39F
## -1 748 21
## 0 11567 12318
## 1 252 228
fit.newdata2 <- lmFit(y.newdata2,design)
fit.newdata2 <- eBayes(fit.newdata2)
summary(decideTests(fit.newdata2[,2:3]))
## treat24D treat39F
## -1 3034 338
## 0 6674 11341
## 1 2711 740
summary(decideTests(fit.newdata2[,2:3], lfc = 0.5))
## treat24D treat39F
## -1 745 26
## 0 11420 12167
## 1 254 226
fit.newdata3 <- lmFit(y.newdata3,design)
fit.newdata3 <- eBayes(fit.newdata3)
summary(decideTests(fit.newdata3[,2:3]))
## treat24D treat39F
## -1 3034 338
## 0 6674 11341
## 1 2711 740
summary(decideTests(fit.newdata3[,2:3], lfc = 0.5))
## treat24D treat39F
## -1 745 26
## 0 11420 12167
## 1 254 226
fit.newdata9 <- lmFit(y.newdata9,design2)
fit.newdata9 <- eBayes(fit.newdata9)
summary(r9 <- decideTests(fit.newdata9[,2:3]))
## treat224D treat239F
## -1 2928 236
## 0 7120 11584
## 1 2449 677
summary(decideTests(fit.newdata9[,2:3], lfc = 0.5))
## treat224D treat239F
## -1 736 21
## 0 11512 12258
## 1 249 218
fit.newdata29 <- lmFit(y.newdata29,design2)
fit.newdata29 <- eBayes(fit.newdata29)
summary(r29 <- decideTests(fit.newdata29[,2:3]))
## treat224D treat239F
## -1 2936 255
## 0 6900 11433
## 1 2513 661
summary(decideTests(fit.newdata29[,2:3], lfc = 0.5))
## treat224D treat239F
## -1 737 25
## 0 11363 12109
## 1 249 215
fit.newdata39 <- lmFit(y.newdata39,design2)
fit.newdata39 <- eBayes(fit.newdata39)
summary(r39 <- decideTests(fit.newdata39[,2:3]))
## treat224D treat239F
## -1 2936 255
## 0 6900 11433
## 1 2513 661
summary(decideTests(fit.newdata39[,2:3], lfc = 0.5))
## treat224D treat239F
## -1 737 25
## 0 11363 12109
## 1 249 215
write.fit(fit.newdata9[,2:3], results = r9, file = "fit9.txt", sep = "\t", digits = 30, adjust = "BH")
write.fit(fit.newdata29[,2:3], results = r29, file = "fit29.txt", sep = "\t", digits = 30, adjust = "BH")
write.fit(fit.newdata39[,2:3], results = r39, file = "fit39.txt", sep = "\t", digits = 30, adjust = "BH")