Author: Shannon Erdelyi
Date: Feb 27, 2014
# libraries
library(edgeR)
library(DESeq)
library(limma)
# data
dat <- read.table("data/bottomly_count_table.tsv", header = T, row.names = 1)
des <- read.table("data/bottomly_phenodata.tsv", header = T, row.names = 1)
str(dat)
str(des)
edgeRRedo the above analysis but first filter the data and remove any gene that has
1. count equal to zero across all samples
2. count equal to zero in at least one sample in each genotype group
# filter genes
keepGenes <- apply(dat, 1, sum) != 0
dat1 <- dat[keepGenes, ]
# create an object storing the data and design information (sample) sample
# data only contains grouping by strain
dge <- DGEList(counts = dat1, group = as.numeric(des$strain))
# design matrix
dm <- model.matrix(~group, dge$samples)
# dispersion
comDisp <- estimateGLMCommonDisp(dge, dm, verbose = T)
## Disp = 0.03893 , BCV = 0.1973
trendDisp <- estimateGLMTrendedDisp(comDisp)
tagDisp <- estimateGLMTagwiseDisp(trendDisp, dm) ## EBayes
# plot the tagwise dispersion against log2-CPM (counts per million)
plotBCV(tagDisp)
# fit glm
fit <- glmFit(tagDisp, dm)
lrt <- glmLRT(fit, coef = grep("group", colnames(coef(fit))))
groupTags <- topTags(lrt, n = Inf)
# genes with FDR < 0.01
nrow(groupTags$table[groupTags$table$FDR < 0.01, ])
## [1] 696
# interesting genes
coolGenes <- rownames(groupTags$table[groupTags$table$FDR < 1e-50, ])
heatmap(cpm(tagDisp)[coolGenes, ], Rowv = NA, Colv = NA, margins = c(10, 10))
# direction of differential expression
de <- decideTestsDGE(lrt, p = 0.05, adjust = "BH")
table(de)
## de
## -1 0 1
## 591 12762 579
# plotting the tagwise log fold changes against log-cpm
tags <- rownames(tagDisp)[abs(de) == 1]
plotSmear(lrt, de.tags = tags)
abline(h = c(-2, 2), col = "blue")
# filter genes
keepGenes <- !(apply(dat[, des$strain == "C57BL/6J"], 1, function(x) sum(x ==
0)) > 0 & apply(dat[, des$strain == "DBA/2J"], 1, function(x) sum(x == 0)) >
0)
dat2 <- dat[keepGenes, ]
# create an object storing the data and design information (sample) sample
# data only contains grouping by strain
dge <- DGEList(counts = dat2, group = as.numeric(des$strain))
# design matrix
dm <- model.matrix(~group, dge$samples)
# dispersion
comDisp <- estimateGLMCommonDisp(dge, dm, verbose = T)
## Disp = 0.0383 , BCV = 0.1957
trendDisp <- estimateGLMTrendedDisp(comDisp)
tagDisp <- estimateGLMTagwiseDisp(trendDisp, dm) ## EBayes
# plot the tagwise dispersion against log2-CPM (counts per million)
plotBCV(tagDisp)
# fit glm
fit <- glmFit(tagDisp, dm)
lrt <- glmLRT(fit, coef = grep("group", colnames(coef(fit))))
groupTags <- topTags(lrt, n = Inf)
# genes with FDR < 0.01
nrow(groupTags$table[groupTags$table$FDR < 0.01, ])
## [1] 669
# interesting genes
coolGenes <- rownames(groupTags$table[groupTags$table$FDR < 1e-50, ])
heatmap(cpm(tagDisp)[coolGenes, ], Rowv = NA, Colv = NA, margins = c(10, 10))
# direction of differential expression
de <- decideTestsDGE(lrt, p = 0.05, adjust = "BH")
table(de)
## de
## -1 0 1
## 553 8854 566
# plotting the tagwise log fold changes against log-cpm
tags <- rownames(tagDisp)[abs(de) == 1]
plotSmear(lrt, de.tags = tags)
abline(h = c(-2, 2), col = "blue")
Choose a specific threshold for the adjusted p value, find the genes identified as differentially expressed using each of edgeR, DESeq and voom+limma. Compare the number of genes in these 3 lists, and draw a venn digram demonstrating the overlap (if any!).
pThresh <- 1e-10
edgeR# samples and counts
dge <- DGEList(counts = dat, group = as.numeric(des$strain))
# design matrix
dm <- model.matrix(~group, dge$samples)
# dispersion
comDisp <- estimateGLMCommonDisp(dge, dm, verbose = T)
## Disp = 0.03893 , BCV = 0.1973
trendDisp <- estimateGLMTrendedDisp(comDisp)
tagDisp <- estimateGLMTagwiseDisp(trendDisp, dm)
# fit
fit <- glmFit(tagDisp, dm)
lrt <- glmLRT(fit, coef = grep("group", colnames(coef(fit))))
groupTags <- topTags(lrt, n = Inf)
# differentially expressed genes with FDR < pThresh
deEdge <- groupTags$table[groupTags$table$FDR < pThresh, ]
edgeR found 105 differentially expressed genes.
DESeq# data
seqDat <- newCountDataSet(dat, des$strain)
seqDat <- estimateSizeFactors(seqDat)
seqDat <- estimateDispersions(seqDat)
# fit
seqResults <- nbinomTest(seqDat, levels(des$strain)[1], levels(des$strain)[2])
# differentially expressed genes with FDR < pThresh
deSeq <- na.omit(seqResults[seqResults$padj < pThresh, ])
DESeq found 76 differentially expressed genes.
voom + limma# data
norm.factor <- calcNormFactors(dat)
dat.voomed <- voom(dat, dm, lib.size = colSums(dat) * norm.factor)
# fit
fit <- lmFit(dat.voomed, dm)
fit <- eBayes(fit)
voomResults <- topTable(fit, coef = "group2", n = Inf)
# differentially expressed genes with FDR < pThresh
deVoom <- voomResults[voomResults$adj.P.Val < pThresh, ]
voom + limma found 18 differentially expressed genes.
# differential expression matrix
deMat <- matrix(c(as.numeric(rownames(dat) %in% rownames(deEdge)), as.numeric(rownames(dat) %in%
deSeq$id), as.numeric(rownames(dat) %in% deVoom$ID)), nrow = nrow(dat),
ncol = 3)
colnames(deMat) <- c("edgeR", "DESeq", "voom + limma")
vennDiagram(vennCounts(deMat))