Perform differential expression analysis on the provided dataset but first filter the data to remove genes with 1) count equal to zero across all samples and 2) count equal to zero in at least one sample in each genotype group.
We will start by filtering out the unwanted genes from our dataset.
dat1 <- dat[(rowSums(dat) != 0), ] # filter genes with count zero across all samples
keepGenes <- !(apply(dat1[, des$strain == "C57BL/6J"], 1, function(x) sum(x ==
0)) > 0 & apply(dat1[, des$strain == "DBA/2J"], 1, function(x) sum(x ==
0)) > 0) # filter genes with count zero for at least one sample in each genotype group
dat.filtered <- dat1[keepGenes, ]
group <- factor(c(rep("1", 10), rep("2", 11)))
dge.glm <- DGEList(counts = dat.filtered, group = group)
design.matrix <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design.matrix, verbose = TRUE)
## Disp = 0.0383 , BCV = 0.1957
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
## Loading required package: splines
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design.matrix)
plotBCV(dge.glm.tag.disp)
fit <- glmFit(dge.glm.tag.disp, design.matrix)
lrt <- glmLRT(fit, coef = 2)
tt.glm <- topTags(lrt, n = Inf)
nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ]) # number of hits with FDR < 0.01
## [1] 731
There are 731 hits with FDR < 0.01.
Let's get the direction of differential expression
diff.exp <- decideTestsDGE(lrt, p = 0.05, adjust = "BH")
table(diff.exp)
## diff.exp
## -1 0 1
## 588 8773 612
We can see that 588 genes are under-expressed in group 2 (DBA/2J) compared with group 1 (C57BL/6J), 8773 show no differences in expression while 612 genes are over-expressed in group 2 versus group 1.
tags.glm <- rownames(dge.glm.tag.disp)[as.logical(diff.exp)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")
Choose a specific threshold for the adjusted p value, find the genes identified as differentially expressed using each of edgeR, DESeq and voom+limma. Compare the number of genes in these 3 lists, and draw a venn digram demonstrating the overlap (if any!). We will select an adjusted p-value threshold of 1e-08.
adj.threshold <- 1e-08
group <- factor(c(rep("1", 10), rep("2", 11)))
dge.glm <- DGEList(counts = dat, group = group)
design <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
## Disp = 0.03893 , BCV = 0.1973
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
fitE <- glmFit(dge.glm.tag.disp, design)
lrt <- glmLRT(fitE, coef = grep("group", colnames(fitE)))
top.tags <- topTags(lrt, n = Inf)
edgeR.hits <- top.tags$table[top.tags$table$FDR < adj.threshold, ]
nrow(edgeR.hits)
## [1] 141
There are 141 edgeR hits with FDR < 1e-08.
deSeqDat <- newCountDataSet(dat, group)
deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483
## 0.6439 1.3454 0.5785 1.4295 0.6355 1.5240 0.7933
## SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475
## 1.1272 1.0772 0.8984 0.8886 1.0255 0.7987 0.7796
## SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494
## 1.6162 0.9882 1.5720 0.7558 1.5922 0.8264 1.4715
deSeqDat <- estimateDispersions(deSeqDat)
results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
DESeq.hits <- na.omit(results[results$padj < adj.threshold, ])
nrow(DESeq.hits)
## [1] 99
There are 99 DESeq hits with FDR < 1e-08.
norm.factor <- calcNormFactors(dat)
dat.voomed <- voom(dat, design, plot = TRUE, lib.size = colSums(dat) * norm.factor)
fitV <- lmFit(dat.voomed, design)
fitV <- eBayes(fitV)
Voom.hits <- topTable(fitV, coef = "group2", n = Inf, p.value = adj.threshold)
nrow(Voom.hits)
## [1] 50
There are 99 DESeq hits with FDR < 1e-08.
hits <- list(edgeR = rownames(edgeR.hits), DESeq = rownames(DESeq.hits), Voom.Limma = rownames(Voom.hits))
venn(hits)
We can see that there are 50 shared hits between the edgeR and voom methods at our adjusted p-value threshold of 1e-08. However, there are no shared hits between DESeq and the other two methods.