Shaun Jackman — Mar 1, 2013, 12:17 PM
# STAT 540 Seminar 8
# Shaun Jackman
# edgeR
library(edgeR)
Loading required package: limma
dat <- read.table("../data/bottomly/bottomly_count_table.txt",
header = TRUE, row.names = 1)
des <- read.table("../data/bottomly/bottomly_phenodata.txt",
header = TRUE, row.names = 1)
all(rownames(des) == colnames(dat))
[1] TRUE
# GLM edgeR
table(des$strain)
C57BL/6J DBA/2J
10 11
group <- factor(c(rep("1", 10), rep(2, 11)))
dge.glm <- DGEList(counts = dat, group = group)
Calculating library sizes from column totals.
design <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
Disp = 0.03893 , BCV = 0.1973
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
Loading required package: splines
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
plotBCV(dge.glm.tag.disp)
fit <- glmFit(dge.glm.tag.disp, design)
lrt <- glmLRT(fit, coef = 2)
topTags(lrt)
Coefficient: group2
logFC logCPM LR PValue FDR
ENSMUSG00000020912 -5.248 3.090 407.6 1.230e-90 4.495e-86
ENSMUSG00000050141 -5.491 2.200 309.5 2.743e-69 5.010e-65
ENSMUSG00000035775 -4.599 2.583 302.6 9.121e-68 1.111e-63
ENSMUSG00000024248 -3.164 3.409 271.5 5.301e-61 4.842e-57
ENSMUSG00000054354 -6.623 1.790 257.2 6.990e-58 5.108e-54
ENSMUSG00000050824 3.725 3.301 243.1 8.382e-55 5.104e-51
ENSMUSG00000015484 -1.968 4.273 230.4 4.843e-52 2.528e-48
ENSMUSG00000030532 1.548 5.565 219.6 1.123e-49 5.128e-46
ENSMUSG00000028393 1.803 6.103 214.5 1.444e-48 5.860e-45
ENSMUSG00000023236 1.426 7.064 207.5 4.920e-47 1.798e-43
tt.glm <- topTags(lrt, n = Inf)
nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])
[1] 566
interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
cpm(dge.glm.tag.disp)[interestingSamples, ]
SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000020912 20.3928 12.6910 15.83 14.819 19.2296
ENSMUSG00000050141 14.1434 10.1528 10.67 6.264 9.9464
ENSMUSG00000035775 14.4723 10.6287 19.51 11.305 12.9303
ENSMUSG00000024248 14.8012 26.0166 17.30 23.527 18.5665
ENSMUSG00000054354 10.8542 7.6146 11.41 8.708 8.2886
ENSMUSG00000050824 0.6578 0.6346 1.84 1.528 0.6631
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000020912 17.330 26.4301 13.359 23.6211 14.310
ENSMUSG00000050141 7.890 8.3605 6.583 11.9115 12.402
ENSMUSG00000035775 12.540 12.1363 6.389 13.5266 11.210
ENSMUSG00000024248 25.784 18.8786 24.781 14.3342 14.310
ENSMUSG00000054354 7.608 7.0121 3.291 4.4416 5.963
ENSMUSG00000050824 1.409 0.2697 3.485 0.8076 1.431
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000020912 0.5037 0.4226 1.0805 0.2802 1.0995
ENSMUSG00000050141 0.2518 0.0000 0.0000 0.2802 0.8246
ENSMUSG00000035775 0.7555 0.0000 0.2701 0.2802 0.6872
ENSMUSG00000024248 2.2666 1.2677 1.8908 3.3618 3.1610
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.5497
ENSMUSG00000050824 19.6437 24.2975 12.6957 15.6885 19.6531
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000020912 0.0000 0.7027 0.8651 0.1362 0.0000
ENSMUSG00000050141 0.0000 0.1405 0.0000 0.1362 0.2578
ENSMUSG00000035775 0.6784 0.1405 0.5767 0.1362 1.8045
ENSMUSG00000024248 1.5829 1.6864 1.7302 1.7712 2.5779
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.0000
ENSMUSG00000050824 19.2209 13.3505 12.1117 14.5780 13.1473
SRX033494
ENSMUSG00000020912 0.0000
ENSMUSG00000050141 0.1477
ENSMUSG00000035775 0.5907
ENSMUSG00000024248 3.1012
ENSMUSG00000054354 0.0000
ENSMUSG00000050824 26.4336
summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))
[,1]
-1 437
0 35698
1 401
tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")
# Mini exercise:
# Redo the above analysis but first filter the data and remove any gene that
# has: 1. count equal tot zero across all samples 2. count equal to zero in at
# least one sample in each genotype group
do.edgeR <- function(dat) {
dge.glm <- DGEList(counts = dat, group = group)
design <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
plotBCV(dge.glm.tag.disp)
fit <- glmFit(dge.glm.tag.disp, design)
lrt <- glmLRT(fit, coef = 2)
cat('Top genes:')
print(topTags(lrt))
tt.glm <- topTags(lrt, n = Inf)
cat('Number of genes with FDR < 0.01:')
print(sum(tt.glm$table$FDR < 0.01))
interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
cat('Interesting genes:')
print(cpm(dge.glm.tag.disp)[interestingSamples, ])
cat('Interesting genes with BH FDR < 0.05:')
print(table(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH")))
tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")
de.glm
}
edgeR.hits <- do.edgeR(subset(dat, apply(dat, 1, function(x) !all(x == 0))))
Calculating library sizes from column totals.
Disp = 0.03893 , BCV = 0.1973
Top genes:Coefficient: group2
logFC logCPM LR PValue FDR
ENSMUSG00000020912 -5.248 3.091 404.9 4.715e-90 6.569e-86
ENSMUSG00000050141 -5.490 2.199 313.3 4.212e-70 2.934e-66
ENSMUSG00000035775 -4.599 2.583 303.2 6.482e-68 3.010e-64
ENSMUSG00000024248 -3.164 3.409 270.5 8.807e-61 3.068e-57
ENSMUSG00000054354 -6.623 1.789 259.3 2.380e-58 6.631e-55
ENSMUSG00000050824 3.725 3.301 241.4 1.996e-54 4.635e-51
ENSMUSG00000015484 -1.968 4.274 228.1 1.547e-51 3.078e-48
ENSMUSG00000030532 1.548 5.565 220.3 7.756e-50 1.351e-46
ENSMUSG00000028393 1.803 6.103 216.3 5.703e-49 8.828e-46
ENSMUSG00000023236 1.426 7.064 210.8 9.040e-48 1.259e-44
Number of genes with FDR < 0.01:[1] 696
Interesting genes: SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000020912 20.3928 12.6910 15.83 14.819 19.2296
ENSMUSG00000050141 14.1434 10.1528 10.67 6.264 9.9464
ENSMUSG00000035775 14.4723 10.6287 19.51 11.305 12.9303
ENSMUSG00000024248 14.8012 26.0166 17.30 23.527 18.5665
ENSMUSG00000054354 10.8542 7.6146 11.41 8.708 8.2886
ENSMUSG00000050824 0.6578 0.6346 1.84 1.528 0.6631
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000020912 17.330 26.4301 13.359 23.6211 14.310
ENSMUSG00000050141 7.890 8.3605 6.583 11.9115 12.402
ENSMUSG00000035775 12.540 12.1363 6.389 13.5266 11.210
ENSMUSG00000024248 25.784 18.8786 24.781 14.3342 14.310
ENSMUSG00000054354 7.608 7.0121 3.291 4.4416 5.963
ENSMUSG00000050824 1.409 0.2697 3.485 0.8076 1.431
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000020912 0.5037 0.4226 1.0805 0.2802 1.0995
ENSMUSG00000050141 0.2518 0.0000 0.0000 0.2802 0.8246
ENSMUSG00000035775 0.7555 0.0000 0.2701 0.2802 0.6872
ENSMUSG00000024248 2.2666 1.2677 1.8908 3.3618 3.1610
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.5497
ENSMUSG00000050824 19.6437 24.2975 12.6957 15.6885 19.6531
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000020912 0.0000 0.7027 0.8651 0.1362 0.0000
ENSMUSG00000050141 0.0000 0.1405 0.0000 0.1362 0.2578
ENSMUSG00000035775 0.6784 0.1405 0.5767 0.1362 1.8045
ENSMUSG00000024248 1.5829 1.6864 1.7302 1.7712 2.5779
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.0000
ENSMUSG00000050824 19.2209 13.3505 12.1117 14.5780 13.1473
SRX033494
ENSMUSG00000020912 0.0000
ENSMUSG00000050141 0.1477
ENSMUSG00000035775 0.5907
ENSMUSG00000024248 3.1012
ENSMUSG00000054354 0.0000
ENSMUSG00000050824 26.4336
Interesting genes with BH FDR < 0.05:
-1 0 1
591 12762 579
edgeR.hits <- do.edgeR(subset(dat, apply(dat, 1, function(x) !any(x == 0))))
Calculating library sizes from column totals.
Disp = 0.03704 , BCV = 0.1925
Top genes:Coefficient: group2
logFC logCPM LR PValue FDR
ENSMUSG00000024248 -3.164 3.412 274.4 1.216e-61 1.119e-57
ENSMUSG00000050824 3.724 3.304 246.2 1.759e-55 8.090e-52
ENSMUSG00000015484 -1.969 4.276 236.1 2.828e-53 8.668e-50
ENSMUSG00000023236 1.426 7.067 218.0 2.503e-49 5.753e-46
ENSMUSG00000030532 1.547 5.568 214.1 1.762e-48 3.241e-45
ENSMUSG00000028393 1.803 6.106 211.1 7.905e-48 1.212e-44
ENSMUSG00000015852 -2.371 3.068 191.8 1.264e-43 1.661e-40
ENSMUSG00000066800 -2.328 3.467 176.8 2.402e-40 2.761e-37
ENSMUSG00000026697 -3.725 3.097 175.1 5.630e-40 5.753e-37
ENSMUSG00000072572 -2.016 3.661 170.7 5.289e-39 4.864e-36
Number of genes with FDR < 0.01:[1] 533
Interesting genes: SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000024248 14.8221 26.0799 17.323 23.57 18.5943
ENSMUSG00000050824 0.6588 0.6361 1.843 1.53 0.6641
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000024248 25.834 18.9053 24.832 14.3890 14.333
ENSMUSG00000050824 1.412 0.2701 3.492 0.8106 1.433
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000024248 2.27 1.27 1.894 3.367 3.166
ENSMUSG00000050824 19.68 24.33 12.715 15.711 19.683
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000024248 1.586 1.689 1.733 1.775 2.582
ENSMUSG00000050824 19.256 13.370 12.132 14.606 13.167
SRX033494
ENSMUSG00000024248 3.108
ENSMUSG00000050824 26.490
Interesting genes with BH FDR < 0.05:
-1 0 1
474 8249 473
# DESeq
library(DESeq)
Loading required package: Biobase
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following object(s) are masked from 'package:stats':
xtabs
The following object(s) are masked from 'package:base':
anyDuplicated, cbind, colnames, duplicated, eval, Filter, Find, get,
intersect, lapply, Map, mapply, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, Position, rbind, Reduce, rep.int, rownames, sapply, setdiff,
table, tapply, union, unique
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To
cite Bioconductor, see 'citation("Biobase")', and for packages
'citation("pkgname")'.
Loading required package: locfit
locfit 1.5-8 2012-04-25
Loading required package: lattice
Attaching package: 'DESeq'
The following object(s) are masked from 'package:limma':
plotMA
deSeqDat <- newCountDataSet(dat, group)
deSeqDat <- estimateSizeFactors(deSeqDat)
deSeqDat <- estimateDispersions(deSeqDat)
plotDispEsts(deSeqDat)
deseq.results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
plotMA(deseq.results)
# Voom & limma
library(limma)
dat.voomed <- voom(dat, design)
fit <- eBayes(lmFit(dat.voomed, design))
# Take Home Problem
# Choose a specific threshold for the adjusted p value, find the genes
# identified as differentially expressed using each of edgeR, DESeq and
# voom+limma. Compare the number of genes in these 3 lists, and draw a venn
# digram demonstrating the overlap (if any!).
edgeR.hits <- do.edgeR(dat)
Calculating library sizes from column totals.
Disp = 0.03893 , BCV = 0.1973
Top genes:Coefficient: group2
logFC logCPM LR PValue FDR
ENSMUSG00000020912 -5.248 3.090 407.6 1.230e-90 4.495e-86
ENSMUSG00000050141 -5.491 2.200 309.5 2.743e-69 5.010e-65
ENSMUSG00000035775 -4.599 2.583 302.6 9.121e-68 1.111e-63
ENSMUSG00000024248 -3.164 3.409 271.5 5.301e-61 4.842e-57
ENSMUSG00000054354 -6.623 1.790 257.2 6.990e-58 5.108e-54
ENSMUSG00000050824 3.725 3.301 243.1 8.382e-55 5.104e-51
ENSMUSG00000015484 -1.968 4.273 230.4 4.843e-52 2.528e-48
ENSMUSG00000030532 1.548 5.565 219.6 1.123e-49 5.128e-46
ENSMUSG00000028393 1.803 6.103 214.5 1.444e-48 5.860e-45
ENSMUSG00000023236 1.426 7.064 207.5 4.920e-47 1.798e-43
Number of genes with FDR < 0.01:[1] 566
Interesting genes: SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000020912 20.3928 12.6910 15.83 14.819 19.2296
ENSMUSG00000050141 14.1434 10.1528 10.67 6.264 9.9464
ENSMUSG00000035775 14.4723 10.6287 19.51 11.305 12.9303
ENSMUSG00000024248 14.8012 26.0166 17.30 23.527 18.5665
ENSMUSG00000054354 10.8542 7.6146 11.41 8.708 8.2886
ENSMUSG00000050824 0.6578 0.6346 1.84 1.528 0.6631
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000020912 17.330 26.4301 13.359 23.6211 14.310
ENSMUSG00000050141 7.890 8.3605 6.583 11.9115 12.402
ENSMUSG00000035775 12.540 12.1363 6.389 13.5266 11.210
ENSMUSG00000024248 25.784 18.8786 24.781 14.3342 14.310
ENSMUSG00000054354 7.608 7.0121 3.291 4.4416 5.963
ENSMUSG00000050824 1.409 0.2697 3.485 0.8076 1.431
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000020912 0.5037 0.4226 1.0805 0.2802 1.0995
ENSMUSG00000050141 0.2518 0.0000 0.0000 0.2802 0.8246
ENSMUSG00000035775 0.7555 0.0000 0.2701 0.2802 0.6872
ENSMUSG00000024248 2.2666 1.2677 1.8908 3.3618 3.1610
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.5497
ENSMUSG00000050824 19.6437 24.2975 12.6957 15.6885 19.6531
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000020912 0.0000 0.7027 0.8651 0.1362 0.0000
ENSMUSG00000050141 0.0000 0.1405 0.0000 0.1362 0.2578
ENSMUSG00000035775 0.6784 0.1405 0.5767 0.1362 1.8045
ENSMUSG00000024248 1.5829 1.6864 1.7302 1.7712 2.5779
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.0000
ENSMUSG00000050824 19.2209 13.3505 12.1117 14.5780 13.1473
SRX033494
ENSMUSG00000020912 0.0000
ENSMUSG00000050141 0.1477
ENSMUSG00000035775 0.5907
ENSMUSG00000024248 3.1012
ENSMUSG00000054354 0.0000
ENSMUSG00000050824 26.4336
Interesting genes with BH FDR < 0.05:
-1 0 1
437 35698 401
table(deseq.results$padj < 0.05)
FALSE TRUE
13353 579
voom.hits <- topTable(fit, coef='group2', num=Inf, sort.by='none')
table(voom.hits$adj.P.Val < 0.05)
FALSE TRUE
35663 873
results <- data.frame(
edgeR = edgeR.hits != 0,
DESeq = deseq.results$padj < 0.05,
voom.limma = voom.hits$adj.P.Val < 0.05)
table(results)
, , voom.limma = FALSE
DESeq
edgeR FALSE TRUE
FALSE 12943 8
TRUE 92 16
, , voom.limma = TRUE
DESeq
edgeR FALSE TRUE
FALSE 141 2
TRUE 177 553
vennDiagram(results)