Abrar — Feb 27, 2014, 11:36 PM
# Seminar 07
# Created by: Abrar Wafa
# Date: Feb.27, 2013
library(edgeR)
Loading required package: limma
library(DESeq)
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:parallel':
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following object is masked from 'package:limma':
plotMA
The following object is masked from 'package:stats':
xtabs
The following objects are masked from 'package:base':
anyDuplicated, append, as.data.frame, as.vector, cbind,
colnames, duplicated, eval, evalq, Filter, Find, get,
intersect, is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unlist
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: locfit
locfit 1.5-9.1 2013-03-22
Loading required package: lattice
Welcome to 'DESeq'. For improved performance, usability and
functionality, please consider migrating to 'DESeq2'.
library(limma)
library(gplots)
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
dat <- read.table("../Data_RNAseq/bottomly_count_table.tsv", header = TRUE, row.names = 1)
des <- read.table("../Data_RNAseq/bottomly_phenodata.tsv", header = TRUE, row.names = 1)
############################################################################################
# edgeR
############################################################################################
# Mini exercise:
all(rownames(des) == colnames(dat))
[1] TRUE
# remove any gene that has count equal to zero across all samples
filDat <- dat[(rowSums(dat) !=0),]
# remove any gene that has count equal to zero in at least one sample in each genotype group
selrow <- apply(filDat, 1, function(row) all(row[1:10]!=0) | all(row[11:21]!=0))
fdat <- filDat [selrow, ]
group <- factor(c(rep("1", 10), rep("2", 11)))
group
[1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
Levels: 1 2
dge.glm <- DGEList(counts = fdat, group = group)
str(dge.glm)
Formal class 'DGEList' [package "edgeR"] with 1 slots
..@ .Data:List of 2
.. ..$ : int [1:9973, 1:21] 369 0 21 15 517 273 11 19 19 97 ...
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:9973] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000056" "ENSMUSG00000000058" ...
.. .. .. ..$ : chr [1:21] "SRX033480" "SRX033488" "SRX033481" "SRX033489" ...
.. ..$ :'data.frame': 21 obs. of 3 variables:
.. .. ..$ group : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
.. .. ..$ lib.size : num [1:21] 3038489 6299265 2715366 6541434 3014237 ...
.. .. ..$ norm.factors: num [1:21] 1 1 1 1 1 1 1 1 1 1 ...
names(dge.glm)
[1] "counts" "samples"
dge.glm[["samples"]]
group lib.size norm.factors
SRX033480 1 3038489 1
SRX033488 1 6299265 1
SRX033481 1 2715366 1
SRX033489 1 6541434 1
SRX033482 1 3014237 1
SRX033490 1 7091978 1
SRX033483 1 3705596 1
SRX033476 1 5161116 1
SRX033478 1 4949177 1
SRX033479 1 4189929 1
SRX033472 2 3967975 1
SRX033473 2 4729997 1
SRX033474 2 3699671 1
SRX033475 2 3567255 1
SRX033491 2 7271072 1
SRX033484 2 4418407 1
SRX033492 2 7111173 1
SRX033485 2 3465132 1
SRX033493 2 7334966 1
SRX033486 2 3876563 1
SRX033494 2 6767129 1
nrow(dge.glm[[1]])
[1] 9973
ncol(dge.glm[[2]])
[1] 3
design <- model.matrix(~group)
design
(Intercept) group2
1 1 0
2 1 0
3 1 0
4 1 0
5 1 0
6 1 0
7 1 0
8 1 0
9 1 0
10 1 0
11 1 1
12 1 1
13 1 1
14 1 1
15 1 1
16 1 1
17 1 1
18 1 1
19 1 1
20 1 1
21 1 1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
Disp = 0.0383 , BCV = 0.1957
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
Loading required package: splines
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
# plot the tagwise dispersion against log2-CPM (counts per million)
plotBCV(dge.glm.tag.disp)
fit <- glmFit(dge.glm.tag.disp, design)
colnames(coef(fit))
[1] "(Intercept)" "group2"
lrt <- glmLRT(fit, coef = 2)
topTags(lrt)
Coefficient: group2
logFC logCPM LR PValue FDR
ENSMUSG00000020912 -5.187 3.156 407.7 1.182e-90 1.179e-86
ENSMUSG00000050141 -5.363 2.320 311.5 1.045e-69 5.213e-66
ENSMUSG00000035775 -4.543 2.675 299.1 5.219e-67 1.735e-63
ENSMUSG00000015484 -1.968 4.308 285.7 4.282e-64 1.068e-60
ENSMUSG00000024248 -3.152 3.464 280.6 5.618e-63 1.121e-59
ENSMUSG00000030532 1.547 5.573 263.3 3.291e-59 5.470e-56
ENSMUSG00000023236 1.426 7.068 248.5 5.419e-56 7.720e-53
ENSMUSG00000054354 -6.284 1.941 245.3 2.743e-55 3.419e-52
ENSMUSG00000050824 3.705 3.366 232.7 1.545e-52 1.712e-49
ENSMUSG00000015852 -2.362 3.139 217.0 4.109e-49 4.097e-46
tt.glm <- topTags(lrt, n = Inf)
class(tt.glm)
[1] "TopTags"
attr(,"package")
[1] "edgeR"
nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])
[1] 731
interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
cpm(dge.glm.tag.disp)[interestingSamples, ]
SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000020912 20.40 12.70 15.84 14.829 19.242
ENSMUSG00000050141 14.15 10.16 10.68 6.268 9.953
ENSMUSG00000035775 14.48 10.64 19.52 11.313 12.939
ENSMUSG00000015484 26.99 33.97 32.78 36.231 31.517
ENSMUSG00000024248 14.81 26.03 17.31 23.542 18.578
ENSMUSG00000030532 27.97 24.92 21.36 25.377 18.578
ENSMUSG00000023236 65.49 78.74 72.55 72.461 76.636
ENSMUSG00000054354 10.86 7.62 11.42 8.714 8.294
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000020912 17.344 26.446 13.369 23.640 14.320
ENSMUSG00000050141 7.896 8.366 6.588 11.921 12.411
ENSMUSG00000035775 12.549 12.144 6.394 13.538 11.217
ENSMUSG00000015484 33.418 30.764 34.682 26.873 31.504
ENSMUSG00000024248 25.804 18.890 24.801 14.346 14.320
ENSMUSG00000030532 26.791 26.446 19.182 23.236 20.048
ENSMUSG00000023236 80.937 83.387 55.221 57.787 66.350
ENSMUSG00000054354 7.614 7.016 3.294 4.445 5.967
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000020912 0.5040 0.4228 1.0812 0.2803 1.1003
ENSMUSG00000050141 0.2520 0.0000 0.0000 0.2803 0.8252
ENSMUSG00000035775 0.7561 0.0000 0.2703 0.2803 0.6877
ENSMUSG00000015484 7.0565 4.8626 9.1900 7.8492 8.5269
ENSMUSG00000024248 2.2682 1.2685 1.8921 3.3639 3.1632
ENSMUSG00000030532 69.8089 70.6132 63.2489 63.0737 73.8543
ENSMUSG00000023236 195.5657 183.7210 176.2319 156.4228 207.1221
ENSMUSG00000054354 0.0000 0.0000 0.0000 0.0000 0.5501
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000020912 0.000 0.7031 0.8658 0.1363 0.000
ENSMUSG00000050141 0.000 0.1406 0.0000 0.1363 0.258
ENSMUSG00000035775 0.679 0.1406 0.5772 0.1363 1.806
ENSMUSG00000015484 9.506 8.4374 8.9463 6.6803 10.318
ENSMUSG00000024248 1.584 1.6875 1.7315 1.7723 2.580
ENSMUSG00000030532 70.161 70.7338 62.6239 78.6643 59.331
ENSMUSG00000023236 178.118 192.9358 207.2071 201.7733 204.821
ENSMUSG00000054354 0.000 0.0000 0.0000 0.0000 0.000
SRX033494
ENSMUSG00000020912 0.0000
ENSMUSG00000050141 0.1478
ENSMUSG00000035775 0.5911
ENSMUSG00000015484 8.8664
ENSMUSG00000024248 3.1032
ENSMUSG00000030532 71.3744
ENSMUSG00000023236 192.8440
ENSMUSG00000054354 0.0000
summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))
[,1]
-1 588
0 8773
1 612
tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")
############################################################################################
# DESeq
############################################################################################
deSeqDat <- newCountDataSet(dat, group)
head(counts(deSeqDat))
SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000000001 369 744 287 769 348
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 0 1 0 1 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 0 1 1 5 0
ENSMUSG00000000049 0 1 0 1 0
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000000001 803 433 469 585 321
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 1 0 7 6 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 4 0 0 0 0
ENSMUSG00000000049 0 0 0 0 0
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000000001 301 461 309 374 781
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 1 1 1 1 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 4 1 1 0 1
ENSMUSG00000000049 0 0 0 0 0
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000000001 555 820 294 758 419
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 2 1 1 4 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 2 1 1 1 1
ENSMUSG00000000049 0 0 0 0 0
SRX033494
ENSMUSG00000000001 857
ENSMUSG00000000003 0
ENSMUSG00000000028 5
ENSMUSG00000000031 0
ENSMUSG00000000037 2
ENSMUSG00000000049 0
deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)
SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483
0.6439 1.3454 0.5785 1.4295 0.6355 1.5240 0.7933
SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475
1.1272 1.0772 0.8984 0.8886 1.0255 0.7987 0.7796
SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494
1.6162 0.9882 1.5720 0.7558 1.5922 0.8264 1.4715
deSeqDat <- estimateDispersions(deSeqDat)
# plotting the estimated dispersions against the mean normalized counts
plotDispEsts(deSeqDat)
results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
str(results)
'data.frame': 36536 obs. of 8 variables:
$ id : chr "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
$ baseMean : num 489.18 0 1.57 0 1.1 ...
$ baseMeanA : num 509.685 0 1.657 0 0.859 ...
$ baseMeanB : num 470.53 0 1.49 0 1.32 ...
$ foldChange : num 0.923 NaN 0.898 NaN 1.537 ...
$ log2FoldChange: num -0.115 NaN -0.156 NaN 0.62 ...
$ pval : num 0.498 NA 0.829 NA 0.968 ...
$ padj : num 1 NA 1 NA 1 ...
nrow(results[results$padj<1e-4,])
[1] 22807
plotMA(results)
############################################################################################
# Voom & limma
############################################################################################
norm.factor <- calcNormFactors(dat)
dat.voomed <- voom(dat, design, plot = TRUE, lib.size = colSums(dat) * norm.factor)
vfit <- lmFit(dat.voomed, design)
vfit <- eBayes(vfit)
vhits <- topTable(vfit)
str(vhits)
'data.frame': 10 obs. of 6 variables:
$ X.Intercept.: num 12.5 12.3 12.3 10.7 10.3 ...
$ group2 : num -0.014 -0.0865 -0.2375 -0.048 0.0207 ...
$ AveExpr : num 12.5 12.2 12.2 10.7 10.3 ...
$ F : num 154234 138459 134826 133973 121795 ...
$ P.Value : num 2.58e-55 1.10e-54 1.58e-54 1.72e-54 6.18e-54 ...
$ adj.P.Val : num 9.44e-51 1.57e-50 1.57e-50 1.57e-50 4.42e-50 ...
############################################################################################
# Take Home Problem
############################################################################################
# edgR
dge.glm <- DGEList(counts = dat, group = group)
dge.glm[["samples"]]
group lib.size norm.factors
SRX033480 1 3040296 1
SRX033488 1 6303665 1
SRX033481 1 2717092 1
SRX033489 1 6545795 1
SRX033482 1 3016179 1
SRX033490 1 7097379 1
SRX033483 1 3707895 1
SRX033476 1 5165144 1
SRX033478 1 4953201 1
SRX033479 1 4192872 1
SRX033472 2 3970729 1
SRX033473 2 4733003 1
SRX033474 2 3702051 1
SRX033475 2 3569483 1
SRX033491 2 7276198 1
SRX033484 2 4422272 1
SRX033492 2 7115851 1
SRX033485 2 3467730 1
SRX033493 2 7339817 1
SRX033486 2 3879114 1
SRX033494 2 6771680 1
design <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
Disp = 0.03893 , BCV = 0.1973
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
edgfit <- glmFit(dge.glm.tag.disp, design)
lrt <- glmLRT(edgfit, coef = 2)
topTags(lrt, adjust.method="BH")
Coefficient: group2
logFC logCPM LR PValue FDR
ENSMUSG00000020912 -5.187 3.155 403.2 1.105e-89 4.037e-85
ENSMUSG00000050141 -5.363 2.319 311.6 9.670e-70 1.767e-65
ENSMUSG00000035775 -4.543 2.674 298.7 6.232e-67 7.589e-63
ENSMUSG00000015484 -1.968 4.307 281.8 3.049e-63 2.785e-59
ENSMUSG00000024248 -3.152 3.463 277.6 2.489e-62 1.819e-58
ENSMUSG00000030532 1.547 5.572 268.9 1.961e-60 1.194e-56
ENSMUSG00000054354 -6.283 1.940 248.7 4.924e-56 2.570e-52
ENSMUSG00000023236 1.426 7.067 244.9 3.292e-55 1.503e-51
ENSMUSG00000050824 3.705 3.365 230.0 6.054e-52 2.458e-48
ENSMUSG00000015852 -2.362 3.138 214.8 1.212e-48 4.051e-45
tt.glm <- topTags(lrt, n = Inf)
class(tt.glm)
[1] "TopTags"
attr(,"package")
[1] "edgeR"
nrow(ehits <- tt.glm$table[tt.glm$table$FDR < 1e-4, ])
[1] 294
#deSeq
deSeqDat <- newCountDataSet(dat, group)
head(counts(deSeqDat))
SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000000001 369 744 287 769 348
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 0 1 0 1 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 0 1 1 5 0
ENSMUSG00000000049 0 1 0 1 0
SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000000001 803 433 469 585 321
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 1 0 7 6 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 4 0 0 0 0
ENSMUSG00000000049 0 0 0 0 0
SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000000001 301 461 309 374 781
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 1 1 1 1 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 4 1 1 0 1
ENSMUSG00000000049 0 0 0 0 0
SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000000001 555 820 294 758 419
ENSMUSG00000000003 0 0 0 0 0
ENSMUSG00000000028 2 1 1 4 1
ENSMUSG00000000031 0 0 0 0 0
ENSMUSG00000000037 2 1 1 1 1
ENSMUSG00000000049 0 0 0 0 0
SRX033494
ENSMUSG00000000001 857
ENSMUSG00000000003 0
ENSMUSG00000000028 5
ENSMUSG00000000031 0
ENSMUSG00000000037 2
ENSMUSG00000000049 0
deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)
SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483
0.6439 1.3454 0.5785 1.4295 0.6355 1.5240 0.7933
SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475
1.1272 1.0772 0.8984 0.8886 1.0255 0.7987 0.7796
SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494
1.6162 0.9882 1.5720 0.7558 1.5922 0.8264 1.4715
deSeqDat <- estimateDispersions(deSeqDat)
results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
dhits <- (results[results$padj<1e-4,])
dhits <- na.exclude(dhits)
rownames(dhits) <- dhits$id
# Voom & limma
vhits <- topTable(vfit, coef= 2, p.value = 1e-4, number = nrow(dat))
nrow(vhits)
[1] 210
diagram <- list(rownames(ehits), rownames(dhits), rownames(vhits))
venn(diagram)