After loading the required libraries we import the dataset.
# Import data
dat <- read.table("bottomly_count_table.tsv", header = TRUE, row.names = 1)
des <- read.table("bottomly_phenodata.tsv", header = TRUE, row.names = 1)
str(dat)
## 'data.frame': 36536 obs. of 21 variables:
## $ SRX033480: int 369 0 0 0 0 0 21 15 517 0 ...
## $ SRX033488: int 744 0 1 0 1 1 46 43 874 0 ...
## $ SRX033481: int 287 0 0 0 1 0 20 12 340 0 ...
## $ SRX033489: int 769 0 1 0 5 1 36 34 813 0 ...
## $ SRX033482: int 348 0 1 0 0 0 12 14 378 0 ...
## $ SRX033490: int 803 0 1 0 4 0 55 32 860 0 ...
## $ SRX033483: int 433 0 0 0 0 0 27 19 528 0 ...
## $ SRX033476: int 469 0 7 0 0 0 44 18 401 0 ...
## $ SRX033478: int 585 0 6 0 0 0 32 44 584 0 ...
## $ SRX033479: int 321 0 1 0 0 0 47 22 401 0 ...
## $ SRX033472: int 301 0 1 0 4 0 40 17 331 0 ...
## $ SRX033473: int 461 0 1 0 1 0 40 24 431 0 ...
## $ SRX033474: int 309 0 1 0 1 0 30 29 341 0 ...
## $ SRX033475: int 374 0 1 0 0 0 27 15 480 0 ...
## $ SRX033491: int 781 0 1 0 1 0 46 34 930 0 ...
## $ SRX033484: int 555 0 2 0 2 0 28 23 585 0 ...
## $ SRX033492: int 820 0 1 0 1 0 40 38 1137 0 ...
## $ SRX033485: int 294 0 1 0 1 0 21 17 490 0 ...
## $ SRX033493: int 758 0 4 0 1 0 52 29 1079 0 ...
## $ SRX033486: int 419 0 1 0 1 0 27 12 565 0 ...
## $ SRX033494: int 857 0 5 0 2 0 45 28 726 0 ...
str(des)
## 'data.frame': 21 obs. of 4 variables:
## $ num.tech.reps : int 1 1 1 1 1 1 1 1 1 1 ...
## $ strain : Factor w/ 2 levels "C57BL/6J","DBA/2J": 1 1 1 1 1 1 1 1 1 1 ...
## $ experiment.number: int 6 7 6 7 6 7 6 4 4 4 ...
## $ lane.number : int 1 1 2 2 3 3 5 6 7 8 ...
GLM edge R
with(des, table(strain))
## strain
## C57BL/6J DBA/2J
## 10 11
group <- factor(c(rep("1", 10), rep("2", 11)))
group
## [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## Levels: 1 2
dge.glm <- DGEList(counts = dat, group = group)
names(dge.glm)
## [1] "counts" "samples"
dge.glm[["samples"]]
## group lib.size norm.factors
## SRX033480 1 3040296 1
## SRX033488 1 6303665 1
## SRX033481 1 2717092 1
## SRX033489 1 6545795 1
## SRX033482 1 3016179 1
## SRX033490 1 7097379 1
## SRX033483 1 3707895 1
## SRX033476 1 5165144 1
## SRX033478 1 4953201 1
## SRX033479 1 4192872 1
## SRX033472 2 3970729 1
## SRX033473 2 4733003 1
## SRX033474 2 3702051 1
## SRX033475 2 3569483 1
## SRX033491 2 7276198 1
## SRX033484 2 4422272 1
## SRX033492 2 7115851 1
## SRX033485 2 3467730 1
## SRX033493 2 7339817 1
## SRX033486 2 3879114 1
## SRX033494 2 6771680 1
design <- model.matrix(~group)
design
## (Intercept) group2
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## 7 1 0
## 8 1 0
## 9 1 0
## 10 1 0
## 11 1 1
## 12 1 1
## 13 1 1
## 14 1 1
## 15 1 1
## 16 1 1
## 17 1 1
## 18 1 1
## 19 1 1
## 20 1 1
## 21 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)
## Disp = 0.03893 , BCV = 0.1973
dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
## Loading required package: splines
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
Plot the tagwise dispersion against log2-CPM (counts per million):
plotBCV(dge.glm.tag.disp)
fit <- glmFit(dge.glm.tag.disp, design)
colnames(coef(fit))
## [1] "(Intercept)" "group2"
lrt <- glmLRT(fit, coef = 2)
topTags(lrt)
## Coefficient: group2
## logFC logCPM LR PValue FDR
## ENSMUSG00000020912 -5.187 3.155 403.2 1.105e-89 4.037e-85
## ENSMUSG00000050141 -5.363 2.319 311.6 9.670e-70 1.767e-65
## ENSMUSG00000035775 -4.543 2.674 298.7 6.232e-67 7.589e-63
## ENSMUSG00000015484 -1.968 4.307 281.8 3.049e-63 2.785e-59
## ENSMUSG00000024248 -3.152 3.463 277.6 2.489e-62 1.819e-58
## ENSMUSG00000030532 1.547 5.572 268.9 1.961e-60 1.194e-56
## ENSMUSG00000054354 -6.283 1.940 248.7 4.924e-56 2.570e-52
## ENSMUSG00000023236 1.426 7.067 244.9 3.292e-55 1.503e-51
## ENSMUSG00000050824 3.705 3.365 230.0 6.054e-52 2.458e-48
## ENSMUSG00000015852 -2.362 3.138 214.8 1.212e-48 4.051e-45
tt.glm <- topTags(lrt, n = Inf)
class(tt.glm)
## [1] "TopTags"
## attr(,"package")
## [1] "edgeR"
nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])
## [1] 600
interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))
## [,1]
## -1 451
## 0 35660
## 1 425
Plotting the tagwise log fold changes against log-cpm:
tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")
Mini exercise
First we delete count equal tot 0 across all sample, then we delete count equal to 0 in at least one sample in each genetype group. The resulting dataset is call final:
new1 <- dat[rowSums(dat) > 0, ]
group1 <- subset(des, strain == "C57BL/6J")
group2 <- subset(des, strain == "DBA/2J")
noZero <- apply(new1, 1, function(row) all(row[1:nrow(group1)] != 0) | all(row[nrow(group1) +
1:nrow(group2)]) != 0)
final <- new1[noZero, ]
Then we repeat the analysis:
dge.glm.new <- DGEList(counts = final, group = group)
dge.glm.com.disp.new <- estimateGLMCommonDisp(dge.glm.new, design, verbose = TRUE)
## Disp = 0.0383 , BCV = 0.1957
dge.glm.trend.disp.new <- estimateGLMTrendedDisp(dge.glm.com.disp.new)
dge.glm.tag.disp.new <- estimateGLMTagwiseDisp(dge.glm.trend.disp.new, design)
# plot the tagwise dispersion against log2-CPM (counts per million)
plotBCV(dge.glm.tag.disp.new)
fit.new <- glmFit(dge.glm.tag.disp.new, design)
lrt.new <- glmLRT(fit.new, coef = 2)
tt.glm.new <- topTags(lrt.new, n = Inf)
nrow(tt.glm.new$table[tt.glm.new$table$FDR < 0.01, ])
## [1] 731
731 genes have FDR less than 0.01
interestingSamples.new <- rownames(tt.glm.new$table[tt.glm.new$table$FDR < 1e-50,
])
summary(de.glm.new <- decideTestsDGE(lrt.new, p = 0.05, adjust = "BH"))
## [,1]
## -1 588
## 0 8773
## 1 612
588 genes are under-expressed in group 2 (DBA/2J) compared with group 1 (C57BL/6J), 8773 show no differences in expression while 612 genes are over-expressed.
Plotting the tagwise log fold changes against log-cpm:
tags.glm.new <- rownames(dge.glm.tag.disp.new)[as.logical(de.glm.new)]
plotSmear(lrt.new, de.tags = tags.glm.new)
abline(h = c(-2, 2), col = "blue")
DESeq
deSeqDat <- newCountDataSet(dat, group)
head(counts(deSeqDat))
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
## ENSMUSG00000000001 369 744 287 769 348
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 0 1 0 1 1
## ENSMUSG00000000031 0 0 0 0 0
## ENSMUSG00000000037 0 1 1 5 0
## ENSMUSG00000000049 0 1 0 1 0
## SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
## ENSMUSG00000000001 803 433 469 585 321
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 1 0 7 6 1
## ENSMUSG00000000031 0 0 0 0 0
## ENSMUSG00000000037 4 0 0 0 0
## ENSMUSG00000000049 0 0 0 0 0
## SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
## ENSMUSG00000000001 301 461 309 374 781
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 1 1 1 1 1
## ENSMUSG00000000031 0 0 0 0 0
## ENSMUSG00000000037 4 1 1 0 1
## ENSMUSG00000000049 0 0 0 0 0
## SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
## ENSMUSG00000000001 555 820 294 758 419
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 2 1 1 4 1
## ENSMUSG00000000031 0 0 0 0 0
## ENSMUSG00000000037 2 1 1 1 1
## ENSMUSG00000000049 0 0 0 0 0
## SRX033494
## ENSMUSG00000000001 857
## ENSMUSG00000000003 0
## ENSMUSG00000000028 5
## ENSMUSG00000000031 0
## ENSMUSG00000000037 2
## ENSMUSG00000000049 0
deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)
## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483
## 0.6439 1.3454 0.5785 1.4295 0.6355 1.5240 0.7933
## SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475
## 1.1272 1.0772 0.8984 0.8886 1.0255 0.7987 0.7796
## SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494
## 1.6162 0.9882 1.5720 0.7558 1.5922 0.8264 1.4715
deSeqDat <- estimateDispersions(deSeqDat)
Plotting the estimated dispersions against the mean normalized counts:
plotDispEsts(deSeqDat)
results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
str(results)
## 'data.frame': 36536 obs. of 8 variables:
## $ id : chr "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## $ baseMean : num 489.18 0 1.57 0 1.1 ...
## $ baseMeanA : num 509.685 0 1.657 0 0.859 ...
## $ baseMeanB : num 470.53 0 1.49 0 1.32 ...
## $ foldChange : num 0.923 NaN 0.898 NaN 1.537 ...
## $ log2FoldChange: num -0.115 NaN -0.156 NaN 0.62 ...
## $ pval : num 0.498 NA 0.829 NA 0.968 ...
## $ padj : num 1 NA 1 NA 1 ...
plotMA(results)
Voom and limma
norm.factor <- calcNormFactors(dat)
dat.voomed <- voom(dat, design, plot = TRUE, lib.size = colSums(dat) * norm.factor)
fit <- lmFit(dat.voomed, design)
fit <- eBayes(fit)
topTable(fit)
## ID X.Intercept. group2 AveExpr
## ENSMUSG00000025867 ENSMUSG00000025867 12.477 -0.013979 12.471
## ENSMUSG00000022892 ENSMUSG00000022892 12.268 -0.086479 12.223
## ENSMUSG00000037852 ENSMUSG00000037852 12.321 -0.237470 12.194
## ENSMUSG00000042700 ENSMUSG00000042700 10.716 -0.048029 10.692
## ENSMUSG00000029461 ENSMUSG00000029461 10.300 0.020715 10.311
## ENSMUSG00000020658 ENSMUSG00000020658 9.610 -0.019628 9.601
## ENSMUSG00000060261 ENSMUSG00000060261 9.469 -0.015743 9.461
## ENSMUSG00000032549 ENSMUSG00000032549 11.904 0.003545 11.905
## ENSMUSG00000024462 ENSMUSG00000024462 10.227 -0.138929 10.153
## ENSMUSG00000030102 ENSMUSG00000030102 12.085 -0.026149 12.073
## F P.Value adj.P.Val
## ENSMUSG00000025867 154234 2.584e-55 9.442e-51
## ENSMUSG00000022892 138459 1.103e-54 1.568e-50
## ENSMUSG00000037852 134826 1.576e-54 1.568e-50
## ENSMUSG00000042700 133973 1.717e-54 1.568e-50
## ENSMUSG00000029461 121795 6.182e-54 4.424e-50
## ENSMUSG00000020658 119935 7.604e-54 4.424e-50
## ENSMUSG00000060261 117841 9.635e-54 4.424e-50
## ENSMUSG00000032549 117324 1.022e-53 4.424e-50
## ENSMUSG00000024462 116767 1.090e-53 4.424e-50
## ENSMUSG00000030102 112155 1.874e-53 6.417e-50
Take home exercise
cutoff <- 1e-04
# GLM edgeR
edgeRhits <- tt.glm$table[tt.glm$table$FDR < cutoff, ]
head(edgeRhits)
## logFC logCPM LR PValue FDR
## ENSMUSG00000020912 -5.187 3.155 403.2 1.105e-89 4.037e-85
## ENSMUSG00000050141 -5.363 2.319 311.6 9.670e-70 1.767e-65
## ENSMUSG00000035775 -4.543 2.674 298.7 6.232e-67 7.589e-63
## ENSMUSG00000015484 -1.968 4.307 281.8 3.049e-63 2.785e-59
## ENSMUSG00000024248 -3.152 3.463 277.6 2.489e-62 1.819e-58
## ENSMUSG00000030532 1.547 5.572 268.9 1.961e-60 1.194e-56
nrow(edgeRhits)
## [1] 294
# DEseq
DESeqhits <- results[!is.na(results$padj), ]
DESeqhits <- DESeqhits[DESeqhits$padj < cutoff, ]
nrow(DESeqhits)
## [1] 203
head(DESeqhits)
## id baseMean baseMeanA baseMeanB foldChange
## 13 ENSMUSG00000000094 1.444 3.032 0.00 0.0000
## 78 ENSMUSG00000000402 21.584 32.528 11.64 0.3577
## 96 ENSMUSG00000000560 921.853 609.185 1206.10 1.9799
## 139 ENSMUSG00000000792 104.244 139.251 72.42 0.5201
## 165 ENSMUSG00000000958 25.887 35.943 16.75 0.4659
## 261 ENSMUSG00000001473 35.355 51.563 20.62 0.3999
## log2FoldChange pval padj
## 13 -Inf 7.391e-07 5.507e-05
## 78 -1.4832 7.778e-11 1.073e-08
## 96 0.9854 4.746e-08 4.690e-06
## 139 -0.9432 3.921e-11 5.750e-09
## 165 -1.1020 1.024e-07 9.382e-06
## 261 -1.3222 1.496e-11 2.290e-09
# voom
hitsVoom <- topTable(fit, p.value = cutoff, coef = "group2", n = Inf)
nrow(hitsVoom)
## [1] 210
head(hitsVoom)
## ID logFC AveExpr t P.Value adj.P.Val B
## 1623 ENSMUSG00000015484 -1.995 3.9635 -20.49 6.099e-18 8.457e-14 30.79
## 6391 ENSMUSG00000027855 5.343 -0.3132 22.23 7.634e-19 2.789e-14 30.24
## 7936 ENSMUSG00000030532 1.539 5.3545 19.71 1.626e-17 1.485e-13 30.04
## 4125 ENSMUSG00000023236 1.420 6.8879 19.36 2.548e-17 1.552e-13 29.64
## 16165 ENSMUSG00000054354 -5.824 -0.2156 -20.39 6.944e-18 8.457e-14 28.84
## 15017 ENSMUSG00000050141 -5.334 0.4628 -19.38 2.495e-17 1.552e-13 27.78
# Venn Diagram
input <- data.frame(edgeR = rownames(dat) %in% rownames(edgeRhits), DESeq = rownames(dat) %in%
DESeqhits$id, Voom = rownames(dat) %in% hitsVoom$ID)
vennDiagram(input)