Seminar 7: RNA-seq - Differential Expression Analysis

After loading the required libraries we import the dataset.

# Import data
dat <- read.table("bottomly_count_table.tsv", header = TRUE, row.names = 1)
des <- read.table("bottomly_phenodata.tsv", header = TRUE, row.names = 1)
str(dat)

## 'data.frame':    36536 obs. of  21 variables:
##  $ SRX033480: int  369 0 0 0 0 0 21 15 517 0 ...
##  $ SRX033488: int  744 0 1 0 1 1 46 43 874 0 ...
##  $ SRX033481: int  287 0 0 0 1 0 20 12 340 0 ...
##  $ SRX033489: int  769 0 1 0 5 1 36 34 813 0 ...
##  $ SRX033482: int  348 0 1 0 0 0 12 14 378 0 ...
##  $ SRX033490: int  803 0 1 0 4 0 55 32 860 0 ...
##  $ SRX033483: int  433 0 0 0 0 0 27 19 528 0 ...
##  $ SRX033476: int  469 0 7 0 0 0 44 18 401 0 ...
##  $ SRX033478: int  585 0 6 0 0 0 32 44 584 0 ...
##  $ SRX033479: int  321 0 1 0 0 0 47 22 401 0 ...
##  $ SRX033472: int  301 0 1 0 4 0 40 17 331 0 ...
##  $ SRX033473: int  461 0 1 0 1 0 40 24 431 0 ...
##  $ SRX033474: int  309 0 1 0 1 0 30 29 341 0 ...
##  $ SRX033475: int  374 0 1 0 0 0 27 15 480 0 ...
##  $ SRX033491: int  781 0 1 0 1 0 46 34 930 0 ...
##  $ SRX033484: int  555 0 2 0 2 0 28 23 585 0 ...
##  $ SRX033492: int  820 0 1 0 1 0 40 38 1137 0 ...
##  $ SRX033485: int  294 0 1 0 1 0 21 17 490 0 ...
##  $ SRX033493: int  758 0 4 0 1 0 52 29 1079 0 ...
##  $ SRX033486: int  419 0 1 0 1 0 27 12 565 0 ...
##  $ SRX033494: int  857 0 5 0 2 0 45 28 726 0 ...

str(des)

## 'data.frame':    21 obs. of  4 variables:
##  $ num.tech.reps    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ strain           : Factor w/ 2 levels "C57BL/6J","DBA/2J": 1 1 1 1 1 1 1 1 1 1 ...
##  $ experiment.number: int  6 7 6 7 6 7 6 4 4 4 ...
##  $ lane.number      : int  1 1 2 2 3 3 5 6 7 8 ...

GLM edge R

with(des, table(strain))

## strain
## C57BL/6J   DBA/2J 
##       10       11

group <- factor(c(rep("1", 10), rep("2", 11)))
group

##  [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## Levels: 1 2

dge.glm <- DGEList(counts = dat, group = group)
names(dge.glm)

## [1] "counts"  "samples"

dge.glm[["samples"]]

##           group lib.size norm.factors
## SRX033480     1  3040296            1
## SRX033488     1  6303665            1
## SRX033481     1  2717092            1
## SRX033489     1  6545795            1
## SRX033482     1  3016179            1
## SRX033490     1  7097379            1
## SRX033483     1  3707895            1
## SRX033476     1  5165144            1
## SRX033478     1  4953201            1
## SRX033479     1  4192872            1
## SRX033472     2  3970729            1
## SRX033473     2  4733003            1
## SRX033474     2  3702051            1
## SRX033475     2  3569483            1
## SRX033491     2  7276198            1
## SRX033484     2  4422272            1
## SRX033492     2  7115851            1
## SRX033485     2  3467730            1
## SRX033493     2  7339817            1
## SRX033486     2  3879114            1
## SRX033494     2  6771680            1

design <- model.matrix(~group)
design

##    (Intercept) group2
## 1            1      0
## 2            1      0
## 3            1      0
## 4            1      0
## 5            1      0
## 6            1      0
## 7            1      0
## 8            1      0
## 9            1      0
## 10           1      0
## 11           1      1
## 12           1      1
## 13           1      1
## 14           1      1
## 15           1      1
## 16           1      1
## 17           1      1
## 18           1      1
## 19           1      1
## 20           1      1
## 21           1      1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"

dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)

## Disp = 0.03893 , BCV = 0.1973

dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)

## Loading required package: splines

dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)

Plot the tagwise dispersion against log2-CPM (counts per million):

plotBCV(dge.glm.tag.disp)

plot of chunk unnamed-chunk-4

fit <- glmFit(dge.glm.tag.disp, design)
colnames(coef(fit))

## [1] "(Intercept)" "group2"

lrt <- glmLRT(fit, coef = 2)
topTags(lrt)

## Coefficient:  group2 
##                     logFC logCPM    LR    PValue       FDR
## ENSMUSG00000020912 -5.187  3.155 403.2 1.105e-89 4.037e-85
## ENSMUSG00000050141 -5.363  2.319 311.6 9.670e-70 1.767e-65
## ENSMUSG00000035775 -4.543  2.674 298.7 6.232e-67 7.589e-63
## ENSMUSG00000015484 -1.968  4.307 281.8 3.049e-63 2.785e-59
## ENSMUSG00000024248 -3.152  3.463 277.6 2.489e-62 1.819e-58
## ENSMUSG00000030532  1.547  5.572 268.9 1.961e-60 1.194e-56
## ENSMUSG00000054354 -6.283  1.940 248.7 4.924e-56 2.570e-52
## ENSMUSG00000023236  1.426  7.067 244.9 3.292e-55 1.503e-51
## ENSMUSG00000050824  3.705  3.365 230.0 6.054e-52 2.458e-48
## ENSMUSG00000015852 -2.362  3.138 214.8 1.212e-48 4.051e-45

tt.glm <- topTags(lrt, n = Inf)
class(tt.glm)

## [1] "TopTags"
## attr(,"package")
## [1] "edgeR"

nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])

## [1] 600

interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))

##    [,1] 
## -1   451
## 0  35660
## 1    425

Plotting the tagwise log fold changes against log-cpm:

tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")

plot of chunk unnamed-chunk-6

Mini exercise

First we delete count equal tot 0 across all sample, then we delete count equal to 0 in at least one sample in each genetype group. The resulting dataset is call final:

new1 <- dat[rowSums(dat) > 0, ]
group1 <- subset(des, strain == "C57BL/6J")
group2 <- subset(des, strain == "DBA/2J")
noZero <- apply(new1, 1, function(row) all(row[1:nrow(group1)] != 0) | all(row[nrow(group1) + 
    1:nrow(group2)]) != 0)
final <- new1[noZero, ]

Then we repeat the analysis:

dge.glm.new <- DGEList(counts = final, group = group)
dge.glm.com.disp.new <- estimateGLMCommonDisp(dge.glm.new, design, verbose = TRUE)

## Disp = 0.0383 , BCV = 0.1957

dge.glm.trend.disp.new <- estimateGLMTrendedDisp(dge.glm.com.disp.new)
dge.glm.tag.disp.new <- estimateGLMTagwiseDisp(dge.glm.trend.disp.new, design)
# plot the tagwise dispersion against log2-CPM (counts per million)
plotBCV(dge.glm.tag.disp.new)

plot of chunk unnamed-chunk-8

fit.new <- glmFit(dge.glm.tag.disp.new, design)
lrt.new <- glmLRT(fit.new, coef = 2)
tt.glm.new <- topTags(lrt.new, n = Inf)
nrow(tt.glm.new$table[tt.glm.new$table$FDR < 0.01, ])

## [1] 731

731 genes have FDR less than 0.01

interestingSamples.new <- rownames(tt.glm.new$table[tt.glm.new$table$FDR < 1e-50, 
    ])
summary(de.glm.new <- decideTestsDGE(lrt.new, p = 0.05, adjust = "BH"))

##    [,1]
## -1  588
## 0  8773
## 1   612

588 genes are under-expressed in group 2 (DBA/2J) compared with group 1 (C57BL/6J), 8773 show no differences in expression while 612 genes are over-expressed.

Plotting the tagwise log fold changes against log-cpm:

tags.glm.new <- rownames(dge.glm.tag.disp.new)[as.logical(de.glm.new)]
plotSmear(lrt.new, de.tags = tags.glm.new)
abline(h = c(-2, 2), col = "blue")

plot of chunk unnamed-chunk-10

DESeq

deSeqDat <- newCountDataSet(dat, group)
head(counts(deSeqDat))

##                    SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
## ENSMUSG00000000001       369       744       287       769       348
## ENSMUSG00000000003         0         0         0         0         0
## ENSMUSG00000000028         0         1         0         1         1
## ENSMUSG00000000031         0         0         0         0         0
## ENSMUSG00000000037         0         1         1         5         0
## ENSMUSG00000000049         0         1         0         1         0
##                    SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
## ENSMUSG00000000001       803       433       469       585       321
## ENSMUSG00000000003         0         0         0         0         0
## ENSMUSG00000000028         1         0         7         6         1
## ENSMUSG00000000031         0         0         0         0         0
## ENSMUSG00000000037         4         0         0         0         0
## ENSMUSG00000000049         0         0         0         0         0
##                    SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
## ENSMUSG00000000001       301       461       309       374       781
## ENSMUSG00000000003         0         0         0         0         0
## ENSMUSG00000000028         1         1         1         1         1
## ENSMUSG00000000031         0         0         0         0         0
## ENSMUSG00000000037         4         1         1         0         1
## ENSMUSG00000000049         0         0         0         0         0
##                    SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
## ENSMUSG00000000001       555       820       294       758       419
## ENSMUSG00000000003         0         0         0         0         0
## ENSMUSG00000000028         2         1         1         4         1
## ENSMUSG00000000031         0         0         0         0         0
## ENSMUSG00000000037         2         1         1         1         1
## ENSMUSG00000000049         0         0         0         0         0
##                    SRX033494
## ENSMUSG00000000001       857
## ENSMUSG00000000003         0
## ENSMUSG00000000028         5
## ENSMUSG00000000031         0
## ENSMUSG00000000037         2
## ENSMUSG00000000049         0

deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)

## SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483 
##    0.6439    1.3454    0.5785    1.4295    0.6355    1.5240    0.7933 
## SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475 
##    1.1272    1.0772    0.8984    0.8886    1.0255    0.7987    0.7796 
## SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494 
##    1.6162    0.9882    1.5720    0.7558    1.5922    0.8264    1.4715

deSeqDat <- estimateDispersions(deSeqDat)

Plotting the estimated dispersions against the mean normalized counts:

plotDispEsts(deSeqDat)

plot of chunk unnamed-chunk-12

results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
str(results)

## 'data.frame':    36536 obs. of  8 variables:
##  $ id            : chr  "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##  $ baseMean      : num  489.18 0 1.57 0 1.1 ...
##  $ baseMeanA     : num  509.685 0 1.657 0 0.859 ...
##  $ baseMeanB     : num  470.53 0 1.49 0 1.32 ...
##  $ foldChange    : num  0.923 NaN 0.898 NaN 1.537 ...
##  $ log2FoldChange: num  -0.115 NaN -0.156 NaN 0.62 ...
##  $ pval          : num  0.498 NA 0.829 NA 0.968 ...
##  $ padj          : num  1 NA 1 NA 1 ...

plotMA(results)

plot of chunk unnamed-chunk-12

Voom and limma

norm.factor <- calcNormFactors(dat)
dat.voomed <- voom(dat, design, plot = TRUE, lib.size = colSums(dat) * norm.factor)

plot of chunk unnamed-chunk-13

fit <- lmFit(dat.voomed, design)
fit <- eBayes(fit)
topTable(fit)

##                                    ID X.Intercept.    group2 AveExpr
## ENSMUSG00000025867 ENSMUSG00000025867       12.477 -0.013979  12.471
## ENSMUSG00000022892 ENSMUSG00000022892       12.268 -0.086479  12.223
## ENSMUSG00000037852 ENSMUSG00000037852       12.321 -0.237470  12.194
## ENSMUSG00000042700 ENSMUSG00000042700       10.716 -0.048029  10.692
## ENSMUSG00000029461 ENSMUSG00000029461       10.300  0.020715  10.311
## ENSMUSG00000020658 ENSMUSG00000020658        9.610 -0.019628   9.601
## ENSMUSG00000060261 ENSMUSG00000060261        9.469 -0.015743   9.461
## ENSMUSG00000032549 ENSMUSG00000032549       11.904  0.003545  11.905
## ENSMUSG00000024462 ENSMUSG00000024462       10.227 -0.138929  10.153
## ENSMUSG00000030102 ENSMUSG00000030102       12.085 -0.026149  12.073
##                         F   P.Value adj.P.Val
## ENSMUSG00000025867 154234 2.584e-55 9.442e-51
## ENSMUSG00000022892 138459 1.103e-54 1.568e-50
## ENSMUSG00000037852 134826 1.576e-54 1.568e-50
## ENSMUSG00000042700 133973 1.717e-54 1.568e-50
## ENSMUSG00000029461 121795 6.182e-54 4.424e-50
## ENSMUSG00000020658 119935 7.604e-54 4.424e-50
## ENSMUSG00000060261 117841 9.635e-54 4.424e-50
## ENSMUSG00000032549 117324 1.022e-53 4.424e-50
## ENSMUSG00000024462 116767 1.090e-53 4.424e-50
## ENSMUSG00000030102 112155 1.874e-53 6.417e-50

Take home exercise

cutoff <- 1e-04
# GLM edgeR
edgeRhits <- tt.glm$table[tt.glm$table$FDR < cutoff, ]
head(edgeRhits)

##                     logFC logCPM    LR    PValue       FDR
## ENSMUSG00000020912 -5.187  3.155 403.2 1.105e-89 4.037e-85
## ENSMUSG00000050141 -5.363  2.319 311.6 9.670e-70 1.767e-65
## ENSMUSG00000035775 -4.543  2.674 298.7 6.232e-67 7.589e-63
## ENSMUSG00000015484 -1.968  4.307 281.8 3.049e-63 2.785e-59
## ENSMUSG00000024248 -3.152  3.463 277.6 2.489e-62 1.819e-58
## ENSMUSG00000030532  1.547  5.572 268.9 1.961e-60 1.194e-56

nrow(edgeRhits)

## [1] 294

# DEseq
DESeqhits <- results[!is.na(results$padj), ]
DESeqhits <- DESeqhits[DESeqhits$padj < cutoff, ]
nrow(DESeqhits)

## [1] 203

head(DESeqhits)

##                     id baseMean baseMeanA baseMeanB foldChange
## 13  ENSMUSG00000000094    1.444     3.032      0.00     0.0000
## 78  ENSMUSG00000000402   21.584    32.528     11.64     0.3577
## 96  ENSMUSG00000000560  921.853   609.185   1206.10     1.9799
## 139 ENSMUSG00000000792  104.244   139.251     72.42     0.5201
## 165 ENSMUSG00000000958   25.887    35.943     16.75     0.4659
## 261 ENSMUSG00000001473   35.355    51.563     20.62     0.3999
##     log2FoldChange      pval      padj
## 13            -Inf 7.391e-07 5.507e-05
## 78         -1.4832 7.778e-11 1.073e-08
## 96          0.9854 4.746e-08 4.690e-06
## 139        -0.9432 3.921e-11 5.750e-09
## 165        -1.1020 1.024e-07 9.382e-06
## 261        -1.3222 1.496e-11 2.290e-09

# voom
hitsVoom <- topTable(fit, p.value = cutoff, coef = "group2", n = Inf)
nrow(hitsVoom)

## [1] 210

head(hitsVoom)

##                       ID  logFC AveExpr      t   P.Value adj.P.Val     B
## 1623  ENSMUSG00000015484 -1.995  3.9635 -20.49 6.099e-18 8.457e-14 30.79
## 6391  ENSMUSG00000027855  5.343 -0.3132  22.23 7.634e-19 2.789e-14 30.24
## 7936  ENSMUSG00000030532  1.539  5.3545  19.71 1.626e-17 1.485e-13 30.04
## 4125  ENSMUSG00000023236  1.420  6.8879  19.36 2.548e-17 1.552e-13 29.64
## 16165 ENSMUSG00000054354 -5.824 -0.2156 -20.39 6.944e-18 8.457e-14 28.84
## 15017 ENSMUSG00000050141 -5.334  0.4628 -19.38 2.495e-17 1.552e-13 27.78

# Venn Diagram
input <- data.frame(edgeR = rownames(dat) %in% rownames(edgeRhits), DESeq = rownames(dat) %in% 
    DESeqhits$id, Voom = rownames(dat) %in% hitsVoom$ID)
vennDiagram(input)

plot of chunk unnamed-chunk-14