Seminar07.R

Abrar — Feb 27, 2014, 11:36 PM

# Seminar 07
# Created by: Abrar Wafa
# Date: Feb.27, 2013

library(edgeR)

Loading required package: limma

library(DESeq)

Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following object is masked from 'package:limma':

    plotMA

The following object is masked from 'package:stats':

    xtabs

The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, as.vector, cbind,
    colnames, duplicated, eval, evalq, Filter, Find, get,
    intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unlist

Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: locfit
locfit 1.5-9.1   2013-03-22
Loading required package: lattice
    Welcome to 'DESeq'. For improved performance, usability and
    functionality, please consider migrating to 'DESeq2'.

library(limma)
library(gplots)

KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009

Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess


dat <- read.table("../Data_RNAseq/bottomly_count_table.tsv", header = TRUE, row.names = 1)
des <- read.table("../Data_RNAseq/bottomly_phenodata.tsv", header = TRUE, row.names = 1)

############################################################################################
# edgeR
############################################################################################
# Mini exercise:
all(rownames(des) == colnames(dat))

[1] TRUE

# remove any gene that has count equal to zero across all samples
filDat <- dat[(rowSums(dat) !=0),]
# remove any gene that has count equal to zero in at least one sample in each genotype group
selrow <- apply(filDat, 1, function(row) all(row[1:10]!=0) | all(row[11:21]!=0))
fdat <- filDat [selrow, ]

group <- factor(c(rep("1", 10), rep("2", 11)))
group

 [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
Levels: 1 2

dge.glm <- DGEList(counts = fdat, group = group)
str(dge.glm)

Formal class 'DGEList' [package "edgeR"] with 1 slots
  ..@ .Data:List of 2
  .. ..$ : int [1:9973, 1:21] 369 0 21 15 517 273 11 19 19 97 ...
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:9973] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000056" "ENSMUSG00000000058" ...
  .. .. .. ..$ : chr [1:21] "SRX033480" "SRX033488" "SRX033481" "SRX033489" ...
  .. ..$ :'data.frame': 21 obs. of  3 variables:
  .. .. ..$ group       : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
  .. .. ..$ lib.size    : num [1:21] 3038489 6299265 2715366 6541434 3014237 ...
  .. .. ..$ norm.factors: num [1:21] 1 1 1 1 1 1 1 1 1 1 ...

names(dge.glm)

[1] "counts"  "samples"

dge.glm[["samples"]]

          group lib.size norm.factors
SRX033480     1  3038489            1
SRX033488     1  6299265            1
SRX033481     1  2715366            1
SRX033489     1  6541434            1
SRX033482     1  3014237            1
SRX033490     1  7091978            1
SRX033483     1  3705596            1
SRX033476     1  5161116            1
SRX033478     1  4949177            1
SRX033479     1  4189929            1
SRX033472     2  3967975            1
SRX033473     2  4729997            1
SRX033474     2  3699671            1
SRX033475     2  3567255            1
SRX033491     2  7271072            1
SRX033484     2  4418407            1
SRX033492     2  7111173            1
SRX033485     2  3465132            1
SRX033493     2  7334966            1
SRX033486     2  3876563            1
SRX033494     2  6767129            1

nrow(dge.glm[[1]])

[1] 9973

ncol(dge.glm[[2]])

[1] 3


design <- model.matrix(~group)
design

   (Intercept) group2
1            1      0
2            1      0
3            1      0
4            1      0
5            1      0
6            1      0
7            1      0
8            1      0
9            1      0
10           1      0
11           1      1
12           1      1
13           1      1
14           1      1
15           1      1
16           1      1
17           1      1
18           1      1
19           1      1
20           1      1
21           1      1
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"


dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)

Disp = 0.0383 , BCV = 0.1957


dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)

Loading required package: splines

dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
# plot the tagwise dispersion against log2-CPM (counts per million)
plotBCV(dge.glm.tag.disp)

plot of chunk unnamed-chunk-1

fit <- glmFit(dge.glm.tag.disp, design)
colnames(coef(fit))

[1] "(Intercept)" "group2"

lrt <- glmLRT(fit, coef = 2)
topTags(lrt)

Coefficient:  group2 
                    logFC logCPM    LR    PValue       FDR
ENSMUSG00000020912 -5.187  3.156 407.7 1.182e-90 1.179e-86
ENSMUSG00000050141 -5.363  2.320 311.5 1.045e-69 5.213e-66
ENSMUSG00000035775 -4.543  2.675 299.1 5.219e-67 1.735e-63
ENSMUSG00000015484 -1.968  4.308 285.7 4.282e-64 1.068e-60
ENSMUSG00000024248 -3.152  3.464 280.6 5.618e-63 1.121e-59
ENSMUSG00000030532  1.547  5.573 263.3 3.291e-59 5.470e-56
ENSMUSG00000023236  1.426  7.068 248.5 5.419e-56 7.720e-53
ENSMUSG00000054354 -6.284  1.941 245.3 2.743e-55 3.419e-52
ENSMUSG00000050824  3.705  3.366 232.7 1.545e-52 1.712e-49
ENSMUSG00000015852 -2.362  3.139 217.0 4.109e-49 4.097e-46

tt.glm <- topTags(lrt, n = Inf)
class(tt.glm)

[1] "TopTags"
attr(,"package")
[1] "edgeR"

nrow(tt.glm$table[tt.glm$table$FDR < 0.01, ])

[1] 731

interestingSamples <- rownames(tt.glm$table[tt.glm$table$FDR < 1e-50, ])
cpm(dge.glm.tag.disp)[interestingSamples, ]

                   SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000020912     20.40     12.70     15.84    14.829    19.242
ENSMUSG00000050141     14.15     10.16     10.68     6.268     9.953
ENSMUSG00000035775     14.48     10.64     19.52    11.313    12.939
ENSMUSG00000015484     26.99     33.97     32.78    36.231    31.517
ENSMUSG00000024248     14.81     26.03     17.31    23.542    18.578
ENSMUSG00000030532     27.97     24.92     21.36    25.377    18.578
ENSMUSG00000023236     65.49     78.74     72.55    72.461    76.636
ENSMUSG00000054354     10.86      7.62     11.42     8.714     8.294
                   SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000020912    17.344    26.446    13.369    23.640    14.320
ENSMUSG00000050141     7.896     8.366     6.588    11.921    12.411
ENSMUSG00000035775    12.549    12.144     6.394    13.538    11.217
ENSMUSG00000015484    33.418    30.764    34.682    26.873    31.504
ENSMUSG00000024248    25.804    18.890    24.801    14.346    14.320
ENSMUSG00000030532    26.791    26.446    19.182    23.236    20.048
ENSMUSG00000023236    80.937    83.387    55.221    57.787    66.350
ENSMUSG00000054354     7.614     7.016     3.294     4.445     5.967
                   SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000020912    0.5040    0.4228    1.0812    0.2803    1.1003
ENSMUSG00000050141    0.2520    0.0000    0.0000    0.2803    0.8252
ENSMUSG00000035775    0.7561    0.0000    0.2703    0.2803    0.6877
ENSMUSG00000015484    7.0565    4.8626    9.1900    7.8492    8.5269
ENSMUSG00000024248    2.2682    1.2685    1.8921    3.3639    3.1632
ENSMUSG00000030532   69.8089   70.6132   63.2489   63.0737   73.8543
ENSMUSG00000023236  195.5657  183.7210  176.2319  156.4228  207.1221
ENSMUSG00000054354    0.0000    0.0000    0.0000    0.0000    0.5501
                   SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000020912     0.000    0.7031    0.8658    0.1363     0.000
ENSMUSG00000050141     0.000    0.1406    0.0000    0.1363     0.258
ENSMUSG00000035775     0.679    0.1406    0.5772    0.1363     1.806
ENSMUSG00000015484     9.506    8.4374    8.9463    6.6803    10.318
ENSMUSG00000024248     1.584    1.6875    1.7315    1.7723     2.580
ENSMUSG00000030532    70.161   70.7338   62.6239   78.6643    59.331
ENSMUSG00000023236   178.118  192.9358  207.2071  201.7733   204.821
ENSMUSG00000054354     0.000    0.0000    0.0000    0.0000     0.000
                   SRX033494
ENSMUSG00000020912    0.0000
ENSMUSG00000050141    0.1478
ENSMUSG00000035775    0.5911
ENSMUSG00000015484    8.8664
ENSMUSG00000024248    3.1032
ENSMUSG00000030532   71.3744
ENSMUSG00000023236  192.8440
ENSMUSG00000054354    0.0000

summary(de.glm <- decideTestsDGE(lrt, p = 0.05, adjust = "BH"))

tags.glm <- rownames(dge.glm.tag.disp)[as.logical(de.glm)]
plotSmear(lrt, de.tags = tags.glm)
abline(h = c(-2, 2), col = "blue")

plot of chunk unnamed-chunk-1


############################################################################################
# DESeq
############################################################################################
deSeqDat <- newCountDataSet(dat, group)
head(counts(deSeqDat))

                   SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000000001       369       744       287       769       348
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         0         1         0         1         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         0         1         1         5         0
ENSMUSG00000000049         0         1         0         1         0
                   SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000000001       803       433       469       585       321
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         1         0         7         6         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         4         0         0         0         0
ENSMUSG00000000049         0         0         0         0         0
                   SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000000001       301       461       309       374       781
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         1         1         1         1         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         4         1         1         0         1
ENSMUSG00000000049         0         0         0         0         0
                   SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000000001       555       820       294       758       419
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         2         1         1         4         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         2         1         1         1         1
ENSMUSG00000000049         0         0         0         0         0
                   SRX033494
ENSMUSG00000000001       857
ENSMUSG00000000003         0
ENSMUSG00000000028         5
ENSMUSG00000000031         0
ENSMUSG00000000037         2
ENSMUSG00000000049         0

deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)

SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483 
   0.6439    1.3454    0.5785    1.4295    0.6355    1.5240    0.7933 
SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475 
   1.1272    1.0772    0.8984    0.8886    1.0255    0.7987    0.7796 
SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494 
   1.6162    0.9882    1.5720    0.7558    1.5922    0.8264    1.4715

deSeqDat <- estimateDispersions(deSeqDat)
# plotting the estimated dispersions against the mean normalized counts
plotDispEsts(deSeqDat)

plot of chunk unnamed-chunk-1

results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
str(results)

'data.frame':   36536 obs. of  8 variables:
 $ id            : chr  "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
 $ baseMean      : num  489.18 0 1.57 0 1.1 ...
 $ baseMeanA     : num  509.685 0 1.657 0 0.859 ...
 $ baseMeanB     : num  470.53 0 1.49 0 1.32 ...
 $ foldChange    : num  0.923 NaN 0.898 NaN 1.537 ...
 $ log2FoldChange: num  -0.115 NaN -0.156 NaN 0.62 ...
 $ pval          : num  0.498 NA 0.829 NA 0.968 ...
 $ padj          : num  1 NA 1 NA 1 ...

nrow(results[results$padj<1e-4,])

[1] 22807

plotMA(results)

plot of chunk unnamed-chunk-1


############################################################################################
# Voom & limma
############################################################################################
norm.factor <- calcNormFactors(dat)
dat.voomed <- voom(dat, design, plot = TRUE, lib.size = colSums(dat) * norm.factor)

plot of chunk unnamed-chunk-1

vfit <- lmFit(dat.voomed, design)
vfit <- eBayes(vfit)
vhits <- topTable(vfit)
str(vhits)

'data.frame':   10 obs. of  6 variables:
 $ X.Intercept.: num  12.5 12.3 12.3 10.7 10.3 ...
 $ group2      : num  -0.014 -0.0865 -0.2375 -0.048 0.0207 ...
 $ AveExpr     : num  12.5 12.2 12.2 10.7 10.3 ...
 $ F           : num  154234 138459 134826 133973 121795 ...
 $ P.Value     : num  2.58e-55 1.10e-54 1.58e-54 1.72e-54 6.18e-54 ...
 $ adj.P.Val   : num  9.44e-51 1.57e-50 1.57e-50 1.57e-50 4.42e-50 ...


############################################################################################
# Take Home Problem
############################################################################################

# edgR
dge.glm <- DGEList(counts = dat, group = group)
dge.glm[["samples"]]

          group lib.size norm.factors
SRX033480     1  3040296            1
SRX033488     1  6303665            1
SRX033481     1  2717092            1
SRX033489     1  6545795            1
SRX033482     1  3016179            1
SRX033490     1  7097379            1
SRX033483     1  3707895            1
SRX033476     1  5165144            1
SRX033478     1  4953201            1
SRX033479     1  4192872            1
SRX033472     2  3970729            1
SRX033473     2  4733003            1
SRX033474     2  3702051            1
SRX033475     2  3569483            1
SRX033491     2  7276198            1
SRX033484     2  4422272            1
SRX033492     2  7115851            1
SRX033485     2  3467730            1
SRX033493     2  7339817            1
SRX033486     2  3879114            1
SRX033494     2  6771680            1

design <- model.matrix(~group)
dge.glm.com.disp <- estimateGLMCommonDisp(dge.glm, design, verbose = TRUE)

Disp = 0.03893 , BCV = 0.1973

dge.glm.trend.disp <- estimateGLMTrendedDisp(dge.glm.com.disp)
dge.glm.tag.disp <- estimateGLMTagwiseDisp(dge.glm.trend.disp, design)
edgfit <- glmFit(dge.glm.tag.disp, design)
lrt <- glmLRT(edgfit, coef = 2)
topTags(lrt,  adjust.method="BH")

Coefficient:  group2 
                    logFC logCPM    LR    PValue       FDR
ENSMUSG00000020912 -5.187  3.155 403.2 1.105e-89 4.037e-85
ENSMUSG00000050141 -5.363  2.319 311.6 9.670e-70 1.767e-65
ENSMUSG00000035775 -4.543  2.674 298.7 6.232e-67 7.589e-63
ENSMUSG00000015484 -1.968  4.307 281.8 3.049e-63 2.785e-59
ENSMUSG00000024248 -3.152  3.463 277.6 2.489e-62 1.819e-58
ENSMUSG00000030532  1.547  5.572 268.9 1.961e-60 1.194e-56
ENSMUSG00000054354 -6.283  1.940 248.7 4.924e-56 2.570e-52
ENSMUSG00000023236  1.426  7.067 244.9 3.292e-55 1.503e-51
ENSMUSG00000050824  3.705  3.365 230.0 6.054e-52 2.458e-48
ENSMUSG00000015852 -2.362  3.138 214.8 1.212e-48 4.051e-45

tt.glm <- topTags(lrt, n = Inf)
class(tt.glm)

[1] "TopTags"
attr(,"package")
[1] "edgeR"

nrow(ehits <- tt.glm$table[tt.glm$table$FDR < 1e-4, ])

[1] 294


#deSeq
deSeqDat <- newCountDataSet(dat, group)
head(counts(deSeqDat))

                   SRX033480 SRX033488 SRX033481 SRX033489 SRX033482
ENSMUSG00000000001       369       744       287       769       348
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         0         1         0         1         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         0         1         1         5         0
ENSMUSG00000000049         0         1         0         1         0
                   SRX033490 SRX033483 SRX033476 SRX033478 SRX033479
ENSMUSG00000000001       803       433       469       585       321
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         1         0         7         6         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         4         0         0         0         0
ENSMUSG00000000049         0         0         0         0         0
                   SRX033472 SRX033473 SRX033474 SRX033475 SRX033491
ENSMUSG00000000001       301       461       309       374       781
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         1         1         1         1         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         4         1         1         0         1
ENSMUSG00000000049         0         0         0         0         0
                   SRX033484 SRX033492 SRX033485 SRX033493 SRX033486
ENSMUSG00000000001       555       820       294       758       419
ENSMUSG00000000003         0         0         0         0         0
ENSMUSG00000000028         2         1         1         4         1
ENSMUSG00000000031         0         0         0         0         0
ENSMUSG00000000037         2         1         1         1         1
ENSMUSG00000000049         0         0         0         0         0
                   SRX033494
ENSMUSG00000000001       857
ENSMUSG00000000003         0
ENSMUSG00000000028         5
ENSMUSG00000000031         0
ENSMUSG00000000037         2
ENSMUSG00000000049         0

deSeqDat <- estimateSizeFactors(deSeqDat)
sizeFactors(deSeqDat)

SRX033480 SRX033488 SRX033481 SRX033489 SRX033482 SRX033490 SRX033483 
   0.6439    1.3454    0.5785    1.4295    0.6355    1.5240    0.7933 
SRX033476 SRX033478 SRX033479 SRX033472 SRX033473 SRX033474 SRX033475 
   1.1272    1.0772    0.8984    0.8886    1.0255    0.7987    0.7796 
SRX033491 SRX033484 SRX033492 SRX033485 SRX033493 SRX033486 SRX033494 
   1.6162    0.9882    1.5720    0.7558    1.5922    0.8264    1.4715

deSeqDat <- estimateDispersions(deSeqDat)
results <- nbinomTest(deSeqDat, levels(group)[1], levels(group)[2])
dhits <- (results[results$padj<1e-4,])
dhits <- na.exclude(dhits)
rownames(dhits) <- dhits$id

# Voom & limma
vhits <- topTable(vfit, coef= 2, p.value = 1e-4, number = nrow(dat))
nrow(vhits)

[1] 210


diagram <- list(rownames(ehits), rownames(dhits), rownames(vhits))
venn(diagram)

plot of chunk unnamed-chunk-1