EdgeR Self-Learning Note

EdgeR is used for differential expression analyses of read counts data from the RNA-Seq expereiments.

It implements statistical methods based on generalized linear models (glm).

edgeR can be applied to differential expression at the gene, exon, transcript or tag level.

Install and load EdgeR

rm(list=ls())
if (!("edgeR" %in% rownames(installed.packages()))){
  source("http://bioconductor.org/biocLite.R")
  biocLite("edgeR")
} else{
  print("edgeR has been successfully installed")
}

## [1] "edgeR has been successfully installed"

library(edgeR)

## Loading required package: limma

limma package is required - the implementation of linear models might from limma (?)

The PDF manual is within the package

edgeRUsersGuide()

## [1] "/Library/Frameworks/R.framework/Versions/3.1/Resources/library/edgeR/doc/edgeRUsersGuide.pdf"

Quick start for classical analysis

# To create a dataframe contains 100 transcripts * 4 samples - populate a 100*4 matrix
set.seed(1122)
data <- 10*matrix(runif(4*10000),nrow=10000,byrow=T)
rownames(data) <- paste("gene", c(1:10000), sep = "")
group <- factor(c(1,1,2,2))

# use DGElist function to read the expression data and phenotype data

# DGE stands for Digital Gene Expression data

y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
# Maximizes the negative binomial conditional common likelihood to give the estimate of the common dispersion across all tags.

et <- exactTest(y)

topTags(et,sort.by="logFC") # well this is similar to topTable in limma # add n=Inf if want to export all summary data

## Comparison of groups:  2-1 
##           logFC logCPM   PValue FDR
## gene9143  5.186  6.871 0.003430   1
## gene7338  4.964  6.965 0.001684   1
## gene9483  4.866  6.842 0.003164   1
## gene736  -4.657  6.552 0.029106   1
## gene7938 -4.606  6.726 0.008755   1
## gene9348 -4.582  6.946 0.001975   1
## gene2981 -4.548  6.547 0.032387   1
## gene5349  4.501  6.667 0.014205   1
## gene2687  4.447  6.963 0.009319   1
## gene7997  4.381  6.831 0.005892   1

# Make a valcano plot to show the distribution

plot(et$table$logFC,-log10(et$table$PValue),col="red",cex=0.8,xlab="logFC",ylab="-log10(P)",main="Valcano plot for test data")

plot of chunk unnamed-chunk-3

Quick start for linnear regression model edgeR analysis

design <- model.matrix(~group)
design

##   (Intercept) group2
## 1           1      0
## 2           1      0
## 3           1      1
## 4           1      1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"

This add intercept of one column of 1

y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)

## Loading required package: splines

y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit,coef=2)
topTags(lrt)

## Coefficient:  group2 
##           logFC logCPM     LR    PValue    FDR
## gene7338  4.964  6.965 11.826 0.0005842 0.7946
## gene9143  5.186  6.871 11.603 0.0006584 0.7946
## gene8184 -4.270  7.061 11.289 0.0007798 0.7946
## gene775   3.853  7.124 11.099 0.0008637 0.7946
## gene9348 -4.582  6.946 10.962 0.0009298 0.7946
## gene2687  4.447  6.963 10.909 0.0009568 0.7946
## gene9483  4.866  6.842 10.864 0.0009807 0.7946
## gene6236 -3.904  7.077 10.449 0.0012272 0.7946
## gene8420  3.766  7.088 10.144 0.0014477 0.7946
## gene4640  3.788  7.063  9.918 0.0016368 0.7946

par(mfrow=c(2,1))
plot(et$table$logFC,-log10(et$table$PValue),col="red",cex=0.8,xlab="logFC",ylab="-log10(P)",main="Valcano plot for test data - classical model")

plot(lrt$table$logFC,-log10(lrt$table$PValue),col="red",cex=0.8,xlab="logFC",ylab="-log10(P)",main="Valcano plot for test data - linear model")

plot of chunk unnamed-chunk-5