EdgeR is used for differential expression analyses of read counts data from the RNA-Seq expereiments.
It implements statistical methods based on generalized linear models (glm).
edgeR can be applied to differential expression at the gene, exon, transcript or tag level.
rm(list=ls())
if (!("edgeR" %in% rownames(installed.packages()))){
source("http://bioconductor.org/biocLite.R")
biocLite("edgeR")
} else{
print("edgeR has been successfully installed")
}
## [1] "edgeR has been successfully installed"
library(edgeR)
## Loading required package: limma
limma package is required - the implementation of linear models might from limma (?)
The PDF manual is within the package
edgeRUsersGuide()
## [1] "/Library/Frameworks/R.framework/Versions/3.1/Resources/library/edgeR/doc/edgeRUsersGuide.pdf"
# To create a dataframe contains 100 transcripts * 4 samples - populate a 100*4 matrix
set.seed(1122)
data <- 10*matrix(runif(4*10000),nrow=10000,byrow=T)
rownames(data) <- paste("gene", c(1:10000), sep = "")
group <- factor(c(1,1,2,2))
# use DGElist function to read the expression data and phenotype data
# DGE stands for Digital Gene Expression data
y <- DGEList(counts=data,group=group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
# Maximizes the negative binomial conditional common likelihood to give the estimate of the common dispersion across all tags.
et <- exactTest(y)
topTags(et,sort.by="logFC") # well this is similar to topTable in limma # add n=Inf if want to export all summary data
## Comparison of groups: 2-1
## logFC logCPM PValue FDR
## gene9143 5.186 6.871 0.003430 1
## gene7338 4.964 6.965 0.001684 1
## gene9483 4.866 6.842 0.003164 1
## gene736 -4.657 6.552 0.029106 1
## gene7938 -4.606 6.726 0.008755 1
## gene9348 -4.582 6.946 0.001975 1
## gene2981 -4.548 6.547 0.032387 1
## gene5349 4.501 6.667 0.014205 1
## gene2687 4.447 6.963 0.009319 1
## gene7997 4.381 6.831 0.005892 1
# Make a valcano plot to show the distribution
plot(et$table$logFC,-log10(et$table$PValue),col="red",cex=0.8,xlab="logFC",ylab="-log10(P)",main="Valcano plot for test data")
design <- model.matrix(~group)
design
## (Intercept) group2
## 1 1 0
## 2 1 0
## 3 1 1
## 4 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
This add intercept of one column of 1
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
## Loading required package: splines
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit,coef=2)
topTags(lrt)
## Coefficient: group2
## logFC logCPM LR PValue FDR
## gene7338 4.964 6.965 11.826 0.0005842 0.7946
## gene9143 5.186 6.871 11.603 0.0006584 0.7946
## gene8184 -4.270 7.061 11.289 0.0007798 0.7946
## gene775 3.853 7.124 11.099 0.0008637 0.7946
## gene9348 -4.582 6.946 10.962 0.0009298 0.7946
## gene2687 4.447 6.963 10.909 0.0009568 0.7946
## gene9483 4.866 6.842 10.864 0.0009807 0.7946
## gene6236 -3.904 7.077 10.449 0.0012272 0.7946
## gene8420 3.766 7.088 10.144 0.0014477 0.7946
## gene4640 3.788 7.063 9.918 0.0016368 0.7946
par(mfrow=c(2,1))
plot(et$table$logFC,-log10(et$table$PValue),col="red",cex=0.8,xlab="logFC",ylab="-log10(P)",main="Valcano plot for test data - classical model")
plot(lrt$table$logFC,-log10(lrt$table$PValue),col="red",cex=0.8,xlab="logFC",ylab="-log10(P)",main="Valcano plot for test data - linear model")