Author: Charleen D. Adams

setwd("C:/Users/chaadams/Dropbox/COH/Hispanic tumours/MD ER Results")
suppressPackageStartupMessages(library(edgeR))
load("Diff_Analysis_ER_handbook_KEGG_GO.Rdata")

Overview

results <- as.data.frame(topTags(res,n = Inf))
#head(results, n=10)
dim(results)
## [1] 22356   131

edgeR provides a function plotSmear that allows us to visualise the results of a DE analysis. In a similar manner to the MA-plot for microarray data, this plot shows the log-fold change against log-counts per million, with DE genes highlighted:

summary(de <- decideTestsDGE(res))
##        1*NEG -1*POS
## Down           1424
## NotSig        17593
## Up             3339
detags <- rownames(dgeObj)[as.logical(de)]
plotSmear(res, de.tags=detags)

Adding annotation to the edgeR results

There are a number of ways to add annotation, but I did this using the org.Hs.eg.db package. (I used biomaRt to filter our data initially, as it came with ENSEMBL IDs. The pipeline we are using here uses ENTREZ IDs. I selected the genes for which there were unique ENTREZ IDs matched to ENSEMBL IDs in biomaRt.)

source("http://www.bioconductor.org/biocLite.R")
# For Human
biocLite("org.Hs.eg.db")
library(org.Hs.eg.db)
library(org.Hs.eg.db)

I got the gene symbols and full gene names.

ann <- select(org.Hs.eg.db,keys=rownames(results),columns=c("ENTREZID","SYMBOL","GENENAME"))
## 'select()' returned 1:1 mapping between keys and columns
# Have a look at the annotation
head(ann, n=2)
##    ENTREZID    SYMBOL                                   GENENAME
## 1      4103    MAGEA4                      MAGE family member A4
## 2 100874155 LINC00392 long intergenic non-protein coding RNA 392

I double checked that the ENTREZID column matched exactly to our results rownames.

table(ann$ENTREZID==rownames(results)) 
## 
##  TRUE 
## 22356

I bound the annotation information to the results data frame.

results.annotated <- cbind(results, ann)
head(results.annotated$GENENAME, n=2)
## [1] "MAGE family member A4"                     
## [2] "long intergenic non-protein coding RNA 392"

I saved results table using the write.csv function.

write.csv(results.annotated,file="Annotated_ER_Results.csv",row.names=FALSE)

A note about deciding how many genes are significant: decideTests function displays significant genes at 5% FDR.

The volcano plot displays a measure of significance on the y-axis and fold-change on the x-axis.

signif <- -log10(results.annotated$FDR)
plot(results.annotated$logFC,signif,pch=16)
points(results.annotated[detags,"logFC"],-log10(results.annotated[detags,"FDR"]),pch=16,col="red")

Here is an example sanity check to have a look at the expression levels of the individual samples for a genes of interest. I looked at grouped expression using stripchart. I used the normalised log expression values in the dgeCounts object (dgeCounts$counts).

library(RColorBrewer)
par(mfrow=c(1,3))
normCounts <- dgeObj$counts
# Let's look at the most signficiant gene, MAGEA4, which has a rowname 4103
stripchart(normCounts["4103",]~groupER)
# This plot is ugly, let's make it better
stripchart(normCounts["4103",]~groupER,vertical=TRUE,las=2,cex.axis=0.8,pch=16,col=1:6,method="jitter")
# Let's use nicer colours
nice.col <- brewer.pal(6,name="Dark2")
stripchart(normCounts["1",]~groupER,vertical=TRUE,las=2,cex.axis=0.8,pch=16,cex=1.3,col=nice.col,method="jitter",ylab="Normalised log2 expression",main="   MAGEA4\n(MAGE family member A4)")

More MD plots

MD plot showing the log-fold change and average abundance of each gene. Signficantly up and down DE genes are highlighted in red and blue respectively.

library(org.Hs.eg.db)
library(statmod)
design <- model.matrix(~0+groupER)
colnames(design) <- levels(groupER)
y <- DGEList(counts.keep,group=groupER, genes=counts.keep)
options(digits=3)

# Add the gene names to the DGEList object
y$genes$Symbol <- rownames(counts.keep) 
y <- estimateDisp(y,design, robust=TRUE)

## estimation of QL dispersions
fit <- glmQLFit(y, design, robust=TRUE)
head(fit$coefficients)
##           Missing    NEG    POS
## 1          -13.71 -13.62 -13.42
## 10         -16.36 -14.56 -14.97
## 100        -12.39 -11.89 -12.14
## 1000       -12.63 -11.43 -11.45
## 10000       -9.87  -8.94  -9.35
## 100009667  -12.39 -12.90 -12.95
plotQLDisp(fit)

#summary(fit$df.prior)

## Differential expression analysis
B.LvsP <- makeContrasts(NEG-POS, levels=design)
res <- glmQLFTest(fit, contrast=B.LvsP)
#topTags(res)
is.de <- decideTestsDGE(res)
plotMD(res, status=is.de, values=c(1,-1), col=c("red", "blue"),
       legend="topright")

MD plot showing the log-fold change and average abundance of each gene. Genes with fold-changes significantly greater than 1.5 are highlighted

tr <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5))
#topTags(tr)

is.de <- decideTestsDGE(tr)
is.de_tr=summary(is.de)
plotMD(tr, status=is.de, values=c(1,-1), col=c("red", "blue"),
       legend="topright")
text(results.annotated$logCPM[1:100],results.annotated$logFC[1:100],labels = results.annotated$SYMBOL[1:100],col="blue")

Heat map clustering

library(gplots)
logCPM <- cpm(y, prior.count=2, log=TRUE)
rownames(logCPM) <- y$genes$Symbol
colnames(logCPM) <- paste(y$samples$group, 1:2, sep="-")
o <- order(tr$table$PValue)
logCPM <- logCPM[o[1:30],]
logCPM <- t(scale(t(logCPM)))

col.pan <- colorpanel(100, "blue", "white", "red")
          heatmap.2(logCPM, col=col.pan, Rowv=TRUE, scale="none",
          trace="none", dendrogram="both", cexRow=1, cexCol=1.4, 
          density.info="none", 
          margin=c(10,9), lhei=c(2,10), lwid=c(2,6))

Gene ontology analysis

go <-  goana(tr, species="Hs")
topGO(go, n=15)
##                                        Term Ont    N  Up Down     P.Up
## GO:0070268                    cornification  BP   78  22    0 3.38e-14
## GO:0032501 multicellular organismal process  BP 6213 322  106 8.42e-14
## GO:0031424                   keratinization  BP   94  23    0 2.49e-13
## GO:0031224  intrinsic component of membrane  CC 4496 245   79 3.01e-12
## GO:0016021   integral component of membrane  CC 4376 238   77 9.02e-12
## GO:0009888               tissue development  BP 1700 116   27 2.63e-11
## GO:0003008                   system process  BP 1567 108   32 7.91e-11
## GO:0044425                    membrane part  CC 5674 287   90 9.90e-11
## GO:0044449           contractile fiber part  CC  208  30    1 1.23e-10
## GO:0043292                contractile fiber  CC  224  31    1 1.74e-10
## GO:0030216     keratinocyte differentiation  BP  162  26    1 1.96e-10
## GO:0030016                        myofibril  CC  213  30    1 2.22e-10
## GO:0048856 anatomical structure development  BP 5128 263   79 2.49e-10
## GO:0030017                        sarcomere  CC  191  28    1 3.46e-10
## GO:0033275    actin-myosin filament sliding  BP   38  13    0 4.13e-10
##              P.Down
## GO:0070268 1.00e+00
## GO:0032501 5.56e-05
## GO:0031424 1.00e+00
## GO:0031224 3.29e-04
## GO:0016021 3.87e-04
## GO:0009888 1.00e-01
## GO:0003008 3.16e-03
## GO:0044425 3.24e-03
## GO:0044449 9.24e-01
## GO:0043292 9.38e-01
## GO:0030216 8.65e-01
## GO:0030016 9.29e-01
## GO:0048856 1.33e-02
## GO:0030017 9.06e-01
## GO:0033275 1.00e+00
topGO=topGO(go, n=15)

KEGG pathway analysis

keg <-  kegga(tr, species="Hs")
topKEGG(keg, n=15, truncate=34)
##                                          Pathway   N Up Down     P.Up
## path:hsa04080 Neuroactive ligand-receptor int... 224 21    7 7.44e-05
## path:hsa00601 Glycosphingolipid biosynthesis ...  26  6    0 2.80e-04
## path:hsa04512           ECM-receptor interaction  80  3    6 5.60e-01
## path:hsa04972               Pancreatic secretion  79 10    4 5.71e-04
## path:hsa04151         PI3K-Akt signaling pathway 319  8   12 8.98e-01
## path:hsa00982 Drug metabolism - cytochrome P4...  57  4    5 1.53e-01
## path:hsa00980 Metabolism of xenobiotics by cy...  58  4    5 1.60e-01
## path:hsa05204            Chemical carcinogenesis  64  6    5 2.86e-02
## path:hsa00983    Drug metabolism - other enzymes  68  4    5 2.34e-01
## path:hsa00053 Ascorbate and aldarate metaboli...  19  2    3 1.51e-01
## path:hsa04974 Protein digestion and absorptio...  76  9    3 1.72e-03
## path:hsa05410 Hypertrophic cardiomyopathy (HC...  79  9    2 2.25e-03
## path:hsa04510                     Focal adhesion 195  9    8 2.80e-01
## path:hsa04923 Regulation of lipolysis in adip...  50  2    4 5.47e-01
## path:hsa05414       Dilated cardiomyopathy (DCM)  85  9    2 3.72e-03
##                 P.Down
## path:hsa04080 0.020886
## path:hsa00601 1.000000
## path:hsa04512 0.000451
## path:hsa04972 0.016186
## path:hsa04151 0.000612
## path:hsa00982 0.000664
## path:hsa00980 0.000719
## path:hsa05204 0.001127
## path:hsa00983 0.001481
## path:hsa00053 0.001526
## path:hsa04974 0.066695
## path:hsa05410 0.252453
## path:hsa04510 0.002879
## path:hsa04923 0.003266
## path:hsa05414 0.279717
topKEGG=topKEGG(keg, n=15, truncate=34)

Extra plots with ggplot2

ggplot2 to generate plots from above

library(ggplot2)

ggplot(results, aes(x = logCPM, y=logFC,col=FDR < 0.05)) + geom_point(alpha=0.4) + scale_colour_manual(values=c("black","red"))

The volcano plot can be constructed in a similar manner

ggplot(results, aes(x = logFC, y=-log10(FDR))) + geom_point()