Author: Charleen D. Adams
setwd("C:/Users/chaadams/Dropbox/COH/Hispanic tumours/MD ER Results")
suppressPackageStartupMessages(library(edgeR))
load("Diff_Analysis_ER_handbook_KEGG_GO.Rdata")
results <- as.data.frame(topTags(res,n = Inf))
#head(results, n=10)
dim(results)
## [1] 22356 131
edgeR
provides a function plotSmear
that allows us to visualise the results of a DE analysis. In a similar manner to the MA-plot for microarray data, this plot shows the log-fold change against log-counts per million, with DE genes highlighted:
summary(de <- decideTestsDGE(res))
## 1*NEG -1*POS
## Down 1424
## NotSig 17593
## Up 3339
detags <- rownames(dgeObj)[as.logical(de)]
plotSmear(res, de.tags=detags)
There are a number of ways to add annotation, but I did this using the org.Hs.eg.db package. (I used biomaRt
to filter our data initially, as it came with ENSEMBL IDs. The pipeline we are using here uses ENTREZ IDs. I selected the genes for which there were unique ENTREZ IDs matched to ENSEMBL IDs in biomaRt
.)
source("http://www.bioconductor.org/biocLite.R")
# For Human
biocLite("org.Hs.eg.db")
library(org.Hs.eg.db)
library(org.Hs.eg.db)
I got the gene symbols and full gene names.
ann <- select(org.Hs.eg.db,keys=rownames(results),columns=c("ENTREZID","SYMBOL","GENENAME"))
## 'select()' returned 1:1 mapping between keys and columns
# Have a look at the annotation
head(ann, n=2)
## ENTREZID SYMBOL GENENAME
## 1 4103 MAGEA4 MAGE family member A4
## 2 100874155 LINC00392 long intergenic non-protein coding RNA 392
I double checked that the ENTREZID
column matched exactly to our results
rownames.
table(ann$ENTREZID==rownames(results))
##
## TRUE
## 22356
I bound the annotation information to the results
data frame.
results.annotated <- cbind(results, ann)
head(results.annotated$GENENAME, n=2)
## [1] "MAGE family member A4"
## [2] "long intergenic non-protein coding RNA 392"
I saved results table using the write.csv
function.
write.csv(results.annotated,file="Annotated_ER_Results.csv",row.names=FALSE)
A note about deciding how many genes are significant: decideTests
function displays significant genes at 5% FDR.
The volcano plot displays a measure of significance on the y-axis and fold-change on the x-axis.
signif <- -log10(results.annotated$FDR)
plot(results.annotated$logFC,signif,pch=16)
points(results.annotated[detags,"logFC"],-log10(results.annotated[detags,"FDR"]),pch=16,col="red")
Here is an example sanity check to have a look at the expression levels of the individual samples for a genes of interest. I looked at grouped expression using stripchart
. I used the normalised log expression values in the dgeCounts
object (dgeCounts$counts
).
library(RColorBrewer)
par(mfrow=c(1,3))
normCounts <- dgeObj$counts
# Let's look at the most signficiant gene, MAGEA4, which has a rowname 4103
stripchart(normCounts["4103",]~groupER)
# This plot is ugly, let's make it better
stripchart(normCounts["4103",]~groupER,vertical=TRUE,las=2,cex.axis=0.8,pch=16,col=1:6,method="jitter")
# Let's use nicer colours
nice.col <- brewer.pal(6,name="Dark2")
stripchart(normCounts["1",]~groupER,vertical=TRUE,las=2,cex.axis=0.8,pch=16,cex=1.3,col=nice.col,method="jitter",ylab="Normalised log2 expression",main=" MAGEA4\n(MAGE family member A4)")
MD plot showing the log-fold change and average abundance of each gene. Signficantly up and down DE genes are highlighted in red and blue respectively.
library(org.Hs.eg.db)
library(statmod)
design <- model.matrix(~0+groupER)
colnames(design) <- levels(groupER)
y <- DGEList(counts.keep,group=groupER, genes=counts.keep)
options(digits=3)
# Add the gene names to the DGEList object
y$genes$Symbol <- rownames(counts.keep)
y <- estimateDisp(y,design, robust=TRUE)
## estimation of QL dispersions
fit <- glmQLFit(y, design, robust=TRUE)
head(fit$coefficients)
## Missing NEG POS
## 1 -13.71 -13.62 -13.42
## 10 -16.36 -14.56 -14.97
## 100 -12.39 -11.89 -12.14
## 1000 -12.63 -11.43 -11.45
## 10000 -9.87 -8.94 -9.35
## 100009667 -12.39 -12.90 -12.95
plotQLDisp(fit)
#summary(fit$df.prior)
## Differential expression analysis
B.LvsP <- makeContrasts(NEG-POS, levels=design)
res <- glmQLFTest(fit, contrast=B.LvsP)
#topTags(res)
is.de <- decideTestsDGE(res)
plotMD(res, status=is.de, values=c(1,-1), col=c("red", "blue"),
legend="topright")
MD plot showing the log-fold change and average abundance of each gene. Genes with fold-changes significantly greater than 1.5 are highlighted
tr <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5))
#topTags(tr)
is.de <- decideTestsDGE(tr)
is.de_tr=summary(is.de)
plotMD(tr, status=is.de, values=c(1,-1), col=c("red", "blue"),
legend="topright")
text(results.annotated$logCPM[1:100],results.annotated$logFC[1:100],labels = results.annotated$SYMBOL[1:100],col="blue")
library(gplots)
logCPM <- cpm(y, prior.count=2, log=TRUE)
rownames(logCPM) <- y$genes$Symbol
colnames(logCPM) <- paste(y$samples$group, 1:2, sep="-")
o <- order(tr$table$PValue)
logCPM <- logCPM[o[1:30],]
logCPM <- t(scale(t(logCPM)))
col.pan <- colorpanel(100, "blue", "white", "red")
heatmap.2(logCPM, col=col.pan, Rowv=TRUE, scale="none",
trace="none", dendrogram="both", cexRow=1, cexCol=1.4,
density.info="none",
margin=c(10,9), lhei=c(2,10), lwid=c(2,6))
go <- goana(tr, species="Hs")
topGO(go, n=15)
## Term Ont N Up Down P.Up
## GO:0070268 cornification BP 78 22 0 3.38e-14
## GO:0032501 multicellular organismal process BP 6213 322 106 8.42e-14
## GO:0031424 keratinization BP 94 23 0 2.49e-13
## GO:0031224 intrinsic component of membrane CC 4496 245 79 3.01e-12
## GO:0016021 integral component of membrane CC 4376 238 77 9.02e-12
## GO:0009888 tissue development BP 1700 116 27 2.63e-11
## GO:0003008 system process BP 1567 108 32 7.91e-11
## GO:0044425 membrane part CC 5674 287 90 9.90e-11
## GO:0044449 contractile fiber part CC 208 30 1 1.23e-10
## GO:0043292 contractile fiber CC 224 31 1 1.74e-10
## GO:0030216 keratinocyte differentiation BP 162 26 1 1.96e-10
## GO:0030016 myofibril CC 213 30 1 2.22e-10
## GO:0048856 anatomical structure development BP 5128 263 79 2.49e-10
## GO:0030017 sarcomere CC 191 28 1 3.46e-10
## GO:0033275 actin-myosin filament sliding BP 38 13 0 4.13e-10
## P.Down
## GO:0070268 1.00e+00
## GO:0032501 5.56e-05
## GO:0031424 1.00e+00
## GO:0031224 3.29e-04
## GO:0016021 3.87e-04
## GO:0009888 1.00e-01
## GO:0003008 3.16e-03
## GO:0044425 3.24e-03
## GO:0044449 9.24e-01
## GO:0043292 9.38e-01
## GO:0030216 8.65e-01
## GO:0030016 9.29e-01
## GO:0048856 1.33e-02
## GO:0030017 9.06e-01
## GO:0033275 1.00e+00
topGO=topGO(go, n=15)
keg <- kegga(tr, species="Hs")
topKEGG(keg, n=15, truncate=34)
## Pathway N Up Down P.Up
## path:hsa04080 Neuroactive ligand-receptor int... 224 21 7 7.44e-05
## path:hsa00601 Glycosphingolipid biosynthesis ... 26 6 0 2.80e-04
## path:hsa04512 ECM-receptor interaction 80 3 6 5.60e-01
## path:hsa04972 Pancreatic secretion 79 10 4 5.71e-04
## path:hsa04151 PI3K-Akt signaling pathway 319 8 12 8.98e-01
## path:hsa00982 Drug metabolism - cytochrome P4... 57 4 5 1.53e-01
## path:hsa00980 Metabolism of xenobiotics by cy... 58 4 5 1.60e-01
## path:hsa05204 Chemical carcinogenesis 64 6 5 2.86e-02
## path:hsa00983 Drug metabolism - other enzymes 68 4 5 2.34e-01
## path:hsa00053 Ascorbate and aldarate metaboli... 19 2 3 1.51e-01
## path:hsa04974 Protein digestion and absorptio... 76 9 3 1.72e-03
## path:hsa05410 Hypertrophic cardiomyopathy (HC... 79 9 2 2.25e-03
## path:hsa04510 Focal adhesion 195 9 8 2.80e-01
## path:hsa04923 Regulation of lipolysis in adip... 50 2 4 5.47e-01
## path:hsa05414 Dilated cardiomyopathy (DCM) 85 9 2 3.72e-03
## P.Down
## path:hsa04080 0.020886
## path:hsa00601 1.000000
## path:hsa04512 0.000451
## path:hsa04972 0.016186
## path:hsa04151 0.000612
## path:hsa00982 0.000664
## path:hsa00980 0.000719
## path:hsa05204 0.001127
## path:hsa00983 0.001481
## path:hsa00053 0.001526
## path:hsa04974 0.066695
## path:hsa05410 0.252453
## path:hsa04510 0.002879
## path:hsa04923 0.003266
## path:hsa05414 0.279717
topKEGG=topKEGG(keg, n=15, truncate=34)
ggplot2
to generate plots from above
library(ggplot2)
ggplot(results, aes(x = logCPM, y=logFC,col=FDR < 0.05)) + geom_point(alpha=0.4) + scale_colour_manual(values=c("black","red"))
The volcano plot can be constructed in a similar manner
ggplot(results, aes(x = logFC, y=-log10(FDR))) + geom_point()