3/23/2020
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
rfu <- function(pname,ver="3.10"){
if(! requireNamespace(pname) == TRUE) { BiocManager::install(pname, version = ver)
}}
rfu("limma")
rfu("edgeR")
rfu("DESeq2")
rfu("DESeq")
rfu("pasilla")
rfu("biomaRt")
rfu("pheatmap")
rm(rfu)
library(limma)
library(edgeR)
library(DESeq2)
library(biomaRt)
library(pasilla)
library(pheatmap)
data("pasillaGenes")
#- Observe pasillaGenes
pasillaGenes
## CountDataSet (storageMode: environment)
## assayData: 14470 features, 7 samples
## element names: counts
## protocolData: none
## phenoData
## sampleNames: treated1fb treated2fb ... untreated4fb (7 total)
## varLabels: sizeFactor condition type
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 20921232
## Annotation:
pData(pasillaGenes)
## sizeFactor condition type
## treated1fb NA treated single-read
## treated2fb NA treated paired-end
## treated3fb NA treated paired-end
## untreated1fb NA untreated single-read
## untreated2fb NA untreated single-read
## untreated3fb NA untreated paired-end
## untreated4fb NA untreated paired-end
head(rownames(pasillaGenes))
## [1] "FBgn0000003" "FBgn0000008" "FBgn0000014" "FBgn0000015" "FBgn0000017"
## [6] "FBgn0000018"
library(biomaRt)
#- want to get symbols for ensembl IDs
# for ease, we focus on unique names
ensembl = biomaRt::useMart("ensembl")
ensembl = useDataset("dmelanogaster_gene_ensembl",mart=ensembl)
names = getBM(attributes= c("ensembl_gene_id","external_gene_name"),
filters=c("ensembl_gene_id"),
values=rownames(pasillaGenes), mart=ensembl)
## Cache found
pasillaGenes = pasillaGenes[names[,1],]
featureData(pasillaGenes)$ID = names[,1]
featureData(pasillaGenes)$name = names[,2]
#- we make a DGEList to follow the limma guilde
dgel = DGEList(counts(pasillaGenes),genes=fData(pasillaGenes),samples=pData(pasillaGenes))
#- we fileter for genes with >1 cpm and focus on the first six samples
ind = rowSums(cpm(dgel) > 1) >=3
dgel = dgel[ind,1:6]
#- here wer are normalizing for library size globally:
dgel = calcNormFactors(dgel)
dgel
## An object of class "DGEList"
## $counts
## treated1fb treated2fb treated3fb untreated1fb untreated2fb
## FBgn0000008 78 46 43 47 89
## FBgn0000017 3187 1672 1859 2445 4615
## FBgn0000018 369 150 176 288 383
## FBgn0000032 942 465 536 767 956
## FBgn0000037 10 10 11 7 13
## untreated3fb
## FBgn0000008 53
## FBgn0000017 2063
## FBgn0000018 135
## FBgn0000032 464
## FBgn0000037 3
## 7298 more rows ...
##
## $samples
## group lib.size norm.factors sizeFactor condition type
## treated1fb 1 8079057 1.0700723 NA treated single-read
## treated2fb 1 4622960 0.9794198 NA treated paired-end
## treated3fb 1 5311504 0.9771941 NA treated paired-end
## untreated1fb 1 5957089 1.0161526 NA untreated single-read
## untreated2fb 1 9466748 1.0042259 NA untreated single-read
## untreated3fb 1 4306561 0.9568565 NA untreated paired-end
##
## $genes
## ID name
## FBgn0000008 FBgn0000008 a
## FBgn0000017 FBgn0000017 Abl
## FBgn0000018 FBgn0000018 abo
## FBgn0000032 FBgn0000032 Acph-1
## FBgn0000037 FBgn0000037 mAChR-A
## 7298 more rows ...
#- look at PCA/MDS plot
plotMDS(dgel)

#- make a design matrix including batch effect
pasilla = factor(dgel$samples$condition)
batch = factor(c(1,2,3,1,2,3)) #- this may or may not be correct, because the pasilla package does not have this info
design = model.matrix(~ batch + pasilla)
design
## (Intercept) batch2 batch3 pasillauntreated
## 1 1 0 0 0
## 2 1 1 0 0
## 3 1 0 1 0
## 4 1 0 0 1
## 5 1 1 0 1
## 6 1 0 1 1
## attr(,"assign")
## [1] 0 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$batch
## [1] "contr.treatment"
##
## attr(,"contrasts")$pasilla
## [1] "contr.treatment"
v = voom(dgel,design,plot=TRUE)

#- explicitly remove batch effect from expression
v.plot = removeBatchEffect(v,batch=batch,covariates=log(v$targets$lib.size))
par(mfcol=c(1,2))
plotMDS(v.plot,main="Batch adjusted");
plotMDS(v, main="Unadjusted");

fit = lmFit(v, design)
fit.de = eBayes(fit, robust=TRUE)
topTable(fit.de, coef="pasillauntreated")
## ID name logFC AveExpr t P.Value
## FBgn0029167 FBgn0029167 Hml 2.217986 8.046088 16.68428 4.453998e-10
## FBgn0039155 FBgn0039155 Kal1 4.444618 4.855747 17.94009 2.279703e-10
## FBgn0035085 FBgn0035085 CG3770 2.452097 5.365380 14.48065 2.516949e-09
## FBgn0026562 FBgn0026562 SPARC 2.216102 11.313544 13.47949 8.215201e-09
## FBgn0001226 FBgn0001226 Hsp27 -1.795897 6.718836 -12.81962 1.093380e-08
## FBgn0029896 FBgn0029896 CG3168 2.568955 4.837385 12.78952 1.124560e-08
## FBgn0038832 FBgn0038832 CG15695 2.700017 4.217363 11.97468 2.462383e-08
## FBgn0040091 FBgn0040091 Ugt317A1 1.505602 6.408523 11.30994 4.829463e-08
## FBgn0034736 FBgn0034736 gas 3.279549 3.427036 11.66987 3.340106e-08
## FBgn0000071 FBgn0000071 Ama -2.708966 4.531278 -11.13247 7.784362e-08
## adj.P.Val B
## FBgn0029167 1.626377e-06 13.601256
## FBgn0039155 1.626377e-06 13.215554
## FBgn0035085 6.127092e-06 11.800784
## FBgn0026562 1.368777e-05 10.768462
## FBgn0001226 1.368777e-05 10.490034
## FBgn0029896 1.368777e-05 10.321666
## FBgn0038832 2.568969e-05 9.476004
## FBgn0040091 3.918841e-05 9.015791
## FBgn0034736 3.049099e-05 8.780088
## FBgn0000071 5.207492e-05 8.490541
#- all genes
atab = topTable(fit.de, coef="pasillauntreated",number = nrow(dgel))
#- controling FDR at 5%
stab = atab[atab$adj.P.Val<=.05,]
ord = order(stab$P.Value,decreasing=FALSE)[1:20]
deg = rownames(stab)[ord]
pheatmap(v.plot[deg,],scale="row", border_color = "white",labels_row=v$genes[deg,"name"],fontsize_row=9,cellheight=11,cellwidth=11)
