HWK#4 Passila-practical

Abish Pius

3/23/2020

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
 
rfu <- function(pname,ver="3.10"){
  if(! requireNamespace(pname) == TRUE) {    BiocManager::install(pname, version = ver)
  }}

rfu("limma")
rfu("edgeR")
rfu("DESeq2")
rfu("DESeq")
rfu("pasilla")
rfu("biomaRt")
rfu("pheatmap")
rm(rfu)

library(limma)
library(edgeR)
library(DESeq2)
library(biomaRt)
library(pasilla)
library(pheatmap)
data("pasillaGenes")

 #- Observe pasillaGenes
pasillaGenes

## CountDataSet (storageMode: environment)
## assayData: 14470 features, 7 samples 
##   element names: counts 
## protocolData: none
## phenoData
##   sampleNames: treated1fb treated2fb ... untreated4fb (7 total)
##   varLabels: sizeFactor condition type
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 20921232 
## Annotation:

pData(pasillaGenes)

##              sizeFactor condition        type
## treated1fb           NA   treated single-read
## treated2fb           NA   treated  paired-end
## treated3fb           NA   treated  paired-end
## untreated1fb         NA untreated single-read
## untreated2fb         NA untreated single-read
## untreated3fb         NA untreated  paired-end
## untreated4fb         NA untreated  paired-end

head(rownames(pasillaGenes))

## [1] "FBgn0000003" "FBgn0000008" "FBgn0000014" "FBgn0000015" "FBgn0000017"
## [6] "FBgn0000018"

library(biomaRt)

#- want to get symbols for ensembl IDs
#  for ease, we focus on unique names

ensembl = biomaRt::useMart("ensembl")
ensembl = useDataset("dmelanogaster_gene_ensembl",mart=ensembl)
names   = getBM(attributes= c("ensembl_gene_id","external_gene_name"),
                filters=c("ensembl_gene_id"),
                values=rownames(pasillaGenes), mart=ensembl)

## Cache found

pasillaGenes                   = pasillaGenes[names[,1],]
featureData(pasillaGenes)$ID   = names[,1]
featureData(pasillaGenes)$name = names[,2]

#- we make a DGEList to follow the limma guilde
dgel = DGEList(counts(pasillaGenes),genes=fData(pasillaGenes),samples=pData(pasillaGenes))

#- we fileter for genes with >1 cpm and focus on the first six samples
ind  =  rowSums(cpm(dgel) > 1) >=3
dgel = dgel[ind,1:6]
 
#- here wer are normalizing for library size globally:
dgel = calcNormFactors(dgel)
 
dgel

## An object of class "DGEList"
## $counts
##             treated1fb treated2fb treated3fb untreated1fb untreated2fb
## FBgn0000008         78         46         43           47           89
## FBgn0000017       3187       1672       1859         2445         4615
## FBgn0000018        369        150        176          288          383
## FBgn0000032        942        465        536          767          956
## FBgn0000037         10         10         11            7           13
##             untreated3fb
## FBgn0000008           53
## FBgn0000017         2063
## FBgn0000018          135
## FBgn0000032          464
## FBgn0000037            3
## 7298 more rows ...
## 
## $samples
##              group lib.size norm.factors sizeFactor condition        type
## treated1fb       1  8079057    1.0700723         NA   treated single-read
## treated2fb       1  4622960    0.9794198         NA   treated  paired-end
## treated3fb       1  5311504    0.9771941         NA   treated  paired-end
## untreated1fb     1  5957089    1.0161526         NA untreated single-read
## untreated2fb     1  9466748    1.0042259         NA untreated single-read
## untreated3fb     1  4306561    0.9568565         NA untreated  paired-end
## 
## $genes
##                      ID    name
## FBgn0000008 FBgn0000008       a
## FBgn0000017 FBgn0000017     Abl
## FBgn0000018 FBgn0000018     abo
## FBgn0000032 FBgn0000032  Acph-1
## FBgn0000037 FBgn0000037 mAChR-A
## 7298 more rows ...

 #- look at PCA/MDS plot
plotMDS(dgel)

#- make a design matrix including batch effect
 pasilla = factor(dgel$samples$condition)
batch   = factor(c(1,2,3,1,2,3)) #- this may or may not be correct, because the pasilla package does not have this info
design  = model.matrix(~ batch + pasilla)
design

##   (Intercept) batch2 batch3 pasillauntreated
## 1           1      0      0                0
## 2           1      1      0                0
## 3           1      0      1                0
## 4           1      0      0                1
## 5           1      1      0                1
## 6           1      0      1                1
## attr(,"assign")
## [1] 0 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$batch
## [1] "contr.treatment"
## 
## attr(,"contrasts")$pasilla
## [1] "contr.treatment"

v = voom(dgel,design,plot=TRUE)

#- explicitly remove batch effect from expression
v.plot = removeBatchEffect(v,batch=batch,covariates=log(v$targets$lib.size))
par(mfcol=c(1,2))
plotMDS(v.plot,main="Batch adjusted");
plotMDS(v, main="Unadjusted");

fit    = lmFit(v, design)
fit.de = eBayes(fit, robust=TRUE)
topTable(fit.de, coef="pasillauntreated")

##                      ID     name     logFC   AveExpr         t      P.Value
## FBgn0029167 FBgn0029167      Hml  2.217986  8.046088  16.68428 4.453998e-10
## FBgn0039155 FBgn0039155     Kal1  4.444618  4.855747  17.94009 2.279703e-10
## FBgn0035085 FBgn0035085   CG3770  2.452097  5.365380  14.48065 2.516949e-09
## FBgn0026562 FBgn0026562    SPARC  2.216102 11.313544  13.47949 8.215201e-09
## FBgn0001226 FBgn0001226    Hsp27 -1.795897  6.718836 -12.81962 1.093380e-08
## FBgn0029896 FBgn0029896   CG3168  2.568955  4.837385  12.78952 1.124560e-08
## FBgn0038832 FBgn0038832  CG15695  2.700017  4.217363  11.97468 2.462383e-08
## FBgn0040091 FBgn0040091 Ugt317A1  1.505602  6.408523  11.30994 4.829463e-08
## FBgn0034736 FBgn0034736      gas  3.279549  3.427036  11.66987 3.340106e-08
## FBgn0000071 FBgn0000071      Ama -2.708966  4.531278 -11.13247 7.784362e-08
##                adj.P.Val         B
## FBgn0029167 1.626377e-06 13.601256
## FBgn0039155 1.626377e-06 13.215554
## FBgn0035085 6.127092e-06 11.800784
## FBgn0026562 1.368777e-05 10.768462
## FBgn0001226 1.368777e-05 10.490034
## FBgn0029896 1.368777e-05 10.321666
## FBgn0038832 2.568969e-05  9.476004
## FBgn0040091 3.918841e-05  9.015791
## FBgn0034736 3.049099e-05  8.780088
## FBgn0000071 5.207492e-05  8.490541

#- all genes
atab = topTable(fit.de, coef="pasillauntreated",number = nrow(dgel))
#- controling FDR at 5%
stab = atab[atab$adj.P.Val<=.05,]
ord  = order(stab$P.Value,decreasing=FALSE)[1:20]
deg  = rownames(stab)[ord]
pheatmap(v.plot[deg,],scale="row", border_color = "white",labels_row=v$genes[deg,"name"],fontsize_row=9,cellheight=11,cellwidth=11)