fdata6 = read.delim("jun10_2016_fdata6.txt",
stringsAsFactors = F,
check.names = F)
noness = filter(fdata6, fdata6$essential == 0)
noness = noness %>% arrange(strain)
preprocessing for low cnts –
count matrix
filter count matrix for ALL rows < 50
filter count matrix for all CTRLS < 50
xbar= as.matrix(read.delim('aug6_2016_hom_barseq.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))
p11 = read.delim("oct2_phsbar.txt",header = T,stringsAsFactors = F,check.names = F)
w11 = which(p11$type == 'ctrl')
lp11 = p11[-w11,]
#filter out essential strains
wnebar = which(noness$strain %in% rownames(xbar))
cat(paste0(' # of rows before filtering: ',nrow(xbar)))
## # of rows before filtering: 12476
hsbar = myall_less50(xbar[noness$strain[wnebar],p11$name],w11)
cat(paste0(' # of rows after filtering for ALL cnts < 50: ',nrow(hsbar)))
## # of rows after filtering for ALL cnts < 50: 9412
hsbar = mymin50(hsbar,w11)
cat(paste0(' # of rows after filtering cnts less 50 in ANY control: ',nrow(hsbar)))
## # of rows after filtering cnts less 50 in ANY control: 9038
retrieve normalized counts from edgeR and DESeq2, and normalizing by count w/out package
group = model factor
ref = reference condition
hsbar = hsbar[,p11$name]
p11$cond = factor(p11$cond)
p11$cond = relevel(p11$cond,ref='sc')
hdds = mydds(hsbar,groups = p11$cond)
## Loading required package: DESeq2
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:dplyr':
##
## rename
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
w11 = which(p11$type == 'ctrl')
hedge = myedgeR(hsbar,group = p11$cond,ref = 'sc')
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## Disp = 0.04536 , BCV = 0.213
mysumtags = sum counts from up and down tags
mysumcond = sum counts from each condition
post processing normalized count matrix collapse matrix dimensions by:
hdds2 = myproc_normcounts(hdds,p11$cond)
## [1] 9038 11
## [1] 4718 11
hedge2 = myproc_normcounts(hedge,p11$cond)
## [1] 9038 11
## [1] 4718 11
compare this to same data generated by arrays
x11= as.matrix(read.delim('x11.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))
x111 = mymixEMctrl(x11,w11)
## Loading required package: reshape2
## Loading required package: mixtools
## mixtools package, version 1.0.4, Released 2016-01-11
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
## number of iterations= 136
btst = myBesttag(x111,bgThresh = 6.3)
## [1] min value of ctrl medians = 7.60542128303907
## [1] 4352 11
hbat = ComBat(btst,mod = model.matrix(~cond,p11),batch = p11$pcr)
## Found 3 batches
## Adjusting for 3 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
tsw = mysweep(hbat)
hsum = mysumm(tsw[,-w11],lp11$cond)
hsum is chip
comparison to normalizing by counts/experiment
tst = mybarnorm(hsbar)
tst2 = myproc_normcounts(tst,p11$cond)
## [1] 9038 11
## [1] 4718 11
noRpkg = tst2[,colnames(hsum)]
plot comparisions
get barseq ypd data
xobar= as.matrix(read.delim('count_matrix_aug4_noctrl_celnames.txt',
header = T,stringsAsFactors = F,sep = "\t",row.names = 1,check.names = F,as.is=TRUE))
perica = read.delim("jun10_2016_pfinal_erica.txt",header = T,stringsAsFactors=F, check.names = F)
w= which(perica$celfile %in% colnames(xobar))
pbar = perica[w,]
xobar = xobar[which(rownames(xobar) %in% noness$strain),pbar$celfile]
colnames(xobar)=pbar$name
#pjen = data.frame(name= colnames(xobar)[grep('Jen',colnames(xobar))])
phybar = pbar %>% filter(pool == 'hom',media == 'ypd')
poybar = pbar %>% filter(pool == 'oliver',media == 'ypd')
phybar = phybar %>% arrange(type,cond,pcr)
poybar = poybar %>% arrange(type,cond,pcr)
woybar = which(poybar$type == 'ctrl')
whybar = which(phybar$type == 'ctrl')
filter as above for low counts
dds and edge evaluation for ypd – check out the FEB17 0083 outlier for hom ypd
hydds = mydds(hybar,phybar$cond)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
oydds = mydds(oybar,poybar$cond)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
oyedge = myedgeR(oybar,poybar$cond,ref = 'ypd')
## Disp = 0.03942 , BCV = 0.1985
hedge = myedgeR(hsbar,p11$cond,ref = 'sc')
## Disp = 0.04536 , BCV = 0.213
hyedge = myedgeR(hybar,phybar$cond,ref = 'ypd')
## Disp = 0.0503 , BCV = 0.2243
tmp = myedgeR(hybar[,-10],phybar$cond[-10],ref = 'ypd')
## Disp = 0.05012 , BCV = 0.2239
tmp = myedgeR(hybar[,-c(1,10)],phybar$cond[-c(1,10)],ref = 'ypd')
## Disp = 0.041 , BCV = 0.2025
downstream collapsing YPD tags and experiments
hyd = myproc_normcounts(hydds,phybar$cond)
## [1] 8222 15
## [1] 4444 15
oyd = myproc_normcounts(oydds,poybar$cond)
## [1] 7760 7
## [1] 4427 7
hyj = myproc_normcounts(hyedge,phybar$cond)
## [1] 8222 15
## [1] 4444 15
oyj = myproc_normcounts(oyedge,poybar$cond)
## [1] 7760 7
## [1] 4427 7
YPD array data
xochip= as.matrix(read.delim(
"xochip.txt",header = T,stringsAsFactors = F,check.names = F))
omix = xochip[,poybar$name]
omix = mymixEMctrl(omix,woybar)
## number of iterations= 223
omix = medShift.updn(omix)
omin = min(apply(omix[,woybar],1,median))
hmix = xochip[,phybar$name]
hmix = mymixEMctrl(hmix,whybar)
## number of iterations= 110
hmin = min(apply(hmix[,whybar],1,median))
bomix = myBesttag(omix,bgThresh=omin)
## [1] min value of ctrl medians = 7.38834850891002
## [1] 4423 7
bhmix = myBesttag(hmix,bgThresh=hmin)
## [1] min value of ctrl medians = 7.17596949818443
## [1] 4264 15
#before batch correction
mydendy(bhmix,phybar$pcr)
## 'dendrogram' with 2 branches and 15 members total, at height 46.95969
bhybat = ComBat(bhmix,mod=model.matrix(~cond,phybar),batch=phybar$pcr)
## Found 7 batches
## Note: one batch has only one sample, setting mean.only=TRUE
## Adjusting for 2 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
#after batch correction
mydendy(bhybat,phybar$cond)
## 'dendrogram' with 2 branches and 15 members total, at height 29.75203
#oliver before batch correction
mydendy(bomix,poybar$cond)
## 'dendrogram' with 2 branches and 7 members total, at height 34.75801
boybat = ComBat(bomix,mod=model.matrix(~cond,poybar),batch=poybar$pcr)
## Found 4 batches
## Note: one batch has only one sample, setting mean.only=TRUE
## Adjusting for 1 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
#oliver after batch correction
mydendy(boybat,poybar$cond)
## 'dendrogram' with 2 branches and 7 members total, at height 26.96797
hysw= mysweep(bhybat)
lphybar = phybar[-whybar,]
#summarize chip data
hysum = mysumm(hysw[,lphybar$name],lphybar$cond)
oysw= mysweep(boybat)
lpoybar = poybar[-woybar,]
oysum = mysumm(oysw[,lpoybar$name],lpoybar$cond)
normalize YPD data by count
noRhy = mybarnorm(hybar)
noRhy3 = myproc_normcounts(noRhy,phybar$cond)
## [1] 8222 15
## [1] 4444 15
noRhy3 = noRhy3[,colnames(hysum)]
noRoy = mybarnorm(oybar)
noRoy3 = myproc_normcounts(noRoy,poybar$cond)
## [1] 7760 7
## [1] 4427 7
noRoy3 = noRoy3[,colnames(oysum),drop=F]
plot YPD results
hom YPD
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(hysum)) {
p1(hysum,i)
p1(noRhy3,i)
p1(hyj,i)
p1(hyd,i)
}
oliver YPD
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(oysum)) {
p1(oysum,i)
p1(noRoy3,i,sig =0.7)
p1(oyj,i,sig =0.7)
p1(oyd,i,sig =0.7)
}
compare to combining biological replicates with standard edge:
hs_std_edge = mydfedge(hsbar,p11$cond,ref= 'sc')
mhs_std = as.matrix(hs_std_edge[,1:5])
rownames(mhs_std) = hs_std_edge$strain
hs_sumtags = mysumtags(mybarnorm(hsbar))
## [1] 9038 11
## [1] 4718 11
hs_std_sumtags = mydfedge(hs_sumtags,p11$cond,ref= 'sc')
mhs_std_sumtags = as.matrix(hs_std_sumtags[,1:5])
rownames(mhs_std_sumtags) = hs_std_sumtags$strain
mhs_std_sumtags= mhs_std_sumtags[order(rownames(mhs_std_sumtags)),]
hy_std_edge = mydfedge(hybar,phybar$cond,ref= 'ypd')
mhy_std = as.matrix(hy_std_edge[,1:4])
rownames(mhy_std) = hy_std_edge$strain
hy_sumtags = mysumtags(mybarnorm(hybar))
## [1] 8222 15
## [1] 4444 15
hy_std_sumtags = mydfedge(hy_sumtags,phybar$cond,ref= 'ypd')
mhy_std_sumtags = as.matrix(hy_std_sumtags[,1:4])
rownames(mhy_std_sumtags) = hy_std_sumtags$strain
mhy_std_sumtags= mhy_std_sumtags[order(rownames(mhy_std_sumtags)),]
compare standard edge vs normalized count analysis from edgeR
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
colnames(mhs_std_sumtags) = paste0('std_edge.',colnames(mhs_std_sumtags))
colnames(mhy_std_sumtags) = paste0('std_edge.',colnames(mhy_std_sumtags))
for (i in 1:3)
psig(-mhs_std_sumtags,hedge2,i)
for (i in 1:2)
psig(-mhy_std_sumtags,hyj,i)
compare to prenormalizing and collapsing up and downtags separately before dispersion estimate
xysum = mysumtags(hybar)
## [1] 8222 15
## [1] 4444 15
xyup = mybarnorm(hybar[grep('uptag',rownames(hybar)),])
xydn = mybarnorm(hybar[grep('downtag',rownames(hybar)),])
xyud = rbind(xyup,xydn)
xynormsum = mysumtags(xyud)
## [1] 8222 15
## [1] 4444 15
dynorm = myedgeR(xynormsum,phybar$cond,ref='ypd')
## Disp = 0.03861 , BCV = 0.1965
dysum = myedgeR(xysum,phybar$cond,ref='ypd')
## Disp = 0.03786 , BCV = 0.1946
xssum = mysumtags(hsbar)
## [1] 9038 11
## [1] 4718 11
dssum = myedgeR(xssum,p11$cond,ref='sc')
## Disp = 0.02983 , BCV = 0.1727
xsup = mybarnorm(hsbar[grep('uptag',rownames(hsbar)),])
xsdn = mybarnorm(hsbar[grep('downtag',rownames(hsbar)),])
xsud = rbind(xsup,xsdn)
xsnormsum = mysumtags(xsud)
## [1] 9038 11
## [1] 4718 11
dsnorm = myedgeR(xsnormsum,p11$cond,ref='sc')
## Disp = 0.02889 , BCV = 0.17
collapse counts by condition
plot pre-summed updn tags vs independent dispersion estimates – SC – doesn’t matter much
norm on left, myedgeR on right
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:3)
psig(psnorm,hedge2,i)
plot pre-summed updn tags vs normalized pre-summed updn tags – SC – doesn’t matter much
norm on left, unnorm right
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:3)
psig(psnorm,pssum,i)
plot pre-summed updn tags vs independent dispersion estimates – YPD – better to presum up and down tags
norm on left, myedgeR on right
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:2)
psig(pynorm,hyj,i)