fdata6 = read.delim("jun10_2016_fdata6.txt",
    stringsAsFactors = F,
    check.names = F)
    
  noness = filter(fdata6, fdata6$essential == 0)
  noness = noness %>% arrange(strain)

preprocessing for low cnts –

count matrix
filter count matrix for ALL rows < 50
filter count matrix for all CTRLS < 50

example, SC dropouts hom samples

xbar= as.matrix(read.delim('aug6_2016_hom_barseq.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))

p11 = read.delim("oct2_phsbar.txt",header = T,stringsAsFactors = F,check.names = F)

w11 = which(p11$type == 'ctrl')
lp11 = p11[-w11,]
#filter out essential strains
wnebar = which(noness$strain %in% rownames(xbar))

cat(paste0(' # of rows before filtering: ',nrow(xbar)))

##  # of rows before filtering: 12476

hsbar = myall_less50(xbar[noness$strain[wnebar],p11$name],w11)

cat(paste0(' # of rows after filtering for ALL cnts < 50: ',nrow(hsbar)))

##  # of rows after filtering for ALL cnts < 50: 9412

hsbar = mymin50(hsbar,w11)

cat(paste0(' # of rows after filtering cnts less 50 in ANY control: ',nrow(hsbar)))

##  # of rows after filtering cnts less 50 in ANY control: 9038

retrieve normalized counts from edgeR and DESeq2, and normalizing by count w/out package

group = model factor

ref = reference condition

hsbar = hsbar[,p11$name]

p11$cond = factor(p11$cond)
p11$cond = relevel(p11$cond,ref='sc')
hdds = mydds(hsbar,groups = p11$cond)

## Loading required package: DESeq2

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:dplyr':
## 
##     rename

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:nlme':
## 
##     collapse

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: Rcpp

## Loading required package: RcppArmadillo

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

w11 = which(p11$type == 'ctrl')

hedge = myedgeR(hsbar,group = p11$cond,ref = 'sc')

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:DESeq2':
## 
##     plotMA

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

## Disp = 0.04536 , BCV = 0.213

mysumtags = sum counts from up and down tags

mysumcond = sum counts from each condition

post processing normalized count matrix collapse matrix dimensions by:

sum up and downtags
summing replicate condtions

hdds2 = myproc_normcounts(hdds,p11$cond)

## [1] 9038   11
## [1] 4718   11

hedge2 = myproc_normcounts(hedge,p11$cond)

## [1] 9038   11
## [1] 4718   11

compare this to same data generated by arrays

x11= as.matrix(read.delim('x11.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))

x111 = mymixEMctrl(x11,w11)

## Loading required package: reshape2

## Loading required package: mixtools

## mixtools package, version 1.0.4, Released 2016-01-11
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.

## number of iterations= 136

btst = myBesttag(x111,bgThresh = 6.3)

## [1] min value of ctrl medians =  7.60542128303907            
## [1] 4352   11

hbat = ComBat(btst,mod = model.matrix(~cond,p11),batch = p11$pcr)

## Found 3 batches
## Adjusting for 3 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

tsw = mysweep(hbat)

hsum = mysumm(tsw[,-w11],lp11$cond)

hsum is chip

comparison to normalizing by counts/experiment

tst = mybarnorm(hsbar)

tst2 = myproc_normcounts(tst,p11$cond)

## [1] 9038   11
## [1] 4718   11

noRpkg = tst2[,colnames(hsum)]

plot comparisions

maricio the above is the bulk of the analysis, below i was ###experimenting

YPD oliver and hom

get barseq ypd data

xobar= as.matrix(read.delim('count_matrix_aug4_noctrl_celnames.txt',
header = T,stringsAsFactors = F,sep = "\t",row.names = 1,check.names = F,as.is=TRUE))

perica = read.delim("jun10_2016_pfinal_erica.txt",header = T,stringsAsFactors=F, check.names = F)

w= which(perica$celfile %in% colnames(xobar))

pbar = perica[w,]

xobar = xobar[which(rownames(xobar) %in% noness$strain),pbar$celfile]

colnames(xobar)=pbar$name

#pjen = data.frame(name= colnames(xobar)[grep('Jen',colnames(xobar))])

phybar = pbar %>% filter(pool == 'hom',media == 'ypd')
poybar = pbar %>% filter(pool == 'oliver',media == 'ypd')
phybar = phybar %>% arrange(type,cond,pcr)
poybar = poybar %>% arrange(type,cond,pcr)

woybar = which(poybar$type == 'ctrl')
whybar = which(phybar$type == 'ctrl')

filter as above for low counts

dds and edge evaluation for ypd – check out the FEB17 0083 outlier for hom ypd

hydds = mydds(hybar,phybar$cond)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

oydds = mydds(oybar,poybar$cond)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

oliver YPD edgeR

oyedge = myedgeR(oybar,poybar$cond,ref = 'ypd')

## Disp = 0.03942 , BCV = 0.1985

hom SC edgeR

hedge = myedgeR(hsbar,p11$cond,ref = 'sc')

## Disp = 0.04536 , BCV = 0.213

hom YPD edgeR

hyedge = myedgeR(hybar,phybar$cond,ref = 'ypd')

## Disp = 0.0503 , BCV = 0.2243

again after removing Feb17 83 experiment

tmp = myedgeR(hybar[,-10],phybar$cond[-10],ref = 'ypd')

## Disp = 0.05012 , BCV = 0.2239

again after removing 2nd outlier

tmp = myedgeR(hybar[,-c(1,10)],phybar$cond[-c(1,10)],ref = 'ypd')

## Disp = 0.041 , BCV = 0.2025

downstream collapsing YPD tags and experiments

hyd = myproc_normcounts(hydds,phybar$cond)

## [1] 8222   15
## [1] 4444   15

oyd = myproc_normcounts(oydds,poybar$cond)

## [1] 7760    7
## [1] 4427    7

hyj = myproc_normcounts(hyedge,phybar$cond)

## [1] 8222   15
## [1] 4444   15

oyj = myproc_normcounts(oyedge,poybar$cond)

## [1] 7760    7
## [1] 4427    7

YPD array data

xochip= as.matrix(read.delim(
"xochip.txt",header = T,stringsAsFactors = F,check.names = F))

omix = xochip[,poybar$name]
omix = mymixEMctrl(omix,woybar)

## number of iterations= 223

omix = medShift.updn(omix)
omin = min(apply(omix[,woybar],1,median))

hmix = xochip[,phybar$name]
hmix = mymixEMctrl(hmix,whybar)

## number of iterations= 110

hmin = min(apply(hmix[,whybar],1,median))

bomix = myBesttag(omix,bgThresh=omin)

## [1] min value of ctrl medians =  7.38834850891002            
## [1] 4423    7

bhmix = myBesttag(hmix,bgThresh=hmin)

## [1] min value of ctrl medians =  7.17596949818443            
## [1] 4264   15

#before batch correction
mydendy(bhmix,phybar$pcr)

## 'dendrogram' with 2 branches and 15 members total, at height 46.95969

bhybat = ComBat(bhmix,mod=model.matrix(~cond,phybar),batch=phybar$pcr)

## Found 7 batches
## Note: one batch has only one sample, setting mean.only=TRUE
## Adjusting for 2 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

#after batch correction
mydendy(bhybat,phybar$cond)

## 'dendrogram' with 2 branches and 15 members total, at height 29.75203

#oliver before batch correction
mydendy(bomix,poybar$cond)

## 'dendrogram' with 2 branches and 7 members total, at height 34.75801

boybat = ComBat(bomix,mod=model.matrix(~cond,poybar),batch=poybar$pcr)

## Found 4 batches
## Note: one batch has only one sample, setting mean.only=TRUE
## Adjusting for 1 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

#oliver after batch correction
mydendy(boybat,poybar$cond)

## 'dendrogram' with 2 branches and 7 members total, at height 26.96797

hysw= mysweep(bhybat)
lphybar = phybar[-whybar,]
#summarize chip data
hysum = mysumm(hysw[,lphybar$name],lphybar$cond)

oysw= mysweep(boybat)
lpoybar = poybar[-woybar,]
oysum = mysumm(oysw[,lpoybar$name],lpoybar$cond)

normalize YPD data by count

noRhy = mybarnorm(hybar)

noRhy3 = myproc_normcounts(noRhy,phybar$cond)

## [1] 8222   15
## [1] 4444   15

noRhy3 = noRhy3[,colnames(hysum)]

noRoy = mybarnorm(oybar)

noRoy3 = myproc_normcounts(noRoy,poybar$cond)

## [1] 7760    7
## [1] 4427    7

noRoy3 = noRoy3[,colnames(oysum),drop=F]

plot YPD results

hom YPD

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(hysum)) { 
  
  p1(hysum,i)
  p1(noRhy3,i)
  p1(hyj,i)
  p1(hyd,i)
  
}

oliver YPD

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(oysum)) { 
  
  p1(oysum,i)
  p1(noRoy3,i,sig =0.7)
  p1(oyj,i,sig =0.7)
  p1(oyd,i,sig =0.7)
  
}

compare to combining biological replicates with standard edge:

hs_std_edge = mydfedge(hsbar,p11$cond,ref= 'sc')
mhs_std = as.matrix(hs_std_edge[,1:5])
rownames(mhs_std) = hs_std_edge$strain

hs_sumtags = mysumtags(mybarnorm(hsbar))

## [1] 9038   11
## [1] 4718   11

hs_std_sumtags = mydfedge(hs_sumtags,p11$cond,ref= 'sc')
mhs_std_sumtags = as.matrix(hs_std_sumtags[,1:5])
rownames(mhs_std_sumtags) = hs_std_sumtags$strain
mhs_std_sumtags= mhs_std_sumtags[order(rownames(mhs_std_sumtags)),]

hy_std_edge = mydfedge(hybar,phybar$cond,ref= 'ypd')
mhy_std = as.matrix(hy_std_edge[,1:4])
rownames(mhy_std) = hy_std_edge$strain


hy_sumtags = mysumtags(mybarnorm(hybar))

## [1] 8222   15
## [1] 4444   15

hy_std_sumtags = mydfedge(hy_sumtags,phybar$cond,ref= 'ypd')
mhy_std_sumtags = as.matrix(hy_std_sumtags[,1:4])
rownames(mhy_std_sumtags) = hy_std_sumtags$strain
mhy_std_sumtags= mhy_std_sumtags[order(rownames(mhy_std_sumtags)),]

compare standard edge vs normalized count analysis from edgeR

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


colnames(mhs_std_sumtags) = paste0('std_edge.',colnames(mhs_std_sumtags))
colnames(mhy_std_sumtags) = paste0('std_edge.',colnames(mhy_std_sumtags))
for (i in 1:3)
psig(-mhs_std_sumtags,hedge2,i)

for (i in 1:2)
psig(-mhy_std_sumtags,hyj,i)

compare to prenormalizing and collapsing up and downtags separately before dispersion estimate

xysum = mysumtags(hybar)

## [1] 8222   15
## [1] 4444   15

xyup = mybarnorm(hybar[grep('uptag',rownames(hybar)),])
xydn = mybarnorm(hybar[grep('downtag',rownames(hybar)),])
xyud = rbind(xyup,xydn)

xynormsum = mysumtags(xyud)

## [1] 8222   15
## [1] 4444   15

dynorm = myedgeR(xynormsum,phybar$cond,ref='ypd')

## Disp = 0.03861 , BCV = 0.1965

dysum = myedgeR(xysum,phybar$cond,ref='ypd')

## Disp = 0.03786 , BCV = 0.1946

xssum = mysumtags(hsbar)

## [1] 9038   11
## [1] 4718   11

dssum = myedgeR(xssum,p11$cond,ref='sc')

## Disp = 0.02983 , BCV = 0.1727

xsup = mybarnorm(hsbar[grep('uptag',rownames(hsbar)),])
xsdn = mybarnorm(hsbar[grep('downtag',rownames(hsbar)),])
xsud = rbind(xsup,xsdn)
xsnormsum = mysumtags(xsud)

## [1] 9038   11
## [1] 4718   11

dsnorm = myedgeR(xsnormsum,p11$cond,ref='sc')

## Disp = 0.02889 , BCV = 0.17

collapse counts by condition

plot pre-summed updn tags vs independent dispersion estimates – SC – doesn’t matter much

norm on left, myedgeR on right

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in 1:3)
psig(psnorm,hedge2,i)

plot pre-summed updn tags vs normalized pre-summed updn tags – SC – doesn’t matter much

norm on left, unnorm right

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:3)
psig(psnorm,pssum,i)

plot pre-summed updn tags vs independent dispersion estimates – YPD – better to presum up and down tags

norm on left, myedgeR on right

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:2)
psig(pynorm,hyj,i)

plot pre-summed updn tags vs post dispersion estimates, pre-summed better – YPD –

norm on left, unnorm right

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:3)
psig(pynorm,pysum,i)

sessionInfo()

## R version 3.3.1 (2016-06-21)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS release 6.6 (Final)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dendextend_1.3.0           mixtools_1.0.4            
##  [3] reshape2_1.4.2             edgeR_3.12.1              
##  [5] limma_3.26.9               DESeq2_1.10.1             
##  [7] RcppArmadillo_0.6.500.4.0  Rcpp_0.12.8               
##  [9] SummarizedExperiment_1.0.2 Biobase_2.30.0            
## [11] GenomicRanges_1.22.4       GenomeInfoDb_1.6.3        
## [13] IRanges_2.4.8              S4Vectors_0.8.11          
## [15] BiocGenerics_0.16.1        RColorBrewer_1.1-2        
## [17] sva_3.18.0                 genefilter_1.52.1         
## [19] mgcv_1.8-15                nlme_3.1-128              
## [21] dplyr_0.5.0                knitr_1.15.1              
## 
## loaded via a namespace (and not attached):
##  [1] splines_3.3.1        Formula_1.2-1        assertthat_0.1      
##  [4] latticeExtra_0.6-28  robustbase_0.92-6    yaml_2.1.14         
##  [7] RSQLite_1.0.0        lattice_0.20-34      chron_2.3-47        
## [10] digest_0.6.10        XVector_0.10.0       colorspace_1.2-6    
## [13] htmltools_0.3.5      Matrix_1.2-7.1       plyr_1.8.4          
## [16] XML_3.98-1.4         zlibbioc_1.16.0      mvtnorm_1.0-5       
## [19] xtable_1.8-2         scales_0.4.1         whisker_0.3-2       
## [22] BiocParallel_1.4.3   tibble_1.2           annotate_1.48.0     
## [25] ggplot2_2.2.0        nnet_7.3-12          lazyeval_0.2.0      
## [28] survival_2.39-5      magrittr_1.5         mclust_5.2          
## [31] evaluate_0.10        MASS_7.3-45          segmented_0.5-1.4   
## [34] class_7.3-14         foreign_0.8-67       tools_3.3.1         
## [37] data.table_1.9.6     trimcluster_0.1-2    stringr_1.1.0       
## [40] kernlab_0.9-25       munsell_0.4.3        locfit_1.5-9.1      
## [43] cluster_2.0.5        AnnotationDbi_1.32.3 fpc_2.1-10          
## [46] lambda.r_1.1.9       futile.logger_1.4.3  grid_3.3.1          
## [49] rmarkdown_1.0        boot_1.3-18          gtable_0.2.0        
## [52] flexmix_2.3-13       DBI_0.5-1            R6_2.2.0            
## [55] gridExtra_2.2.1      prabclus_2.2-6       Hmisc_3.17-4        
## [58] futile.options_1.0.0 modeltools_0.2-21    stringi_1.1.2       
## [61] geneplotter_1.48.0   rpart_4.1-10         acepack_1.3-3.3     
## [64] DEoptimR_1.0-6       diptest_0.75-7

message for YPD = better to prenorm up and down separately, then run edgeR, didn’t matter for SC

normalizing separately didn’t matter for SC, did for YPD

normalizing tags separately didn’t matter for SC, did for YPD

chip_barseq.Rmd

ggiaever

02/10/2016