fdata6 = read.delim("jun10_2016_fdata6.txt",
    stringsAsFactors = F,
    check.names = F)
    
  noness = filter(fdata6, fdata6$essential == 0)
  noness = noness %>% arrange(strain)

preprocessing for low cnts – 1. count matrix 2. filter count matrix for ALL rows < 50 3. filter count matrix for all CTRLS < 50

example, SC dropouts hom samples


xbar= as.matrix(read.delim('aug6_2016_hom_barseq.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))

p11 = read.delim("oct2_phsbar.txt",header = T,stringsAsFactors = F,check.names = F)

w11 = which(p11$type == 'ctrl')
lp11 = p11[-w11,]
#filter out essential strains
wnebar = which(noness$strain %in% rownames(xbar))

cat(paste0(' # of rows before filtering: ',nrow(xbar)))

hsbar = myall_less50(xbar[noness$strain[wnebar],p11$name])

cat(paste0(' # of rows after filtering for ALL cnts < 50: ',nrow(hsbar)))

hsbar = mymin50(hsbar,w11)

cat(paste0(' # of rows after filtering cnts less 50 in ANY control: ',nrow(hsbar)))

retrieve normalized counts from edgeR and DESeq2, and normalizing by count w/out package

group = model factor

ref = reference condition

hsbar = hsbar[,p11$name]

p11$cond = factor(p11$cond)
p11$cond = relevel(p11$cond,ref='sc')
hdds = mydds(hsbar,groups = p11$cond)

hedge = myedgeR(hsbar,group = p11$cond,ref = 'sc')

mysumtags = sum counts from up and down tags

mysumcond = sum counts from each condition

post processing normalized count matrix collapse matrix dimensions by:

  1. sum up and downtags
  2. summing replicate condtions
hdds2 = myproc_normcounts(hdds,p11$cond)

hedge2 = myproc_normcounts(hedge,p11$cond)

compare this to same data generated by arrays

x11= as.matrix(read.delim('x11.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))

x111 = mymixEMctrl(x11,w11)
btst = myBesttag(x111,bgThresh = 6.3)
hbat = ComBat(btst,mod = model.matrix(~cond,p11),batch = p11$pcr)
tsw = mysweep(hbat)

hsum = mysumm(tsw[,-w11],lp11$cond)

comparison to normalizing by counts/experiment without an R package

tst = mybarnorm(hsbar)

tst2 = myproc_normcounts(tst,p11$cond)

noRpkg = tst2[,colnames(hsum)]

plot comparisions

YPD oliver and hom

get barseq ypd data

xobar= as.matrix(read.delim('count_matrix_aug4_noctrl_celnames.txt',
header = T,stringsAsFactors = F,sep = "\t",row.names = 1,check.names = F,as.is=TRUE))

perica = read.delim("jun10_2016_pfinal_erica.txt",header = T,stringsAsFactors=F, check.names = F)

w= which(perica$celfile %in% colnames(xobar))

pbar = perica[w,]

xobar = xobar[which(rownames(xobar) %in% noness$strain),pbar$celfile]

colnames(xobar)=pbar$name

#pjen = data.frame(name= colnames(xobar)[grep('Jen',colnames(xobar))])

phybar = pbar %>% filter(pool == 'hom',media == 'ypd')
poybar = pbar %>% filter(pool == 'oliver',media == 'ypd')
phybar = phybar %>% arrange(type,cond,pcr)
poybar = poybar %>% arrange(type,cond,pcr)

woybar = which(poybar$type == 'ctrl')
whybar = which(phybar$type == 'ctrl')

filter as above for low counts

dds and edge evaluation for ypd – check out the FEB17 0083 outlier for hom ypd


hydds = mydds(hybar,phybar$cond)
oydds = mydds(oybar,poybar$cond)
hyedge = myedgeR(hybar,phybar$cond,ref = 'ypd')
oyedge = myedgeR(oybar,poybar$cond,ref = 'ypd')
hyd = myproc_normcounts(hydds,phybar$cond)
oyd = myproc_normcounts(oydds,poybar$cond)

hyj = myproc_normcounts(hyedge,phybar$cond)
oyj = myproc_normcounts(oyedge,poybar$cond)

ypd array datag


xochip= as.matrix(read.delim(
"xochip.txt",header = T,stringsAsFactors = F,check.names = F))

omix = xochip[,poybar$name]
omix = mymixEMctrl(omix,woybar)
omix = medShift.updn(omix)
omin = min(apply(omix[,woybar],1,median))

hmix = xochip[,phybar$name]
hmix = mymixEMctrl(hmix,whybar)
hmin = min(apply(hmix[,whybar],1,median))

bomix = myBesttag(omix,bgThresh=omin)

bhmix = myBesttag(hmix,bgThresh=hmin)

#before batch correction
mydendy(bhmix,phybar$pcr)

bhybat = ComBat(bhmix,mod=model.matrix(~cond,phybar),batch=phybar$pcr)
#after batch correction
mydendy(bhybat,phybar$cond)

#oliver before batch correction
mydendy(bomix,poybar$cond)
boybat = ComBat(bomix,mod=model.matrix(~cond,poybar),batch=poybar$pcr)

#oliver after batch correction
mydendy(boybat,poybar$cond)

hysw= mysweep(bhybat)
lphybar = phybar[-whybar,]
#summarize chip data
hysum = mysumm(hysw[,lphybar$name],lphybar$cond)

oysw= mysweep(boybat)
lpoybar = poybar[-woybar,]
oysum = mysumm(oysw[,lpoybar$name],lpoybar$cond)

normalize YPD data by count

noRhy = mybarnorm(hybar)

noRhy3 = myproc_normcounts(noRhy,phybar$cond)

noRhy3 = noRhy3[,colnames(hysum)]

noRoy = mybarnorm(oybar)

noRoy3 = myproc_normcounts(noRoy,poybar$cond)

noRoy3 = noRoy3[,colnames(oysum),drop=F]

plot YPD results

hom YPD


opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(hysum)) { 
  
  p1(hysum,i)
  p1(noRhy3,i)
  p1(hyj,i)
  p1(hyd,i)
  
}

oliver YPD


opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(oysum)) { 
  
  p1(oysum,i)
  p1(noRoy3,i,sig =0.7)
  p1(oyj,i,sig =0.7)
  p1(oyd,i,sig =0.7)
  
}

compare to combining biological replicates with standard edge:

  1. up and downtags separately, collapsed by summation after dispersion estimates

hs_std_edge = mydfedge(hsbar,p11$cond,ref= 'sc')
mhs_std = as.matrix(hs_std_edge[,1:5])
rownames(mhs_std) = hs_std_edge$strain

hs_sumtags = mysumtags(mybarnorm(hsbar))
hs_std_sumtags = mydfedge(hs_sumtags,p11$cond,ref= 'sc')
mhs_std_sumtags = as.matrix(hs_std_sumtags[,1:5])
rownames(mhs_std_sumtags) = hs_std_sumtags$strain
mhs_std_sumtags= mhs_std_sumtags[order(rownames(mhs_std_sumtags)),]



hy_std_edge = mydfedge(hybar,phybar$cond,ref= 'ypd')
mhy_std = as.matrix(hy_std_edge[,1:4])
rownames(mhy_std) = hy_std_edge$strain


hy_sumtags = mysumtags(mybarnorm(hybar))

hy_std_sumtags = mydfedge(hy_sumtags,phybar$cond,ref= 'ypd')
mhy_std_sumtags = as.matrix(hy_std_sumtags[,1:4])
rownames(mhy_std_sumtags) = hy_std_sumtags$strain
mhy_std_sumtags= mhy_std_sumtags[order(rownames(mhy_std_sumtags)),]
  1. normalize up and downtags separately before combining, predispersion

plot standard edge, pre-summed up and downtags

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:3)
psig(-mhs_std_sumtags,hedge2,i)

for (i in 1:2)
psig(-mhy_std_sumtags,hyj,i)
---
title: "chip_barseq.Rmd"
author: "ggiaever"
date: "02/10/2016"
output: html_notebook
css: complex-css.css
---


```{r setup libraries,include=F}
library(knitr)
knitr::opts_chunk$set(echo = T)
library(dplyr)
# # library(reshape2)
# library(ggplot2)
library(sva)
# library(DESeq2)
# 
library(RColorBrewer)
# library(edgeR)
palette(brewer.pal(8,'Set2'))

# mycolors = c("darkorange" ,"dodgerblue","limegreen","navy","hotpink",    "plum4","blue" , "magenta2","mediumpurple"  ,  "firebrick" ,"darkolivegreen4","royalblue3")
# palette(mycolors)
#source("071316_funcs_erica.R")
opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1)

source("aug15_working_chip_processing.R")
source("barseq_functions.R")

```

```{r get gene info}
  fdata6 = read.delim("jun10_2016_fdata6.txt",
    stringsAsFactors = F,
    check.names = F)
    
  noness = filter(fdata6, fdata6$essential == 0)
  noness = noness %>% arrange(strain)

```


preprocessing for low cnts -- 
1. count matrix
2. filter count matrix for ALL rows < 50
3. filter count matrix for all CTRLS < 50

#example, SC dropouts hom samples
```{r SC barseq data}

xbar= as.matrix(read.delim('aug6_2016_hom_barseq.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))

p11 = read.delim("oct2_phsbar.txt",header = T,stringsAsFactors = F,check.names = F)

w11 = which(p11$type == 'ctrl')
lp11 = p11[-w11,]
#filter out essential strains
wnebar = which(noness$strain %in% rownames(xbar))

cat(paste0(' # of rows before filtering: ',nrow(xbar)))

hsbar = myall_less50(xbar[noness$strain[wnebar],p11$name])

cat(paste0(' # of rows after filtering for ALL cnts < 50: ',nrow(hsbar)))

hsbar = mymin50(hsbar,w11)

cat(paste0(' # of rows after filtering cnts less 50 in ANY control: ',nrow(hsbar)))


```
retrieve normalized counts from edgeR and DESeq2, and normalizing by count w/out package

group = model factor

ref = reference condition
```{r}
hsbar = hsbar[,p11$name]

p11$cond = factor(p11$cond)
p11$cond = relevel(p11$cond,ref='sc')
hdds = mydds(hsbar,groups = p11$cond)

hedge = myedgeR(hsbar,group = p11$cond,ref = 'sc')
```

mysumtags = sum counts from up and down tags

mysumcond = sum counts from each condition

post processing normalized count matrix
collapse matrix dimensions by:

1. sum up and downtags 
2. summing replicate condtions
```{r post normalized cnt processing}
hdds2 = myproc_normcounts(hdds,p11$cond)

hedge2 = myproc_normcounts(hedge,p11$cond)
```

compare this to same data generated by arrays
```{r SC hom chip data}
x11= as.matrix(read.delim('x11.txt',header = T,stringsAsFactors =F,check.names = F,strip.white = T))

x111 = mymixEMctrl(x11,w11)
btst = myBesttag(x111,bgThresh = 6.3)
hbat = ComBat(btst,mod = model.matrix(~cond,p11),batch = p11$pcr)
tsw = mysweep(hbat)

hsum = mysumm(tsw[,-w11],lp11$cond)
```


comparison to normalizing by counts/experiment without an R package
```{r hom SC noR package}
tst = mybarnorm(hsbar)

tst2 = myproc_normcounts(tst,p11$cond)

noRpkg = tst2[,colnames(hsum)]
```

plot comparisions
```{r, fig.width=12,fig.height=6,echo = F}
hedge2=hedge2[,colnames(hsum)]

hdds2=hdds2[,colnames(hsum)]

colnames(hsum)=paste0('chip.',colnames(hsum))
colnames(hedge2)=paste0('edge.',colnames(hedge2))
colnames(hdds2)=paste0('deseq2.',colnames(hdds2))
colnames(noRpkg)=paste0('noRpkg.',colnames(noRpkg))

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
hsum = hsum[order(rownames(hsum)),]

for (i in 1:ncol(hsum)) { 
  p1(hsum,i)
  p1(noRpkg,i)
  p1(hedge2,i)
  p1(hdds2,i)
}
```




#YPD oliver and hom
get barseq ypd data
```{r, get ypd chipdata}
xobar= as.matrix(read.delim('count_matrix_aug4_noctrl_celnames.txt',
header = T,stringsAsFactors = F,sep = "\t",row.names = 1,check.names = F,as.is=TRUE))

perica = read.delim("jun10_2016_pfinal_erica.txt",header = T,stringsAsFactors=F, check.names = F)

w= which(perica$celfile %in% colnames(xobar))

pbar = perica[w,]

xobar = xobar[which(rownames(xobar) %in% noness$strain),pbar$celfile]

colnames(xobar)=pbar$name

#pjen = data.frame(name= colnames(xobar)[grep('Jen',colnames(xobar))])

phybar = pbar %>% filter(pool == 'hom',media == 'ypd')
poybar = pbar %>% filter(pool == 'oliver',media == 'ypd')
phybar = phybar %>% arrange(type,cond,pcr)
poybar = poybar %>% arrange(type,cond,pcr)

woybar = which(poybar$type == 'ctrl')
whybar = which(phybar$type == 'ctrl')
```

filter as above for low counts
```{r same deal ypd data, eval = T, include = F}
oybar = myall_less50(xobar[,poybar$name])
hybar = myall_less50(xobar[,phybar$name])

print('dim after filtering all cnts less 50')
print(dim(hybar))
print(dim(oybar))
hybar = mymin50(hybar,whybar)
oybar = mymin50(oybar,woybar)
print('dim after filtering cnts less 50 only in contrl')
print(dim(hybar))
print(dim(oybar))
```

dds and edge evaluation for ypd -- check out the FEB17 0083 outlier for hom ypd
```{r normal cnts, eval = T, include = T}

hydds = mydds(hybar,phybar$cond)
oydds = mydds(oybar,poybar$cond)
hyedge = myedgeR(hybar,phybar$cond,ref = 'ypd')
oyedge = myedgeR(oybar,poybar$cond,ref = 'ypd')

```

```{r, eval = T, include = T}
hyd = myproc_normcounts(hydds,phybar$cond)
oyd = myproc_normcounts(oydds,poybar$cond)

hyj = myproc_normcounts(hyedge,phybar$cond)
oyj = myproc_normcounts(oyedge,poybar$cond)

```


ypd array datag
```{r get YPD oliver and hom chip data}

xochip= as.matrix(read.delim(
"xochip.txt",header = T,stringsAsFactors = F,check.names = F))

omix = xochip[,poybar$name]
omix = mymixEMctrl(omix,woybar)
omix = medShift.updn(omix)
omin = min(apply(omix[,woybar],1,median))

hmix = xochip[,phybar$name]
hmix = mymixEMctrl(hmix,whybar)
hmin = min(apply(hmix[,whybar],1,median))

bomix = myBesttag(omix,bgThresh=omin)

bhmix = myBesttag(hmix,bgThresh=hmin)

#before batch correction
mydendy(bhmix,phybar$pcr)

bhybat = ComBat(bhmix,mod=model.matrix(~cond,phybar),batch=phybar$pcr)
#after batch correction
mydendy(bhybat,phybar$cond)

#oliver before batch correction
mydendy(bomix,poybar$cond)
boybat = ComBat(bomix,mod=model.matrix(~cond,poybar),batch=poybar$pcr)

#oliver after batch correction
mydendy(boybat,poybar$cond)

hysw= mysweep(bhybat)
lphybar = phybar[-whybar,]
#summarize chip data
hysum = mysumm(hysw[,lphybar$name],lphybar$cond)

oysw= mysweep(boybat)
lpoybar = poybar[-woybar,]
oysum = mysumm(oysw[,lpoybar$name],lpoybar$cond)

```


normalize YPD data by count
```{r norm by cnts YPD}
noRhy = mybarnorm(hybar)

noRhy3 = myproc_normcounts(noRhy,phybar$cond)

noRhy3 = noRhy3[,colnames(hysum)]

noRoy = mybarnorm(oybar)

noRoy3 = myproc_normcounts(noRoy,poybar$cond)

noRoy3 = noRoy3[,colnames(oysum),drop=F]
```

plot YPD results
```{r, fig.width=12,fig.height=6,echo = F}
oysum = oysum[order(rownames(oysum)),,drop=F]
hysum = hysum[order(rownames(hysum)),,drop=F]
hyj=hyj[,colnames(hysum)]

hyd=hyd[,colnames(hysum)]

colnames(hysum)=paste0('chip.',colnames(hysum))
colnames(hyj)=paste0('edge.',colnames(hyj))
colnames(hyd)=paste0('deseq2.',colnames(hyd))
colnames(noRhy3)=paste0('noRpkg.',colnames(noRhy3))

colnames(oysum)=paste0('chip.',colnames(oysum))
colnames(oyj)=paste0('edge.',colnames(oyj))
colnames(oyd)=paste0('deseq2.',colnames(oyd))
colnames(noRoy3)=paste0('noRpkg.',colnames(noRoy3))

```

hom YPD
```{r, fig.width=12,fig.height=6}

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(hysum)) { 
  
  p1(hysum,i)
  p1(noRhy3,i)
  p1(hyj,i)
  p1(hyd,i)
  
}

```

oliver YPD
```{r oliver, fig.width=12,fig.height=6}

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:ncol(oysum)) { 
  
  p1(oysum,i)
  p1(noRoy3,i,sig =0.7)
  p1(oyj,i,sig =0.7)
  p1(oyd,i,sig =0.7)
  
}
```

compare to combining biological replicates with standard edge:

1. up and downtags separately, collapsed by summation after dispersion estimates
```{r standard edge}

hs_std_edge = mydfedge(hsbar,p11$cond,ref= 'sc')
mhs_std = as.matrix(hs_std_edge[,1:5])
rownames(mhs_std) = hs_std_edge$strain

hs_sumtags = mysumtags(mybarnorm(hsbar))
hs_std_sumtags = mydfedge(hs_sumtags,p11$cond,ref= 'sc')
mhs_std_sumtags = as.matrix(hs_std_sumtags[,1:5])
rownames(mhs_std_sumtags) = hs_std_sumtags$strain
mhs_std_sumtags= mhs_std_sumtags[order(rownames(mhs_std_sumtags)),]



hy_std_edge = mydfedge(hybar,phybar$cond,ref= 'ypd')
mhy_std = as.matrix(hy_std_edge[,1:4])
rownames(mhy_std) = hy_std_edge$strain


hy_sumtags = mysumtags(mybarnorm(hybar))

hy_std_sumtags = mydfedge(hy_sumtags,phybar$cond,ref= 'ypd')
mhy_std_sumtags = as.matrix(hy_std_sumtags[,1:4])
rownames(mhy_std_sumtags) = hy_std_sumtags$strain
mhy_std_sumtags= mhy_std_sumtags[order(rownames(mhy_std_sumtags)),]

```

2. normalize up and downtags separately before combining, predispersion

plot standard edge, pre-summed up and downtags

```{r std edge, fig.width=12,fig.height=6}
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in 1:3)
psig(-mhs_std_sumtags,hedge2,i)

for (i in 1:2)
psig(-mhy_std_sumtags,hyj,i)

```


