background tag calls batch corrected mixEM SC

hi pat! finally decided on a criteria – posterior > 0.5 in at least half of experiments, works better than just using the controls

here’s our goal

Purpose Develop a procedure for identifying poorly performing tags; specific goal being to distinguish strains NOT in the pool for three YKO derived collection:

auxotrophic YKO (hom)
prototrophic collection made by SGA/back-crossing (caudy)
prototrophic collection made by CEN plasmid complementation of the auxotrophies

this is really a preprocessing step for now.

uber important as this is a major point of this manuscript

erica this sight might me as useful for you as it was for me to understand what mixEM is doing

http://[rstudio-pubs-static.s3.amazonaws.com/1001_3177e85f5e4840be840c84452780db52.html]

pat, my main issue as always is where to draw the cutoff?

initially i enforced the rule that all 3 control experiments had to have a posterior > 0.05 i know why i used 0.05 – thought they were pvalues and wanted to be 95% confident they were background, but its too stringent changed to > 0.5 in ~ 1/2 of experiments for each pool set

i am sure there are more robust ways of calling/refining it. maybe even on a per tag basis rather than a per experiment basis? (eventually). also wasn’t sure if the batch correction was kosher…and not sure how much it mattered

batch correction definitely helps for SC, else the distributions all see to require different values based on plots

we did lose a few precious genes (i hope not too many) and probably need to tweak it, but overall it worked well

primarily we lost ADE genes from oliver – but if you look at these genes they are really flaky and close to background

steps

get SC data for all pools, xsc is the data matrix, psc describes experiments

xsc = read.delim(
  "jul22_2016_allpools_xSC.txt",header = T,stringsAsFactors = F,check.names = F)
   
  psc = read.delim(
  "jul22_2016_allpools_pSC.txt",
  header = T,
  stringsAsFactors = F,
  check.names = F
  )
  
  fdata6 = read.delim("jun10_2016_fdata6.txt",
    stringsAsFactors = F,
    check.names = F)
    
  noness = filter(fdata6, fdata6$essential == 0)
  noness = noness %>% arrange(strain)
  ess = filter(fdata6, fdata6$essential == 1)

assign tag type to each, plot ctrl experiments to compare distributions of unassigned vs assigned tags

# 
# unass = grep("tag_",rownames(xsc))
# 
# rownames(xsc)[1:1800]=gsub(":uptag",":downtag",rownames(xsc)[1:1800])

wess = which(rownames(xsc) %in% ess$strain)

xsc = xsc[-wess,]

xsc = xsc[,psc$name]

opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)

wnoness = which(rownames(xsc) %in% noness$strain)
xsc$assign[wnoness] = "noness"

xsc$assign[1:3600] = "unass"
xsc$tag = rownames(xsc)

mat = match(xsc$tag, fdata6$strain)
xsc$gene = fdata6$sgd_gene[mat]

wsc = which(psc$type == "ctrl")
sc = psc$name[wsc]


w = which(names(xsc) %in% c(sc,"assign","tag","gene"))



test = xsc[, w]

m <- melt(test, id.vars=c("tag","assign","gene"), variable.name = "exp")

mat = match(m$exp,psc$name)

m$pool = psc$pool[mat]
m$tag = factor(m$tag)
m$pool = factor(m$pool)
m$assign = factor(m$assign)

ggplot(m, aes(value)) + geom_histogram(aes(fill = pool), bins=50) +
  facet_grid(assign~pool) + ggtitle("distribution by pool and tag type")

ggplot(m, aes(value))+geom_freqpoly(aes(colour=pool), bins=50)+facet_grid(assign~.)+theme_bw()+ ggtitle("distribution by pool")

ggplot(m, aes(value)) + geom_histogram(aes(fill = assign), bins=50) +
  facet_wrap(~exp,ncol=3) + ggtitle("distribution by pool and tag type ctrl chips")+theme_bw()

batch correct and normalize first – all tags together

pcs = filter(psc,pool == "caudy")
phs = filter(psc,pool == "hom")
pos = filter(psc,pool == "oliver")



xcs = xsc[,pcs$name]
xhs = xsc[,phs$name]
xos = xsc[,pos$name]

xcs = medShift.updn(as.matrix(xcs))
xhs = medShift.updn(as.matrix(xhs))
xos = medShift.updn(as.matrix(xos))

bcs = ComBat(xcs,model.matrix(~cond,pcs),batch = pcs$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bhs = ComBat(xhs,model.matrix(~cond,phs),batch = phs$pcr)

## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bos = ComBat(xos,model.matrix(~cond,pos),batch = pos$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

plot before and after batch correction to see if it helps align distributiosn

density
dendograms

mydendy(xhs,phs$pcr,main="prebatch hom correct color = pcr date")

## 'dendrogram' with 2 branches and 30 members total, at height 120.2446

mydendy(bhs,phs$pcr, main="post batch hom correct color = pcr date")

## 'dendrogram' with 2 branches and 30 members total, at height 74.33607

mydensity(xhs,phs$pcr,main="pre batch hom correct density, color = pcr date")

mydensity(bhs,phs$pcr, main="post batch hom correct density, color = pcr date")

reformat long-2-wide

print('unass included')

## [1] "unass included"

bsc = cbind(bcs,bhs,bos)


bsc = as.data.frame(bsc,stringsAsFactors = F)
w = which(names(xsc) %in% c("assign","tag","gene"))

bsc = cbind(bsc,xsc[,w])



msc <- melt(bsc, id.vars=c("tag","assign","gene"), variable.name = "exp")
mat = match(msc$exp,psc$name)
msc$pool = psc$pool[mat]
msc$cond = psc$cond[mat]
msc$type = psc$type[mat]

estimate of the unassigned means and sd – pretty equal for all pools

ua.mean <- aggregate(value ~ pool, data=msc, FUN=mean, subset=assign=='unass')
cmean = filter(ua.mean,pool == "caudy")
hmean = filter(ua.mean,pool == "hom")
omean = filter(ua.mean,pool == "oliver")

ua.var <- aggregate(value ~ pool, data=msc, FUN=sd, subset=assign=='unass')
cvar = filter(ua.var,pool == "caudy")
hvar = filter(ua.var,pool == "hom")
ovar = filter(ua.var,pool == "oliver")

print(ua.mean)

##     pool    value
## 1  caudy 5.405844
## 2    hom 5.217357
## 3 oliver 5.315958

print(ua.var)

##     pool     value
## 1  caudy 0.6958178
## 2    hom 0.4251037
## 3 oliver 0.7314023

re batch correct without the unassigned tags

bcs = medShift.updn(as.matrix(xcs[noness$strain,pcs$name]))
bhs = medShift.updn(as.matrix(xhs[noness$strain,phs$name]))
bos = medShift.updn(as.matrix(xos[noness$strain,pos$name]))
bcs = ComBat(bcs[noness$strain,pcs$name],model.matrix(~cond,pcs),batch = pcs$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bhs = ComBat(bhs[noness$strain,phs$name],model.matrix(~cond,phs),batch = phs$pcr)

## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bos = ComBat(bos[noness$strain,pos$name],model.matrix(~cond,pos),batch = pos$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

print('unass removed')

## [1] "unass removed"

bsc = cbind(bcs,bhs,bos)


bsc = as.data.frame(bsc,stringsAsFactors = F)
bsc$tag = rownames(bsc)
table(bsc$tag == noness$strain)

## 
##  TRUE 
## 10216

bsc$gene = noness$sgd_gene


msc <- melt(bsc, id.vars=c("tag","gene"), variable.name = "exp")
mat = match(msc$exp,psc$name)
msc$pool = psc$pool[mat]
msc$cond = psc$cond[mat]
msc$type = psc$type[mat]

Plot histogram of assigned and unassigned for all experiments. This will generate a tall lattice of plots, so only save to pdf, but do not show in notebook.

It is pretty clear than around June 2 there is a leftward shift in the assigned tags and then a shift back to the right around July 15.

ggnote – i batch corrected this round –

I think we can fit a single gaussian to the unassigned tag distribution. Then fit a mixture of two Gaussians with one of the two fixed at the unassigned tag distribution and the other unconstrained for the assigned tags. Then we can use the posterior distribution over the cluster assignment variables to give an odds-ratio of the tag assignement. Then we can draw a threshold on the odds-ratio.

Fit a 2 component Gaussian mixture model to each experiment’s NONESS assigned tags. Store the posterior probability of the tag assignment to the “good” or “not unassigned” distribution" for each experiment.

noting the difference in number of iterations needed to converge each time this is run – us this something to worry about? takes more iterations without batch correction

caudy = 183 – 282 no batch correction hom = 78 131 oliver = 312 242

batch corrected

print('batch corrected mixEM')

## [1] "batch corrected mixEM"

set.seed(5)

mvn.cau <- apply(bcs[noness$strain,pcs$name], 2, normalmixEM2comp, lambda = 0.5, mu = c(cmean[1,2], 2*cmean[1,2]), sigsqrd = c(cvar[1,2], 5*cvar[1,2]), eps = 1e-08, maxit = 10000, verb = F)

## number of iterations= 152 
## number of iterations= 175 
## number of iterations= 191 
## number of iterations= 175 
## number of iterations= 160 
## number of iterations= 235 
## number of iterations= 202 
## number of iterations= 185 
## number of iterations= 211 
## number of iterations= 169 
## number of iterations= 203 
## number of iterations= 223 
## number of iterations= 281 
## number of iterations= 190 
## number of iterations= 191 
## number of iterations= 199 
## number of iterations= 222 
## number of iterations= 159 
## number of iterations= 187 
## number of iterations= 191 
## number of iterations= 218 
## number of iterations= 192 
## number of iterations= 217 
## number of iterations= 179 
## number of iterations= 182 
## number of iterations= 211 
## number of iterations= 166 
## number of iterations= 180 
## number of iterations= 174 
## number of iterations= 201 
## number of iterations= 168 
## number of iterations= 199 
## number of iterations= 242 
## number of iterations= 183

set.seed(5)

mvn.hom <- apply(bhs[noness$strain,phs$name], 2, normalmixEM2comp, lambda = 0.5, mu = c(hmean[1,2], 2*hmean[1,2]), sigsqrd = c(hvar[1,2], 5*hvar[1,2]), eps = 1e-08, maxit = 10000, verb = F)

## number of iterations= 156 
## number of iterations= 148 
## number of iterations= 166 
## number of iterations= 149 
## number of iterations= 145 
## number of iterations= 178 
## number of iterations= 137 
## number of iterations= 178 
## number of iterations= 180 
## number of iterations= 152 
## number of iterations= 156 
## number of iterations= 122 
## number of iterations= 172 
## number of iterations= 180 
## number of iterations= 167 
## number of iterations= 123 
## number of iterations= 188 
## number of iterations= 171 
## number of iterations= 123 
## number of iterations= 147 
## number of iterations= 187 
## number of iterations= 211 
## number of iterations= 258 
## number of iterations= 218 
## number of iterations= 190 
## number of iterations= 170 
## number of iterations= 125 
## number of iterations= 244 
## number of iterations= 274 
## number of iterations= 78

set.seed(5)

mvn.oliv <- apply(bos[noness$strain,pos$name], 2, normalmixEM2comp, lambda = 0.5, mu = c(omean[1,2], 2*omean[1,2]), sigsqrd = c(ovar[1,2], 5*ovar[1,2]), eps = 1e-08, maxit = 10000, verb = F)

## number of iterations= 451 
## number of iterations= 497 
## number of iterations= 240 
## number of iterations= 410 
## number of iterations= 252 
## number of iterations= 284 
## number of iterations= 362 
## number of iterations= 286 
## number of iterations= 280 
## number of iterations= 409 
## number of iterations= 294 
## number of iterations= 269 
## number of iterations= 251 
## number of iterations= 269 
## number of iterations= 339 
## number of iterations= 552 
## number of iterations= 313 
## number of iterations= 329 
## number of iterations= 276 
## number of iterations= 353 
## number of iterations= 271 
## number of iterations= 362 
## number of iterations= 492 
## number of iterations= 364 
## number of iterations= 333 
## number of iterations= 335 
## number of iterations= 351 
## number of iterations= 345 
## number of iterations= 180 
## number of iterations= 248 
## number of iterations= 211 
## number of iterations= 263 
## number of iterations= 238 
## number of iterations= 327 
## number of iterations= 537 
## number of iterations= 252 
## number of iterations= 244 
## number of iterations= 349 
## number of iterations= 312

cpost <- bcs[noness$strain,pcs$name]
for (i in 1:length(names(mvn.cau))) {
cpost[,i] <- mvn.cau[[i]]$posterior[,2]
}

hpost <- bhs[noness$strain,phs$name]
for (i in 1:length(names(mvn.hom))) {
hpost[,i] <- mvn.hom[[i]]$posterior[,2]
}

opost <- bos[noness$strain,pos$name]
for (i in 1:length(names(mvn.oliv))) {
opost[,i] <- mvn.oliv[[i]]$posterior[,2]
}

post = cbind(cpost,hpost,opost)
post = as.data.frame(post,stringsAsFactors = F)
post$tag = rownames(post)


rm(cpost)
rm(hpost)
rm(opost)

reformat dataframe wide to long for plotting

most<- melt(post, id.vars=c("tag"), variable.name = "exp",value.name = "post")



msc2 <- merge(msc, most, by=c("tag","exp"))

# call bkgnd tags by post.p > 0.
msc2$sig = rep(NA,nrow(msc2))


wbkgnd = which(msc2$post > 0.5)
msc2$sig[wbkgnd]=1
msc2$sig[-wbkgnd]=0

mat = match(msc2$exp,psc$name)
msc2$type = psc$type[mat]
msc2$cond = psc$cond[mat]

grab the fits from the models based on unassigned tags instead: erica we can justify using the lower bounds (mean - 3*sd) from the ctrls for our bg threshold cutoff values – they are higher than before

hom = 6.1 caudy = 6.5 oliver = 6.3

note lower variance and lower means for the hom pool in general

fit.hom = lapply(mvn.hom,with,c(mu,sigma,lambda))
hfit = matrix(unlist(fit.hom),ncol=6,byrow=T)
rownames(hfit)=names(fit.hom)
colnames(hfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")


fit.cau = lapply(mvn.cau,with,c(mu,sigma,lambda))
cfit = matrix(unlist(fit.cau),ncol=6,byrow=T)
rownames(cfit)=names(fit.cau)
colnames(cfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")

fit.oliv = lapply(mvn.oliv,with,c(mu,sigma,lambda))
ofit = matrix(unlist(fit.oliv),ncol=6,byrow=T)
rownames(ofit)=names(fit.oliv)
colnames(ofit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")

opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)

plot(ofit[,1],ylim=c(5,12),col=1,main = "comparing pools mean good vs bad tags",cex.main=0.9,,type="b")

points(ofit[,2],ylim=c(5,12),col=4,type="b")
points(hfit[,1],ylim=c(5,12),col=2,type="b")
points(hfit[,2],ylim=c(5,12),col=5,type="b")
points(cfit[,1],ylim=c(5,12),col=3,type="b")
points(cfit[,2],ylim=c(5,12),col=6,type="b")

legend("topright", inset=.05, title="pool",
    c("oliver.bg","hom.bg","caudy.bg","oliver","hom","caudy"), fill=1:6, cex=0.7)

hfit = apply(hfit,2,mean)
cfit = apply(cfit,2,mean)
ofit = apply(ofit,2,mean)

hom_bgthresh = hfit[2] - 3*hfit[4]
caudy_bgthresh = cfit[2] - 3*cfit[4]
oliv_bgthresh = ofit[2] - 3*ofit[4]

hom_bgthresh = hfit[1]
caudy_bgthresh =cfit[1]
oliv_bgthresh = ofit[1] 
print(noquote("stats for unass/assigned hom,caudy and oliver respectively = "))

## [1] stats for unass/assigned hom,caudy and oliver respectively =

print(hfit)

##   bkg_mean   ass_mean     bkg_sd     ass_sd bkg_lambda ass_lambda 
##  6.1019831  9.5738050  0.9242400  0.8667852  0.2446167  0.7553833

print(cfit)

##   bkg_mean   ass_mean     bkg_sd     ass_sd bkg_lambda ass_lambda 
##  6.5650372  9.8201733  1.0970971  0.7369216  0.4035549  0.5964451

print(ofit)

##   bkg_mean   ass_mean     bkg_sd     ass_sd bkg_lambda ass_lambda 
##  6.3821403  9.7263986  1.0899718  0.7657322  0.3400108  0.6599892

print(noquote("hom,caudy and oliver respectively:  +/- 3 stdev from mean assigned tags = "))

## [1] hom,caudy and oliver respectively:  +/- 3 stdev from mean assigned tags =

print(c(hom_bgthresh,  hfit[2] + 3*hfit[4] ))

##  bkg_mean  ass_mean 
##  6.101983 12.174161

print(c(caudy_bgthresh, cfit[2] + 3*cfit[4]))

##  bkg_mean  ass_mean 
##  6.565037 12.030938

print(c(oliv_bgthresh, ofit[2] + 3*ofit[4]))

## bkg_mean ass_mean 
##  6.38214 12.02360

check out differences in fitted gaussian parameters between pools

the ratio of background (lambda) to assigned is –: 25% ~40% ~30% of all 10216 tags called background for hom caudy and oliver repectively

print(noquote("hom,caudy and oliver percent background  vs present tags"))

## [1] hom,caudy and oliver percent background  vs present tags

print(c(round(hfit[5]*100,2) ,round(hfit[6]*100,2) ))

## bkg_lambda ass_lambda 
##      24.46      75.54

print(c(round(cfit[5]*100,2) ,round(cfit[6]*100,2) ))

## bkg_lambda ass_lambda 
##      40.36      59.64

print(c(round(ofit[5]*100,2) ,round(ofit[6]*100,2) ))

## bkg_lambda ass_lambda 
##         34         66

plot the data with the fitted curves

opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)

print('hom_sc_24Sep14')

## [1] "hom_sc_24Sep14"

plot(mvn.hom$hom_sc_24Sep14,density = T)



print('oliver_sc_02Jun15')

## [1] "oliver_sc_02Jun15"

plot(mvn.oliv$oliver_sc_02Jun15,density = T)

print('caudy_sc_16Jun14')

## [1] "caudy_sc_16Jun14"

plot(mvn.cau$caudy_sc_16Jun14,density = T)

plot the assigned tag distribution with the posterior likelihood? of it being a “good” tag. (only ctrl exps shown)

w7 = which(psc$pcr %in% c("2015-07-15"))
w = which(psc$cond[w7] %in% c("arg","trp","sc") )
exps = psc$name[w7][w]

test = msc2 %>% filter(exp %in% exps)  %>% droplevels()

ggplot(test) +
  facet_wrap(pool ~ exp,ncol=3) +
  geom_histogram(aes(x=value,fill=pool),bins=100) + facet_wrap(~cond,nrow=3)

ggplot(test) +
  facet_wrap(pool ~ exp,ncol=3) +
  geom_density(aes(x=value,col=pool)) + facet_wrap(pool~cond,nrow=3)  +  
  
  geom_point(aes(x=value, y=post,col=pool))

mctrl = msc2 %>% filter(type=="ctrl") %>% droplevels()
caudy = msc2 %>% filter(pool== "caudy",type=="ctrl") %>% droplevels()
oliv = msc2 %>% filter(pool== "oliver",type=="ctrl") %>% droplevels()
hom = msc2 %>% filter(pool== "hom",type=="ctrl") %>% droplevels()

o = ggplot(oliv) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.)  + ggtitle("oliver ctrls")+
  theme(legend.position="none")
  o

c = ggplot(caudy) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.)  + ggtitle("caudy ctrls")+
  theme(legend.position="none")
  c

h = ggplot(hom) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() +  facet_grid(exp~.) + ggtitle("hom ctrls") +
  theme(legend.position="none")
  h

  ggplot(mctrl) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=pool), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() +  facet_wrap(pool~exp,nrow=3) + ggtitle("hom ctrls") +
  theme(legend.position="none")

from here split out the pools and draw the Venn for tags w/ posterior > 0.5 in ~ 50% of experiments

i hate that i have to make an arbritrary cutoff after all this work! couldn’t we use pnorm to get the probabilty based on the estimated parameters of the gaussian?

actrl= acast(msc2, tag~pool+cond, value.var="sig",sum)

actrl = as.data.frame(actrl,stringsAsFactors = F)

gc = grep('caudy',colnames(actrl))

csum = apply(actrl[,gc],1,sum)

gh = grep('hom',colnames(actrl))

hsum = apply(actrl[,gh],1,sum)

go = grep('oliv',colnames(actrl))

osum = apply(actrl[,go],1,sum)

otab = table(osum)
htab = table(hsum)
ctab = table(csum)

opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1)

barplot(htab,col=2, main='count exps above threshold')
barplot(cumsum(prop.table(htab)),col=2, main='hom cumulative sum')

barplot(otab,col=1, main='oliver')
barplot(cumsum(prop.table(otab)),col=1, main='oliver cumulative sum')

barplot(ctab,col=3, main='caudy')
barplot(cumsum(prop.table(ctab)),col=3, main='caudy cumulative sum')

wc = which(csum > nrow(pcs)/2)
wo = which(osum > nrow(pos)/2)
wh = which(hsum > nrow(phs)/2)



#wc = which(actrl[,1]> 0.5)
#wh = which(actrl[,2]> 0.5)
#wo = which(actrl[,3]> 0.5)

ctag = rownames(actrl)[wc]
htag = rownames(actrl)[wh]
otag = rownames(actrl)[wo]

cmix=xcs[ctag,pcs$name]
hmix=xhs[htag,phs$name]
omix=xos[otag,pos$name]


hom_bgthresh = hfit[1]
caudy_bgthresh= cfit[1]
oliv_bgthresh = ofit[1]

bhix = myBeststrain(hmix,bgThresh = hom_bgthresh)

## [1] min value of ctrl medians =  7.46767155284592            
## [1] 4470   30

bcix = myBeststrain(cmix,bgThresh = caudy_bgthresh)

## [1] min value of ctrl medians =  7.59040103547903            
## [1] 3981   34

boix = myBeststrain(omix,bgThresh = oliv_bgthresh)

## [1] min value of ctrl medians =  7.81958360782921            
## [1] 4380   39

from barplots its pretty clear that the significance call is all or none, though different for each pool there are lots of ways to call presence or absence given we have likelihood? odds? across many experiments,

need to check consistency between replicates of absence/presence call

compute batch corrected tag presence using our standard approach for later comparison 1. median normalize by experiment 2. throw out tags that are not about the background threshold 3. pick best tag by the choosing the one with lowest sd/mean (if robust = F) for ctrl chips

wos = which(pos$type == "ctrl")
wcs = which(pcs$type == "ctrl")
whs = which(phs$type == "ctrl")



cbest = myBeststrain(xcs[noness$strain,],bgThresh = caudy_bgthresh)

## [1] min value of ctrl medians =  6.56544558591269            
## [1] 4606   34

hbest = myBeststrain(xhs[noness$strain,],bgThresh = hom_bgthresh)

## [1] min value of ctrl medians =  6.10519630177698            
## [1] 4679   30

obest = myBeststrain(xos[noness$strain,],bgThresh = oliv_bgthresh)

## [1] min value of ctrl medians =  6.3844752063782             
## [1] 4780   39

bghs = computeBgThresh(as.matrix(xsc[1:3600,phs$name]))
bgcs = computeBgThresh(as.matrix(xsc[1:3600,pcs$name]))
bgos = computeBgThresh(as.matrix(xsc[1:3600,pos$name]))

hbest = rmtags(hbest,whs,bghs)

## [1] 4679   30
## [1] 4679   30
## [1] 5.51681
## [1] 5.217231

cbest = rmtags(cbest,wcs,bgcs)

## [1] 4606   34
## [1] 4602   34
## [1] 5.717324
## [1] 5.193772

obest = rmtags(obest,wos,bgos)

## [1] 4780   39
## [1] 4777   39
## [1] 5.926253
## [1] 5.232661

bhix = rmtags(bhix,whs,bghs)

## [1] 4470   30
## [1] 4470   30
## [1] 6.806934
## [1] 5.217231

bcix= rmtags(bcix,wcs,bgcs)

## [1] 3981   34
## [1] 3981   34
## [1] 7.356527
## [1] 5.193772

boix = rmtags(boix,wos,bgos)

## [1] 4380   39
## [1] 4380   39
## [1] 7.269101
## [1] 5.232661

cbat = ComBat(cbest ,model.matrix(~cond,pcs),batch = pcs$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

hbat = ComBat(hbest ,model.matrix(~cond,phs),batch = phs$pcr)

## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

obat= ComBat(obest ,model.matrix(~cond,pos),batch = pos$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bxxcs = ComBat(bcix ,model.matrix(~cond,pcs),batch = pcs$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bxxhs = ComBat(bhix ,model.matrix(~cond,phs),batch = phs$pcr)

## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bxxos= ComBat(boix,model.matrix(~cond,pos),batch = pos$pcr)

## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

compare the GENE lists in standard vs mixEM

hsd = setdiff(rownames(hbat),rownames(bxxhs))
osd = setdiff(rownames(obat),rownames(bxxos))
csd = setdiff(rownames(cbat),rownames(bxxcs))




hsd1 = setdiff(rownames(bxxhs),rownames(hbat))
osd1 = setdiff(rownames(bxxos),rownames(obat))
csd1 = setdiff(rownames(bxxcs),rownames(cbat))



print('# strains rejected by EM, caudy hom oliver')

## [1] "# strains rejected by EM, caudy hom oliver"

print(length(csd))

## [1] 963

print(length(hsd))

## [1] 368

print(length(osd))

## [1] 710

print('# strains rejected by std')

## [1] "# strains rejected by std"

print(length(csd1))

## [1] 342

print(length(hsd1))

## [1] 159

print(length(osd1))

## [1] 313

r compare distributions of rejected strains rejected by mixEM vs standard preprocessing

blue lines are strains called as background with mixEM but NOT in standard

orange lines are strains called as background with standard but NOT in mixEM

opar <- par(pch=19,mfrow=c(1,3),mar=c(2.2,2,1.4,1),cex=1.2)

plot(density(xhs[hsd1,whs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='hom')
lines(density(xhs[hsd1,whs[2]]),col=1,lwd=2)
lines(density(xhs[hsd1,whs[3]]),col=1,lwd=2)
lines(density(xhs[hsd,whs[3]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[2]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[1]]),col=2,lwd=2)



legend("topright", inset=.05, title="method",
    c("rejected std","rejected EM"), fill=1:2, cex=0.7)

plot(density(xos[osd1,wos[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='oliver')
lines(density(xos[osd1,wos[2]]),col=1,lwd=2)
lines(density(xos[osd1,wos[3]]),col=1,lwd=2)
lines(density(xos[osd,wos[3]]),col=2,lwd=2)
lines(density(xos[osd,wos[2]]),col=2,lwd=2)
lines(density(xos[osd,wos[1]]),col=2,lwd=2)

plot(density(xcs[csd1,wcs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main ='caudy')
lines(density(xcs[csd1,wcs[2]]),col=1,lwd=2)
lines(density(xcs[csd1,wcs[3]]),col=1,lwd=2)
lines(density(xcs[csd,wcs[2]]),col=2)
lines(density(xcs[csd,wcs[3]]),col=2,lwd=2)
lines(density(xcs[csd,wcs[1]]),col=2,lwd=2)

obat = getgene(obat)
cbat = getgene(cbat)
bxxos = getgene(bxxos)
bxxcs = getgene(bxxcs)
bxxhs = getgene(bxxhs)
hbat = getgene(hbat)

cbat = myduprow(cbat)

## [1] 4602   34
## [1] 4546   34

hbat = myduprow(hbat)

## [1] 4679   30
## [1] 4640   30

obat = myduprow(obat)

## [1] 4777   39
## [1] 4669   39

bxxcs = myduprow(bxxcs)

## [1] 3981   34
## [1] 3970   34

bxxhs = myduprow(bxxhs)

## [1] 4470   30
## [1] 4455   30

bxxos = myduprow(bxxos)

## [1] 4380   39
## [1] 4298   39

i hate venns but here goes: make a list of strains in each pool tags absent in all strains were removed

venn universe is the list of tags present in at least one pool

pie charts! think its easier to visualize –

compare this to our standard call of gene presence:

top venn is genes by mixEM, bottom is by standard

cboth= rownames(bxxcs)
hboth= rownames(bxxhs)
oboth= rownames(bxxos)

df = data.frame(gene = unique(noness$sgd_gene),stringsAsFactors = F)

df$mixcau = rep(NA,nrow(df))
df$mixhom = rep(NA,nrow(df))
df$mixoliv = rep(NA,nrow(df))
df$standcau = rep(NA,nrow(df))
df$standhom = rep(NA,nrow(df))
df$standoliv = rep(NA,nrow(df))

w = which(df$gene %in% cboth)
df$mixcau[w]= 1
df$mixcau[-w]= 0

w = which(df$gene %in% hboth)
df$mixhom[w]= 1
df$mixhom[-w]= 0

w =which(df$gene %in% oboth)
df$mixoliv[w]= 1
df$mixoliv[-w]= 0

w = which(df$gene %in% rownames(cbat))
df$standcau [w]= 1
df$standcau [-w]= 0

w = which(df$gene %in% rownames(hbat))
df$standhom[w]= 1
df$standhom [-w]= 0

w = which(df$gene %in% rownames(obat))
df$standoliv[w]= 1
df$standoliv [-w]= 0

df$ssum = df$standcau+df$standoliv+df$standhom

df$xsum = df$mixcau+df$mixoliv+df$mixhom

ws = which(df$ssum == 0)

xdf = df[,c(1:4,9)]
sdf = df[,c(1,5:8)]

ws = which(sdf$ssum == 0)

print(length(ws))

## [1] 110

sdf = sdf[-ws,]
wx = which(xdf$xsum == 0)
print(length(wx))

## [1] 215

xdf = xdf[-wx,]

mix = melt(xdf[,1:4],variable.name = "pool",value.name="call")

## Using gene as id variables

stand = melt(sdf[,1:4],variable.name = "pool",value.name="call")

## Using gene as id variables

piemix = pie_chart1(mix[,2:3], main = "call",  condition = "pool")
piestand = pie_chart1(stand[,2:3], main = "call",  condition = "pool")

name = c("caudy","hom","oliver")

opar <- par(pch=19,mfrow=c(2,1),mar=c(2.2,0.5,1.4,0.5),cex=1.2)

multiplot(piemix, piestand, layout=matrix(c(1,2), nrow=2, byrow=TRUE))

venn strains present by mix EM

t1 = ggvenn(cboth,hboth,oboth,category = name)

present by standard

t2 = ggvenn(rownames(cbat),rownames(hbat),rownames(obat))

venn strains missing by mix EM

univ = unique(c(cboth,hboth,oboth))

xcbg = setdiff(univ,cboth)
xhbg = setdiff(univ,hboth)
xobg = setdiff(univ,oboth)

t1 = ggvenn(xcbg,xhbg,xobg,category = name)

venn strains missing by standard

suniv = unique(c(rownames(obat),rownames(cbat),rownames(hbat)))

cbg = setdiff(suniv,rownames(cbat))
hbg = setdiff(suniv,rownames(hbat))
obg = setdiff(suniv,rownames(obat))
t1 = ggvenn(cbg,hbg,obg,category = name)

proportion venn genes missing by EM vs STD

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)

library(venneuler)

## Loading required package: rJava

mix0 = filter(mix,mix$call == 0)
v0 <- venneuler(mix0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)


cau = paste('mixcaud',length(xcbg))
hom = paste('mixhom',length(xhbg))
oliv = paste('mixoliv',length(xobg))
v0$labels = c(cau,hom,oliv)
plot(v0)

stand0 = filter(stand,stand$call == 0)

cau = paste('stdcaud',length(cbg))
hom = paste('stdhom',length(hbg))
oliv = paste('stdoliv',length(obg))
v0 <- venneuler(stand0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)
v0$labels = c(cau,hom,oliv)
plot(v0)

check fitness profiles by preprocessing method mixEM seems to work well in getting rid of flaky tags

whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")


lobat = getLR(obat,wos)
lbxxos = getLR(bxxos,wos)  

lcbat = getLR(cbat,wcs)
lbxxcs = getLR(bxxcs,wcs) 


lhbat = getLR(hbat,whs)
lbxxhs = getLR(bxxhs,whs) 

whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")

lphs = phs[-whs,]

lpos = pos[-wos,]

lpcs = pcs[-wcs,]

wgly = which(lphs$cond %in% c("gln","gly",'ser','thr'))
lphs = lphs[-wgly,]
wgly = which(lpcs$cond %in% c("gln","gly",'ser','thr'))
lpcs = lpcs[-wgly,]
wgly = which(lpos$cond %in% c("gln","gly",'ser','thr'))
lpos = lpos[-wgly,]

lphs = lphs %>% arrange(cond,pcr)
lpcs = lpcs %>% arrange(cond,pcr)
lpos = lpos %>% arrange(cond,pcr)

lcbat = lcbat[,lpcs$name]
lbxxcs = lbxxcs[,lpcs$name]

lhbat = lhbat[,lphs$name]
lbxxhs = lbxxhs[,lphs$name]

lobat = lobat[,lpos$name]
lbxxos = lbxxos[,lpos$name]




opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)

hsub = jul15 = grep("Jul15",colnames(lhbat))

csub = grep("Jul15",colnames(lcbat))
osub = jul15 = grep("Jul15",colnames(lobat))

colnames(lbxxos) = paste("EM.",colnames(lbxxos))
colnames(lbxxcs) = paste("EM.",colnames(lbxxcs))
colnames(lbxxhs) = paste("EM.",colnames(lbxxhs))

compare fitness profiles EM to standard

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in hsub) psig(lhbat,lbxxhs,i)

for (i in osub) psig(lobat,lbxxos,i)

##check out this nice caudy LEU profile! corey reminded me of LEU2 in SGA strain contruction
for (i in csub) psig(lcbat,lbxxcs,i)

compare on same scales

harg = grep('arg',colnames(lhbat))
carg = grep('arg',colnames(lcbat))
oarg = grep('arg',colnames(lobat))

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in harg) pscale1(lhbat,i,lbxxhs,i)

for (i in carg) pscale1(lcbat,i,lbxxcs,i)

for (i in oarg) pscale1(lobat,i,lbxxos,i)

harg = grep('lys',colnames(lhbat))
carg = grep('lys',colnames(lcbat))
oarg = grep('lys',colnames(lobat))

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in harg) pscale1(lhbat,i,lbxxhs,i)

for (i in carg) pscale1(lcbat,i,lbxxcs,i)

for (i in oarg) pscale1(lobat,i,lbxxos,i)

compare differences by strain

hsd = setdiff(rownames(hbat),rownames(bxxhs))
hsd = sort(unique(hsd))

print('hom reject EM')

## [1] "hom reject EM"

print(hsd)

##   [1] "AAH1"      "ACB1"      "ADE5,7"    "ADE8"      "ADK1"     
##   [6] "AIM26"     "AMA1"      "APL6"      "ARG8"      "ARO1"     
##  [11] "ARO7"      "ARP5"      "ARV1"      "ASP1"      "ATP12"    
##  [16] "ATP15"     "BCS1"      "BDS1"      "BTS1"      "BUB3"     
##  [21] "BUD32"     "CAX4"      "CCR4"      "CHK1"      "CNM67"    
##  [26] "COA2"      "COQ6"      "COR1"      "COX10"     "COX20"    
##  [31] "CPS1"      "CTF4"      "CYB2"      "DAL82"     "DEG1"     
##  [36] "DIA4"      "DID4"      "DSC2"      "ECM22"     "ECM31"    
##  [41] "EIS1"      "ERG6"      "EXO5"      "FEN2"      "FLX1"     
##  [46] "FMP52"     "FYV6"      "HAP2"      "HBN1"      "HSL1"     
##  [51] "ICE2"      "IDH1"      "IMD4"      "IMG2"      "IOC4"     
##  [56] "IPK1"      "KAP122"    "KAR3"      "KRE28"     "KRE6"     
##  [61] "LSO1"      "LTV1"      "MDS3"      "MET13"     "MHO1"     
##  [66] "MNN10"     "MOT2"      "MRH4"      "MSS1"      "NOT5"     
##  [71] "NUP84"     "OPI9"      "PDR5"      "PET112"    "PHO85"    
##  [76] "PIN2"      "POG1"      "PRO1"      "PRO2"      "PRS2"     
##  [81] "PSR1"      "QCR9"      "RBK1"      "RLM1"      "RNR4"     
##  [86] "RPB4"      "RPL1B"     "RPL31A"    "RPS16B"    "RPS23B"   
##  [91] "RPS25B"    "RPS6A"     "RRF1"      "RRG8"      "RRT14"    
##  [96] "RSC2"      "RSM24"     "RSM26"     "RTF1"      "SAF1"     
## [101] "SAM3"      "SAN1"      "SIR3"      "SKO1"      "SNF12"    
## [106] "SNF2"      "SOM1"      "SPC1"      "SPT7"      "SRC1"     
## [111] "SSA2"      "SSB2"      "STE11"     "STE3"      "SUS1"     
## [116] "SUV3"      "SYM1"      "TAF14"     "THO2"      "THP1"     
## [121] "TIM8"      "TKL2"      "TRK1"      "UBP15"     "UGO1"     
## [126] "UIP4"      "VAM6"      "VBA5"      "VMA16"     "VMA21"    
## [131] "VMA22"     "VMA7"      "VPS15"     "VPS16"     "VPS33"    
## [136] "VPS53"     "YAL016C-B" "YBR096W"   "YBR221W-A" "YCL021W-A"
## [141] "YDL183C"   "YDL206W"   "YDR524C-B" "YER076C"   "YGL088W"  
## [146] "YGL185C"   "YGL218W"   "YGR035W-A" "YGR121W-A" "YGR161W-C"
## [151] "YGR174W-A" "YGR204C-A" "YGR219W"   "YHL015W-A" "YHR022C-A"
## [156] "YHR086W-A" "YIR018C-A" "YJL136W-A" "YKL018C-A" "YKL106C-A"
## [161] "YLL006W-A" "YLR361C-A" "YLR419W"   "YML007C-A" "YML009W-B"
## [166] "YML079W"   "YMR013W-A" "YMR141C"   "YMR230W-A" "YNG2"     
## [171] "YNL028W"   "YNL097C-B" "YNL184C"   "YOL097W-A" "YOL118C"  
## [176] "YOL159C-A" "YOL164W-A" "YOR161C-C" "YOR316C-A" "YOR331C"  
## [181] "YPL152W-A" "YPR099C"   "YPR159C-A" "YTA12"     "ZAP1"

osd = setdiff(rownames(lobat),rownames(lbxxos))
print('oliv reject EM')

## [1] "oliv reject EM"

osd = sort(osd)
print(osd)

##   [1] "AAH1"         "ADE1"         "ADE3"         "ADE4"        
##   [5] "ADE5,7"       "ADE6"         "ADK1"         "AGP2"        
##   [9] "AIM44"        "ALD5"         "ALG9"         "AMA1"        
##  [13] "ANP1"         "APL6"         "APQ12"        "ARC18"       
##  [17] "ARG80"        "ARL3"         "ARP8"         "ARV1"        
##  [21] "ASC1"         "ASF1"         "ASK10"        "ASP1"        
##  [25] "ATG11"        "ATG4"         "ATP12"        "ATP15"       
##  [29] "AVT5"         "BAR1"         "BEM4"         "BNA7"        
##  [33] "BOR1"         "BTS1"         "BUD14"        "BUD20"       
##  [37] "BUD23"        "CAX4"         "CBS2"         "CCE1"        
##  [41] "CHA1"         "CHC1"         "CKA1"         "CKB1"        
##  [45] "CLA4"         "CNE1"         "CNM67"        "COQ8"        
##  [49] "CRP1"         "CST6"         "CTF8"         "CTK3"        
##  [53] "CTR9"         "CYC3"         "DAK1"         "DAL82"       
##  [57] "DCI1"         "DEF1"         "DEG1"         "DFG16"       
##  [61] "DHH1"         "DON1"         "DRS2"         "DUG3"        
##  [65] "DUN1"         "ECM22"        "ECM31"        "EDE1"        
##  [69] "EIS1"         "ELO2"         "ELO3"         "ELP2"        
##  [73] "EMC6"         "END3"         "ENT3"         "ERG24"       
##  [77] "ERG3"         "ERJ5"         "ERP2"         "EST3"        
##  [81] "ETR1"         "EXO5"         "FMP52"        "FMT1"        
##  [85] "FPR1"         "FRA1"         "FUN14"        "FUN30"       
##  [89] "FYV10"        "FYV5"         "FYV6"         "GCN1"        
##  [93] "GCN4"         "GLE2"         "GLO3"         "GRX8"        
##  [97] "HAP3"         "HCR1"         "HFI1"         "HFM1"        
## [101] "HIR1"         "HMO1"         "HMT1"         "HOM3"        
## [105] "HSM3"         "HXT17"        "IDH2"         "IMD4"        
## [109] "INM2"         "IOC4"         "IRC18"        "IST3"        
## [113] "IZH1"         "JJJ3"         "KAP122"       "KTI11"       
## [117] "KTR1"         "KTR6"         "LAC1"         "LGE1"        
## [121] "LHS1"         "LOA1"         "LOC1"         "LSM1"        
## [125] "LSM6"         "LTV1"         "LYS20"        "MAP1"        
## [129] "MDJ1"         "MDL2"         "MDM10"        "MED1"        
## [133] "MET10"        "MET13"        "MET22"        "MET7"        
## [137] "MFM1"         "MGR2"         "MGR3"         "MGT1"        
## [141] "MIP6"         "MIX17"        "MMM1"         "MMS22"       
## [145] "MNN10"        "MNN2"         "MPO1"         "MRPL1"       
## [149] "MRPL20"       "MRPL4"        "MRPS5"        "MRT4"        
## [153] "MSA1"         "MSB4"         "MSD1"         "MSR1"        
## [157] "MSS1"         "MVB12"        "NCR1"         "NDE1"        
## [161] "NDI1"         "NGG1"         "NHX1"         "NPL3"        
## [165] "NSR1"         "NUC1"         "NUP60"        "OM14"        
## [169] "OPI8"         "OSW7"         "OTU2"         "PDH1"        
## [173] "PEP12"        "PEX12"        "PEX13"        "PEX21"       
## [177] "PEX28"        "PHO89"        "PIM1"         "PMP3"        
## [181] "PMR1"         "PMT1"         "POP2"         "POS5"        
## [185] "PPN1"         "PPT2"         "PPZ1"         "PRM2"        
## [189] "PRM9"         "PRO2"         "PRS3"         "PRY2"        
## [193] "PSR1"         "PUF4"         "PUF6"         "PXA2"        
## [197] "RAD6"         "RAD9"         "RAM1"         "RCN1"        
## [201] "RCO1"         "RCR1"         "RCR2"         "REI1"        
## [205] "REV7"         "RGC1"         "RHB1"         "RIM4"        
## [209] "ROM1"         "RPB4"         "RPL12B"       "RPL21A"      
## [213] "RPL22A"       "RPL27B"       "RPL2B"        "RPL36B"      
## [217] "RPL37A"       "RPL40A"       "RPL42B"       "RPL6A"       
## [221] "RPL7B"        "RPS10A"       "RPS11A"       "RPS17A"      
## [225] "RPS18A"       "RPS19B"       "RPS21A"       "RPS29B"      
## [229] "RPS7A"        "RPS9B"        "RRG8"         "RSA1"        
## [233] "RSM26"        "RTT109"       "SAP1"         "SAP4"        
## [237] "SAS2"         "SDH1"         "SEM1"         "SIC1"        
## [241] "SIN4"         "SIR4"         "SIT4"         "SKG1"        
## [245] "SLS1"         "SMF2"         "SMK1"         "SNF6"        
## [249] "SNG1"         "SNT309"       "SPC72"        "SPF1"        
## [253] "SPL2"         "SPO23"        "SPT10"        "SPT4"        
## [257] "SRB5"         "SSA2"         "SSE1"         "SSF1"        
## [261] "SSH1"         "SSQ1"         "SSZ1"         "STR2_paralog"
## [265] "SUS1"         "SUV3"         "SWC5"         "SWI5"        
## [269] "SWI6"         "SWR1"         "TAT2"         "TGS1"        
## [273] "THI2"         "TIF1"         "TKL2"         "TMA10"       
## [277] "TOD6"         "TOF2"         "TOM1"         "TOP1"        
## [281] "TRM9"         "TSA2"         "TSC3"         "TSR3"        
## [285] "TVP15"        "UBP15"        "UBP3"         "UBP5"        
## [289] "UFO1"         "UME6"         "UMP1"         "URC2"        
## [293] "URM1"         "UTP30"        "UTR4"         "VBA1"        
## [297] "VID27"        "VMA1"         "VMA10"        "VMA11"       
## [301] "VMA16"        "VMA2"         "VMA21"        "VMA5"        
## [305] "VMA7"         "VMA9"         "VPS15"        "VPS28"       
## [309] "VPS3"         "VPS52"        "XRS2"         "YAL066W"     
## [313] "YBL059W"      "YBL071C"      "YBR096W"      "YBR178W"     
## [317] "YBR225W"      "YCL001W-B"    "YCR101C"      "YCR102C"     
## [321] "YDJ1"         "YDL068W"      "YDL129W"      "YDL183C"     
## [325] "YDR199W"      "YDR222W"      "YDR401W"      "YDR433W"     
## [329] "YDR514C"      "YEL045C"      "YER010C"      "YER046W-A"   
## [333] "YFL012W"      "YFL013W-A"    "YFR018C"      "YGK3"        
## [337] "YGL072C"      "YGL108C"      "YGL188C-A"    "YGR015C"     
## [341] "YGR064W"      "YGR117C"      "YGR160W"      "YHC3"        
## [345] "YHK8"         "YHR050W-A"    "YIL025C"      "YJL077W-B"   
## [349] "YJR018W"      "YJR154W"      "YKL136W"      "YKR051W"     
## [353] "YLR269C"      "YML009W-B"    "YMR031W-A"    "YMR242W-A"   
## [357] "YNG2"         "YNL140C"      "YNL171C"      "YOL079W"     
## [361] "YOL118C"      "YOR072W"      "YOR139C"      "YOR199W"     
## [365] "YOR318C"      "YOR331C"      "YPL260W"      "YPR084W"     
## [369] "YPR123C"      "YRR1"         "ZRT2"

csd = setdiff(rownames(cbat),rownames(bxxcs))
csd = sort(csd)

print('caudy reject EM')

## [1] "caudy reject EM"

print(csd)

##   [1] "1-Oct"     "AAH1"      "AAT2"      "ACO2"      "ADE2"     
##   [6] "ADE8"      "ADH3"      "ADK1"      "ADO1"      "AEP1"     
##  [11] "AEP2"      "AFT1"      "AIM10"     "AIM22"     "AIM44"    
##  [16] "ALO1"      "ANP1"      "APL6"      "ARG1"      "ARG2"     
##  [21] "ARG4"      "ARG5,6"    "ARG7"      "ARL3"      "ARO4"     
##  [26] "ARO7"      "ARP8"      "ARV1"      "ARX1"      "ASC1"     
##  [31] "ASF1"      "ASK10"     "ATG11"     "ATG14"     "ATG20"    
##  [36] "ATP1"      "ATP17"     "ATP22"     "ATP3"      "ATP5"     
##  [41] "AVL9"      "AVT5"      "BAS1"      "BBC1"      "BCS1"     
##  [46] "BFR1"      "BNA1"      "BOR1"      "BRE1"      "BTS1"     
##  [51] "BUB3"      "BUD14"     "BUD16"     "BUD19"     "BUD20"    
##  [56] "BUD21"     "BUD27"     "CBC2"      "CBP2"      "CCE1"     
##  [61] "CCM1"      "CCR4"      "CCS1"      "CDA1"      "CDA2"     
##  [66] "CDC10"     "CHA1"      "CHD1"      "CHM7"      "CHO2"     
##  [71] "CIR2"      "CKB1"      "CKB2"      "CKI1"      "CLA4"     
##  [76] "CLC1"      "CNE1"      "CNM67"     "COA3"      "COG1"     
##  [81] "COQ9"      "COR1"      "COX10"     "COX11"     "COX18"    
##  [86] "COX19"     "COX7"      "CPA1_uorf" "CPA2"      "CRP1"     
##  [91] "CTF8"      "CTK2"      "CTK3"      "CTR1"      "CTR9"     
##  [96] "CUS2"      "CYT1"      "DAL82"     "DCI1"      "DDC1"     
## [101] "DFG16"     "DIA2"      "DIA4"      "DID4"      "DOA4"     
## [106] "DOC1"      "DRS2"      "DSS1"      "DST1"      "DUG3"     
## [111] "DYN3"      "ECM27"     "ECM31"     "ECM33"     "ECM38"    
## [116] "EFG1"      "EFT2"      "EIS1"      "ELM1"      "ELO2"     
## [121] "ELO3"      "ELP2"      "ELP3"      "EMC6"      "END3"     
## [126] "ENT3"      "ERJ5"      "ERP2"      "EST1"      "EST2"     
## [131] "EST3"      "ETR1"      "EXO1"      "FAB1"      "FAR11"    
## [136] "FAR7"      "FEN2"      "FMP48"     "FMP52"     "FMT1"     
## [141] "FRA1"      "FUN30"     "FYV5"      "FZO1"      "GAL80"    
## [146] "GAS1"      "GCN1"      "GCN5"      "GCV3"      "GEA2"     
## [151] "GEP3"      "GGC1"      "GLE2"      "GLO3"      "GLR1"     
## [156] "GLY1"      "GPM2"      "GRX8"      "GSF2"      "GTF1"     
## [161] "GTR2"      "HAC1"      "HBN1"      "HEF3"      "HER2"     
## [166] "HFM1"      "HIR1"      "HIS1"      "HIS2"      "HIS4"     
## [171] "HIS5"      "HIS7"      "HIT1"      "HMI1"      "HMO1"     
## [176] "HMT1"      "HOC1"      "HOM2"      "HOM3"      "HOM6"     
## [181] "HOP2"      "HPM1"      "HPR1"      "HRQ1"      "HSL1"     
## [186] "HSM3"      "HTD2"      "HUL4"      "HUR1"      "HXT17"    
## [191] "ICE2"      "ICS2"      "IES1"      "IKI1"      "IMP2'"    
## [196] "INM2"      "INO4"      "IOC4"      "IRC18"     "ISA2"     
## [201] "IST3"      "JIP4"      "JJJ3"      "JNM1"      "KCS1"     
## [206] "KGD1"      "KGD2"      "KIP1"      "KTI11"     "KTR1"     
## [211] "KTR6"      "LAT1"      "LCL2"      "LEA1"      "LHS1"     
## [216] "LIP2"      "LMO1"      "LPD1"      "LRG1"      "LSC2"     
## [221] "LSM1"      "LSM6"      "LSO1"      "LTV1"      "LYP1"     
## [226] "LYS12"     "LYS5"      "LYS9"      "MAF1"      "MAM33"    
## [231] "MCM21"     "MCT1"      "MEF1"      "MEF2"      "MET1"     
## [236] "MET18"     "MET31"     "MET8"      "MFA1"      "MFM1"     
## [241] "MGM101"    "MGR2"      "MGR3"      "MGT1"      "MID1"     
## [246] "MIP1"      "MIP6"      "MMM1"      "MMS22"     "MMS4"     
## [251] "MNN2"      "MRH4"      "MRP1"      "MRP20"     "MRPL1"    
## [256] "MRPL11"    "MRPL13"    "MRPL15"    "MRPL16"    "MRPL17"   
## [261] "MRPL22"    "MRPL27"    "MRPL28"    "MRPL35"    "MRPL37"   
## [266] "MRPL51"    "MRPL9"     "MRPS16"    "MRPS17"    "MRPS28"   
## [271] "MRPS35"    "MRPS9"     "MRX14"     "MSB4"      "MSF1"     
## [276] "MSH1"      "MSM1"      "MSR1"      "MSS1"      "MSS116"   
## [281] "MST1"      "MSW1"      "MTG2"      "MUM2"      "MVB12"    
## [286] "NAT3"      "NCS6"      "NDI1"      "NEW1"      "NGG1"     
## [291] "NOT3"      "NUC1"      "NUP133"    "NUP60"     "NUP84"    
## [296] "OM14"      "OPI3"      "OSW7"      "OXA1"      "PAN5"     
## [301] "PAP2"      "PBP4"      "PBY1"      "PDE2"      "PDH1"     
## [306] "PET111"    "PET123"    "PET130"    "PET309"    "PET54"    
## [311] "PEX12"     "PEX13"     "PEX19"     "PEX21"     "PEX28"    
## [316] "PFD1"      "PFF1"      "PHO89"     "PKR1"      "PLM2"     
## [321] "PMR1"      "PMS1"      "PMT1"      "POP2"      "POR1"     
## [326] "PPA2"      "PPT2"      "PRM9"      "PRP18"     "PRS3"     
## [331] "PSD1"      "PSR1"      "PXA2"      "QCR8"      "QDR2"     
## [336] "RAD9"      "RAI1"      "RCF2"      "RCN1"      "RCR1"     
## [341] "RCR2"      "REI1"      "REV7"      "REX2"      "RFU1"     
## [346] "RGC1"      "RHB1"      "RHO5"      "RIM1"      "RMD6"     
## [351] "RNR4"      "ROM1"      "ROM2"      "RPB4"      "RPB9"     
## [356] "RPL11B"    "RPL13A"    "RPL16A"    "RPL16B"    "RPL19A"   
## [361] "RPL21A"    "RPL22A"    "RPL24A"    "RPL27A"    "RPL2B"    
## [366] "RPL31A"    "RPL34B"    "RPL36B"    "RPL42B"    "RPL43A"   
## [371] "RPL7B"     "RPO41"     "RPP2B"     "RPS0A"     "RPS10A"   
## [376] "RPS11A"    "RPS17A"    "RPS21B"    "RPS23B"    "RPS24B"   
## [381] "RPS29B"    "RPS7A"     "RPS8A"     "RPS9B"     "RRT8"     
## [386] "RSA1"      "RSM19"     "RSM22"     "RSM26"     "RSM27"    
## [391] "RTF1"      "RTS1"      "RTT101"    "RTT109"    "SAC1"     
## [396] "SAM37"     "SAP1"      "SAP4"      "SCO1"      "SCS22"    
## [401] "SDH1"      "SDH4"      "SDS3"      "SEA4"      "SEO1"     
## [406] "SER2"      "SHS1"      "SIC1"      "SIR3"      "SLM5"     
## [411] "SLM6"      "SLT2"      "SLX9"      "SMF2"      "SMK1"     
## [416] "SNA3"      "SNF1"      "SNF3"      "SNF4"      "SNF7"     
## [421] "SNG1"      "SOH1"      "SPL2"      "SPO23"     "SPO77"    
## [426] "SPT3"      "SPT7"      "SRB2"      "SRB5"      "SRN2"     
## [431] "SRO7"      "SSA2"      "SSE1"      "SSH1"      "SST2"     
## [436] "STB5"      "STE11"     "STE5"      "SUS1"      "SVF1"     
## [441] "SWC5"      "SWF1"      "SWH1"      "SWI5"      "SYG1"     
## [446] "TAT2"      "TCO89"     "TEF4"      "TFB5"      "THI2"     
## [451] "THP3"      "THR1"      "THR4"      "TIF1"      "TKL2"     
## [456] "TOF2"      "TPN1"      "TRM9"      "TRP1"      "TRP2"     
## [461] "TRX3"      "TSA2"      "TSC3"      "TSR3"      "UBC4"     
## [466] "UBP3"      "UME6"      "URA1"      "URA7"      "URC2"     
## [471] "URM1"      "UTR1"      "VAM3"      "VBA1"      "VBA2"     
## [476] "VID22"     "VID27"     "VMA11"     "VMA2"      "VMA21"    
## [481] "VMA9"      "VPH2"      "VPS1"      "VPS24"     "VPS28"    
## [486] "VPS29"     "VPS41"     "VPS52"     "VPS53"     "VPS69"    
## [491] "VPS70"     "VPS73"     "WHI3"      "XRS2"      "YAK1"     
## [496] "YAL066W"   "YAP1"      "YBL065W"   "YBL071C"   "YBR096W"  
## [501] "YBR178W"   "YBR225W"   "YCL001W-B" "YCL002C"   "YCL007C"  
## [506] "YCL021W-A" "YCR101C"   "YCR102C"   "YDJ1"      "YDL041W"  
## [511] "YDL118W"   "YDL172C"   "YDL211C"   "YDR048C"   "YDR090C"  
## [516] "YDR114C"   "YDR222W"   "YDR442W"   "YDR455C"   "YDR514C"  
## [521] "YEF1"      "YEL045C"   "YER010C"   "YET3"      "YFL012W"  
## [526] "YFL013W-A" "YFR045W"   "YGK3"      "YGL041C"   "YGL042C"  
## [531] "YGL088W"   "YGL108C"   "YGL188C-A" "YGL218W"   "YGR015C"  
## [536] "YGR064W"   "YGR117C"   "YGR160W"   "YHK8"      "YHR050W-A"
## [541] "YIL025C"   "YJL022W"   "YJL043W"   "YJL055W"   "YJL077W-B"
## [546] "YJR011C"   "YJR107W"   "YJR154W"   "YKL033W-A" "YKL136W"  
## [551] "YKR051W"   "YLR202C"   "YLR297W"   "YLR358C"   "YLR407W"  
## [556] "YML012C-A" "YML079W"   "YML094C-A" "YMR294W-A" "YNL146W"  
## [561] "YNL203C"   "YNL226W"   "YNR068C"   "YOL118C"   "YOR139C"  
## [566] "YOR199W"   "YOR283W"   "YOR376W"   "YPL062W"   "YPL260W"  
## [571] "YPR014C"   "YPR084W"   "YPT32"     "YTP1"      "ZRT2"     
## [576] "ZUO1"

hsd1 = setdiff(rownames(bxxhs),rownames(hbat))

if(length(hsd1)==0) print('no hom genes rejected by standard')

## [1] "no hom genes rejected by standard"

if(length(hsd1) > 0) print(hsd1)  



osd1 = setdiff(rownames(bxxos),rownames(obat))

if(length(osd1)==0) print('no oliver genes rejected by standard')

## [1] "no oliver genes rejected by standard"

if(length(osd1) > 0) print(osd1)

csd1 = setdiff(rownames(bxxcs),rownames(cbat))


if(length(csd1)==0) print('no caudy genes rejected by standard')

## [1] "no caudy genes rejected by standard"

if(length(osd1) > 0) print(csd1)

mix prefix = presence determined by fitting two gaussians stand prefix = presence determined by standard protocol of tag presences

xtab = table(mix$call,mix$pool)

stab = table(stand$call,stand$pool)


pxtab = round(xtab/nrow(xdf),2)
pstab = round(stab/nrow(sdf),2)

print(xtab)

##    
##     mixcau mixhom mixoliv
##   0    814    329     486
##   1   3970   4455    4298

print(stab)

##    
##     standcau standhom standoliv
##   0      343      249       220
##   1     4546     4640      4669

print(pxtab)

##    
##     mixcau mixhom mixoliv
##   0   0.17   0.07    0.10
##   1   0.83   0.93    0.90

print(pstab)

##    
##     standcau standhom standoliv
##   0     0.07     0.05      0.04
##   1     0.93     0.95      0.96

plot median vs mad of tags rejected ONLY by mixEM and not by our standard bkrnd thresholds:

background tag calls batch corrected mixEM SC

ggiaever

07/12/2016