background tag calls batch corrected mixEM YPD

filename = 080116_bkgrnd_edit_batchYPD.Rmd

hi pat! finally decided on a criteria – posterior > 0.5 in at least half of experiments, works better than just using the controls – update, works less well in the YPD drug experiments here, had to use a more stringent threshold

here’s our goal

Purpose Develop a procedure for identifying poorly performing tags; specific goal being to distinguish strains NOT in the pool for three YKO derived collection:

auxotrophic YKO (hom)
prototrophic collection made by SGA/back-crossing (caudy)
prototrophic collection made by CEN plasmid complementation of the auxotrophies

this is really a preprocessing step for now.

uber important as this is a major point of this manuscript

erica this sight might me as useful for you as it was for me to understand what mixEM is doing http://rstudio-pubs-static.s3.amazonaws.com/1001_3177e85f5e4840be840c84452780db52.html

pat, my main issue as always is where to draw the cutoff?

for now i just enforced the rule that all 3 control experiments had to have a posterior > 0.05 i know why i used 0.05 – thought they were pvalues and wanted to be 95% confident they were background, but its too stringent changed to > 0.5 in ~ 1/2 of experiments

i am sure there are more robust ways of calling/refining it. maybe even on a per tag basis rather than a per experiment basis? (eventually). also wasn’t sure if the batch correction was kosher…

batch correction definitely helps for SC, else the distributions all see to require different values based on plots

we did lose a few precious genes (i hope not too many) and probably need to tweak it, but overall it worked well

primarily we lost ADE genes from oliver – but if you look at these genes they are really flaky and close to background

steps

get YPD data for all pools, xypd is the data matrix, pypd describes experiments

  xypd = read.delim(
  "jul24_2016_allpools_YPD.txt",header = T,stringsAsFactors = F,check.names = F)
  
  pypd = read.delim(
  "jul24_2016_allpools_pYPD.txt",
  header = T,
  stringsAsFactors = F,
  check.names = F
  )
  
  
  fdata6 = read.delim("jun10_2016_fdata6.txt",
    stringsAsFactors = F,
    check.names = F)
    
  noness = filter(fdata6, fdata6$essential == 0)
  noness = noness %>% arrange(strain)
  ess = filter(fdata6, fdata6$essential == 1)
  
  pypd = pypd %>% arrange(pool,type,cond,pcr)
  
  xypd = xypd[,pypd$name]

assign tag type to each, plot ctrl experiments to compare distributions of unassigned vs assigned tags

removed unassigned in this run and used essentials instead

# 
# unass = grep("tag_",rownames(xypd))
# 
# rownames(xypd)[1:1800]=gsub(":uptag",":downtag",rownames(xypd)[1:1800])



wnoness = which(rownames(xypd) %in% noness$strain)
xypd$assign[wnoness] = "noness"

xypd$assign[1:3600] = "unass"
xypd$tag = rownames(xypd)

mat = match(xypd$tag, fdata6$strain)
xypd$gene = fdata6$sgd_gene[mat]
pcs = filter(pypd,pool == "caudy")
phs = filter(pypd,pool == "hom")
pos = filter(pypd,pool == "oliver")

whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
cs = sample(pcs$name[wcs],3)
hs = sample(phs$name[whs],3)
os = sample(pos$name[wos],3)

ypd = c(cs,hs,os)

w = which(names(xypd) %in% c(ypd,"assign","tag","gene"))
test = xypd[, w]

m <- melt(test, id.vars=c("tag","assign","gene"), variable.name = "exp")

mat = match(m$exp,pypd$name)

m$pool = pypd$pool[mat]
m$tag = factor(m$tag)
m$pool = factor(m$pool)
m$assign = factor(m$assign)

ggplot(m, aes(value)) + geom_histogram(aes(fill = pool), bins=50) +
  facet_grid(assign~pool) + ggtitle("distribution by pool and tag type")

ggplot(m, aes(value))+geom_freqpoly(aes(colour=pool), bins=50)+facet_grid(assign~.)+theme_bw()+ ggtitle("distribution by pool")

ggplot(m, aes(value)) + geom_histogram(aes(fill = assign), bins=50) +
  facet_wrap(~exp,ncol=3) + ggtitle("distribution by pool and tag type ctrl chips")+theme_bw()

batch correct and normalize first – all tags together

plot before and after batch correction to see if it helps align distributiosn

density
dendograms

mydendy(xhs,phs$pcr,main="prebatch hom correct color = pcr date")

## 'dendrogram' with 2 branches and 26 members total, at height 151.3245

mydendy(bhs,phs$pcr, main="post batch hom correct color = pcr date")

## 'dendrogram' with 2 branches and 26 members total, at height 75.7054

 mydendy(xhs,phs$cond,main="prebatch hom correct color = condition")

## 'dendrogram' with 2 branches and 26 members total, at height 151.3245

 mydendy(bhs,phs$cond, main="post batch hom correct color = condition")

## 'dendrogram' with 2 branches and 26 members total, at height 75.7054

mydensity(xhs,phs$pcr,main="pre batch hom correct density, color = pcr date")

mydensity(bhs,phs$pcr, main="post batch hom correct density, color = pcr date")

reformat long-2-wide

print('unass included')

## [1] "unass included"

bsc = cbind(bcs,bhs,bos)


bsc = as.data.frame(bsc,stringsAsFactors = F)
w = which(names(xypd) %in% c("assign","tag","gene"))

bsc = cbind(bsc,xypd[,w])



msc <- melt(bsc, id.vars=c("tag","assign","gene"), variable.name = "exp")
mat = match(msc$exp,pypd$name)
msc$pool = pypd$pool[mat]
msc$cond = pypd$cond[mat]
msc$type = pypd$type[mat]

estimate of the unassigned means and sd – pretty equal for all pools

ua.mean <- aggregate(value ~ pool, data=msc, FUN=mean, subset=assign=='unass')
cmean = filter(ua.mean,pool == "caudy")
hmean = filter(ua.mean,pool == "hom")
omean = filter(ua.mean,pool == "oliver")

ua.var <- aggregate(value ~ pool, data=msc, FUN=sd, subset=assign=='unass')
cvar = filter(ua.var,pool == "caudy")
hvar = filter(ua.var,pool == "hom")
ovar = filter(ua.var,pool == "oliver")

print(ua.mean)

##     pool    value
## 1  caudy 5.251039
## 2    hom 5.268618
## 3 oliver 5.330938

print(ua.var)

##     pool     value
## 1  caudy 0.6937826
## 2    hom 0.4193457
## 3 oliver 0.7380817

re batch correct without the unassigned tags

bcs = medShift.updn(as.matrix(xcs[noness$strain,pcs$name]))
bhs = medShift.updn(as.matrix(xhs[noness$strain,phs$name]))
bos = medShift.updn(as.matrix(xos[noness$strain,pos$name]))
bcs = ComBat(bcs[noness$strain,pcs$name],model.matrix(~cond,pcs),batch = pcs$pcr)

## Found 6 batches
## Adjusting for 5 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bhs = ComBat(bhs[noness$strain,phs$name],model.matrix(~cond,phs),batch = phs$pcr)

## Found 5 batches
## Adjusting for 5 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

bos = ComBat(bos[noness$strain,pos$name],model.matrix(~cond,pos),batch = pos$pcr)

## Found 6 batches
## Adjusting for 5 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

print('unass removed')

## [1] "unass removed"

bsc = cbind(bcs,bhs,bos)


bsc = as.data.frame(bsc,stringsAsFactors = F)
bsc$tag = rownames(bsc)
table(bsc$tag == noness$strain)

## 
##  TRUE 
## 10216

bsc$gene = noness$sgd_gene


msc <- melt(bsc, id.vars=c("tag","gene"), variable.name = "exp")
mat = match(msc$exp,pypd$name)
msc$pool = pypd$pool[mat]
msc$cond = pypd$cond[mat]
msc$type = pypd$type[mat]

Plot histogram of assigned and unassigned for all experiments. This will generate a tall lattice of plots, so only save to pdf, but do not show in notebook.

It is pretty clear than around June 2 there is a leftward shift in the assigned tags and then a shift back to the right around July 15.

ggnote – i batch corrected this round –

I think we can fit a single gaussian to the unassigned tag distribution. Then fit a mixture of two Gaussians with one of the two fixed at the unassigned tag distribution and the other unconstrained for the assigned tags. Then we can use the posterior distribution over the cluster assignment variables to give an odds-ratio of the tag assignement. Then we can draw a threshold on the odds-ratio.

ggnote: drew a threshold based on 0.05

Fit a 2 component Gaussian mixture model to each experiment’s NONESS assigned tags. Store the posterior probability of the tag assignment to the “good” or “not unassigned” distribution" for each experiment.

note the difference in number of iterations needed to converge each time this is run – us this something to worry about? takes more iterations without batch correction

       SC     no batch             YPD

caudy = 183 – 282 187 hom = 78 131 127 oliver = 312 242 778!!!

reformat dataframe wide to long for plotting

most<- melt(post, id.vars=c("tag"), variable.name = "exp",value.name = "post")



msc2 <- merge(msc, most, by=c("tag","exp"))

# call bkgnd tags by posterior > 0.5
msc2$sig = rep(NA,nrow(msc2))


wbkgnd = which(msc2$post > 0.5)
msc2$sig[wbkgnd]=1
msc2$sig[-wbkgnd]=0

mat = match(msc2$exp,pypd$name)
msc2$type = pypd$type[mat]
msc2$cond = pypd$cond[mat]

grab the fits from the models based on unassigned tags instead: erica we can justify background thresholds using these

note lower variance and lower means for the hom pool in general

fit.hom = lapply(mvn.hom,with,c(mu,sigma,lambda))
hfit = matrix(unlist(fit.hom),ncol=6,byrow=T)
rownames(hfit)=names(fit.hom)
colnames(hfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")


fit.cau = lapply(mvn.cau,with,c(mu,sigma,lambda))
cfit = matrix(unlist(fit.cau),ncol=6,byrow=T)
rownames(cfit)=names(fit.cau)
colnames(cfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")

fit.oliv = lapply(mvn.oliv,with,c(mu,sigma,lambda))
ofit = matrix(unlist(fit.oliv),ncol=6,byrow=T)
rownames(ofit)=names(fit.oliv)
colnames(ofit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")

opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)

plot(ofit[,1],ylim=c(5,12),col=1,main = "comparing pools mean good vs bad tags",cex.main=0.9,,type="b")

points(ofit[,2],ylim=c(5,12),col=4,type="b")
points(hfit[,1],ylim=c(5,12),col=2,type="b")
points(hfit[,2],ylim=c(5,12),col=5,type="b")
points(cfit[,1],ylim=c(5,12),col=3,type="b")
points(cfit[,2],ylim=c(5,12),col=6,type="b")

legend("topright", inset=.05, title="pool",
    c("oliver.bg","hom.bg","caudy.bg","oliver","hom","caudy"), fill=1:6, cex=0.7)

hfit = apply(hfit,2,mean)
cfit = apply(cfit,2,mean)
ofit = apply(ofit,2,mean)


hom_bgthresh = hfit[1]
caudy_bgthresh =cfit[1]
oliv_bgthresh = ofit[1] 
print(noquote("stats for unass/assigned hom,caudy and oliver respectively = "))

## [1] stats for unass/assigned hom,caudy and oliver respectively =

print(hfit)

##   bkg_mean   ass_mean     bkg_sd     ass_sd bkg_lambda ass_lambda 
##  5.7271166  9.1996259  0.5743220  1.0161595  0.2035815  0.7964185

print(cfit)

##   bkg_mean   ass_mean     bkg_sd     ass_sd bkg_lambda ass_lambda 
##  5.8748559  9.5280973  0.7413370  0.9805742  0.3217994  0.6782006

print(ofit)

##   bkg_mean   ass_mean     bkg_sd     ass_sd bkg_lambda ass_lambda 
##  5.6978940  9.3414196  0.6196868  1.0285872  0.2413985  0.7586015

not sure how to justify the threshold here here based on fitted parameters or by the plots below

any way to get a probability

print(noquote("hom,caudy and oliver respectively:  +/- 3 stdev from mean assigned tags = "))

## [1] hom,caudy and oliver respectively:  +/- 3 stdev from mean assigned tags =

print(c(hfit[2] - 3*hfit[4],  hfit[2] + 3*hfit[4] ))

##  ass_mean  ass_mean 
##  6.151147 12.248105

print(c(cfit[2] - 3*cfit[4], cfit[2] + 3*cfit[4]))

##  ass_mean  ass_mean 
##  6.586375 12.469820

print(c(ofit[2] - 3*ofit[4], ofit[2] + 3*ofit[4]))

##  ass_mean  ass_mean 
##  6.255658 12.427181

check out differences in fitted gaussian parameters between pools

the ratio of background (lambda) to assigned is

print(noquote("hom,caudy and oliver percent background  vs present tags"))

## [1] hom,caudy and oliver percent background  vs present tags

print(c(round(hfit[5]*100,2) ,round(hfit[6]*100,2) ))

## bkg_lambda ass_lambda 
##      20.36      79.64

print(c(round(cfit[5]*100,2) ,round(cfit[6]*100,2) ))

## bkg_lambda ass_lambda 
##      32.18      67.82

print(c(round(ofit[5]*100,2) ,round(ofit[6]*100,2) ))

## bkg_lambda ass_lambda 
##      24.14      75.86

opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)

print(paste(names(mvn.hom)[1]))

## [1] "hom_ypd_15Jan15"

plot(mvn.hom[[1]],density = T)

print(paste(names(mvn.cau)[1]))

## [1] "caudy_ypd_15Jan15"

plot(mvn.cau[[1]],density = T)

print(paste(names(mvn.oliv)[1]))

## [1] "oliver_ypd_15Jan15"

plot(mvn.oliv[[1]],density = T)

direct comparison of intensity values scaled histograms

library(scales)
diff = ggplot(msc2) +
geom_histogram(aes(x=value,y = ..ncount..,fill=pool), binwidth=0.25,position="dodge")


diff = diff + scale_y_continuous(labels = percent_format())
diff =diff + theme(panel.grid  = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5))

diff2=diff + theme(panel.grid  = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5))
diff2 + facet_grid(sig~.)+ theme(legend.position=c(0.7, 0.7)+ ggtitle("scaled distrubutions in two pools"))

plot the assigned tag distribution with the posterior probability of it being a “good” tag. (only ctrl exps shown) its tough to see where the cutoff shoud be here for caudy especially

mat = match(msc2$exp,pypd$name)

msc2$pcr = pypd$pcr[mat]
w7 = which(pypd$pcr %in% c("2015-01-15"))


whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
cs = sample(pcs$name[wcs],3)
hs = sample(phs$name[whs],3)
os = sample(pos$name[wos],3)
exps = c(cs,hs,os)

test = msc2 %>% filter(exp %in% exps)  %>% droplevels()


ggplot(test) +
  facet_wrap(pool ~ exp,ncol=3) +
  geom_density(aes(x=value,col=pool))   +  
  
  geom_point(aes(x=value, y=post,col=pool))

cs = sample(pcs$name[wcs],3)
hs = sample(phs$name[whs],3)
os = sample(pos$name[wos],3)

mctrl = msc2 %>% filter(type=="ctrl",exp %in% c(cs,hs,os)) %>% droplevels()
caudy = msc2 %>% filter(pool== "caudy",type=="ctrl",exp %in% cs) %>% droplevels()
oliv = msc2 %>% filter(pool== "oliver",type=="ctrl",exp %in% os) %>% droplevels()
hom = msc2 %>% filter(pool== "hom",type=="ctrl",exp %in% hs) %>% droplevels()



o = ggplot(oliv) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.)  + ggtitle("oliver ctrls")+
  theme(legend.position="none")
  o

c = ggplot(caudy) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.)  + ggtitle("caudy ctrls")+
  theme(legend.position="none")
  c

h = ggplot(hom) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() +  facet_grid(exp~.) + ggtitle("hom ctrls") +
  theme(legend.position="none")
  h

  ggplot(mctrl) +
  geom_histogram(aes(x=value,y = ..ncount..,fill=pool), binwidth=0.1,position="dodge") +

  geom_point(aes(x=value, y=post))+ theme_bw() +  facet_wrap(pool~exp,nrow=3) + ggtitle("sample of ctrls") +
  theme(legend.position="none")

from here split out the pools and draw the Venn for tags w/ posterior > 0.5 in ~ 50% of experiments

because they ypd experiments are so noisy and we have more controls, only use controls

i hate that i have to make an arbritrary cutoff after all this work! couldn’t we use pnorm to get the probabilty based on the estimated parameters of the gaussian?

its really different for YPD compared to SC

increased my arbitraryness by requiring ALL controls > 0.5

mctrl = msc2 %>% filter(type=="ctrl") %>% droplevels()
actrl= acast(mctrl, tag~pool+exp, value.var="sig",sum)

actrl = as.data.frame(actrl,stringsAsFactors = F)

gc = grep('caudy',colnames(actrl))

csum = apply(actrl[,gc],1,sum)

gh = grep('hom',colnames(actrl))

hsum = apply(actrl[,gh],1,sum)

go = grep('oliv',colnames(actrl))

osum = apply(actrl[,go],1,sum)

otab = table(osum)
htab = table(hsum)
ctab = table(csum)

opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1)

barplot(htab,col=2, main='count of exps that pass threshold hom')
barplot(cumsum(prop.table(htab)),col=2, main='hom cumulative sum')

barplot(otab,col=1, main='oliver')
barplot(cumsum(prop.table(otab)),col=1, main='oliver cumulative sum')

barplot(ctab,col=3, main='caudy')
barplot(cumsum(prop.table(ctab)),col=3, main='caudy cumulative sum')

wc = which(csum == length(wcs))
wo = which(osum == length(wos))
wh = which(hsum == length(whs))

# wc = which(csum > nrow(pcs[wcs,])/2)
# wo = which(osum > nrow(pos[wos,])/2)
# wh = which(hsum > nrow(phs[whs,])/2)
# 


#wc = which(actrl[,1]> 0.5)
#wh = which(actrl[,2]> 0.5)
#wo = which(actrl[,3]> 0.5)

ctag = rownames(actrl)[wc]
htag = rownames(actrl)[wh]
otag = rownames(actrl)[wo]

cmix=xcs[ctag,pcs$name]
hmix=xhs[htag,phs$name]
omix=xos[otag,pos$name]


hom_bgthresh = hfit[1]
caudy_bgthresh= cfit[1]
oliv_bgthresh = ofit[1]

bhix = myBeststrain(hmix,bgThresh = hom_bgthresh)

## [1] min value of ctrl medians =  6.2917072054257             
## [1] 4391   26

bcix = myBeststrain(cmix,bgThresh = caudy_bgthresh)

## [1] min value of ctrl medians =  8.00201061527352            
## [1] 3884   32

boix = myBeststrain(omix,bgThresh = oliv_bgthresh)

## [1] min value of ctrl medians =  7.45872049969686            
## [1] 4374   29

from barplots its pretty clear that the significance call is all or none, though different for each pool there are lots of ways to call presence or absence given we have likelihood? odds? across many experiments,

need to check consistency between replicates of absence/presence call

compute batch corrected tag presence using our standard approach for later comparison 1. median normalize by experiment 2. throw out tags that are not about the background threshold 3. pick best tag by the choosing the one with lowest sd/mean (if robust = F) for ctrl chips

check the STRAINS rejected by EM vs those rejected by standard preprocessing

hsd = setdiff(rownames(hbat),rownames(bxxhs))
osd = setdiff(rownames(obat),rownames(bxxos))
csd = setdiff(rownames(cbat),rownames(bxxcs))




hsd1 = setdiff(rownames(bxxhs),rownames(hbat))
osd1 = setdiff(rownames(bxxos),rownames(obat))
csd1 = setdiff(rownames(bxxcs),rownames(cbat))



print(' strains rejected by EM, caudy hom oliver')

## [1] " strains rejected by EM, caudy hom oliver"

print(length(csd))

## [1] 1013

print(length(hsd))

## [1] 519

print(length(osd))

## [1] 729

print('# strains rejected by std')

## [1] "# strains rejected by std"

print(length(csd1))

## [1] 311

print(length(hsd1))

## [1] 232

print(length(osd1))

## [1] 268

r compare distributions of rejected strains by mixEM and std

orange = reject by standard preprocessing, blue = rejected by mixEM

opar <- par(pch=19,mfrow=c(1,3),mar=c(2.2,2,1.4,1),cex=1.2)

plot(density(xhs[hsd1,whs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='hom')
lines(density(xhs[hsd1,whs[2]]),col=1,lwd=2)
lines(density(xhs[hsd1,whs[3]]),col=1,lwd=2)
lines(density(xhs[hsd,whs[3]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[2]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[1]]),col=2,lwd=2)



legend("topright", inset=.05, title="method",
    c("rejected std","rejected EM"), fill=1:2, cex=0.7)

plot(density(xos[osd1,wos[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='oliver')
lines(density(xos[osd1,wos[2]]),col=1,lwd=2)
lines(density(xos[osd1,wos[3]]),col=1,lwd=2)
lines(density(xos[osd,wos[3]]),col=2,lwd=2)
lines(density(xos[osd,wos[2]]),col=2,lwd=2)
lines(density(xos[osd,wos[1]]),col=2,lwd=2)

plot(density(xcs[csd1,wcs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main ='caudy')
lines(density(xcs[csd1,wcs[2]]),col=1,lwd=2)
lines(density(xcs[csd1,wcs[3]]),col=1,lwd=2)
lines(density(xcs[csd,wcs[2]]),col=2)
lines(density(xcs[csd,wcs[3]]),col=2,lwd=2)
lines(density(xcs[csd,wcs[1]]),col=2,lwd=2)

i hate venns but here goes: make a list of strains in each pool tags absent in all strains were removed

pie charts! think its easier to visualize

compare this to our standard call of gene presence:

top venn is genes by mixEM, bottom is by standard

cboth= rownames(bxxcs)
hboth= rownames(bxxhs)
oboth= rownames(bxxos)

df = data.frame(gene = unique(noness$sgd_gene),stringsAsFactors = F)

df$mixcau = rep(NA,nrow(df))
df$mixhom = rep(NA,nrow(df))
df$mixoliv = rep(NA,nrow(df))
df$standcau = rep(NA,nrow(df))
df$standhom = rep(NA,nrow(df))
df$standoliv = rep(NA,nrow(df))

w = which(df$gene %in% cboth)
df$mixcau[w]= 1
df$mixcau[-w]= 0

w = which(df$gene %in% hboth)
df$mixhom[w]= 1
df$mixhom[-w]= 0

w =which(df$gene %in% oboth)
df$mixoliv[w]= 1
df$mixoliv[-w]= 0

w = which(df$gene %in% rownames(cbat))
df$standcau [w]= 1
df$standcau [-w]= 0

w = which(df$gene %in% rownames(hbat))
df$standhom[w]= 1
df$standhom [-w]= 0

w = which(df$gene %in% rownames(obat))
df$standoliv[w]= 1
df$standoliv [-w]= 0

df$ssum = df$standcau+df$standoliv+df$standhom

df$xsum = df$mixcau+df$mixoliv+df$mixhom

ws = which(df$ssum == 0)

xdf = df[,c(1:4,9)]
sdf = df[,c(1,5:8)]

ws = which(sdf$ssum == 0)

print(length(ws))

## [1] 107

sdf = sdf[-ws,]
wx = which(xdf$xsum == 0)
print(length(wx))

## [1] 290

xdf = xdf[-wx,]

mix = melt(xdf[,1:4],variable.name = "pool",value.name="call")

## Using gene as id variables

stand = melt(sdf[,1:4],variable.name = "pool",value.name="call")

## Using gene as id variables

piemix = pie_chart1(mix[,2:3], main = "call",  condition = "pool")
piestand = pie_chart1(stand[,2:3], main = "call",  condition = "pool")

name = c("caudy","hom","oliver")

piecharts each pool vs universe of all strains present in at least one pool

opar <- par(pch=19,mfrow=c(2,1),mar=c(2.2,0.5,1.4,0.5),cex=1.2)

multiplot(piemix, piestand, layout=matrix(c(1,2), nrow=2, byrow=TRUE))

venn strains present by mix EM

t1 = ggvenn(cboth,hboth,oboth,category = name)

present by standard

t2 = ggvenn(rownames(cbat),rownames(hbat),rownames(obat))

venn strains missing by mix EM

univ = unique(c(cboth,hboth,oboth))

xcbg = setdiff(univ,cboth)
xhbg = setdiff(univ,hboth)
xobg = setdiff(univ,oboth)

t1 = ggvenn(xcbg,xhbg,xobg,category = name)

venn strains missing by standard

suniv = unique(c(rownames(obat),rownames(cbat),rownames(hbat)))

cbg = setdiff(suniv,rownames(cbat))
hbg = setdiff(suniv,rownames(hbat))
obg = setdiff(suniv,rownames(obat))
t1 = ggvenn(cbg,hbg,obg,category = name)

proportion venn genes missing by EM vs STD

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)

library(venneuler)

## Loading required package: rJava

mix0 = filter(mix,mix$call == 0)
v0 <- venneuler(mix0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)


cau = paste('mixcaud',length(xcbg))
hom = paste('mixhom',length(xhbg))
oliv = paste('mixoliv',length(xobg))
v0$labels = c(cau,hom,oliv)
plot(v0)

stand0 = filter(stand,stand$call == 0)

cau = paste('stdcaud',length(cbg))
hom = paste('stdhom',length(hbg))
oliv = paste('stdoliv',length(obg))
v0 <- venneuler(stand0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)
v0$labels = c(cau,hom,oliv)
plot(v0)

check fitness profiles by preprocessing method mixEM seems to work well in getting rid of flaky tags

whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")

cbat = cbat[,pcs$name]
bxxcs = bxxcs[,pcs$name]

hbat = hbat[,phs$name]
bxxhs = bxxhs[,phs$name]

obat =obat[,pos$name]
bxxos = bxxos[,pos$name]


lobat = getLR(obat,wos)
lbxxos = getLR(bxxos,wos)  

lcbat = getLR(cbat,wcs)
lbxxcs = getLR(bxxcs,wcs) 


lhbat = getLR(hbat,whs)
lbxxhs = getLR(bxxhs,whs) 

whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")

lphs = phs[-whs,]

lpos = pos[-wos,]

lpcs = pcs[-wcs,]

lphs = lphs %>% arrange(cond,pcr)
lpcs = lpcs %>% arrange(cond,pcr)
lpos = lpos %>% arrange(cond,pcr)

lcbat = lcbat[,lpcs$name]
lbxxcs = lbxxcs[,lpcs$name]

lhbat = lhbat[,lphs$name]
lbxxhs = lbxxhs[,lphs$name]

lobat = lobat[,lpos$name]
lbxxos = lbxxos[,lpos$name]




opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)

hsub = jul15 = grep("Jul15",colnames(lhbat))

csub = grep("Jul15",colnames(lcbat))
osub = jul15 = grep("Jul15",colnames(lobat))

colnames(lbxxos) = paste("EM.",colnames(lbxxos))
colnames(lbxxcs) = paste("EM.",colnames(lbxxcs))
colnames(lbxxhs) = paste("EM.",colnames(lbxxhs))

threshold wasn’t harsh enough

compare fitness profiles EM to standard

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in hsub) psig(lhbat,lbxxhs,i)

for (i in osub) psig(lobat,lbxxos,i)

for (i in csub) psig(lcbat,lbxxcs,i)

compare on same scales

harg = grep('arg',colnames(lhbat))
carg = grep('arg',colnames(lcbat))
oarg = grep('arg',colnames(lobat))

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in harg) pscale1(lhbat,i,lbxxhs,i)

for (i in carg) pscale1(lcbat,i,lbxxcs,i)

for (i in oarg) pscale1(lobat,i,lbxxos,i)

harg = grep('lys',colnames(lhbat))
carg = grep('lys',colnames(lcbat))
oarg = grep('lys',colnames(lobat))

opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)


for (i in harg) pscale1(lhbat,i,lbxxhs,i)

for (i in carg) pscale1(lcbat,i,lbxxcs,i)

for (i in oarg) pscale1(lobat,i,lbxxos,i)

compare differences by GENES rejected by EM hom

hsd = setdiff(rownames(hbat),rownames(bxxhs))
hsd = sort(unique(hsd))

print('hom reject EM')

## [1] "hom reject EM"

print(hsd)

##   [1] "ADE5,7"    "ADE8"      "ADK1"      "AIM26"     "AKR1"     
##   [6] "ALD5"      "AMA1"      "APL6"      "APQ12"     "APQ13"    
##  [11] "ARG8"      "ARG82"     "ARO1"      "ARO7"      "ARV1"     
##  [16] "ATP12"     "ATP15"     "AVL9"      "BCS1"      "BDS1"     
##  [21] "BEM1"      "BEM2"      "BOR1"      "BUB3"      "BUD19"    
##  [26] "BUD23"     "BUD32"     "BUR2"      "CAX4"      "CBS2"     
##  [31] "CCR4"      "CCS1"      "CLA4"      "CLN3"      "CNM67"    
##  [36] "COA1"      "COA2"      "COA3"      "COG1"      "COX10"    
##  [41] "COX14"     "COX20"     "CPR7"      "CPS1"      "CTF4"     
##  [46] "CTF8"      "CTR1"      "CYB2"      "CYS4"      "DEG1"     
##  [51] "DGR1"      "DIA2"      "DIA4"      "DID4"      "EAF1"     
##  [56] "ECM31"     "EIS1"      "ELP3"      "EOS1"      "ERG6"     
##  [61] "ERI1"      "EXO5"      "FLX1"      "FMP52"     "FUI1"     
##  [66] "FUN12"     "FYV10"     "FYV4"      "FYV6"      "GCR2"     
##  [71] "GLE2"      "GND1"      "GRR1"      "HAP2"      "HBN1"     
##  [76] "HFI1"      "HHF2"      "HIR1"      "HMO1"      "HNT3"     
##  [81] "HSL1"      "HSL7"      "HTZ1"      "IDH1"      "IDH2"     
##  [86] "IES2"      "IMD4"      "IMG2"      "INM2"      "IRC15"    
##  [91] "IWR1"      "KAP122"    "KAR3"      "KCS1"      "KRE28"    
##  [96] "KRE6"      "KTR5"      "LSO1"      "LTV1"      "MAP1"     
## [101] "MDS3"      "MMS22"     "MNN10"     "MON2"      "MOT2"     
## [106] "MPT5"      "MRP10"     "MRPS5"     "MRT4"      "MSS1"     
## [111] "MSS18"     "NAT3"      "NDI1"      "NGG1"      "NOT5"     
## [116] "NSR1"      "NUP84"     "OPI9"      "PAF1"      "PAU5"     
## [121] "PEP3"      "PET112"    "PFK2"      "PHO2"      "PHO85"    
## [126] "PIF1"      "POG1"      "PPZ1"      "PRO1"      "PRS1"     
## [131] "PRS2"      "PSR1"      "RAD50"     "REF2"      "RLM1"     
## [136] "RNR4"      "RPA12"     "RPB4"      "RPL14B"    "RPL1B"    
## [141] "RPL20A"    "RPL21A"    "RPL27A"    "RPL31A"    "RPL39"    
## [146] "RPS10A"    "RPS16B"    "RPS21B"    "RPS23B"    "RPS25B"   
## [151] "RPS7A"     "RPS9B"     "RRF1"      "RRG8"      "RSM24"    
## [156] "RSM26"     "RTF1"      "RTT109"    "SAM3"      "SAN1"     
## [161] "SFP1"      "SGO1"      "SIN4"      "SIT4"      "SKO1"     
## [166] "SLM6"      "SLO1"      "SLS1"      "SNF12"     "SNF2"     
## [171] "SNF4"      "SNG1"      "SNT309"    "SOD2"      "SOM1"     
## [176] "SPC1"      "SRC1"      "SSE1"      "SSH1"      "SUS1"     
## [181] "SUV3"      "SYM1"      "TAF14"     "THO2"      "THP1"     
## [186] "TIM8"      "TKL2"      "TRK1"      "TRP4"      "UGO1"     
## [191] "UIP4"      "UTP30"     "VAC8"      "VAM6"      "VBA5"     
## [196] "VMA1"      "VMA11"     "VMA16"     "VMA21"     "VMA22"    
## [201] "VMA3"      "VMA4"      "VMA6"      "VMA7"      "VPH2"     
## [206] "VPS1"      "VPS15"     "VPS33"     "VPS53"     "VPS65"    
## [211] "YAF9"      "YAL016C-B" "YAL044W-A" "YAR035C-A" "YBL065W"  
## [216] "YBL100W-C" "YBR085C-A" "YBR096W"   "YDL159W-A" "YDL183C"  
## [221] "YDL206W"   "YDR034W-B" "YDR442W"   "YDR524C-B" "YEL045C"  
## [226] "YER076C"   "YGL088W"   "YGL101W"   "YGL218W"   "YGR016W"  
## [231] "YGR121W-A" "YGR160W"   "YGR161W-C" "YGR204C-A" "YGR219W"  
## [236] "YHR022C-A" "YIL134C-A" "YJL136W-A" "YKL018C-A" "YKL106C-A"
## [241] "YKL118W"   "YLL006W-A" "YLR361C-A" "YML007C-A" "YML009W-B"
## [246] "YML079W"   "YMR013W-A" "YMR182W-A" "YNL097C-B" "YNL184C"  
## [251] "YNL277W-A" "YNR073C"   "YOL097W-A" "YOL118C"   "YOL164W-A"
## [256] "YOR139C"   "YOR161C-C" "YOR293C-A" "YOR316C-A" "YOR331C"  
## [261] "YOR376W-A" "YPR099C"   "YPR159C-A"

oliver

osd = setdiff(rownames(lobat),rownames(lbxxos))
print('oliv reject EM')

## [1] "oliv reject EM"

osd = sort(osd)
print(osd)

##   [1] "ADA2"      "ADE1"      "ADE3"      "ADE6"      "ADE8"     
##   [6] "ADK1"      "AGP2"      "ALF1"      "AMA1"      "ANP1"     
##  [11] "APL6"      "APQ12"     "ARC18"     "ARF1"      "ARG80"    
##  [16] "ARO2"      "ARP6"      "ARP8"      "ARV1"      "ARX1"     
##  [21] "ASC1"      "ASF1"      "ASK10"     "ATP1"      "ATP12"    
##  [26] "ATP15"     "AVT5"      "BAR1"      "BAS1"      "BDF1"     
##  [31] "BEM3"      "BNA7"      "BOR1"      "BUB1"      "BUD14"    
##  [36] "BUD20"     "BUD23"     "BUD25"     "BUD30"     "BUD32"    
##  [41] "CAX4"      "CBS2"      "CCE1"      "CCS1"      "CKB1"     
##  [46] "CLA4"      "CLD1"      "CNE1"      "CNM67"     "COA6"     
##  [51] "COQ3"      "COQ8"      "CPR7"      "CRP1"      "CSE2"     
##  [56] "CST6"      "CTF8"      "CTK2"      "CTR1"      "CTR9"     
##  [61] "CYB2"      "DBP3"      "DBP7"      "DCI1"      "DEF1"     
##  [66] "DEG1"      "DFG16"     "DHH1"      "DIA2"      "DON1"     
##  [71] "DUN1"      "ECM21"     "ECM31"     "EDE1"      "EFM2"     
##  [76] "EIS1"      "ELM1"      "ELP2"      "ELP3"      "END3"     
##  [81] "EOS1"      "ERG24"     "ERG3"      "ERJ5"      "ERP2"     
##  [86] "EST2"      "EST3"      "ETR1"      "EXO5"      "FEN2"     
##  [91] "FMP52"     "FMT1"      "FRA1"      "FUN12"     "FUN14"    
##  [96] "FUN30"     "FYV10"     "FYV6"      "GAS1"      "GCN1"     
## [101] "GCN4"      "GCN5"      "GCR2"      "GEP3"      "GLE2"     
## [106] "GLO3"      "GRX5"      "GRX8"      "GTR2"      "HAP3"     
## [111] "HCR1"      "HFI1"      "HFM1"      "HHY1"      "HIR1"     
## [116] "HMO1"      "HNT3"      "HOM3"      "HSM3"      "HST1"     
## [121] "HTL1"      "HTZ1"      "HXT17"     "IDH2"      "IES2"     
## [126] "IMD4"      "IMG1"      "INM2"      "IOC4"      "IRC18"    
## [131] "IST3"      "ISU1"      "IZH1"      "KAP120"    "KAP122"   
## [136] "KCS1"      "KGD2"      "KIN4"      "KIN82"     "KTI11"    
## [141] "KTR1"      "LAC1"      "LGE1"      "LOA1"      "LOC1"     
## [146] "LRG1"      "LRP1"      "LSM1"      "LSM6"      "LTV1"     
## [151] "LYS20"     "MAF1"      "MAM33"     "MDE1"      "MDJ1"     
## [156] "MDM10"     "MED1"      "MET10"     "MET22"     "MFA1"     
## [161] "MFM1"      "MGR2"      "MGR3"      "MIP6"      "MIX17"    
## [166] "MMS22"     "MNN10"     "MNN2"      "MNN9"      "MPO1"     
## [171] "MRPL1"     "MRPL20"    "MRPL23"    "MRPL3"     "MRPL4"    
## [176] "MRPL49"    "MRPS5"     "MRPS9"     "MRT4"      "MSA1"     
## [181] "MSB4"      "MSD1"      "MSR1"      "MSS1"      "MSY1"     
## [186] "MVB12"     "NCR1"      "NDI1"      "NEW1"      "NGG1"     
## [191] "NHX1"      "NKP2"      "NPL3"      "NPL6"      "NSR1"     
## [196] "NUP60"     "NUP84"     "OM14"      "OSW7"      "OTU2"     
## [201] "PAP2"      "PAU5"      "PDH1"      "PEP12"     "PEX12"    
## [206] "PEX13"     "PHO89"     "PMR1"      "POP2"      "PPT2"     
## [211] "PPZ1"      "PRM2"      "PRM9"      "PRS3"      "PRY2"     
## [216] "PSR1"      "PUB1"      "PUF4"      "PXA2"      "RAD27"    
## [221] "RAD52"     "RAD6"      "RAM1"      "RCR1"      "RCR2"     
## [226] "REF2"      "REI1"      "REV7"      "RFU1"      "RGC1"     
## [231] "RHB1"      "RHO4"      "RHO5"      "RIB1"      "RIM1"     
## [236] "RIM4"      "RNR4"      "ROM1"      "RPB4"      "RPB9"     
## [241] "RPL11B"    "RPL12B"    "RPL21A"    "RPL22A"    "RPL27A"   
## [246] "RPL27B"    "RPL2B"     "RPL36B"    "RPL37A"    "RPL40A"   
## [251] "RPL42B"    "RPL43A"    "RPL6A"     "RPL7B"     "RPP2B"    
## [256] "RPS10A"    "RPS11B"    "RPS17A"    "RPS18A"    "RPS18B"   
## [261] "RPS19B"    "RPS21A"    "RPS23B"    "RPS24A"    "RPS25B"   
## [266] "RPS29B"    "RPS7A"     "RPS9B"     "RRG1"      "RRG8"     
## [271] "RRP8"      "RSA1"      "RSM23"     "RSM26"     "RSM7"     
## [276] "RTF1"      "RTG3"      "RTT109"    "SAF1"      "SAP1"     
## [281] "SAP4"      "SDH1"      "SEO1"      "SHM2"      "SIC1"     
## [286] "SIN4"      "SIR4"      "SIT4"      "SKG1"      "SLS1"     
## [291] "SMF2"      "SMK1"      "SNF2"      "SNF4"      "SNF6"     
## [296] "SNG1"      "SNT309"    "SOD2"      "SPC72"     "SPF1"     
## [301] "SPL2"      "SPO23"     "SPT10"     "SPT4"      "SRB5"     
## [306] "SSE1"      "SSH1"      "SSQ1"      "SST2"      "SSZ1"     
## [311] "SUS1"      "SUV3"      "SWC5"      "SWH1"      "SWI5"     
## [316] "SWI6"      "SWR1"      "TAF14"     "TAL1"      "TAT2"     
## [321] "TDA3"      "TGS1"      "THI2"      "TIF1"      "TKL2"     
## [326] "TMA10"     "TOD6"      "TOF2"      "TOP1"      "TPN1"     
## [331] "TRM9"      "TRP4"      "TRX3"      "TSA2"      "TSR3"     
## [336] "TVP18"     "UBP3"      "UBP5"      "UFO1"      "UMP1"     
## [341] "URC2"      "URM1"      "UTP30"     "UTR4"      "VBA1"     
## [346] "VBA5"      "VID22"     "VID27"     "VMA1"      "VMA10"    
## [351] "VMA11"     "VMA16"     "VMA2"      "VMA21"     "VMA5"     
## [356] "VMA6"      "VMA7"      "VMA9"      "VPH2"      "VPS15"    
## [361] "VPS28"     "VPS52"     "VPS60"     "VPS71"     "XRS2"     
## [366] "YAF9"      "YAL066W"   "YAP1"      "YBL059W"   "YBL065W"  
## [371] "YBL094C"   "YBR096W"   "YBR196C-A" "YCL001W-B" "YCP4"     
## [376] "YCR025C"   "YCR101C"   "YCR102C"   "YDJ1"      "YDL068W"  
## [381] "YDL129W"   "YDL183C"   "YDL211C"   "YDR034C-A" "YDR433W"  
## [386] "YEL045C"   "YER010C"   "YFL013W-A" "YFR018C"   "YGL108C"  
## [391] "YGL118C"   "YGL188C-A" "YGR064W"   "YGR117C"   "YGR160W"  
## [396] "YHC3"      "YHK8"      "YIL025C"   "YIL029C"   "YJL077W-B"
## [401] "YJR018W"   "YJR154W"   "YKE2"      "YKL118W"   "YKL136W"  
## [406] "YKR051W"   "YLR269C"   "YLR283W"   "YLR358C"   "YML009W-B"
## [411] "YML012C-A" "YML079W"   "YMR031W-A" "YMR242W-A" "YNG2"     
## [416] "YNL140C"   "YNL171C"   "YNR073C"   "YOL079W"   "YOR139C"  
## [421] "YOR199W"   "YOR331C"   "YOR376W"   "YPL260W"   "YPR123C"  
## [426] "YPT32"     "YRR1"      "YTP1"      "ZAP1"      "ZRT2"

caudy

csd = setdiff(rownames(cbat),rownames(bxxcs))
csd = sort(csd)

print('caudy reject EM')

## [1] "caudy reject EM"

print(csd)

##   [1] "AAH1"      "ACO2"      "ADE6"      "ADE8"      "ADK1"     
##   [6] "ADO1"      "AFT1"      "AIF1"      "AIM10"     "AIM44"    
##  [11] "ALF1"      "ALO1"      "ANP1"      "APL6"      "APQ12"    
##  [16] "ARC1"      "ARC18"     "ARF1"      "ARG1"      "ARG2"     
##  [21] "ARG3"      "ARG4"      "ARG5,6"    "ARG80"     "ARL1"     
##  [26] "ARL3"      "ARO7"      "ARP6"      "ARP8"      "ARV1"     
##  [31] "ARX1"      "ASC1"      "ASF1"      "ASK10"     "ASP1"     
##  [36] "ATG11"     "ATG14"     "ATG23"     "ATG4"      "ATP17"    
##  [41] "AVL9"      "AVT1"      "AVT5"      "BAS1"      "BBC1"     
##  [46] "BCK1"      "BEM1"      "BEM2"      "BFR1"      "BNA7"     
##  [51] "BOR1"      "BRE1"      "BUB1"      "BUB3"      "BUD14"    
##  [56] "BUD16"     "BUD19"     "BUD20"     "BUD23"     "BUD25"    
##  [61] "BUD28"     "BUD30"     "BUD32"     "CAT5"      "CBP2"     
##  [66] "CCE1"      "CCM1"      "CCR4"      "CCS1"      "CDA1"     
##  [71] "CDA2"      "CDC10"     "CHD1"      "CIK1"      "CIR2"     
##  [76] "CKB1"      "CKI1"      "CLA4"      "CNE1"      "CNM67"    
##  [81] "COA3"      "COQ9"      "COR1"      "COX10"     "COX11"    
##  [86] "COX18"     "COX7"      "COX9"      "CPA2"      "CPR7"     
##  [91] "CRP1"      "CSM1"      "CST6"      "CTF4"      "CTF8"     
##  [96] "CTK2"      "CTK3"      "CTR1"      "CTR9"      "CYT1"     
## [101] "DAK1"      "DAL82"     "DCI1"      "DDC1"      "DEP1"     
## [106] "DFG16"     "DIA2"      "DIA4"      "DIG1"      "DOA4"     
## [111] "DOC1"      "DON1"      "DRS2"      "DSS1"      "DST1"     
## [116] "DYN3"      "EAF3"      "EAP1"      "ECM27"     "ECM31"    
## [121] "ECM33"     "ECM38"     "EFG1"      "EFT2"      "EIS1"     
## [126] "ELM1"      "ELO2"      "ELP2"      "ELP3"      "ELP4"     
## [131] "EMC10"     "END3"      "ENT3"      "ERF2"      "ERG24"    
## [136] "ERJ5"      "ERP2"      "EST1"      "EST2"      "EST3"     
## [141] "ETR1"      "FAB1"      "FAR11"     "FAR7"      "FEN2"     
## [146] "FMP52"     "FMT1"      "FRA1"      "FUN14"     "FUN30"    
## [151] "FUS3"      "FYV5"      "FYV7"      "FZO1"      "GAL80"    
## [156] "GCG1"      "GCN1"      "GCN5"      "GCR2"      "GCV3"     
## [161] "GEP3"      "GET3"      "GIS4"      "GLE2"      "GLR1"     
## [166] "GPM2"      "GRX8"      "GSF2"      "GTF1"      "GTS1"     
## [171] "GUP1"      "HAC1"      "HBN1"      "HEF3"      "HER2"     
## [176] "HFM1"      "HIR1"      "HIS1"      "HIS4"      "HIS5"     
## [181] "HIT1"      "HMO1"      "HMT1"      "HOC1"      "HOM2"     
## [186] "HOM3"      "HOM6"      "HOP2"      "HSL1"      "HSL7"     
## [191] "HSM3"      "HST1"      "HTZ1"      "HUL4"      "HUR1"     
## [196] "HXT17"     "ICS2"      "IDH2"      "IES1"      "IES2"     
## [201] "IKI1"      "IKI3"      "IMD4"      "IMP1"      "IMP2"     
## [206] "IMP2'"     "INH1"      "INM2"      "INO2"      "INO4"     
## [211] "IOC4"      "IPK1"      "IRC18"     "ISA2"      "IST3"     
## [216] "IWR1"      "JJJ3"      "JNM1"      "KAP120"    "KCS1"     
## [221] "KEX2"      "KGD2"      "KIN4"      "KIN82"     "KIP1"     
## [226] "KRE28"     "KTI11"     "KTR1"      "LAC1"      "LAS21"    
## [231] "LAT1"      "LCL2"      "LIP2"      "LMO1"      "LNP1"     
## [236] "LOC1"      "LRG1"      "LSC2"      "LSM1"      "LSM6"     
## [241] "LSO1"      "LTV1"      "LYP1"      "LYS12"     "LYS14"    
## [246] "LYS4"      "LYS5"      "LYS9"      "MAC1"      "MAF1"     
## [251] "MCT1"      "MDL2"      "MDM34"     "MEF1"      "MEF2"     
## [256] "MET18"     "MET22"     "MET3"      "MET6"      "MFA1"     
## [261] "MFM1"      "MGM101"    "MGR2"      "MGR3"      "MGT1"     
## [266] "MHF2"      "MID1"      "MIP1"      "MIP6"      "MIX14"    
## [271] "MIX17"     "MMM1"      "MMS1"      "MMS22"     "MMS4"     
## [276] "MNN2"      "MPC54"     "MPO1"      "MRH4"      "MRP1"     
## [281] "MRPL1"     "MRPL11"    "MRPL15"    "MRPL16"    "MRPL17"   
## [286] "MRPL35"    "MRPL37"    "MRPL40"    "MRPL51"    "MRPL7"    
## [291] "MRPL8"     "MRPL9"     "MRPS16"    "MRPS9"     "MRX14"    
## [296] "MSB4"      "MSE1"      "MSH1"      "MSL1"      "MSM1"     
## [301] "MSS1"      "MSS116"    "MSS51"     "MST1"      "MSW1"     
## [306] "MSY1"      "MTG2"      "MUM2"      "MVB12"     "NCS6"     
## [311] "NDI1"      "NGG1"      "NMA1"      "NPL6"      "NPT1"     
## [316] "NSR1"      "NUP170"    "NUP60"     "NUP84"     "OM14"     
## [321] "OPI11"     "OPI8"      "OSW7"      "OTU2"      "PAN5"     
## [326] "PAU5"      "PBP4"      "PCP1"      "PDE2"      "PDH1"     
## [331] "PDR3"      "PET100"    "PET123"    "PET130"    "PET309"   
## [336] "PET54"     "PEX12"     "PEX13"     "PEX21"     "PFF1"     
## [341] "PHO89"     "PIN2"      "PLM2"      "PMP3"      "PMR1"     
## [346] "PMS1"      "PMT1"      "PMT2"      "POP2"      "POR1"     
## [351] "PPA2"      "PPN1"      "PPT2"      "PPZ1"      "PRM9"     
## [356] "PRP18"     "PRS3"      "PRY2"      "PSR1"      "PTK2"     
## [361] "PXA2"      "QCR2"      "QCR8"      "QDR2"      "RAD27"    
## [366] "RAD54"     "RAD57"     "RAD6"      "RAI1"      "RBK1"     
## [371] "RCO1"      "RCR1"      "RCR2"      "REI1"      "REV7"     
## [376] "REX2"      "RGC1"      "RHB1"      "RHO5"      "RIM4"     
## [381] "RMD6"      "RNR3"      "ROM1"      "RPB4"      "RPB9"     
## [386] "RPL11B"    "RPL13A"    "RPL14A"    "RPL16A"    "RPL19A"   
## [391] "RPL21A"    "RPL22A"    "RPL23A"    "RPL27A"    "RPL2B"    
## [396] "RPL31A"    "RPL34B"    "RPL36B"    "RPL39"     "RPL42B"   
## [401] "RPL43A"    "RPL7B"     "RPO41"     "RPP2B"     "RPS0B"    
## [406] "RPS10A"    "RPS11A"    "RPS11B"    "RPS17A"    "RPS18A"   
## [411] "RPS18B"    "RPS19B"    "RPS21A"    "RPS21B"    "RPS23A"   
## [416] "RPS23B"    "RPS24B"    "RPS25B"    "RPS27B"    "RPS28B"   
## [421] "RPS29B"    "RPS7A"     "RPS8A"     "RPS9B"     "RRG9"     
## [426] "RRT8"      "RSA1"      "RSC2"      "RSM19"     "RSM22"    
## [431] "RSM24"     "RSM25"     "RSM26"     "RSM27"     "RTF1"     
## [436] "RTG3"      "RTS1"      "RTT101"    "RTT109"    "SAC1"     
## [441] "SAF1"      "SAM37"     "SAP1"      "SAP4"      "SBA1"     
## [446] "SCS22"     "SDH1"      "SDS23"     "SDS3"      "SEA4"     
## [451] "SEO1"      "SER2"      "SHS1"      "SIC1"      "SIN3"     
## [456] "SIN4"      "SIP3"      "SIR3"      "SLM6"      "SLT2"     
## [461] "SLX9"      "SMF2"      "SMK1"      "SNA3"      "SNF1"     
## [466] "SNF2"      "SNF3"      "SNF4"      "SNF5"      "SNF7"     
## [471] "SNG1"      "SOD2"      "SOH1"      "SPF1"      "SPL2"     
## [476] "SPO23"     "SPO77"     "SPT2"      "SPT21"     "SPT3"     
## [481] "SPT4"      "SRB2"      "SRN2"      "SRO7"      "SSA2"     
## [486] "SSD1"      "SSE1"      "SSH1"      "SSN8"      "SST2"     
## [491] "STE11"     "STE24"     "STE5"      "STE50"     "STE7"     
## [496] "SUS1"      "SWC5"      "SWD1"      "SWH1"      "SWI5"     
## [501] "SYG1"      "TAT2"      "TCO89"     "TDA3"      "TEF4"     
## [506] "TFB5"      "TGS1"      "THI2"      "THR1"      "THR4"     
## [511] "TIF1"      "TKL2"      "TMA10"     "TOD6"      "TOF2"     
## [516] "TPN1"      "TRM12"     "TRM7"      "TRM9"      "TRP2"     
## [521] "TRP3_dub"  "TRX3"      "TSA2"      "TSR3"      "UBC12"    
## [526] "UBC4"      "UBP5"      "UME6"      "URA1"      "URA7"     
## [531] "URC2"      "URM1"      "UTP30"     "UTR1"      "VBA1"     
## [536] "VBA2"      "VBA5"      "VID22"     "VID27"     "VMA11"    
## [541] "VMA21"     "VPS1"      "VPS41"     "VPS52"     "VPS53"    
## [546] "VPS61"     "VPS69"     "VPS70"     "VPS71"     "VPS73"    
## [551] "XRS2"      "YAF9"      "YAK1"      "YAL042C-A" "YAL066W"  
## [556] "YAP1"      "YBL059W"   "YBL065W"   "YBL071C-B" "YBR096W"  
## [561] "YBR178W"   "YBR225W"   "YCK1"      "YCL001W-B" "YCL002C"  
## [566] "YCL007C"   "YCL021W-A" "YCR101C"   "YCR102C"   "YDJ1"     
## [571] "YDL041W"   "YDL118W"   "YDL129W"   "YDL183C"   "YDL211C"  
## [576] "YDL241W"   "YDR048C"   "YDR149C"   "YDR417C"   "YDR442W"  
## [581] "YDR455C"   "YDR514C"   "YEF1"      "YER010C"   "YER046W-A"
## [586] "YER156C"   "YFL013W-A" "YFR045W"   "YGK3"      "YGL042C"  
## [591] "YGL072C"   "YGL088W"   "YGL108C"   "YGL118C"   "YGL177W"  
## [596] "YGL188C-A" "YGL218W"   "YGR064W"   "YGR117C"   "YGR160W"  
## [601] "YHC3"      "YHK8"      "YHR050W-A" "YIA6"      "YIL025C"  
## [606] "YIL029C"   "YJL043W"   "YJL055W"   "YJL077W-B" "YJR154W"  
## [611] "YKE2"      "YKL033W-A" "YKL136W"   "YKL169C"   "YKR051W"  
## [616] "YLR184W"   "YLR264C-A" "YLR269C"   "YLR283W"   "YLR297W"  
## [621] "YLR311C"   "YLR358C"   "YLR434C"   "YML009W-B" "YML012C-A"
## [626] "YML079W"   "YML094C-A" "YMR001C-A" "YMR114C"   "YMR194C-A"
## [631] "YMR242W-A" "YNG2"      "YNL120C"   "YNL140C"   "YNL146W"  
## [636] "YNL198C"   "YNL226W"   "YNR068C"   "YOL038C-A" "YOL079W"  
## [641] "YOL118C"   "YOL166W-A" "YOR139C"   "YOR199W"   "YOR283W"  
## [646] "YOR376W"   "YPL062W"   "YPL113C"   "YPL260W"   "YPR092W"  
## [651] "YPR098C"   "YPT32"     "YRR1"      "YTP1"      "ZAP1"     
## [656] "ZRT2"      "ZUO1"

compare differences by GENES rejected by STD hom

hsd1 = setdiff(rownames(bxxhs),rownames(hbat))

if(length(hsd1)==0) print('no hom genes rejected by standard')

## [1] "no hom genes rejected by standard"

if(length(hsd1) > 0) print(hsd1)  

osd1 = setdiff(rownames(bxxos),rownames(obat))

if(length(osd1)==0) print('no oliver genes rejected by standard')
if(length(osd1) > 0) print(osd1)

## [1] "PIM1"

csd1 = setdiff(rownames(bxxcs),rownames(cbat))


if(length(csd1)==0) print('no caudy genes rejected by standard')

## [1] "no caudy genes rejected by standard"

if(length(osd1) > 0) print(csd1)

## character(0)

summary tables

xtab = table(mix$call,mix$pool)

stab = table(stand$call,stand$pool)


pxtab = round(xtab/nrow(xdf),2)
pstab = round(stab/nrow(sdf),2)

print(xtab)

##    
##     mixcau mixhom mixoliv
##   0    836    333     419
##   1   3873   4376    4290

print(stab)

##    
##     standcau standhom standoliv
##   0      362      253       173
##   1     4530     4639      4719

print(pxtab)

##    
##     mixcau mixhom mixoliv
##   0   0.18   0.07    0.09
##   1   0.82   0.93    0.91

print(pstab)

##    
##     standcau standhom standoliv
##   0     0.07     0.05      0.04
##   1     0.93     0.95      0.96

plot median vs mad of tags rejected ONLY by mixEM and not by our standard bkrnd thresholds:

pmad1 <- function(mat,...) {
  med <- apply(mat,1,median)
  mad <- apply(mat,1,mad)
  q1 <- quantile(mad,0.25)
  q3 <- quantile(mad,0.75)
  IQR <- q3-q1
  I1 <- q1 - 1.5*IQR
  I2 <- q3 + 1.5*IQR
  plot(med,mad,...)
  wlow <- which(mad < I1)
  wup <- which(mad > I2)
  points(med[wup],mad[wup],col="red")
  text(med,mad,names(mad),pos=4,cex = 0.8)
  wup
}


opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)

hmad = pmad1(hbat[hsd,whs],main = "hom mixEM reject",cex.main=0.7)

omad= pmad1(obat[osd,wos],main = "oliver mixEM reject",cex.main=0.7)

cmad = pmad1(cbat[csd,wcs],main = "caudy mixEM reject",cex.main=0.7)

background tag calls batch corrected mixEM YPD

ggiaever

07/12/2016