hi pat! finally decided on a criteria – posterior > 0.5 in at least half of experiments, works better than just using the controls
here’s our goal
Purpose Develop a procedure for identifying poorly performing tags; specific goal being to distinguish strains NOT in the pool for three YKO derived collection:
auxotrophic YKO (hom)
prototrophic collection made by SGA/back-crossing (caudy)
prototrophic collection made by CEN plasmid complementation of the auxotrophies
this is really a preprocessing step for now.
uber important as this is a major point of this manuscript
erica this sight might me as useful for you as it was for me to understand what mixEM is doing
http://[rstudio-pubs-static.s3.amazonaws.com/1001_3177e85f5e4840be840c84452780db52.html]
pat, my main issue as always is where to draw the cutoff?
initially i enforced the rule that all 3 control experiments had to have a posterior > 0.05 i know why i used 0.05 – thought they were pvalues and wanted to be 95% confident they were background, but its too stringent changed to > 0.5 in ~ 1/2 of experiments for each pool set
i am sure there are more robust ways of calling/refining it. maybe even on a per tag basis rather than a per experiment basis? (eventually). also wasn’t sure if the batch correction was kosher…and not sure how much it mattered
batch correction definitely helps for SC, else the distributions all see to require different values based on plots
we did lose a few precious genes (i hope not too many) and probably need to tweak it, but overall it worked well
primarily we lost ADE genes from oliver – but if you look at these genes they are really flaky and close to background
steps
get SC data for all pools, xsc is the data matrix, psc describes experiments
xsc = read.delim(
"jul22_2016_allpools_xSC.txt",header = T,stringsAsFactors = F,check.names = F)
psc = read.delim(
"jul22_2016_allpools_pSC.txt",
header = T,
stringsAsFactors = F,
check.names = F
)
fdata6 = read.delim("jun10_2016_fdata6.txt",
stringsAsFactors = F,
check.names = F)
noness = filter(fdata6, fdata6$essential == 0)
noness = noness %>% arrange(strain)
ess = filter(fdata6, fdata6$essential == 1)
assign tag type to each, plot ctrl experiments to compare distributions of unassigned vs assigned tags
#
# unass = grep("tag_",rownames(xsc))
#
# rownames(xsc)[1:1800]=gsub(":uptag",":downtag",rownames(xsc)[1:1800])
wess = which(rownames(xsc) %in% ess$strain)
xsc = xsc[-wess,]
xsc = xsc[,psc$name]
opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)
wnoness = which(rownames(xsc) %in% noness$strain)
xsc$assign[wnoness] = "noness"
xsc$assign[1:3600] = "unass"
xsc$tag = rownames(xsc)
mat = match(xsc$tag, fdata6$strain)
xsc$gene = fdata6$sgd_gene[mat]
wsc = which(psc$type == "ctrl")
sc = psc$name[wsc]
w = which(names(xsc) %in% c(sc,"assign","tag","gene"))
test = xsc[, w]
m <- melt(test, id.vars=c("tag","assign","gene"), variable.name = "exp")
mat = match(m$exp,psc$name)
m$pool = psc$pool[mat]
m$tag = factor(m$tag)
m$pool = factor(m$pool)
m$assign = factor(m$assign)
ggplot(m, aes(value)) + geom_histogram(aes(fill = pool), bins=50) +
facet_grid(assign~pool) + ggtitle("distribution by pool and tag type")
ggplot(m, aes(value))+geom_freqpoly(aes(colour=pool), bins=50)+facet_grid(assign~.)+theme_bw()+ ggtitle("distribution by pool")
ggplot(m, aes(value)) + geom_histogram(aes(fill = assign), bins=50) +
facet_wrap(~exp,ncol=3) + ggtitle("distribution by pool and tag type ctrl chips")+theme_bw()
batch correct and normalize first – all tags together
pcs = filter(psc,pool == "caudy")
phs = filter(psc,pool == "hom")
pos = filter(psc,pool == "oliver")
xcs = xsc[,pcs$name]
xhs = xsc[,phs$name]
xos = xsc[,pos$name]
xcs = medShift.updn(as.matrix(xcs))
xhs = medShift.updn(as.matrix(xhs))
xos = medShift.updn(as.matrix(xos))
bcs = ComBat(xcs,model.matrix(~cond,pcs),batch = pcs$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bhs = ComBat(xhs,model.matrix(~cond,phs),batch = phs$pcr)
## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bos = ComBat(xos,model.matrix(~cond,pos),batch = pos$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
plot before and after batch correction to see if it helps align distributiosn
density
dendograms
mydendy(xhs,phs$pcr,main="prebatch hom correct color = pcr date")
## 'dendrogram' with 2 branches and 30 members total, at height 120.2446
mydendy(bhs,phs$pcr, main="post batch hom correct color = pcr date")
## 'dendrogram' with 2 branches and 30 members total, at height 74.33607
mydensity(xhs,phs$pcr,main="pre batch hom correct density, color = pcr date")
mydensity(bhs,phs$pcr, main="post batch hom correct density, color = pcr date")
reformat long-2-wide
print('unass included')
## [1] "unass included"
bsc = cbind(bcs,bhs,bos)
bsc = as.data.frame(bsc,stringsAsFactors = F)
w = which(names(xsc) %in% c("assign","tag","gene"))
bsc = cbind(bsc,xsc[,w])
msc <- melt(bsc, id.vars=c("tag","assign","gene"), variable.name = "exp")
mat = match(msc$exp,psc$name)
msc$pool = psc$pool[mat]
msc$cond = psc$cond[mat]
msc$type = psc$type[mat]
estimate of the unassigned means and sd – pretty equal for all pools
ua.mean <- aggregate(value ~ pool, data=msc, FUN=mean, subset=assign=='unass')
cmean = filter(ua.mean,pool == "caudy")
hmean = filter(ua.mean,pool == "hom")
omean = filter(ua.mean,pool == "oliver")
ua.var <- aggregate(value ~ pool, data=msc, FUN=sd, subset=assign=='unass')
cvar = filter(ua.var,pool == "caudy")
hvar = filter(ua.var,pool == "hom")
ovar = filter(ua.var,pool == "oliver")
print(ua.mean)
## pool value
## 1 caudy 5.405844
## 2 hom 5.217357
## 3 oliver 5.315958
print(ua.var)
## pool value
## 1 caudy 0.6958178
## 2 hom 0.4251037
## 3 oliver 0.7314023
re batch correct without the unassigned tags
bcs = medShift.updn(as.matrix(xcs[noness$strain,pcs$name]))
bhs = medShift.updn(as.matrix(xhs[noness$strain,phs$name]))
bos = medShift.updn(as.matrix(xos[noness$strain,pos$name]))
bcs = ComBat(bcs[noness$strain,pcs$name],model.matrix(~cond,pcs),batch = pcs$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bhs = ComBat(bhs[noness$strain,phs$name],model.matrix(~cond,phs),batch = phs$pcr)
## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bos = ComBat(bos[noness$strain,pos$name],model.matrix(~cond,pos),batch = pos$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
print('unass removed')
## [1] "unass removed"
bsc = cbind(bcs,bhs,bos)
bsc = as.data.frame(bsc,stringsAsFactors = F)
bsc$tag = rownames(bsc)
table(bsc$tag == noness$strain)
##
## TRUE
## 10216
bsc$gene = noness$sgd_gene
msc <- melt(bsc, id.vars=c("tag","gene"), variable.name = "exp")
mat = match(msc$exp,psc$name)
msc$pool = psc$pool[mat]
msc$cond = psc$cond[mat]
msc$type = psc$type[mat]
Plot histogram of assigned and unassigned for all experiments. This will generate a tall lattice of plots, so only save to pdf, but do not show in notebook.
It is pretty clear than around June 2 there is a leftward shift in the assigned tags and then a shift back to the right around July 15.
ggnote – i batch corrected this round –
I think we can fit a single gaussian to the unassigned tag distribution. Then fit a mixture of two Gaussians with one of the two fixed at the unassigned tag distribution and the other unconstrained for the assigned tags. Then we can use the posterior distribution over the cluster assignment variables to give an odds-ratio of the tag assignement. Then we can draw a threshold on the odds-ratio.
Fit a 2 component Gaussian mixture model to each experiment’s NONESS assigned tags. Store the posterior probability of the tag assignment to the “good” or “not unassigned” distribution" for each experiment.
noting the difference in number of iterations needed to converge each time this is run – us this something to worry about? takes more iterations without batch correction
caudy = 183 – 282 no batch correction hom = 78 131 oliver = 312 242
batch corrected
print('batch corrected mixEM')
## [1] "batch corrected mixEM"
set.seed(5)
mvn.cau <- apply(bcs[noness$strain,pcs$name], 2, normalmixEM2comp, lambda = 0.5, mu = c(cmean[1,2], 2*cmean[1,2]), sigsqrd = c(cvar[1,2], 5*cvar[1,2]), eps = 1e-08, maxit = 10000, verb = F)
## number of iterations= 152
## number of iterations= 175
## number of iterations= 191
## number of iterations= 175
## number of iterations= 160
## number of iterations= 235
## number of iterations= 202
## number of iterations= 185
## number of iterations= 211
## number of iterations= 169
## number of iterations= 203
## number of iterations= 223
## number of iterations= 281
## number of iterations= 190
## number of iterations= 191
## number of iterations= 199
## number of iterations= 222
## number of iterations= 159
## number of iterations= 187
## number of iterations= 191
## number of iterations= 218
## number of iterations= 192
## number of iterations= 217
## number of iterations= 179
## number of iterations= 182
## number of iterations= 211
## number of iterations= 166
## number of iterations= 180
## number of iterations= 174
## number of iterations= 201
## number of iterations= 168
## number of iterations= 199
## number of iterations= 242
## number of iterations= 183
set.seed(5)
mvn.hom <- apply(bhs[noness$strain,phs$name], 2, normalmixEM2comp, lambda = 0.5, mu = c(hmean[1,2], 2*hmean[1,2]), sigsqrd = c(hvar[1,2], 5*hvar[1,2]), eps = 1e-08, maxit = 10000, verb = F)
## number of iterations= 156
## number of iterations= 148
## number of iterations= 166
## number of iterations= 149
## number of iterations= 145
## number of iterations= 178
## number of iterations= 137
## number of iterations= 178
## number of iterations= 180
## number of iterations= 152
## number of iterations= 156
## number of iterations= 122
## number of iterations= 172
## number of iterations= 180
## number of iterations= 167
## number of iterations= 123
## number of iterations= 188
## number of iterations= 171
## number of iterations= 123
## number of iterations= 147
## number of iterations= 187
## number of iterations= 211
## number of iterations= 258
## number of iterations= 218
## number of iterations= 190
## number of iterations= 170
## number of iterations= 125
## number of iterations= 244
## number of iterations= 274
## number of iterations= 78
set.seed(5)
mvn.oliv <- apply(bos[noness$strain,pos$name], 2, normalmixEM2comp, lambda = 0.5, mu = c(omean[1,2], 2*omean[1,2]), sigsqrd = c(ovar[1,2], 5*ovar[1,2]), eps = 1e-08, maxit = 10000, verb = F)
## number of iterations= 451
## number of iterations= 497
## number of iterations= 240
## number of iterations= 410
## number of iterations= 252
## number of iterations= 284
## number of iterations= 362
## number of iterations= 286
## number of iterations= 280
## number of iterations= 409
## number of iterations= 294
## number of iterations= 269
## number of iterations= 251
## number of iterations= 269
## number of iterations= 339
## number of iterations= 552
## number of iterations= 313
## number of iterations= 329
## number of iterations= 276
## number of iterations= 353
## number of iterations= 271
## number of iterations= 362
## number of iterations= 492
## number of iterations= 364
## number of iterations= 333
## number of iterations= 335
## number of iterations= 351
## number of iterations= 345
## number of iterations= 180
## number of iterations= 248
## number of iterations= 211
## number of iterations= 263
## number of iterations= 238
## number of iterations= 327
## number of iterations= 537
## number of iterations= 252
## number of iterations= 244
## number of iterations= 349
## number of iterations= 312
cpost <- bcs[noness$strain,pcs$name]
for (i in 1:length(names(mvn.cau))) {
cpost[,i] <- mvn.cau[[i]]$posterior[,2]
}
hpost <- bhs[noness$strain,phs$name]
for (i in 1:length(names(mvn.hom))) {
hpost[,i] <- mvn.hom[[i]]$posterior[,2]
}
opost <- bos[noness$strain,pos$name]
for (i in 1:length(names(mvn.oliv))) {
opost[,i] <- mvn.oliv[[i]]$posterior[,2]
}
post = cbind(cpost,hpost,opost)
post = as.data.frame(post,stringsAsFactors = F)
post$tag = rownames(post)
rm(cpost)
rm(hpost)
rm(opost)
reformat dataframe wide to long for plotting
most<- melt(post, id.vars=c("tag"), variable.name = "exp",value.name = "post")
msc2 <- merge(msc, most, by=c("tag","exp"))
# call bkgnd tags by post.p > 0.
msc2$sig = rep(NA,nrow(msc2))
wbkgnd = which(msc2$post > 0.5)
msc2$sig[wbkgnd]=1
msc2$sig[-wbkgnd]=0
mat = match(msc2$exp,psc$name)
msc2$type = psc$type[mat]
msc2$cond = psc$cond[mat]
grab the fits from the models based on unassigned tags instead: erica we can justify using the lower bounds (mean - 3*sd) from the ctrls for our bg threshold cutoff values – they are higher than before
hom = 6.1 caudy = 6.5 oliver = 6.3
note lower variance and lower means for the hom pool in general
fit.hom = lapply(mvn.hom,with,c(mu,sigma,lambda))
hfit = matrix(unlist(fit.hom),ncol=6,byrow=T)
rownames(hfit)=names(fit.hom)
colnames(hfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")
fit.cau = lapply(mvn.cau,with,c(mu,sigma,lambda))
cfit = matrix(unlist(fit.cau),ncol=6,byrow=T)
rownames(cfit)=names(fit.cau)
colnames(cfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")
fit.oliv = lapply(mvn.oliv,with,c(mu,sigma,lambda))
ofit = matrix(unlist(fit.oliv),ncol=6,byrow=T)
rownames(ofit)=names(fit.oliv)
colnames(ofit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")
opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)
plot(ofit[,1],ylim=c(5,12),col=1,main = "comparing pools mean good vs bad tags",cex.main=0.9,,type="b")
points(ofit[,2],ylim=c(5,12),col=4,type="b")
points(hfit[,1],ylim=c(5,12),col=2,type="b")
points(hfit[,2],ylim=c(5,12),col=5,type="b")
points(cfit[,1],ylim=c(5,12),col=3,type="b")
points(cfit[,2],ylim=c(5,12),col=6,type="b")
legend("topright", inset=.05, title="pool",
c("oliver.bg","hom.bg","caudy.bg","oliver","hom","caudy"), fill=1:6, cex=0.7)
hfit = apply(hfit,2,mean)
cfit = apply(cfit,2,mean)
ofit = apply(ofit,2,mean)
hom_bgthresh = hfit[2] - 3*hfit[4]
caudy_bgthresh = cfit[2] - 3*cfit[4]
oliv_bgthresh = ofit[2] - 3*ofit[4]
hom_bgthresh = hfit[1]
caudy_bgthresh =cfit[1]
oliv_bgthresh = ofit[1]
print(noquote("stats for unass/assigned hom,caudy and oliver respectively = "))
## [1] stats for unass/assigned hom,caudy and oliver respectively =
print(hfit)
## bkg_mean ass_mean bkg_sd ass_sd bkg_lambda ass_lambda
## 6.1019831 9.5738050 0.9242400 0.8667852 0.2446167 0.7553833
print(cfit)
## bkg_mean ass_mean bkg_sd ass_sd bkg_lambda ass_lambda
## 6.5650372 9.8201733 1.0970971 0.7369216 0.4035549 0.5964451
print(ofit)
## bkg_mean ass_mean bkg_sd ass_sd bkg_lambda ass_lambda
## 6.3821403 9.7263986 1.0899718 0.7657322 0.3400108 0.6599892
print(noquote("hom,caudy and oliver respectively: +/- 3 stdev from mean assigned tags = "))
## [1] hom,caudy and oliver respectively: +/- 3 stdev from mean assigned tags =
print(c(hom_bgthresh, hfit[2] + 3*hfit[4] ))
## bkg_mean ass_mean
## 6.101983 12.174161
print(c(caudy_bgthresh, cfit[2] + 3*cfit[4]))
## bkg_mean ass_mean
## 6.565037 12.030938
print(c(oliv_bgthresh, ofit[2] + 3*ofit[4]))
## bkg_mean ass_mean
## 6.38214 12.02360
check out differences in fitted gaussian parameters between pools
25% ~40% ~30% of all 10216 tags called background for hom caudy and oliver repectively
print(noquote("hom,caudy and oliver percent background vs present tags"))
## [1] hom,caudy and oliver percent background vs present tags
print(c(round(hfit[5]*100,2) ,round(hfit[6]*100,2) ))
## bkg_lambda ass_lambda
## 24.46 75.54
print(c(round(cfit[5]*100,2) ,round(cfit[6]*100,2) ))
## bkg_lambda ass_lambda
## 40.36 59.64
print(c(round(ofit[5]*100,2) ,round(ofit[6]*100,2) ))
## bkg_lambda ass_lambda
## 34 66
plot the data with the fitted curves
opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)
print('hom_sc_24Sep14')
## [1] "hom_sc_24Sep14"
plot(mvn.hom$hom_sc_24Sep14,density = T)
print('oliver_sc_02Jun15')
## [1] "oliver_sc_02Jun15"
plot(mvn.oliv$oliver_sc_02Jun15,density = T)
print('caudy_sc_16Jun14')
## [1] "caudy_sc_16Jun14"
plot(mvn.cau$caudy_sc_16Jun14,density = T)
plot the assigned tag distribution with the posterior likelihood? of it being a “good” tag. (only ctrl exps shown)
w7 = which(psc$pcr %in% c("2015-07-15"))
w = which(psc$cond[w7] %in% c("arg","trp","sc") )
exps = psc$name[w7][w]
test = msc2 %>% filter(exp %in% exps) %>% droplevels()
ggplot(test) +
facet_wrap(pool ~ exp,ncol=3) +
geom_histogram(aes(x=value,fill=pool),bins=100) + facet_wrap(~cond,nrow=3)
ggplot(test) +
facet_wrap(pool ~ exp,ncol=3) +
geom_density(aes(x=value,col=pool)) + facet_wrap(pool~cond,nrow=3) +
geom_point(aes(x=value, y=post,col=pool))
mctrl = msc2 %>% filter(type=="ctrl") %>% droplevels()
caudy = msc2 %>% filter(pool== "caudy",type=="ctrl") %>% droplevels()
oliv = msc2 %>% filter(pool== "oliver",type=="ctrl") %>% droplevels()
hom = msc2 %>% filter(pool== "hom",type=="ctrl") %>% droplevels()
o = ggplot(oliv) +
geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.) + ggtitle("oliver ctrls")+
theme(legend.position="none")
o
c = ggplot(caudy) +
geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.) + ggtitle("caudy ctrls")+
theme(legend.position="none")
c
h = ggplot(hom) +
geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.) + ggtitle("hom ctrls") +
theme(legend.position="none")
h
ggplot(mctrl) +
geom_histogram(aes(x=value,y = ..ncount..,fill=pool), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_wrap(pool~exp,nrow=3) + ggtitle("hom ctrls") +
theme(legend.position="none")
from here split out the pools and draw the Venn for tags w/ posterior > 0.5 in ~ 50% of experiments
i hate that i have to make an arbritrary cutoff after all this work! couldn’t we use pnorm to get the probabilty based on the estimated parameters of the gaussian?
actrl= acast(msc2, tag~pool+cond, value.var="sig",sum)
actrl = as.data.frame(actrl,stringsAsFactors = F)
gc = grep('caudy',colnames(actrl))
csum = apply(actrl[,gc],1,sum)
gh = grep('hom',colnames(actrl))
hsum = apply(actrl[,gh],1,sum)
go = grep('oliv',colnames(actrl))
osum = apply(actrl[,go],1,sum)
otab = table(osum)
htab = table(hsum)
ctab = table(csum)
opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1)
barplot(htab,col=2, main='count exps above threshold')
barplot(cumsum(prop.table(htab)),col=2, main='hom cumulative sum')
barplot(otab,col=1, main='oliver')
barplot(cumsum(prop.table(otab)),col=1, main='oliver cumulative sum')
barplot(ctab,col=3, main='caudy')
barplot(cumsum(prop.table(ctab)),col=3, main='caudy cumulative sum')
wc = which(csum > nrow(pcs)/2)
wo = which(osum > nrow(pos)/2)
wh = which(hsum > nrow(phs)/2)
#wc = which(actrl[,1]> 0.5)
#wh = which(actrl[,2]> 0.5)
#wo = which(actrl[,3]> 0.5)
ctag = rownames(actrl)[wc]
htag = rownames(actrl)[wh]
otag = rownames(actrl)[wo]
cmix=xcs[ctag,pcs$name]
hmix=xhs[htag,phs$name]
omix=xos[otag,pos$name]
hom_bgthresh = hfit[1]
caudy_bgthresh= cfit[1]
oliv_bgthresh = ofit[1]
bhix = myBeststrain(hmix,bgThresh = hom_bgthresh)
## [1] min value of ctrl medians = 7.46767155284592
## [1] 4470 30
bcix = myBeststrain(cmix,bgThresh = caudy_bgthresh)
## [1] min value of ctrl medians = 7.59040103547903
## [1] 3981 34
boix = myBeststrain(omix,bgThresh = oliv_bgthresh)
## [1] min value of ctrl medians = 7.81958360782921
## [1] 4380 39
from barplots its pretty clear that the significance call is all or none, though different for each pool there are lots of ways to call presence or absence given we have likelihood? odds? across many experiments,
need to check consistency between replicates of absence/presence call
compute batch corrected tag presence using our standard approach for later comparison 1. median normalize by experiment 2. throw out tags that are not about the background threshold 3. pick best tag by the choosing the one with lowest sd/mean (if robust = F) for ctrl chips
wos = which(pos$type == "ctrl")
wcs = which(pcs$type == "ctrl")
whs = which(phs$type == "ctrl")
cbest = myBeststrain(xcs[noness$strain,],bgThresh = caudy_bgthresh)
## [1] min value of ctrl medians = 6.56544558591269
## [1] 4606 34
hbest = myBeststrain(xhs[noness$strain,],bgThresh = hom_bgthresh)
## [1] min value of ctrl medians = 6.10519630177698
## [1] 4679 30
obest = myBeststrain(xos[noness$strain,],bgThresh = oliv_bgthresh)
## [1] min value of ctrl medians = 6.3844752063782
## [1] 4780 39
bghs = computeBgThresh(as.matrix(xsc[1:3600,phs$name]))
bgcs = computeBgThresh(as.matrix(xsc[1:3600,pcs$name]))
bgos = computeBgThresh(as.matrix(xsc[1:3600,pos$name]))
hbest = rmtags(hbest,whs,bghs)
## [1] 4679 30
## [1] 4679 30
## [1] 5.51681
## [1] 5.217231
cbest = rmtags(cbest,wcs,bgcs)
## [1] 4606 34
## [1] 4602 34
## [1] 5.717324
## [1] 5.193772
obest = rmtags(obest,wos,bgos)
## [1] 4780 39
## [1] 4777 39
## [1] 5.926253
## [1] 5.232661
bhix = rmtags(bhix,whs,bghs)
## [1] 4470 30
## [1] 4470 30
## [1] 6.806934
## [1] 5.217231
bcix= rmtags(bcix,wcs,bgcs)
## [1] 3981 34
## [1] 3981 34
## [1] 7.356527
## [1] 5.193772
boix = rmtags(boix,wos,bgos)
## [1] 4380 39
## [1] 4380 39
## [1] 7.269101
## [1] 5.232661
cbat = ComBat(cbest ,model.matrix(~cond,pcs),batch = pcs$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
hbat = ComBat(hbest ,model.matrix(~cond,phs),batch = phs$pcr)
## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
obat= ComBat(obest ,model.matrix(~cond,pos),batch = pos$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bxxcs = ComBat(bcix ,model.matrix(~cond,pcs),batch = pcs$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bxxhs = ComBat(bhix ,model.matrix(~cond,phs),batch = phs$pcr)
## Found 3 batches
## Adjusting for 9 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bxxos= ComBat(boix,model.matrix(~cond,pos),batch = pos$pcr)
## Found 3 batches
## Adjusting for 12 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
compare the GENE lists in standard vs mixEM
hsd = setdiff(rownames(hbat),rownames(bxxhs))
osd = setdiff(rownames(obat),rownames(bxxos))
csd = setdiff(rownames(cbat),rownames(bxxcs))
hsd1 = setdiff(rownames(bxxhs),rownames(hbat))
osd1 = setdiff(rownames(bxxos),rownames(obat))
csd1 = setdiff(rownames(bxxcs),rownames(cbat))
print('# strains rejected by EM, caudy hom oliver')
## [1] "# strains rejected by EM, caudy hom oliver"
print(length(csd))
## [1] 963
print(length(hsd))
## [1] 368
print(length(osd))
## [1] 710
print('# strains rejected by std')
## [1] "# strains rejected by std"
print(length(csd1))
## [1] 342
print(length(hsd1))
## [1] 159
print(length(osd1))
## [1] 313
r compare distributions of rejected strains rejected by mixEM vs standard preprocessing
blue lines are strains called as background with mixEM but NOT in standard
orange lines are strains called as background with standard but NOT in mixEM
opar <- par(pch=19,mfrow=c(1,3),mar=c(2.2,2,1.4,1),cex=1.2)
plot(density(xhs[hsd1,whs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='hom')
lines(density(xhs[hsd1,whs[2]]),col=1,lwd=2)
lines(density(xhs[hsd1,whs[3]]),col=1,lwd=2)
lines(density(xhs[hsd,whs[3]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[2]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[1]]),col=2,lwd=2)
legend("topright", inset=.05, title="method",
c("rejected std","rejected EM"), fill=1:2, cex=0.7)
plot(density(xos[osd1,wos[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='oliver')
lines(density(xos[osd1,wos[2]]),col=1,lwd=2)
lines(density(xos[osd1,wos[3]]),col=1,lwd=2)
lines(density(xos[osd,wos[3]]),col=2,lwd=2)
lines(density(xos[osd,wos[2]]),col=2,lwd=2)
lines(density(xos[osd,wos[1]]),col=2,lwd=2)
plot(density(xcs[csd1,wcs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main ='caudy')
lines(density(xcs[csd1,wcs[2]]),col=1,lwd=2)
lines(density(xcs[csd1,wcs[3]]),col=1,lwd=2)
lines(density(xcs[csd,wcs[2]]),col=2)
lines(density(xcs[csd,wcs[3]]),col=2,lwd=2)
lines(density(xcs[csd,wcs[1]]),col=2,lwd=2)
obat = getgene(obat)
cbat = getgene(cbat)
bxxos = getgene(bxxos)
bxxcs = getgene(bxxcs)
bxxhs = getgene(bxxhs)
hbat = getgene(hbat)
cbat = myduprow(cbat)
## [1] 4602 34
## [1] 4546 34
hbat = myduprow(hbat)
## [1] 4679 30
## [1] 4640 30
obat = myduprow(obat)
## [1] 4777 39
## [1] 4669 39
bxxcs = myduprow(bxxcs)
## [1] 3981 34
## [1] 3970 34
bxxhs = myduprow(bxxhs)
## [1] 4470 30
## [1] 4455 30
bxxos = myduprow(bxxos)
## [1] 4380 39
## [1] 4298 39
i hate venns but here goes: make a list of strains in each pool tags absent in all strains were removed
venn universe is the list of tags present in at least one pool
pie charts! think its easier to visualize –
compare this to our standard call of gene presence:
top venn is genes by mixEM, bottom is by standard
cboth= rownames(bxxcs)
hboth= rownames(bxxhs)
oboth= rownames(bxxos)
df = data.frame(gene = unique(noness$sgd_gene),stringsAsFactors = F)
df$mixcau = rep(NA,nrow(df))
df$mixhom = rep(NA,nrow(df))
df$mixoliv = rep(NA,nrow(df))
df$standcau = rep(NA,nrow(df))
df$standhom = rep(NA,nrow(df))
df$standoliv = rep(NA,nrow(df))
w = which(df$gene %in% cboth)
df$mixcau[w]= 1
df$mixcau[-w]= 0
w = which(df$gene %in% hboth)
df$mixhom[w]= 1
df$mixhom[-w]= 0
w =which(df$gene %in% oboth)
df$mixoliv[w]= 1
df$mixoliv[-w]= 0
w = which(df$gene %in% rownames(cbat))
df$standcau [w]= 1
df$standcau [-w]= 0
w = which(df$gene %in% rownames(hbat))
df$standhom[w]= 1
df$standhom [-w]= 0
w = which(df$gene %in% rownames(obat))
df$standoliv[w]= 1
df$standoliv [-w]= 0
df$ssum = df$standcau+df$standoliv+df$standhom
df$xsum = df$mixcau+df$mixoliv+df$mixhom
ws = which(df$ssum == 0)
xdf = df[,c(1:4,9)]
sdf = df[,c(1,5:8)]
ws = which(sdf$ssum == 0)
print(length(ws))
## [1] 110
sdf = sdf[-ws,]
wx = which(xdf$xsum == 0)
print(length(wx))
## [1] 215
xdf = xdf[-wx,]
mix = melt(xdf[,1:4],variable.name = "pool",value.name="call")
## Using gene as id variables
stand = melt(sdf[,1:4],variable.name = "pool",value.name="call")
## Using gene as id variables
piemix = pie_chart1(mix[,2:3], main = "call", condition = "pool")
piestand = pie_chart1(stand[,2:3], main = "call", condition = "pool")
name = c("caudy","hom","oliver")
opar <- par(pch=19,mfrow=c(2,1),mar=c(2.2,0.5,1.4,0.5),cex=1.2)
multiplot(piemix, piestand, layout=matrix(c(1,2), nrow=2, byrow=TRUE))
venn strains present by mix EM
t1 = ggvenn(cboth,hboth,oboth,category = name)
present by standard
t2 = ggvenn(rownames(cbat),rownames(hbat),rownames(obat))
venn strains missing by mix EM
univ = unique(c(cboth,hboth,oboth))
xcbg = setdiff(univ,cboth)
xhbg = setdiff(univ,hboth)
xobg = setdiff(univ,oboth)
t1 = ggvenn(xcbg,xhbg,xobg,category = name)
venn strains missing by standard
suniv = unique(c(rownames(obat),rownames(cbat),rownames(hbat)))
cbg = setdiff(suniv,rownames(cbat))
hbg = setdiff(suniv,rownames(hbat))
obg = setdiff(suniv,rownames(obat))
t1 = ggvenn(cbg,hbg,obg,category = name)
proportion venn genes missing by EM vs STD
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)
library(venneuler)
## Loading required package: rJava
mix0 = filter(mix,mix$call == 0)
v0 <- venneuler(mix0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)
cau = paste('mixcaud',length(xcbg))
hom = paste('mixhom',length(xhbg))
oliv = paste('mixoliv',length(xobg))
v0$labels = c(cau,hom,oliv)
plot(v0)
stand0 = filter(stand,stand$call == 0)
cau = paste('stdcaud',length(cbg))
hom = paste('stdhom',length(hbg))
oliv = paste('stdoliv',length(obg))
v0 <- venneuler(stand0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)
v0$labels = c(cau,hom,oliv)
plot(v0)
check fitness profiles by preprocessing method mixEM seems to work well in getting rid of flaky tags
whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
lobat = getLR(obat,wos)
lbxxos = getLR(bxxos,wos)
lcbat = getLR(cbat,wcs)
lbxxcs = getLR(bxxcs,wcs)
lhbat = getLR(hbat,whs)
lbxxhs = getLR(bxxhs,whs)
whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
lphs = phs[-whs,]
lpos = pos[-wos,]
lpcs = pcs[-wcs,]
wgly = which(lphs$cond %in% c("gln","gly",'ser','thr'))
lphs = lphs[-wgly,]
wgly = which(lpcs$cond %in% c("gln","gly",'ser','thr'))
lpcs = lpcs[-wgly,]
wgly = which(lpos$cond %in% c("gln","gly",'ser','thr'))
lpos = lpos[-wgly,]
lphs = lphs %>% arrange(cond,pcr)
lpcs = lpcs %>% arrange(cond,pcr)
lpos = lpos %>% arrange(cond,pcr)
lcbat = lcbat[,lpcs$name]
lbxxcs = lbxxcs[,lpcs$name]
lhbat = lhbat[,lphs$name]
lbxxhs = lbxxhs[,lphs$name]
lobat = lobat[,lpos$name]
lbxxos = lbxxos[,lpos$name]
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
hsub = jul15 = grep("Jul15",colnames(lhbat))
csub = grep("Jul15",colnames(lcbat))
osub = jul15 = grep("Jul15",colnames(lobat))
colnames(lbxxos) = paste("EM.",colnames(lbxxos))
colnames(lbxxcs) = paste("EM.",colnames(lbxxcs))
colnames(lbxxhs) = paste("EM.",colnames(lbxxhs))
compare fitness profiles EM to standard
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in hsub) psig(lhbat,lbxxhs,i)
for (i in osub) psig(lobat,lbxxos,i)
##check out this nice caudy LEU profile! corey reminded me of LEU2 in SGA strain contruction
for (i in csub) psig(lcbat,lbxxcs,i)
compare on same scales
harg = grep('arg',colnames(lhbat))
carg = grep('arg',colnames(lcbat))
oarg = grep('arg',colnames(lobat))
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in harg) pscale1(lhbat,i,lbxxhs,i)
for (i in carg) pscale1(lcbat,i,lbxxcs,i)
for (i in oarg) pscale1(lobat,i,lbxxos,i)
harg = grep('lys',colnames(lhbat))
carg = grep('lys',colnames(lcbat))
oarg = grep('lys',colnames(lobat))
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in harg) pscale1(lhbat,i,lbxxhs,i)
for (i in carg) pscale1(lcbat,i,lbxxcs,i)
for (i in oarg) pscale1(lobat,i,lbxxos,i)
compare differences by strain
hsd = setdiff(rownames(hbat),rownames(bxxhs))
hsd = sort(unique(hsd))
print('hom reject EM')
## [1] "hom reject EM"
print(hsd)
## [1] "AAH1" "ACB1" "ADE5,7" "ADE8" "ADK1"
## [6] "AIM26" "AMA1" "APL6" "ARG8" "ARO1"
## [11] "ARO7" "ARP5" "ARV1" "ASP1" "ATP12"
## [16] "ATP15" "BCS1" "BDS1" "BTS1" "BUB3"
## [21] "BUD32" "CAX4" "CCR4" "CHK1" "CNM67"
## [26] "COA2" "COQ6" "COR1" "COX10" "COX20"
## [31] "CPS1" "CTF4" "CYB2" "DAL82" "DEG1"
## [36] "DIA4" "DID4" "DSC2" "ECM22" "ECM31"
## [41] "EIS1" "ERG6" "EXO5" "FEN2" "FLX1"
## [46] "FMP52" "FYV6" "HAP2" "HBN1" "HSL1"
## [51] "ICE2" "IDH1" "IMD4" "IMG2" "IOC4"
## [56] "IPK1" "KAP122" "KAR3" "KRE28" "KRE6"
## [61] "LSO1" "LTV1" "MDS3" "MET13" "MHO1"
## [66] "MNN10" "MOT2" "MRH4" "MSS1" "NOT5"
## [71] "NUP84" "OPI9" "PDR5" "PET112" "PHO85"
## [76] "PIN2" "POG1" "PRO1" "PRO2" "PRS2"
## [81] "PSR1" "QCR9" "RBK1" "RLM1" "RNR4"
## [86] "RPB4" "RPL1B" "RPL31A" "RPS16B" "RPS23B"
## [91] "RPS25B" "RPS6A" "RRF1" "RRG8" "RRT14"
## [96] "RSC2" "RSM24" "RSM26" "RTF1" "SAF1"
## [101] "SAM3" "SAN1" "SIR3" "SKO1" "SNF12"
## [106] "SNF2" "SOM1" "SPC1" "SPT7" "SRC1"
## [111] "SSA2" "SSB2" "STE11" "STE3" "SUS1"
## [116] "SUV3" "SYM1" "TAF14" "THO2" "THP1"
## [121] "TIM8" "TKL2" "TRK1" "UBP15" "UGO1"
## [126] "UIP4" "VAM6" "VBA5" "VMA16" "VMA21"
## [131] "VMA22" "VMA7" "VPS15" "VPS16" "VPS33"
## [136] "VPS53" "YAL016C-B" "YBR096W" "YBR221W-A" "YCL021W-A"
## [141] "YDL183C" "YDL206W" "YDR524C-B" "YER076C" "YGL088W"
## [146] "YGL185C" "YGL218W" "YGR035W-A" "YGR121W-A" "YGR161W-C"
## [151] "YGR174W-A" "YGR204C-A" "YGR219W" "YHL015W-A" "YHR022C-A"
## [156] "YHR086W-A" "YIR018C-A" "YJL136W-A" "YKL018C-A" "YKL106C-A"
## [161] "YLL006W-A" "YLR361C-A" "YLR419W" "YML007C-A" "YML009W-B"
## [166] "YML079W" "YMR013W-A" "YMR141C" "YMR230W-A" "YNG2"
## [171] "YNL028W" "YNL097C-B" "YNL184C" "YOL097W-A" "YOL118C"
## [176] "YOL159C-A" "YOL164W-A" "YOR161C-C" "YOR316C-A" "YOR331C"
## [181] "YPL152W-A" "YPR099C" "YPR159C-A" "YTA12" "ZAP1"
osd = setdiff(rownames(lobat),rownames(lbxxos))
print('oliv reject EM')
## [1] "oliv reject EM"
osd = sort(osd)
print(osd)
## [1] "AAH1" "ADE1" "ADE3" "ADE4"
## [5] "ADE5,7" "ADE6" "ADK1" "AGP2"
## [9] "AIM44" "ALD5" "ALG9" "AMA1"
## [13] "ANP1" "APL6" "APQ12" "ARC18"
## [17] "ARG80" "ARL3" "ARP8" "ARV1"
## [21] "ASC1" "ASF1" "ASK10" "ASP1"
## [25] "ATG11" "ATG4" "ATP12" "ATP15"
## [29] "AVT5" "BAR1" "BEM4" "BNA7"
## [33] "BOR1" "BTS1" "BUD14" "BUD20"
## [37] "BUD23" "CAX4" "CBS2" "CCE1"
## [41] "CHA1" "CHC1" "CKA1" "CKB1"
## [45] "CLA4" "CNE1" "CNM67" "COQ8"
## [49] "CRP1" "CST6" "CTF8" "CTK3"
## [53] "CTR9" "CYC3" "DAK1" "DAL82"
## [57] "DCI1" "DEF1" "DEG1" "DFG16"
## [61] "DHH1" "DON1" "DRS2" "DUG3"
## [65] "DUN1" "ECM22" "ECM31" "EDE1"
## [69] "EIS1" "ELO2" "ELO3" "ELP2"
## [73] "EMC6" "END3" "ENT3" "ERG24"
## [77] "ERG3" "ERJ5" "ERP2" "EST3"
## [81] "ETR1" "EXO5" "FMP52" "FMT1"
## [85] "FPR1" "FRA1" "FUN14" "FUN30"
## [89] "FYV10" "FYV5" "FYV6" "GCN1"
## [93] "GCN4" "GLE2" "GLO3" "GRX8"
## [97] "HAP3" "HCR1" "HFI1" "HFM1"
## [101] "HIR1" "HMO1" "HMT1" "HOM3"
## [105] "HSM3" "HXT17" "IDH2" "IMD4"
## [109] "INM2" "IOC4" "IRC18" "IST3"
## [113] "IZH1" "JJJ3" "KAP122" "KTI11"
## [117] "KTR1" "KTR6" "LAC1" "LGE1"
## [121] "LHS1" "LOA1" "LOC1" "LSM1"
## [125] "LSM6" "LTV1" "LYS20" "MAP1"
## [129] "MDJ1" "MDL2" "MDM10" "MED1"
## [133] "MET10" "MET13" "MET22" "MET7"
## [137] "MFM1" "MGR2" "MGR3" "MGT1"
## [141] "MIP6" "MIX17" "MMM1" "MMS22"
## [145] "MNN10" "MNN2" "MPO1" "MRPL1"
## [149] "MRPL20" "MRPL4" "MRPS5" "MRT4"
## [153] "MSA1" "MSB4" "MSD1" "MSR1"
## [157] "MSS1" "MVB12" "NCR1" "NDE1"
## [161] "NDI1" "NGG1" "NHX1" "NPL3"
## [165] "NSR1" "NUC1" "NUP60" "OM14"
## [169] "OPI8" "OSW7" "OTU2" "PDH1"
## [173] "PEP12" "PEX12" "PEX13" "PEX21"
## [177] "PEX28" "PHO89" "PIM1" "PMP3"
## [181] "PMR1" "PMT1" "POP2" "POS5"
## [185] "PPN1" "PPT2" "PPZ1" "PRM2"
## [189] "PRM9" "PRO2" "PRS3" "PRY2"
## [193] "PSR1" "PUF4" "PUF6" "PXA2"
## [197] "RAD6" "RAD9" "RAM1" "RCN1"
## [201] "RCO1" "RCR1" "RCR2" "REI1"
## [205] "REV7" "RGC1" "RHB1" "RIM4"
## [209] "ROM1" "RPB4" "RPL12B" "RPL21A"
## [213] "RPL22A" "RPL27B" "RPL2B" "RPL36B"
## [217] "RPL37A" "RPL40A" "RPL42B" "RPL6A"
## [221] "RPL7B" "RPS10A" "RPS11A" "RPS17A"
## [225] "RPS18A" "RPS19B" "RPS21A" "RPS29B"
## [229] "RPS7A" "RPS9B" "RRG8" "RSA1"
## [233] "RSM26" "RTT109" "SAP1" "SAP4"
## [237] "SAS2" "SDH1" "SEM1" "SIC1"
## [241] "SIN4" "SIR4" "SIT4" "SKG1"
## [245] "SLS1" "SMF2" "SMK1" "SNF6"
## [249] "SNG1" "SNT309" "SPC72" "SPF1"
## [253] "SPL2" "SPO23" "SPT10" "SPT4"
## [257] "SRB5" "SSA2" "SSE1" "SSF1"
## [261] "SSH1" "SSQ1" "SSZ1" "STR2_paralog"
## [265] "SUS1" "SUV3" "SWC5" "SWI5"
## [269] "SWI6" "SWR1" "TAT2" "TGS1"
## [273] "THI2" "TIF1" "TKL2" "TMA10"
## [277] "TOD6" "TOF2" "TOM1" "TOP1"
## [281] "TRM9" "TSA2" "TSC3" "TSR3"
## [285] "TVP15" "UBP15" "UBP3" "UBP5"
## [289] "UFO1" "UME6" "UMP1" "URC2"
## [293] "URM1" "UTP30" "UTR4" "VBA1"
## [297] "VID27" "VMA1" "VMA10" "VMA11"
## [301] "VMA16" "VMA2" "VMA21" "VMA5"
## [305] "VMA7" "VMA9" "VPS15" "VPS28"
## [309] "VPS3" "VPS52" "XRS2" "YAL066W"
## [313] "YBL059W" "YBL071C" "YBR096W" "YBR178W"
## [317] "YBR225W" "YCL001W-B" "YCR101C" "YCR102C"
## [321] "YDJ1" "YDL068W" "YDL129W" "YDL183C"
## [325] "YDR199W" "YDR222W" "YDR401W" "YDR433W"
## [329] "YDR514C" "YEL045C" "YER010C" "YER046W-A"
## [333] "YFL012W" "YFL013W-A" "YFR018C" "YGK3"
## [337] "YGL072C" "YGL108C" "YGL188C-A" "YGR015C"
## [341] "YGR064W" "YGR117C" "YGR160W" "YHC3"
## [345] "YHK8" "YHR050W-A" "YIL025C" "YJL077W-B"
## [349] "YJR018W" "YJR154W" "YKL136W" "YKR051W"
## [353] "YLR269C" "YML009W-B" "YMR031W-A" "YMR242W-A"
## [357] "YNG2" "YNL140C" "YNL171C" "YOL079W"
## [361] "YOL118C" "YOR072W" "YOR139C" "YOR199W"
## [365] "YOR318C" "YOR331C" "YPL260W" "YPR084W"
## [369] "YPR123C" "YRR1" "ZRT2"
csd = setdiff(rownames(cbat),rownames(bxxcs))
csd = sort(csd)
print('caudy reject EM')
## [1] "caudy reject EM"
print(csd)
## [1] "1-Oct" "AAH1" "AAT2" "ACO2" "ADE2"
## [6] "ADE8" "ADH3" "ADK1" "ADO1" "AEP1"
## [11] "AEP2" "AFT1" "AIM10" "AIM22" "AIM44"
## [16] "ALO1" "ANP1" "APL6" "ARG1" "ARG2"
## [21] "ARG4" "ARG5,6" "ARG7" "ARL3" "ARO4"
## [26] "ARO7" "ARP8" "ARV1" "ARX1" "ASC1"
## [31] "ASF1" "ASK10" "ATG11" "ATG14" "ATG20"
## [36] "ATP1" "ATP17" "ATP22" "ATP3" "ATP5"
## [41] "AVL9" "AVT5" "BAS1" "BBC1" "BCS1"
## [46] "BFR1" "BNA1" "BOR1" "BRE1" "BTS1"
## [51] "BUB3" "BUD14" "BUD16" "BUD19" "BUD20"
## [56] "BUD21" "BUD27" "CBC2" "CBP2" "CCE1"
## [61] "CCM1" "CCR4" "CCS1" "CDA1" "CDA2"
## [66] "CDC10" "CHA1" "CHD1" "CHM7" "CHO2"
## [71] "CIR2" "CKB1" "CKB2" "CKI1" "CLA4"
## [76] "CLC1" "CNE1" "CNM67" "COA3" "COG1"
## [81] "COQ9" "COR1" "COX10" "COX11" "COX18"
## [86] "COX19" "COX7" "CPA1_uorf" "CPA2" "CRP1"
## [91] "CTF8" "CTK2" "CTK3" "CTR1" "CTR9"
## [96] "CUS2" "CYT1" "DAL82" "DCI1" "DDC1"
## [101] "DFG16" "DIA2" "DIA4" "DID4" "DOA4"
## [106] "DOC1" "DRS2" "DSS1" "DST1" "DUG3"
## [111] "DYN3" "ECM27" "ECM31" "ECM33" "ECM38"
## [116] "EFG1" "EFT2" "EIS1" "ELM1" "ELO2"
## [121] "ELO3" "ELP2" "ELP3" "EMC6" "END3"
## [126] "ENT3" "ERJ5" "ERP2" "EST1" "EST2"
## [131] "EST3" "ETR1" "EXO1" "FAB1" "FAR11"
## [136] "FAR7" "FEN2" "FMP48" "FMP52" "FMT1"
## [141] "FRA1" "FUN30" "FYV5" "FZO1" "GAL80"
## [146] "GAS1" "GCN1" "GCN5" "GCV3" "GEA2"
## [151] "GEP3" "GGC1" "GLE2" "GLO3" "GLR1"
## [156] "GLY1" "GPM2" "GRX8" "GSF2" "GTF1"
## [161] "GTR2" "HAC1" "HBN1" "HEF3" "HER2"
## [166] "HFM1" "HIR1" "HIS1" "HIS2" "HIS4"
## [171] "HIS5" "HIS7" "HIT1" "HMI1" "HMO1"
## [176] "HMT1" "HOC1" "HOM2" "HOM3" "HOM6"
## [181] "HOP2" "HPM1" "HPR1" "HRQ1" "HSL1"
## [186] "HSM3" "HTD2" "HUL4" "HUR1" "HXT17"
## [191] "ICE2" "ICS2" "IES1" "IKI1" "IMP2'"
## [196] "INM2" "INO4" "IOC4" "IRC18" "ISA2"
## [201] "IST3" "JIP4" "JJJ3" "JNM1" "KCS1"
## [206] "KGD1" "KGD2" "KIP1" "KTI11" "KTR1"
## [211] "KTR6" "LAT1" "LCL2" "LEA1" "LHS1"
## [216] "LIP2" "LMO1" "LPD1" "LRG1" "LSC2"
## [221] "LSM1" "LSM6" "LSO1" "LTV1" "LYP1"
## [226] "LYS12" "LYS5" "LYS9" "MAF1" "MAM33"
## [231] "MCM21" "MCT1" "MEF1" "MEF2" "MET1"
## [236] "MET18" "MET31" "MET8" "MFA1" "MFM1"
## [241] "MGM101" "MGR2" "MGR3" "MGT1" "MID1"
## [246] "MIP1" "MIP6" "MMM1" "MMS22" "MMS4"
## [251] "MNN2" "MRH4" "MRP1" "MRP20" "MRPL1"
## [256] "MRPL11" "MRPL13" "MRPL15" "MRPL16" "MRPL17"
## [261] "MRPL22" "MRPL27" "MRPL28" "MRPL35" "MRPL37"
## [266] "MRPL51" "MRPL9" "MRPS16" "MRPS17" "MRPS28"
## [271] "MRPS35" "MRPS9" "MRX14" "MSB4" "MSF1"
## [276] "MSH1" "MSM1" "MSR1" "MSS1" "MSS116"
## [281] "MST1" "MSW1" "MTG2" "MUM2" "MVB12"
## [286] "NAT3" "NCS6" "NDI1" "NEW1" "NGG1"
## [291] "NOT3" "NUC1" "NUP133" "NUP60" "NUP84"
## [296] "OM14" "OPI3" "OSW7" "OXA1" "PAN5"
## [301] "PAP2" "PBP4" "PBY1" "PDE2" "PDH1"
## [306] "PET111" "PET123" "PET130" "PET309" "PET54"
## [311] "PEX12" "PEX13" "PEX19" "PEX21" "PEX28"
## [316] "PFD1" "PFF1" "PHO89" "PKR1" "PLM2"
## [321] "PMR1" "PMS1" "PMT1" "POP2" "POR1"
## [326] "PPA2" "PPT2" "PRM9" "PRP18" "PRS3"
## [331] "PSD1" "PSR1" "PXA2" "QCR8" "QDR2"
## [336] "RAD9" "RAI1" "RCF2" "RCN1" "RCR1"
## [341] "RCR2" "REI1" "REV7" "REX2" "RFU1"
## [346] "RGC1" "RHB1" "RHO5" "RIM1" "RMD6"
## [351] "RNR4" "ROM1" "ROM2" "RPB4" "RPB9"
## [356] "RPL11B" "RPL13A" "RPL16A" "RPL16B" "RPL19A"
## [361] "RPL21A" "RPL22A" "RPL24A" "RPL27A" "RPL2B"
## [366] "RPL31A" "RPL34B" "RPL36B" "RPL42B" "RPL43A"
## [371] "RPL7B" "RPO41" "RPP2B" "RPS0A" "RPS10A"
## [376] "RPS11A" "RPS17A" "RPS21B" "RPS23B" "RPS24B"
## [381] "RPS29B" "RPS7A" "RPS8A" "RPS9B" "RRT8"
## [386] "RSA1" "RSM19" "RSM22" "RSM26" "RSM27"
## [391] "RTF1" "RTS1" "RTT101" "RTT109" "SAC1"
## [396] "SAM37" "SAP1" "SAP4" "SCO1" "SCS22"
## [401] "SDH1" "SDH4" "SDS3" "SEA4" "SEO1"
## [406] "SER2" "SHS1" "SIC1" "SIR3" "SLM5"
## [411] "SLM6" "SLT2" "SLX9" "SMF2" "SMK1"
## [416] "SNA3" "SNF1" "SNF3" "SNF4" "SNF7"
## [421] "SNG1" "SOH1" "SPL2" "SPO23" "SPO77"
## [426] "SPT3" "SPT7" "SRB2" "SRB5" "SRN2"
## [431] "SRO7" "SSA2" "SSE1" "SSH1" "SST2"
## [436] "STB5" "STE11" "STE5" "SUS1" "SVF1"
## [441] "SWC5" "SWF1" "SWH1" "SWI5" "SYG1"
## [446] "TAT2" "TCO89" "TEF4" "TFB5" "THI2"
## [451] "THP3" "THR1" "THR4" "TIF1" "TKL2"
## [456] "TOF2" "TPN1" "TRM9" "TRP1" "TRP2"
## [461] "TRX3" "TSA2" "TSC3" "TSR3" "UBC4"
## [466] "UBP3" "UME6" "URA1" "URA7" "URC2"
## [471] "URM1" "UTR1" "VAM3" "VBA1" "VBA2"
## [476] "VID22" "VID27" "VMA11" "VMA2" "VMA21"
## [481] "VMA9" "VPH2" "VPS1" "VPS24" "VPS28"
## [486] "VPS29" "VPS41" "VPS52" "VPS53" "VPS69"
## [491] "VPS70" "VPS73" "WHI3" "XRS2" "YAK1"
## [496] "YAL066W" "YAP1" "YBL065W" "YBL071C" "YBR096W"
## [501] "YBR178W" "YBR225W" "YCL001W-B" "YCL002C" "YCL007C"
## [506] "YCL021W-A" "YCR101C" "YCR102C" "YDJ1" "YDL041W"
## [511] "YDL118W" "YDL172C" "YDL211C" "YDR048C" "YDR090C"
## [516] "YDR114C" "YDR222W" "YDR442W" "YDR455C" "YDR514C"
## [521] "YEF1" "YEL045C" "YER010C" "YET3" "YFL012W"
## [526] "YFL013W-A" "YFR045W" "YGK3" "YGL041C" "YGL042C"
## [531] "YGL088W" "YGL108C" "YGL188C-A" "YGL218W" "YGR015C"
## [536] "YGR064W" "YGR117C" "YGR160W" "YHK8" "YHR050W-A"
## [541] "YIL025C" "YJL022W" "YJL043W" "YJL055W" "YJL077W-B"
## [546] "YJR011C" "YJR107W" "YJR154W" "YKL033W-A" "YKL136W"
## [551] "YKR051W" "YLR202C" "YLR297W" "YLR358C" "YLR407W"
## [556] "YML012C-A" "YML079W" "YML094C-A" "YMR294W-A" "YNL146W"
## [561] "YNL203C" "YNL226W" "YNR068C" "YOL118C" "YOR139C"
## [566] "YOR199W" "YOR283W" "YOR376W" "YPL062W" "YPL260W"
## [571] "YPR014C" "YPR084W" "YPT32" "YTP1" "ZRT2"
## [576] "ZUO1"
hsd1 = setdiff(rownames(bxxhs),rownames(hbat))
if(length(hsd1)==0) print('no hom genes rejected by standard')
## [1] "no hom genes rejected by standard"
if(length(hsd1) > 0) print(hsd1)
osd1 = setdiff(rownames(bxxos),rownames(obat))
if(length(osd1)==0) print('no oliver genes rejected by standard')
## [1] "no oliver genes rejected by standard"
if(length(osd1) > 0) print(osd1)
csd1 = setdiff(rownames(bxxcs),rownames(cbat))
if(length(csd1)==0) print('no caudy genes rejected by standard')
## [1] "no caudy genes rejected by standard"
if(length(osd1) > 0) print(csd1)
mix prefix = presence determined by fitting two gaussians stand prefix = presence determined by standard protocol of tag presences
xtab = table(mix$call,mix$pool)
stab = table(stand$call,stand$pool)
pxtab = round(xtab/nrow(xdf),2)
pstab = round(stab/nrow(sdf),2)
print(xtab)
##
## mixcau mixhom mixoliv
## 0 814 329 486
## 1 3970 4455 4298
print(stab)
##
## standcau standhom standoliv
## 0 343 249 220
## 1 4546 4640 4669
print(pxtab)
##
## mixcau mixhom mixoliv
## 0 0.17 0.07 0.10
## 1 0.83 0.93 0.90
print(pstab)
##
## standcau standhom standoliv
## 0 0.07 0.05 0.04
## 1 0.93 0.95 0.96
plot median vs mad of tags rejected ONLY by mixEM and not by our standard bkrnd thresholds: