filename = 080116_bkgrnd_edit_batchYPD.Rmd
hi pat! finally decided on a criteria – posterior > 0.5 in at least half of experiments, works better than just using the controls – update, works less well in the YPD drug experiments here, had to use a more stringent threshold
here’s our goal
Purpose Develop a procedure for identifying poorly performing tags; specific goal being to distinguish strains NOT in the pool for three YKO derived collection:
auxotrophic YKO (hom)
prototrophic collection made by SGA/back-crossing (caudy)
prototrophic collection made by CEN plasmid complementation of the auxotrophies
this is really a preprocessing step for now.
uber important as this is a major point of this manuscript
erica this sight might me as useful for you as it was for me to understand what mixEM is doing http://rstudio-pubs-static.s3.amazonaws.com/1001_3177e85f5e4840be840c84452780db52.html
pat, my main issue as always is where to draw the cutoff?
for now i just enforced the rule that all 3 control experiments had to have a posterior > 0.05 i know why i used 0.05 – thought they were pvalues and wanted to be 95% confident they were background, but its too stringent changed to > 0.5 in ~ 1/2 of experiments
i am sure there are more robust ways of calling/refining it. maybe even on a per tag basis rather than a per experiment basis? (eventually). also wasn’t sure if the batch correction was kosher…
batch correction definitely helps for SC, else the distributions all see to require different values based on plots
we did lose a few precious genes (i hope not too many) and probably need to tweak it, but overall it worked well
primarily we lost ADE genes from oliver – but if you look at these genes they are really flaky and close to background
steps
get YPD data for all pools, xypd is the data matrix, pypd describes experiments
xypd = read.delim(
"jul24_2016_allpools_YPD.txt",header = T,stringsAsFactors = F,check.names = F)
pypd = read.delim(
"jul24_2016_allpools_pYPD.txt",
header = T,
stringsAsFactors = F,
check.names = F
)
fdata6 = read.delim("jun10_2016_fdata6.txt",
stringsAsFactors = F,
check.names = F)
noness = filter(fdata6, fdata6$essential == 0)
noness = noness %>% arrange(strain)
ess = filter(fdata6, fdata6$essential == 1)
pypd = pypd %>% arrange(pool,type,cond,pcr)
xypd = xypd[,pypd$name]
assign tag type to each, plot ctrl experiments to compare distributions of unassigned vs assigned tags
removed unassigned in this run and used essentials instead
#
# unass = grep("tag_",rownames(xypd))
#
# rownames(xypd)[1:1800]=gsub(":uptag",":downtag",rownames(xypd)[1:1800])
wnoness = which(rownames(xypd) %in% noness$strain)
xypd$assign[wnoness] = "noness"
xypd$assign[1:3600] = "unass"
xypd$tag = rownames(xypd)
mat = match(xypd$tag, fdata6$strain)
xypd$gene = fdata6$sgd_gene[mat]
pcs = filter(pypd,pool == "caudy")
phs = filter(pypd,pool == "hom")
pos = filter(pypd,pool == "oliver")
whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
cs = sample(pcs$name[wcs],3)
hs = sample(phs$name[whs],3)
os = sample(pos$name[wos],3)
ypd = c(cs,hs,os)
w = which(names(xypd) %in% c(ypd,"assign","tag","gene"))
test = xypd[, w]
m <- melt(test, id.vars=c("tag","assign","gene"), variable.name = "exp")
mat = match(m$exp,pypd$name)
m$pool = pypd$pool[mat]
m$tag = factor(m$tag)
m$pool = factor(m$pool)
m$assign = factor(m$assign)
ggplot(m, aes(value)) + geom_histogram(aes(fill = pool), bins=50) +
facet_grid(assign~pool) + ggtitle("distribution by pool and tag type")
ggplot(m, aes(value))+geom_freqpoly(aes(colour=pool), bins=50)+facet_grid(assign~.)+theme_bw()+ ggtitle("distribution by pool")
ggplot(m, aes(value)) + geom_histogram(aes(fill = assign), bins=50) +
facet_wrap(~exp,ncol=3) + ggtitle("distribution by pool and tag type ctrl chips")+theme_bw()
batch correct and normalize first – all tags together
plot before and after batch correction to see if it helps align distributiosn
density
dendograms
mydendy(xhs,phs$pcr,main="prebatch hom correct color = pcr date")
## 'dendrogram' with 2 branches and 26 members total, at height 151.3245
mydendy(bhs,phs$pcr, main="post batch hom correct color = pcr date")
## 'dendrogram' with 2 branches and 26 members total, at height 75.7054
mydendy(xhs,phs$cond,main="prebatch hom correct color = condition")
## 'dendrogram' with 2 branches and 26 members total, at height 151.3245
mydendy(bhs,phs$cond, main="post batch hom correct color = condition")
## 'dendrogram' with 2 branches and 26 members total, at height 75.7054
mydensity(xhs,phs$pcr,main="pre batch hom correct density, color = pcr date")
mydensity(bhs,phs$pcr, main="post batch hom correct density, color = pcr date")
reformat long-2-wide
print('unass included')
## [1] "unass included"
bsc = cbind(bcs,bhs,bos)
bsc = as.data.frame(bsc,stringsAsFactors = F)
w = which(names(xypd) %in% c("assign","tag","gene"))
bsc = cbind(bsc,xypd[,w])
msc <- melt(bsc, id.vars=c("tag","assign","gene"), variable.name = "exp")
mat = match(msc$exp,pypd$name)
msc$pool = pypd$pool[mat]
msc$cond = pypd$cond[mat]
msc$type = pypd$type[mat]
estimate of the unassigned means and sd – pretty equal for all pools
ua.mean <- aggregate(value ~ pool, data=msc, FUN=mean, subset=assign=='unass')
cmean = filter(ua.mean,pool == "caudy")
hmean = filter(ua.mean,pool == "hom")
omean = filter(ua.mean,pool == "oliver")
ua.var <- aggregate(value ~ pool, data=msc, FUN=sd, subset=assign=='unass')
cvar = filter(ua.var,pool == "caudy")
hvar = filter(ua.var,pool == "hom")
ovar = filter(ua.var,pool == "oliver")
print(ua.mean)
## pool value
## 1 caudy 5.251039
## 2 hom 5.268618
## 3 oliver 5.330938
print(ua.var)
## pool value
## 1 caudy 0.6937826
## 2 hom 0.4193457
## 3 oliver 0.7380817
re batch correct without the unassigned tags
bcs = medShift.updn(as.matrix(xcs[noness$strain,pcs$name]))
bhs = medShift.updn(as.matrix(xhs[noness$strain,phs$name]))
bos = medShift.updn(as.matrix(xos[noness$strain,pos$name]))
bcs = ComBat(bcs[noness$strain,pcs$name],model.matrix(~cond,pcs),batch = pcs$pcr)
## Found 6 batches
## Adjusting for 5 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bhs = ComBat(bhs[noness$strain,phs$name],model.matrix(~cond,phs),batch = phs$pcr)
## Found 5 batches
## Adjusting for 5 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
bos = ComBat(bos[noness$strain,pos$name],model.matrix(~cond,pos),batch = pos$pcr)
## Found 6 batches
## Adjusting for 5 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
print('unass removed')
## [1] "unass removed"
bsc = cbind(bcs,bhs,bos)
bsc = as.data.frame(bsc,stringsAsFactors = F)
bsc$tag = rownames(bsc)
table(bsc$tag == noness$strain)
##
## TRUE
## 10216
bsc$gene = noness$sgd_gene
msc <- melt(bsc, id.vars=c("tag","gene"), variable.name = "exp")
mat = match(msc$exp,pypd$name)
msc$pool = pypd$pool[mat]
msc$cond = pypd$cond[mat]
msc$type = pypd$type[mat]
Plot histogram of assigned and unassigned for all experiments. This will generate a tall lattice of plots, so only save to pdf, but do not show in notebook.
It is pretty clear than around June 2 there is a leftward shift in the assigned tags and then a shift back to the right around July 15.
ggnote – i batch corrected this round –
I think we can fit a single gaussian to the unassigned tag distribution. Then fit a mixture of two Gaussians with one of the two fixed at the unassigned tag distribution and the other unconstrained for the assigned tags. Then we can use the posterior distribution over the cluster assignment variables to give an odds-ratio of the tag assignement. Then we can draw a threshold on the odds-ratio.
ggnote: drew a threshold based on 0.05
Fit a 2 component Gaussian mixture model to each experiment’s NONESS assigned tags. Store the posterior probability of the tag assignment to the “good” or “not unassigned” distribution" for each experiment.
note the difference in number of iterations needed to converge each time this is run – us this something to worry about? takes more iterations without batch correction
SC no batch YPD
caudy = 183 – 282 187 hom = 78 131 127 oliver = 312 242 778!!!
reformat dataframe wide to long for plotting
most<- melt(post, id.vars=c("tag"), variable.name = "exp",value.name = "post")
msc2 <- merge(msc, most, by=c("tag","exp"))
# call bkgnd tags by posterior > 0.5
msc2$sig = rep(NA,nrow(msc2))
wbkgnd = which(msc2$post > 0.5)
msc2$sig[wbkgnd]=1
msc2$sig[-wbkgnd]=0
mat = match(msc2$exp,pypd$name)
msc2$type = pypd$type[mat]
msc2$cond = pypd$cond[mat]
grab the fits from the models based on unassigned tags instead: erica we can justify background thresholds using these
note lower variance and lower means for the hom pool in general
fit.hom = lapply(mvn.hom,with,c(mu,sigma,lambda))
hfit = matrix(unlist(fit.hom),ncol=6,byrow=T)
rownames(hfit)=names(fit.hom)
colnames(hfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")
fit.cau = lapply(mvn.cau,with,c(mu,sigma,lambda))
cfit = matrix(unlist(fit.cau),ncol=6,byrow=T)
rownames(cfit)=names(fit.cau)
colnames(cfit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")
fit.oliv = lapply(mvn.oliv,with,c(mu,sigma,lambda))
ofit = matrix(unlist(fit.oliv),ncol=6,byrow=T)
rownames(ofit)=names(fit.oliv)
colnames(ofit)=c("bkg_mean","ass_mean","bkg_sd","ass_sd","bkg_lambda","ass_lambda")
opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)
plot(ofit[,1],ylim=c(5,12),col=1,main = "comparing pools mean good vs bad tags",cex.main=0.9,,type="b")
points(ofit[,2],ylim=c(5,12),col=4,type="b")
points(hfit[,1],ylim=c(5,12),col=2,type="b")
points(hfit[,2],ylim=c(5,12),col=5,type="b")
points(cfit[,1],ylim=c(5,12),col=3,type="b")
points(cfit[,2],ylim=c(5,12),col=6,type="b")
legend("topright", inset=.05, title="pool",
c("oliver.bg","hom.bg","caudy.bg","oliver","hom","caudy"), fill=1:6, cex=0.7)
hfit = apply(hfit,2,mean)
cfit = apply(cfit,2,mean)
ofit = apply(ofit,2,mean)
hom_bgthresh = hfit[1]
caudy_bgthresh =cfit[1]
oliv_bgthresh = ofit[1]
print(noquote("stats for unass/assigned hom,caudy and oliver respectively = "))
## [1] stats for unass/assigned hom,caudy and oliver respectively =
print(hfit)
## bkg_mean ass_mean bkg_sd ass_sd bkg_lambda ass_lambda
## 5.7271166 9.1996259 0.5743220 1.0161595 0.2035815 0.7964185
print(cfit)
## bkg_mean ass_mean bkg_sd ass_sd bkg_lambda ass_lambda
## 5.8748559 9.5280973 0.7413370 0.9805742 0.3217994 0.6782006
print(ofit)
## bkg_mean ass_mean bkg_sd ass_sd bkg_lambda ass_lambda
## 5.6978940 9.3414196 0.6196868 1.0285872 0.2413985 0.7586015
not sure how to justify the threshold here here based on fitted parameters or by the plots below
any way to get a probability
print(noquote("hom,caudy and oliver respectively: +/- 3 stdev from mean assigned tags = "))
## [1] hom,caudy and oliver respectively: +/- 3 stdev from mean assigned tags =
print(c(hfit[2] - 3*hfit[4], hfit[2] + 3*hfit[4] ))
## ass_mean ass_mean
## 6.151147 12.248105
print(c(cfit[2] - 3*cfit[4], cfit[2] + 3*cfit[4]))
## ass_mean ass_mean
## 6.586375 12.469820
print(c(ofit[2] - 3*ofit[4], ofit[2] + 3*ofit[4]))
## ass_mean ass_mean
## 6.255658 12.427181
check out differences in fitted gaussian parameters between pools
the ratio of background (lambda) to assigned is
print(noquote("hom,caudy and oliver percent background vs present tags"))
## [1] hom,caudy and oliver percent background vs present tags
print(c(round(hfit[5]*100,2) ,round(hfit[6]*100,2) ))
## bkg_lambda ass_lambda
## 20.36 79.64
print(c(round(cfit[5]*100,2) ,round(cfit[6]*100,2) ))
## bkg_lambda ass_lambda
## 32.18 67.82
print(c(round(ofit[5]*100,2) ,round(ofit[6]*100,2) ))
## bkg_lambda ass_lambda
## 24.14 75.86
opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)
print(paste(names(mvn.hom)[1]))
## [1] "hom_ypd_15Jan15"
plot(mvn.hom[[1]],density = T)
print(paste(names(mvn.cau)[1]))
## [1] "caudy_ypd_15Jan15"
plot(mvn.cau[[1]],density = T)
print(paste(names(mvn.oliv)[1]))
## [1] "oliver_ypd_15Jan15"
plot(mvn.oliv[[1]],density = T)
direct comparison of intensity values scaled histograms
library(scales)
diff = ggplot(msc2) +
geom_histogram(aes(x=value,y = ..ncount..,fill=pool), binwidth=0.25,position="dodge")
diff = diff + scale_y_continuous(labels = percent_format())
diff =diff + theme(panel.grid = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5))
diff2=diff + theme(panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=0.5))
diff2 + facet_grid(sig~.)+ theme(legend.position=c(0.7, 0.7)+ ggtitle("scaled distrubutions in two pools"))
plot the assigned tag distribution with the posterior probability of it being a “good” tag. (only ctrl exps shown) its tough to see where the cutoff shoud be here for caudy especially
mat = match(msc2$exp,pypd$name)
msc2$pcr = pypd$pcr[mat]
w7 = which(pypd$pcr %in% c("2015-01-15"))
whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
cs = sample(pcs$name[wcs],3)
hs = sample(phs$name[whs],3)
os = sample(pos$name[wos],3)
exps = c(cs,hs,os)
test = msc2 %>% filter(exp %in% exps) %>% droplevels()
ggplot(test) +
facet_wrap(pool ~ exp,ncol=3) +
geom_density(aes(x=value,col=pool)) +
geom_point(aes(x=value, y=post,col=pool))
cs = sample(pcs$name[wcs],3)
hs = sample(phs$name[whs],3)
os = sample(pos$name[wos],3)
mctrl = msc2 %>% filter(type=="ctrl",exp %in% c(cs,hs,os)) %>% droplevels()
caudy = msc2 %>% filter(pool== "caudy",type=="ctrl",exp %in% cs) %>% droplevels()
oliv = msc2 %>% filter(pool== "oliver",type=="ctrl",exp %in% os) %>% droplevels()
hom = msc2 %>% filter(pool== "hom",type=="ctrl",exp %in% hs) %>% droplevels()
o = ggplot(oliv) +
geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.) + ggtitle("oliver ctrls")+
theme(legend.position="none")
o
c = ggplot(caudy) +
geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.) + ggtitle("caudy ctrls")+
theme(legend.position="none")
c
h = ggplot(hom) +
geom_histogram(aes(x=value,y = ..ncount..,fill=exp), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_grid(exp~.) + ggtitle("hom ctrls") +
theme(legend.position="none")
h
ggplot(mctrl) +
geom_histogram(aes(x=value,y = ..ncount..,fill=pool), binwidth=0.1,position="dodge") +
geom_point(aes(x=value, y=post))+ theme_bw() + facet_wrap(pool~exp,nrow=3) + ggtitle("sample of ctrls") +
theme(legend.position="none")
from here split out the pools and draw the Venn for tags w/ posterior > 0.5 in ~ 50% of experiments
because they ypd experiments are so noisy and we have more controls, only use controls
i hate that i have to make an arbritrary cutoff after all this work! couldn’t we use pnorm to get the probabilty based on the estimated parameters of the gaussian?
its really different for YPD compared to SC
increased my arbitraryness by requiring ALL controls > 0.5
mctrl = msc2 %>% filter(type=="ctrl") %>% droplevels()
actrl= acast(mctrl, tag~pool+exp, value.var="sig",sum)
actrl = as.data.frame(actrl,stringsAsFactors = F)
gc = grep('caudy',colnames(actrl))
csum = apply(actrl[,gc],1,sum)
gh = grep('hom',colnames(actrl))
hsum = apply(actrl[,gh],1,sum)
go = grep('oliv',colnames(actrl))
osum = apply(actrl[,go],1,sum)
otab = table(osum)
htab = table(hsum)
ctab = table(csum)
opar <- par(pch=19,mfrow=c(3,2),mar=c(2.2,0.5,1.4,0.5),cex=1)
barplot(htab,col=2, main='count of exps that pass threshold hom')
barplot(cumsum(prop.table(htab)),col=2, main='hom cumulative sum')
barplot(otab,col=1, main='oliver')
barplot(cumsum(prop.table(otab)),col=1, main='oliver cumulative sum')
barplot(ctab,col=3, main='caudy')
barplot(cumsum(prop.table(ctab)),col=3, main='caudy cumulative sum')
wc = which(csum == length(wcs))
wo = which(osum == length(wos))
wh = which(hsum == length(whs))
# wc = which(csum > nrow(pcs[wcs,])/2)
# wo = which(osum > nrow(pos[wos,])/2)
# wh = which(hsum > nrow(phs[whs,])/2)
#
#wc = which(actrl[,1]> 0.5)
#wh = which(actrl[,2]> 0.5)
#wo = which(actrl[,3]> 0.5)
ctag = rownames(actrl)[wc]
htag = rownames(actrl)[wh]
otag = rownames(actrl)[wo]
cmix=xcs[ctag,pcs$name]
hmix=xhs[htag,phs$name]
omix=xos[otag,pos$name]
hom_bgthresh = hfit[1]
caudy_bgthresh= cfit[1]
oliv_bgthresh = ofit[1]
bhix = myBeststrain(hmix,bgThresh = hom_bgthresh)
## [1] min value of ctrl medians = 6.2917072054257
## [1] 4391 26
bcix = myBeststrain(cmix,bgThresh = caudy_bgthresh)
## [1] min value of ctrl medians = 8.00201061527352
## [1] 3884 32
boix = myBeststrain(omix,bgThresh = oliv_bgthresh)
## [1] min value of ctrl medians = 7.45872049969686
## [1] 4374 29
from barplots its pretty clear that the significance call is all or none, though different for each pool there are lots of ways to call presence or absence given we have likelihood? odds? across many experiments,
need to check consistency between replicates of absence/presence call
compute batch corrected tag presence using our standard approach for later comparison 1. median normalize by experiment 2. throw out tags that are not about the background threshold 3. pick best tag by the choosing the one with lowest sd/mean (if robust = F) for ctrl chips
check the STRAINS rejected by EM vs those rejected by standard preprocessing
hsd = setdiff(rownames(hbat),rownames(bxxhs))
osd = setdiff(rownames(obat),rownames(bxxos))
csd = setdiff(rownames(cbat),rownames(bxxcs))
hsd1 = setdiff(rownames(bxxhs),rownames(hbat))
osd1 = setdiff(rownames(bxxos),rownames(obat))
csd1 = setdiff(rownames(bxxcs),rownames(cbat))
print(' strains rejected by EM, caudy hom oliver')
## [1] " strains rejected by EM, caudy hom oliver"
print(length(csd))
## [1] 1013
print(length(hsd))
## [1] 519
print(length(osd))
## [1] 729
print('# strains rejected by std')
## [1] "# strains rejected by std"
print(length(csd1))
## [1] 311
print(length(hsd1))
## [1] 232
print(length(osd1))
## [1] 268
r compare distributions of rejected strains by mixEM and std
orange = reject by standard preprocessing, blue = rejected by mixEM
opar <- par(pch=19,mfrow=c(1,3),mar=c(2.2,2,1.4,1),cex=1.2)
plot(density(xhs[hsd1,whs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='hom')
lines(density(xhs[hsd1,whs[2]]),col=1,lwd=2)
lines(density(xhs[hsd1,whs[3]]),col=1,lwd=2)
lines(density(xhs[hsd,whs[3]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[2]]),col=2,lwd=2)
lines(density(xhs[hsd,whs[1]]),col=2,lwd=2)
legend("topright", inset=.05, title="method",
c("rejected std","rejected EM"), fill=1:2, cex=0.7)
plot(density(xos[osd1,wos[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main='oliver')
lines(density(xos[osd1,wos[2]]),col=1,lwd=2)
lines(density(xos[osd1,wos[3]]),col=1,lwd=2)
lines(density(xos[osd,wos[3]]),col=2,lwd=2)
lines(density(xos[osd,wos[2]]),col=2,lwd=2)
lines(density(xos[osd,wos[1]]),col=2,lwd=2)
plot(density(xcs[csd1,wcs[1]]),col=1,lwd=2,xlim=c(4,12),ylim=c(0,0.6),main ='caudy')
lines(density(xcs[csd1,wcs[2]]),col=1,lwd=2)
lines(density(xcs[csd1,wcs[3]]),col=1,lwd=2)
lines(density(xcs[csd,wcs[2]]),col=2)
lines(density(xcs[csd,wcs[3]]),col=2,lwd=2)
lines(density(xcs[csd,wcs[1]]),col=2,lwd=2)
i hate venns but here goes: make a list of strains in each pool tags absent in all strains were removed
pie charts! think its easier to visualize
compare this to our standard call of gene presence:
top venn is genes by mixEM, bottom is by standard
cboth= rownames(bxxcs)
hboth= rownames(bxxhs)
oboth= rownames(bxxos)
df = data.frame(gene = unique(noness$sgd_gene),stringsAsFactors = F)
df$mixcau = rep(NA,nrow(df))
df$mixhom = rep(NA,nrow(df))
df$mixoliv = rep(NA,nrow(df))
df$standcau = rep(NA,nrow(df))
df$standhom = rep(NA,nrow(df))
df$standoliv = rep(NA,nrow(df))
w = which(df$gene %in% cboth)
df$mixcau[w]= 1
df$mixcau[-w]= 0
w = which(df$gene %in% hboth)
df$mixhom[w]= 1
df$mixhom[-w]= 0
w =which(df$gene %in% oboth)
df$mixoliv[w]= 1
df$mixoliv[-w]= 0
w = which(df$gene %in% rownames(cbat))
df$standcau [w]= 1
df$standcau [-w]= 0
w = which(df$gene %in% rownames(hbat))
df$standhom[w]= 1
df$standhom [-w]= 0
w = which(df$gene %in% rownames(obat))
df$standoliv[w]= 1
df$standoliv [-w]= 0
df$ssum = df$standcau+df$standoliv+df$standhom
df$xsum = df$mixcau+df$mixoliv+df$mixhom
ws = which(df$ssum == 0)
xdf = df[,c(1:4,9)]
sdf = df[,c(1,5:8)]
ws = which(sdf$ssum == 0)
print(length(ws))
## [1] 107
sdf = sdf[-ws,]
wx = which(xdf$xsum == 0)
print(length(wx))
## [1] 290
xdf = xdf[-wx,]
mix = melt(xdf[,1:4],variable.name = "pool",value.name="call")
## Using gene as id variables
stand = melt(sdf[,1:4],variable.name = "pool",value.name="call")
## Using gene as id variables
piemix = pie_chart1(mix[,2:3], main = "call", condition = "pool")
piestand = pie_chart1(stand[,2:3], main = "call", condition = "pool")
name = c("caudy","hom","oliver")
piecharts each pool vs universe of all strains present in at least one pool
opar <- par(pch=19,mfrow=c(2,1),mar=c(2.2,0.5,1.4,0.5),cex=1.2)
multiplot(piemix, piestand, layout=matrix(c(1,2), nrow=2, byrow=TRUE))
venn strains present by mix EM
t1 = ggvenn(cboth,hboth,oboth,category = name)
present by standard
t2 = ggvenn(rownames(cbat),rownames(hbat),rownames(obat))
venn strains missing by mix EM
univ = unique(c(cboth,hboth,oboth))
xcbg = setdiff(univ,cboth)
xhbg = setdiff(univ,hboth)
xobg = setdiff(univ,oboth)
t1 = ggvenn(xcbg,xhbg,xobg,category = name)
venn strains missing by standard
suniv = unique(c(rownames(obat),rownames(cbat),rownames(hbat)))
cbg = setdiff(suniv,rownames(cbat))
hbg = setdiff(suniv,rownames(hbat))
obg = setdiff(suniv,rownames(obat))
t1 = ggvenn(cbg,hbg,obg,category = name)
proportion venn genes missing by EM vs STD
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,0.5,1.4,0.5),cex=1.2)
library(venneuler)
## Loading required package: rJava
mix0 = filter(mix,mix$call == 0)
v0 <- venneuler(mix0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)
cau = paste('mixcaud',length(xcbg))
hom = paste('mixhom',length(xhbg))
oliv = paste('mixoliv',length(xobg))
v0$labels = c(cau,hom,oliv)
plot(v0)
stand0 = filter(stand,stand$call == 0)
cau = paste('stdcaud',length(cbg))
hom = paste('stdhom',length(hbg))
oliv = paste('stdoliv',length(obg))
v0 <- venneuler(stand0[,c(1,2)])
v0$colors=c(0.1,0.5,0.75)
v0$labels = c(cau,hom,oliv)
plot(v0)
check fitness profiles by preprocessing method mixEM seems to work well in getting rid of flaky tags
whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
cbat = cbat[,pcs$name]
bxxcs = bxxcs[,pcs$name]
hbat = hbat[,phs$name]
bxxhs = bxxhs[,phs$name]
obat =obat[,pos$name]
bxxos = bxxos[,pos$name]
lobat = getLR(obat,wos)
lbxxos = getLR(bxxos,wos)
lcbat = getLR(cbat,wcs)
lbxxcs = getLR(bxxcs,wcs)
lhbat = getLR(hbat,whs)
lbxxhs = getLR(bxxhs,whs)
whs = which(phs$type == "ctrl")
wcs = which(pcs$type == "ctrl")
wos = which(pos$type == "ctrl")
lphs = phs[-whs,]
lpos = pos[-wos,]
lpcs = pcs[-wcs,]
lphs = lphs %>% arrange(cond,pcr)
lpcs = lpcs %>% arrange(cond,pcr)
lpos = lpos %>% arrange(cond,pcr)
lcbat = lcbat[,lpcs$name]
lbxxcs = lbxxcs[,lpcs$name]
lhbat = lhbat[,lphs$name]
lbxxhs = lbxxhs[,lphs$name]
lobat = lobat[,lpos$name]
lbxxos = lbxxos[,lpos$name]
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
hsub = jul15 = grep("Jul15",colnames(lhbat))
csub = grep("Jul15",colnames(lcbat))
osub = jul15 = grep("Jul15",colnames(lobat))
colnames(lbxxos) = paste("EM.",colnames(lbxxos))
colnames(lbxxcs) = paste("EM.",colnames(lbxxcs))
colnames(lbxxhs) = paste("EM.",colnames(lbxxhs))
threshold wasn’t harsh enough
compare fitness profiles EM to standard
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in hsub) psig(lhbat,lbxxhs,i)
for (i in osub) psig(lobat,lbxxos,i)
for (i in csub) psig(lcbat,lbxxcs,i)
compare on same scales
harg = grep('arg',colnames(lhbat))
carg = grep('arg',colnames(lcbat))
oarg = grep('arg',colnames(lobat))
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in harg) pscale1(lhbat,i,lbxxhs,i)
for (i in carg) pscale1(lcbat,i,lbxxcs,i)
for (i in oarg) pscale1(lobat,i,lbxxos,i)
harg = grep('lys',colnames(lhbat))
carg = grep('lys',colnames(lcbat))
oarg = grep('lys',colnames(lobat))
opar <- par(pch=19,mfrow=c(1,2),mar=c(2.2,2,1.4,1),cex=1.2)
for (i in harg) pscale1(lhbat,i,lbxxhs,i)
for (i in carg) pscale1(lcbat,i,lbxxcs,i)
for (i in oarg) pscale1(lobat,i,lbxxos,i)
compare differences by GENES rejected by EM hom
hsd = setdiff(rownames(hbat),rownames(bxxhs))
hsd = sort(unique(hsd))
print('hom reject EM')
## [1] "hom reject EM"
print(hsd)
## [1] "ADE5,7" "ADE8" "ADK1" "AIM26" "AKR1"
## [6] "ALD5" "AMA1" "APL6" "APQ12" "APQ13"
## [11] "ARG8" "ARG82" "ARO1" "ARO7" "ARV1"
## [16] "ATP12" "ATP15" "AVL9" "BCS1" "BDS1"
## [21] "BEM1" "BEM2" "BOR1" "BUB3" "BUD19"
## [26] "BUD23" "BUD32" "BUR2" "CAX4" "CBS2"
## [31] "CCR4" "CCS1" "CLA4" "CLN3" "CNM67"
## [36] "COA1" "COA2" "COA3" "COG1" "COX10"
## [41] "COX14" "COX20" "CPR7" "CPS1" "CTF4"
## [46] "CTF8" "CTR1" "CYB2" "CYS4" "DEG1"
## [51] "DGR1" "DIA2" "DIA4" "DID4" "EAF1"
## [56] "ECM31" "EIS1" "ELP3" "EOS1" "ERG6"
## [61] "ERI1" "EXO5" "FLX1" "FMP52" "FUI1"
## [66] "FUN12" "FYV10" "FYV4" "FYV6" "GCR2"
## [71] "GLE2" "GND1" "GRR1" "HAP2" "HBN1"
## [76] "HFI1" "HHF2" "HIR1" "HMO1" "HNT3"
## [81] "HSL1" "HSL7" "HTZ1" "IDH1" "IDH2"
## [86] "IES2" "IMD4" "IMG2" "INM2" "IRC15"
## [91] "IWR1" "KAP122" "KAR3" "KCS1" "KRE28"
## [96] "KRE6" "KTR5" "LSO1" "LTV1" "MAP1"
## [101] "MDS3" "MMS22" "MNN10" "MON2" "MOT2"
## [106] "MPT5" "MRP10" "MRPS5" "MRT4" "MSS1"
## [111] "MSS18" "NAT3" "NDI1" "NGG1" "NOT5"
## [116] "NSR1" "NUP84" "OPI9" "PAF1" "PAU5"
## [121] "PEP3" "PET112" "PFK2" "PHO2" "PHO85"
## [126] "PIF1" "POG1" "PPZ1" "PRO1" "PRS1"
## [131] "PRS2" "PSR1" "RAD50" "REF2" "RLM1"
## [136] "RNR4" "RPA12" "RPB4" "RPL14B" "RPL1B"
## [141] "RPL20A" "RPL21A" "RPL27A" "RPL31A" "RPL39"
## [146] "RPS10A" "RPS16B" "RPS21B" "RPS23B" "RPS25B"
## [151] "RPS7A" "RPS9B" "RRF1" "RRG8" "RSM24"
## [156] "RSM26" "RTF1" "RTT109" "SAM3" "SAN1"
## [161] "SFP1" "SGO1" "SIN4" "SIT4" "SKO1"
## [166] "SLM6" "SLO1" "SLS1" "SNF12" "SNF2"
## [171] "SNF4" "SNG1" "SNT309" "SOD2" "SOM1"
## [176] "SPC1" "SRC1" "SSE1" "SSH1" "SUS1"
## [181] "SUV3" "SYM1" "TAF14" "THO2" "THP1"
## [186] "TIM8" "TKL2" "TRK1" "TRP4" "UGO1"
## [191] "UIP4" "UTP30" "VAC8" "VAM6" "VBA5"
## [196] "VMA1" "VMA11" "VMA16" "VMA21" "VMA22"
## [201] "VMA3" "VMA4" "VMA6" "VMA7" "VPH2"
## [206] "VPS1" "VPS15" "VPS33" "VPS53" "VPS65"
## [211] "YAF9" "YAL016C-B" "YAL044W-A" "YAR035C-A" "YBL065W"
## [216] "YBL100W-C" "YBR085C-A" "YBR096W" "YDL159W-A" "YDL183C"
## [221] "YDL206W" "YDR034W-B" "YDR442W" "YDR524C-B" "YEL045C"
## [226] "YER076C" "YGL088W" "YGL101W" "YGL218W" "YGR016W"
## [231] "YGR121W-A" "YGR160W" "YGR161W-C" "YGR204C-A" "YGR219W"
## [236] "YHR022C-A" "YIL134C-A" "YJL136W-A" "YKL018C-A" "YKL106C-A"
## [241] "YKL118W" "YLL006W-A" "YLR361C-A" "YML007C-A" "YML009W-B"
## [246] "YML079W" "YMR013W-A" "YMR182W-A" "YNL097C-B" "YNL184C"
## [251] "YNL277W-A" "YNR073C" "YOL097W-A" "YOL118C" "YOL164W-A"
## [256] "YOR139C" "YOR161C-C" "YOR293C-A" "YOR316C-A" "YOR331C"
## [261] "YOR376W-A" "YPR099C" "YPR159C-A"
oliver
osd = setdiff(rownames(lobat),rownames(lbxxos))
print('oliv reject EM')
## [1] "oliv reject EM"
osd = sort(osd)
print(osd)
## [1] "ADA2" "ADE1" "ADE3" "ADE6" "ADE8"
## [6] "ADK1" "AGP2" "ALF1" "AMA1" "ANP1"
## [11] "APL6" "APQ12" "ARC18" "ARF1" "ARG80"
## [16] "ARO2" "ARP6" "ARP8" "ARV1" "ARX1"
## [21] "ASC1" "ASF1" "ASK10" "ATP1" "ATP12"
## [26] "ATP15" "AVT5" "BAR1" "BAS1" "BDF1"
## [31] "BEM3" "BNA7" "BOR1" "BUB1" "BUD14"
## [36] "BUD20" "BUD23" "BUD25" "BUD30" "BUD32"
## [41] "CAX4" "CBS2" "CCE1" "CCS1" "CKB1"
## [46] "CLA4" "CLD1" "CNE1" "CNM67" "COA6"
## [51] "COQ3" "COQ8" "CPR7" "CRP1" "CSE2"
## [56] "CST6" "CTF8" "CTK2" "CTR1" "CTR9"
## [61] "CYB2" "DBP3" "DBP7" "DCI1" "DEF1"
## [66] "DEG1" "DFG16" "DHH1" "DIA2" "DON1"
## [71] "DUN1" "ECM21" "ECM31" "EDE1" "EFM2"
## [76] "EIS1" "ELM1" "ELP2" "ELP3" "END3"
## [81] "EOS1" "ERG24" "ERG3" "ERJ5" "ERP2"
## [86] "EST2" "EST3" "ETR1" "EXO5" "FEN2"
## [91] "FMP52" "FMT1" "FRA1" "FUN12" "FUN14"
## [96] "FUN30" "FYV10" "FYV6" "GAS1" "GCN1"
## [101] "GCN4" "GCN5" "GCR2" "GEP3" "GLE2"
## [106] "GLO3" "GRX5" "GRX8" "GTR2" "HAP3"
## [111] "HCR1" "HFI1" "HFM1" "HHY1" "HIR1"
## [116] "HMO1" "HNT3" "HOM3" "HSM3" "HST1"
## [121] "HTL1" "HTZ1" "HXT17" "IDH2" "IES2"
## [126] "IMD4" "IMG1" "INM2" "IOC4" "IRC18"
## [131] "IST3" "ISU1" "IZH1" "KAP120" "KAP122"
## [136] "KCS1" "KGD2" "KIN4" "KIN82" "KTI11"
## [141] "KTR1" "LAC1" "LGE1" "LOA1" "LOC1"
## [146] "LRG1" "LRP1" "LSM1" "LSM6" "LTV1"
## [151] "LYS20" "MAF1" "MAM33" "MDE1" "MDJ1"
## [156] "MDM10" "MED1" "MET10" "MET22" "MFA1"
## [161] "MFM1" "MGR2" "MGR3" "MIP6" "MIX17"
## [166] "MMS22" "MNN10" "MNN2" "MNN9" "MPO1"
## [171] "MRPL1" "MRPL20" "MRPL23" "MRPL3" "MRPL4"
## [176] "MRPL49" "MRPS5" "MRPS9" "MRT4" "MSA1"
## [181] "MSB4" "MSD1" "MSR1" "MSS1" "MSY1"
## [186] "MVB12" "NCR1" "NDI1" "NEW1" "NGG1"
## [191] "NHX1" "NKP2" "NPL3" "NPL6" "NSR1"
## [196] "NUP60" "NUP84" "OM14" "OSW7" "OTU2"
## [201] "PAP2" "PAU5" "PDH1" "PEP12" "PEX12"
## [206] "PEX13" "PHO89" "PMR1" "POP2" "PPT2"
## [211] "PPZ1" "PRM2" "PRM9" "PRS3" "PRY2"
## [216] "PSR1" "PUB1" "PUF4" "PXA2" "RAD27"
## [221] "RAD52" "RAD6" "RAM1" "RCR1" "RCR2"
## [226] "REF2" "REI1" "REV7" "RFU1" "RGC1"
## [231] "RHB1" "RHO4" "RHO5" "RIB1" "RIM1"
## [236] "RIM4" "RNR4" "ROM1" "RPB4" "RPB9"
## [241] "RPL11B" "RPL12B" "RPL21A" "RPL22A" "RPL27A"
## [246] "RPL27B" "RPL2B" "RPL36B" "RPL37A" "RPL40A"
## [251] "RPL42B" "RPL43A" "RPL6A" "RPL7B" "RPP2B"
## [256] "RPS10A" "RPS11B" "RPS17A" "RPS18A" "RPS18B"
## [261] "RPS19B" "RPS21A" "RPS23B" "RPS24A" "RPS25B"
## [266] "RPS29B" "RPS7A" "RPS9B" "RRG1" "RRG8"
## [271] "RRP8" "RSA1" "RSM23" "RSM26" "RSM7"
## [276] "RTF1" "RTG3" "RTT109" "SAF1" "SAP1"
## [281] "SAP4" "SDH1" "SEO1" "SHM2" "SIC1"
## [286] "SIN4" "SIR4" "SIT4" "SKG1" "SLS1"
## [291] "SMF2" "SMK1" "SNF2" "SNF4" "SNF6"
## [296] "SNG1" "SNT309" "SOD2" "SPC72" "SPF1"
## [301] "SPL2" "SPO23" "SPT10" "SPT4" "SRB5"
## [306] "SSE1" "SSH1" "SSQ1" "SST2" "SSZ1"
## [311] "SUS1" "SUV3" "SWC5" "SWH1" "SWI5"
## [316] "SWI6" "SWR1" "TAF14" "TAL1" "TAT2"
## [321] "TDA3" "TGS1" "THI2" "TIF1" "TKL2"
## [326] "TMA10" "TOD6" "TOF2" "TOP1" "TPN1"
## [331] "TRM9" "TRP4" "TRX3" "TSA2" "TSR3"
## [336] "TVP18" "UBP3" "UBP5" "UFO1" "UMP1"
## [341] "URC2" "URM1" "UTP30" "UTR4" "VBA1"
## [346] "VBA5" "VID22" "VID27" "VMA1" "VMA10"
## [351] "VMA11" "VMA16" "VMA2" "VMA21" "VMA5"
## [356] "VMA6" "VMA7" "VMA9" "VPH2" "VPS15"
## [361] "VPS28" "VPS52" "VPS60" "VPS71" "XRS2"
## [366] "YAF9" "YAL066W" "YAP1" "YBL059W" "YBL065W"
## [371] "YBL094C" "YBR096W" "YBR196C-A" "YCL001W-B" "YCP4"
## [376] "YCR025C" "YCR101C" "YCR102C" "YDJ1" "YDL068W"
## [381] "YDL129W" "YDL183C" "YDL211C" "YDR034C-A" "YDR433W"
## [386] "YEL045C" "YER010C" "YFL013W-A" "YFR018C" "YGL108C"
## [391] "YGL118C" "YGL188C-A" "YGR064W" "YGR117C" "YGR160W"
## [396] "YHC3" "YHK8" "YIL025C" "YIL029C" "YJL077W-B"
## [401] "YJR018W" "YJR154W" "YKE2" "YKL118W" "YKL136W"
## [406] "YKR051W" "YLR269C" "YLR283W" "YLR358C" "YML009W-B"
## [411] "YML012C-A" "YML079W" "YMR031W-A" "YMR242W-A" "YNG2"
## [416] "YNL140C" "YNL171C" "YNR073C" "YOL079W" "YOR139C"
## [421] "YOR199W" "YOR331C" "YOR376W" "YPL260W" "YPR123C"
## [426] "YPT32" "YRR1" "YTP1" "ZAP1" "ZRT2"
caudy
csd = setdiff(rownames(cbat),rownames(bxxcs))
csd = sort(csd)
print('caudy reject EM')
## [1] "caudy reject EM"
print(csd)
## [1] "AAH1" "ACO2" "ADE6" "ADE8" "ADK1"
## [6] "ADO1" "AFT1" "AIF1" "AIM10" "AIM44"
## [11] "ALF1" "ALO1" "ANP1" "APL6" "APQ12"
## [16] "ARC1" "ARC18" "ARF1" "ARG1" "ARG2"
## [21] "ARG3" "ARG4" "ARG5,6" "ARG80" "ARL1"
## [26] "ARL3" "ARO7" "ARP6" "ARP8" "ARV1"
## [31] "ARX1" "ASC1" "ASF1" "ASK10" "ASP1"
## [36] "ATG11" "ATG14" "ATG23" "ATG4" "ATP17"
## [41] "AVL9" "AVT1" "AVT5" "BAS1" "BBC1"
## [46] "BCK1" "BEM1" "BEM2" "BFR1" "BNA7"
## [51] "BOR1" "BRE1" "BUB1" "BUB3" "BUD14"
## [56] "BUD16" "BUD19" "BUD20" "BUD23" "BUD25"
## [61] "BUD28" "BUD30" "BUD32" "CAT5" "CBP2"
## [66] "CCE1" "CCM1" "CCR4" "CCS1" "CDA1"
## [71] "CDA2" "CDC10" "CHD1" "CIK1" "CIR2"
## [76] "CKB1" "CKI1" "CLA4" "CNE1" "CNM67"
## [81] "COA3" "COQ9" "COR1" "COX10" "COX11"
## [86] "COX18" "COX7" "COX9" "CPA2" "CPR7"
## [91] "CRP1" "CSM1" "CST6" "CTF4" "CTF8"
## [96] "CTK2" "CTK3" "CTR1" "CTR9" "CYT1"
## [101] "DAK1" "DAL82" "DCI1" "DDC1" "DEP1"
## [106] "DFG16" "DIA2" "DIA4" "DIG1" "DOA4"
## [111] "DOC1" "DON1" "DRS2" "DSS1" "DST1"
## [116] "DYN3" "EAF3" "EAP1" "ECM27" "ECM31"
## [121] "ECM33" "ECM38" "EFG1" "EFT2" "EIS1"
## [126] "ELM1" "ELO2" "ELP2" "ELP3" "ELP4"
## [131] "EMC10" "END3" "ENT3" "ERF2" "ERG24"
## [136] "ERJ5" "ERP2" "EST1" "EST2" "EST3"
## [141] "ETR1" "FAB1" "FAR11" "FAR7" "FEN2"
## [146] "FMP52" "FMT1" "FRA1" "FUN14" "FUN30"
## [151] "FUS3" "FYV5" "FYV7" "FZO1" "GAL80"
## [156] "GCG1" "GCN1" "GCN5" "GCR2" "GCV3"
## [161] "GEP3" "GET3" "GIS4" "GLE2" "GLR1"
## [166] "GPM2" "GRX8" "GSF2" "GTF1" "GTS1"
## [171] "GUP1" "HAC1" "HBN1" "HEF3" "HER2"
## [176] "HFM1" "HIR1" "HIS1" "HIS4" "HIS5"
## [181] "HIT1" "HMO1" "HMT1" "HOC1" "HOM2"
## [186] "HOM3" "HOM6" "HOP2" "HSL1" "HSL7"
## [191] "HSM3" "HST1" "HTZ1" "HUL4" "HUR1"
## [196] "HXT17" "ICS2" "IDH2" "IES1" "IES2"
## [201] "IKI1" "IKI3" "IMD4" "IMP1" "IMP2"
## [206] "IMP2'" "INH1" "INM2" "INO2" "INO4"
## [211] "IOC4" "IPK1" "IRC18" "ISA2" "IST3"
## [216] "IWR1" "JJJ3" "JNM1" "KAP120" "KCS1"
## [221] "KEX2" "KGD2" "KIN4" "KIN82" "KIP1"
## [226] "KRE28" "KTI11" "KTR1" "LAC1" "LAS21"
## [231] "LAT1" "LCL2" "LIP2" "LMO1" "LNP1"
## [236] "LOC1" "LRG1" "LSC2" "LSM1" "LSM6"
## [241] "LSO1" "LTV1" "LYP1" "LYS12" "LYS14"
## [246] "LYS4" "LYS5" "LYS9" "MAC1" "MAF1"
## [251] "MCT1" "MDL2" "MDM34" "MEF1" "MEF2"
## [256] "MET18" "MET22" "MET3" "MET6" "MFA1"
## [261] "MFM1" "MGM101" "MGR2" "MGR3" "MGT1"
## [266] "MHF2" "MID1" "MIP1" "MIP6" "MIX14"
## [271] "MIX17" "MMM1" "MMS1" "MMS22" "MMS4"
## [276] "MNN2" "MPC54" "MPO1" "MRH4" "MRP1"
## [281] "MRPL1" "MRPL11" "MRPL15" "MRPL16" "MRPL17"
## [286] "MRPL35" "MRPL37" "MRPL40" "MRPL51" "MRPL7"
## [291] "MRPL8" "MRPL9" "MRPS16" "MRPS9" "MRX14"
## [296] "MSB4" "MSE1" "MSH1" "MSL1" "MSM1"
## [301] "MSS1" "MSS116" "MSS51" "MST1" "MSW1"
## [306] "MSY1" "MTG2" "MUM2" "MVB12" "NCS6"
## [311] "NDI1" "NGG1" "NMA1" "NPL6" "NPT1"
## [316] "NSR1" "NUP170" "NUP60" "NUP84" "OM14"
## [321] "OPI11" "OPI8" "OSW7" "OTU2" "PAN5"
## [326] "PAU5" "PBP4" "PCP1" "PDE2" "PDH1"
## [331] "PDR3" "PET100" "PET123" "PET130" "PET309"
## [336] "PET54" "PEX12" "PEX13" "PEX21" "PFF1"
## [341] "PHO89" "PIN2" "PLM2" "PMP3" "PMR1"
## [346] "PMS1" "PMT1" "PMT2" "POP2" "POR1"
## [351] "PPA2" "PPN1" "PPT2" "PPZ1" "PRM9"
## [356] "PRP18" "PRS3" "PRY2" "PSR1" "PTK2"
## [361] "PXA2" "QCR2" "QCR8" "QDR2" "RAD27"
## [366] "RAD54" "RAD57" "RAD6" "RAI1" "RBK1"
## [371] "RCO1" "RCR1" "RCR2" "REI1" "REV7"
## [376] "REX2" "RGC1" "RHB1" "RHO5" "RIM4"
## [381] "RMD6" "RNR3" "ROM1" "RPB4" "RPB9"
## [386] "RPL11B" "RPL13A" "RPL14A" "RPL16A" "RPL19A"
## [391] "RPL21A" "RPL22A" "RPL23A" "RPL27A" "RPL2B"
## [396] "RPL31A" "RPL34B" "RPL36B" "RPL39" "RPL42B"
## [401] "RPL43A" "RPL7B" "RPO41" "RPP2B" "RPS0B"
## [406] "RPS10A" "RPS11A" "RPS11B" "RPS17A" "RPS18A"
## [411] "RPS18B" "RPS19B" "RPS21A" "RPS21B" "RPS23A"
## [416] "RPS23B" "RPS24B" "RPS25B" "RPS27B" "RPS28B"
## [421] "RPS29B" "RPS7A" "RPS8A" "RPS9B" "RRG9"
## [426] "RRT8" "RSA1" "RSC2" "RSM19" "RSM22"
## [431] "RSM24" "RSM25" "RSM26" "RSM27" "RTF1"
## [436] "RTG3" "RTS1" "RTT101" "RTT109" "SAC1"
## [441] "SAF1" "SAM37" "SAP1" "SAP4" "SBA1"
## [446] "SCS22" "SDH1" "SDS23" "SDS3" "SEA4"
## [451] "SEO1" "SER2" "SHS1" "SIC1" "SIN3"
## [456] "SIN4" "SIP3" "SIR3" "SLM6" "SLT2"
## [461] "SLX9" "SMF2" "SMK1" "SNA3" "SNF1"
## [466] "SNF2" "SNF3" "SNF4" "SNF5" "SNF7"
## [471] "SNG1" "SOD2" "SOH1" "SPF1" "SPL2"
## [476] "SPO23" "SPO77" "SPT2" "SPT21" "SPT3"
## [481] "SPT4" "SRB2" "SRN2" "SRO7" "SSA2"
## [486] "SSD1" "SSE1" "SSH1" "SSN8" "SST2"
## [491] "STE11" "STE24" "STE5" "STE50" "STE7"
## [496] "SUS1" "SWC5" "SWD1" "SWH1" "SWI5"
## [501] "SYG1" "TAT2" "TCO89" "TDA3" "TEF4"
## [506] "TFB5" "TGS1" "THI2" "THR1" "THR4"
## [511] "TIF1" "TKL2" "TMA10" "TOD6" "TOF2"
## [516] "TPN1" "TRM12" "TRM7" "TRM9" "TRP2"
## [521] "TRP3_dub" "TRX3" "TSA2" "TSR3" "UBC12"
## [526] "UBC4" "UBP5" "UME6" "URA1" "URA7"
## [531] "URC2" "URM1" "UTP30" "UTR1" "VBA1"
## [536] "VBA2" "VBA5" "VID22" "VID27" "VMA11"
## [541] "VMA21" "VPS1" "VPS41" "VPS52" "VPS53"
## [546] "VPS61" "VPS69" "VPS70" "VPS71" "VPS73"
## [551] "XRS2" "YAF9" "YAK1" "YAL042C-A" "YAL066W"
## [556] "YAP1" "YBL059W" "YBL065W" "YBL071C-B" "YBR096W"
## [561] "YBR178W" "YBR225W" "YCK1" "YCL001W-B" "YCL002C"
## [566] "YCL007C" "YCL021W-A" "YCR101C" "YCR102C" "YDJ1"
## [571] "YDL041W" "YDL118W" "YDL129W" "YDL183C" "YDL211C"
## [576] "YDL241W" "YDR048C" "YDR149C" "YDR417C" "YDR442W"
## [581] "YDR455C" "YDR514C" "YEF1" "YER010C" "YER046W-A"
## [586] "YER156C" "YFL013W-A" "YFR045W" "YGK3" "YGL042C"
## [591] "YGL072C" "YGL088W" "YGL108C" "YGL118C" "YGL177W"
## [596] "YGL188C-A" "YGL218W" "YGR064W" "YGR117C" "YGR160W"
## [601] "YHC3" "YHK8" "YHR050W-A" "YIA6" "YIL025C"
## [606] "YIL029C" "YJL043W" "YJL055W" "YJL077W-B" "YJR154W"
## [611] "YKE2" "YKL033W-A" "YKL136W" "YKL169C" "YKR051W"
## [616] "YLR184W" "YLR264C-A" "YLR269C" "YLR283W" "YLR297W"
## [621] "YLR311C" "YLR358C" "YLR434C" "YML009W-B" "YML012C-A"
## [626] "YML079W" "YML094C-A" "YMR001C-A" "YMR114C" "YMR194C-A"
## [631] "YMR242W-A" "YNG2" "YNL120C" "YNL140C" "YNL146W"
## [636] "YNL198C" "YNL226W" "YNR068C" "YOL038C-A" "YOL079W"
## [641] "YOL118C" "YOL166W-A" "YOR139C" "YOR199W" "YOR283W"
## [646] "YOR376W" "YPL062W" "YPL113C" "YPL260W" "YPR092W"
## [651] "YPR098C" "YPT32" "YRR1" "YTP1" "ZAP1"
## [656] "ZRT2" "ZUO1"
compare differences by GENES rejected by STD hom
hsd1 = setdiff(rownames(bxxhs),rownames(hbat))
if(length(hsd1)==0) print('no hom genes rejected by standard')
## [1] "no hom genes rejected by standard"
if(length(hsd1) > 0) print(hsd1)
osd1 = setdiff(rownames(bxxos),rownames(obat))
if(length(osd1)==0) print('no oliver genes rejected by standard')
if(length(osd1) > 0) print(osd1)
## [1] "PIM1"
csd1 = setdiff(rownames(bxxcs),rownames(cbat))
if(length(csd1)==0) print('no caudy genes rejected by standard')
## [1] "no caudy genes rejected by standard"
if(length(osd1) > 0) print(csd1)
## character(0)
summary tables
xtab = table(mix$call,mix$pool)
stab = table(stand$call,stand$pool)
pxtab = round(xtab/nrow(xdf),2)
pstab = round(stab/nrow(sdf),2)
print(xtab)
##
## mixcau mixhom mixoliv
## 0 836 333 419
## 1 3873 4376 4290
print(stab)
##
## standcau standhom standoliv
## 0 362 253 173
## 1 4530 4639 4719
print(pxtab)
##
## mixcau mixhom mixoliv
## 0 0.18 0.07 0.09
## 1 0.82 0.93 0.91
print(pstab)
##
## standcau standhom standoliv
## 0 0.07 0.05 0.04
## 1 0.93 0.95 0.96
plot median vs mad of tags rejected ONLY by mixEM and not by our standard bkrnd thresholds:
pmad1 <- function(mat,...) {
med <- apply(mat,1,median)
mad <- apply(mat,1,mad)
q1 <- quantile(mad,0.25)
q3 <- quantile(mad,0.75)
IQR <- q3-q1
I1 <- q1 - 1.5*IQR
I2 <- q3 + 1.5*IQR
plot(med,mad,...)
wlow <- which(mad < I1)
wup <- which(mad > I2)
points(med[wup],mad[wup],col="red")
text(med,mad,names(mad),pos=4,cex = 0.8)
wup
}
opar <- par(pch=19,mfrow=c(1,1),mar=c(2.2,2,1.4,1),cex=1.2)
hmad = pmad1(hbat[hsd,whs],main = "hom mixEM reject",cex.main=0.7)
omad= pmad1(obat[osd,wos],main = "oliver mixEM reject",cex.main=0.7)
cmad = pmad1(cbat[csd,wcs],main = "caudy mixEM reject",cex.main=0.7)