Abstract
To evaluate antibody quality,we propose the novel Antibody Efficiency Estimation from Immunoprecipitation data (AEEIP) model. It contains a maximum a posterior (MAP) estimate the efficiency of antibody. It is able to calculate the proportion of mixed Input sample in IP sample. It also try to change the hyper parameter of beat prior to get the better result.Please don’t hesitate to contact Haozhe.Wang17@student.xjtlu.edu.cn if you have any questions.# install.packages("devtools")
# library("devtools")
# install_github("whz991026/AEEIP")
library(AEEIP)
Load the other packages:
This section is getting the simulate data with 0.1 mix rate and replicates is 3
# the mix rate is 0.1
set.seed(1)
data <-simulateData(T=10000, hyper_a_1=20,hyper_b_1 =0.2, hyper_a_2=50,hyper_b_2 =0.05,
pure_rate=0.99,mix_rate=0.1,mode=2,replicates=3)
corrected the bias for the replicates 1
set.seed(1)
# firstly we set the initial tau is near to 0 and let the size of beta to be large enough
# to see the plot
res <- AEEIP_test(r_t_pure=data[[3]][[1]],r_t_mix=data[[4]][[1]],beta=sum(data[[4]][[1]])*1e1,alpha_a=NA,
alpha_b=NA,tau_a_vector=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)
res1 <- AEEIP_test_final(res,r_t_pure=data[[3]][[1]],r_t_mix=data[[4]][[1]],beta=sum(data[[4]][[1]])*1e1,alpha_a=NA, alpha_b=NA,tau_a=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)
## [1] "the antibody bias is around 0.108844542801534"
res_replicate1 <- res1[[1]]
data.frame_1 <- res1[[2]]
data.frame_2 <- res1[[3]]
plot <- ggplot() + geom_point(data=data.frame_2,aes(x=tau_a_no_bayes,y=diff))+
geom_hline(aes(yintercept=0), colour="#BB0000", linetype="dashed")
plot
res_replicate1$tau_a_bayes
## [1] 0.1088445
set.seed(1)
# firstly we set the initial tau is near to 0 and let the size of beta to be large enough,
# to see the plot
res <- AEEIP_test(r_t_pure=data[[3]][[2]],r_t_mix=data[[4]][[2]],beta=sum(data[[4]][[2]])*1e1,alpha_a=NA,
alpha_b=NA,tau_a_vector=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)
res1 <- AEEIP_test_final(res,r_t_pure=data[[3]][[2]],r_t_mix=data[[4]][[2]],beta=sum(data[[4]][[2]])*1e1,alpha_a=NA, alpha_b=NA,tau_a=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)
## [1] "the antibody bias is around 0.108819650014971"
res_replicate2 <- res1[[1]]
data.frame_1 <- res1[[2]]
data.frame_2 <- res1[[3]]
plot <- ggplot() + geom_point(data=data.frame_2,aes(x=tau_a_no_bayes,y=diff))+
geom_hline(aes(yintercept=0), colour="#BB0000", linetype="dashed")
plot
res_replicate2$tau_a_bayes
## [1] 0.1088197
set.seed(1)
# firstly we set the initial tau is near to 0 and let the size of beta to be large enough,
# to see the plot
res <- AEEIP_test(r_t_pure=data[[3]][[3]],r_t_mix=data[[4]][[3]],beta=sum(data[[4]][[3]])*1e1,alpha_a=NA,
alpha_b=NA,tau_a_vector=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)
res1 <- AEEIP_test_final(res,r_t_pure=data[[3]][[3]],r_t_mix=data[[4]][[3]],beta=sum(data[[4]][[3]])*1e1,alpha_a=NA, alpha_b=NA,tau_a=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)
## [1] "the antibody bias is around 0.1088061616759"
res_replicate3 <- res1[[1]]
data.frame_1 <- res1[[2]]
data.frame_2 <- res1[[3]]
plot <- ggplot() + geom_point(data=data.frame_2,aes(x=tau_a_no_bayes,y=diff))+
geom_hline(aes(yintercept=0), colour="#BB0000", linetype="dashed")
plot
res_replicate3$tau_a_bayes
## [1] 0.1088062
use DESeq2 to analysis the real data, to see the differential between the input and IP.
counts <- cbind(as.data.frame(data[[1]]),as.data.frame(data[[2]]))
design = data.frame(
Trt = c(rep("input", dim(as.data.frame(data[[1]]))[2]),
rep("IP", dim(as.data.frame(data[[1]]))[2])))
model = ~ Trt
#
dds <- DESeq2::DESeqDataSetFromMatrix(countData = counts,
colData = design,
design = model)
## converting counts to integer mode
data_size <- as.matrix(counts)
# first log
log_data <- log(data_size)
log_data [is.infinite(log_data)] <- NA
log_mean <- rowMeans(log_data)
log_s <- log_data-log_mean
# then exp
s_size <- exp(apply(log_s,2,function(x)median(x,na.rm=TRUE)))
names(s_size) <- NULL
dds$sizeFactor <- s_size
# DESeq2::sizeFactors(dds) = s_size
dds <- DESeq2::DESeq(dds, test = "Wald")
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res_DESeq2_real = results(dds)#, name = "Trt_IP_vs_input"
res_DESeq2_real_01_1 <- res_DESeq2_real
use DESeq2 to analysis the mix data, to see the differential between the input and IP.
counts <- cbind(as.data.frame(data[[3]]),as.data.frame(data[[4]]))
design = data.frame(
Trt = c(rep("input", dim(as.data.frame(data[[3]]))[2]),
rep("IP", dim(as.data.frame(data[[3]]))[2])))
model = ~ Trt
#
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = design,
design = model)
## converting counts to integer mode
data_size <- as.matrix(counts)
# first log
log_data <- log(data_size)
log_data [is.infinite(log_data)] <- NA
log_mean <- rowMeans(log_data)
log_s <- log_data-log_mean
# then exp
s_size <- exp(apply(log_s,2,function(x)median(x,na.rm=TRUE)))
names(s_size) <- NULL
sizeFactors(dds) = s_size
dds <- DESeq(dds, test = "Wald")
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res_DESeq2_mix = DESeq2::results(dds)
res_DESeq2_mix_01_1 <- res_DESeq2_mix
input <- as.data.frame(data[[3]])
IP <- as.data.frame(data[[4]])
res_DESeq2_mix_corrected <- DESeq2_AEEIP(list(res_replicate1,res_replicate2,res_replicate3),input,IP)
## converting counts to integer mode
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# we see the difference is small for the corrected mix data
# to see the i=relation between the three results
index_real <- which(res_DESeq2_real$pvalue<=5e-2)
threshold1 <- res_DESeq2_mix_corrected$pvalue[order(res_DESeq2_mix_corrected$pvalue)[length(index_real)]]
threshold2 <- res_DESeq2_mix$pvalue[order(res_DESeq2_mix$pvalue)[length(index_real)]]
index_mix_corrected <- which(res_DESeq2_mix_corrected$pvalue<=threshold1)
index_mix <- which(res_DESeq2_mix$pvalue<=threshold2)
index_list <- list(real=index_real,corrected=index_mix_corrected,
mix=index_mix)
ggVennDiagram(index_list,category.names = c("real","corrected","mix"),
label = "count",
label_color = "black",
label_alpha = 0,
edge_lty = "dashed",
edge_size = 1) +
scale_fill_gradient(low="white",high = "#b9292b",name = "gene count")
# we can see the intersection between real data and corrected mix data is better than the mix data.