AEEIP_vignette

Abstract

To evaluate antibody quality,we propose the novel Antibody Efficiency Estimation from Immunoprecipitation data (AEEIP) model. It contains a maximum a posterior (MAP) estimate the efficiency of antibody. It is able to calculate the proportion of mixed Input sample in IP sample. It also try to change the hyper parameter of beat prior to get the better result.Please don’t hesitate to contact Haozhe.Wang17@student.xjtlu.edu.cn if you have any questions. Table of content

0.1

simulate data

This section is getting the simulate data with 0.1 mix rate and replicates is 3

# the mix rate is 0.1
set.seed(1)
data <-simulateData(T=10000, hyper_a_1=20,hyper_b_1 =0.2, hyper_a_2=50,hyper_b_2 =0.05,
pure_rate=0.99,mix_rate=0.1,mode=2,replicates=3)

corrected the bias

replicates 1

corrected the bias for the replicates 1

set.seed(1)
# firstly we set the initial tau is near to 0 and let the size of beta to be large enough
# to see the plot
res <- AEEIP_test(r_t_pure=data[[3]][[1]],r_t_mix=data[[4]][[1]],beta=sum(data[[4]][[1]])*1e1,alpha_a=NA,
alpha_b=NA,tau_a_vector=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)

res1 <- AEEIP_test_final(res,r_t_pure=data[[3]][[1]],r_t_mix=data[[4]][[1]],beta=sum(data[[4]][[1]])*1e1,alpha_a=NA,                      alpha_b=NA,tau_a=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
                     max_iteration =1e6,stop_stepsize=1e-7)

## [1] "the antibody bias is around 0.108844542801534"

res_replicate1 <- res1[[1]]
data.frame_1 <- res1[[2]]
data.frame_2 <- res1[[3]]


plot <- ggplot() + geom_point(data=data.frame_2,aes(x=tau_a_no_bayes,y=diff))+
     geom_hline(aes(yintercept=0), colour="#BB0000", linetype="dashed")

plot

res_replicate1$tau_a_bayes

## [1] 0.1088445

replicates 2

set.seed(1)
# firstly we set the initial tau is near to 0 and let the size of beta to be large enough,
# to see the plot
res <- AEEIP_test(r_t_pure=data[[3]][[2]],r_t_mix=data[[4]][[2]],beta=sum(data[[4]][[2]])*1e1,alpha_a=NA,
alpha_b=NA,tau_a_vector=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)


res1 <- AEEIP_test_final(res,r_t_pure=data[[3]][[2]],r_t_mix=data[[4]][[2]],beta=sum(data[[4]][[2]])*1e1,alpha_a=NA,                      alpha_b=NA,tau_a=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
                     max_iteration =1e6,stop_stepsize=1e-7)

## [1] "the antibody bias is around 0.108819650014971"

res_replicate2 <- res1[[1]]
data.frame_1 <- res1[[2]]
data.frame_2 <- res1[[3]]


plot <- ggplot() + geom_point(data=data.frame_2,aes(x=tau_a_no_bayes,y=diff))+
     geom_hline(aes(yintercept=0), colour="#BB0000", linetype="dashed")

plot

res_replicate2$tau_a_bayes

## [1] 0.1088197

replicates 3

set.seed(1)
# firstly we set the initial tau is near to 0 and let the size of beta to be large enough,
# to see the plot
res <- AEEIP_test(r_t_pure=data[[3]][[3]],r_t_mix=data[[4]][[3]],beta=sum(data[[4]][[3]])*1e1,alpha_a=NA,
alpha_b=NA,tau_a_vector=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
max_iteration =1e6,stop_stepsize=1e-7)

res1 <- AEEIP_test_final(res,r_t_pure=data[[3]][[3]],r_t_mix=data[[4]][[3]],beta=sum(data[[4]][[3]])*1e1,alpha_a=NA,                      alpha_b=NA,tau_a=0,l=1,prop_vector=seq(0.01,0.99,by=0.01),
                     max_iteration =1e6,stop_stepsize=1e-7)

## [1] "the antibody bias is around 0.1088061616759"

res_replicate3 <- res1[[1]]
data.frame_1 <- res1[[2]]
data.frame_2 <- res1[[3]]

plot <- ggplot() + geom_point(data=data.frame_2,aes(x=tau_a_no_bayes,y=diff))+
     geom_hline(aes(yintercept=0), colour="#BB0000", linetype="dashed")

plot

res_replicate3$tau_a_bayes

## [1] 0.1088062

use the non_mix real data to run the DESeq2

use DESeq2 to analysis the real data, to see the differential between the input and IP.

    counts <- cbind(as.data.frame(data[[1]]),as.data.frame(data[[2]]))
    design = data.frame(
                    Trt = c(rep("input", dim(as.data.frame(data[[1]]))[2]),
                            rep("IP", dim(as.data.frame(data[[1]]))[2])))
    model = ~  Trt 
    # 
    dds <- DESeq2::DESeqDataSetFromMatrix(countData = counts,
                                colData = design,
                                design = model)

## converting counts to integer mode

    data_size <- as.matrix(counts)

    # first log
    log_data <- log(data_size)
    log_data [is.infinite(log_data)] <- NA
    log_mean <- rowMeans(log_data)
    log_s <- log_data-log_mean

    # then exp
    s_size <- exp(apply(log_s,2,function(x)median(x,na.rm=TRUE)))
    names(s_size) <- NULL
    dds$sizeFactor <- s_size
    # DESeq2::sizeFactors(dds) = s_size

    dds <- DESeq2::DESeq(dds, test = "Wald")

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

## final dispersion estimates

## fitting model and testing

    res_DESeq2_real = results(dds)#, name = "Trt_IP_vs_input"
    
    
    res_DESeq2_real_01_1 <- res_DESeq2_real

use the mix data to run the DESeq2

use DESeq2 to analysis the mix data, to see the differential between the input and IP.

    counts <- cbind(as.data.frame(data[[3]]),as.data.frame(data[[4]]))
    design = data.frame(
                    Trt = c(rep("input", dim(as.data.frame(data[[3]]))[2]),
                            rep("IP", dim(as.data.frame(data[[3]]))[2])))
    model = ~  Trt 
    # 
    dds <- DESeqDataSetFromMatrix(countData = counts,
                                colData = design,
                                design = model)

## converting counts to integer mode

    data_size <- as.matrix(counts)
  
  
    # first log
    log_data <- log(data_size)
    log_data [is.infinite(log_data)] <- NA
    log_mean <- rowMeans(log_data)
    log_s <- log_data-log_mean
    
    # then exp
    s_size <- exp(apply(log_s,2,function(x)median(x,na.rm=TRUE)))
    names(s_size) <- NULL
    sizeFactors(dds) = s_size
    
    dds <- DESeq(dds, test = "Wald")

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.

## final dispersion estimates

## fitting model and testing

    res_DESeq2_mix = DESeq2::results(dds)
    
    res_DESeq2_mix_01_1 <- res_DESeq2_mix

use the corrected mix data to run the DESeq2

input <- as.data.frame(data[[3]])
IP <- as.data.frame(data[[4]])

res_DESeq2_mix_corrected <- DESeq2_AEEIP(list(res_replicate1,res_replicate2,res_replicate3),input,IP)

## converting counts to integer mode

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

evaluate

# we see the difference is small for the corrected mix data
 

# to see the i=relation between the three results
index_real <- which(res_DESeq2_real$pvalue<=5e-2)
threshold1 <- res_DESeq2_mix_corrected$pvalue[order(res_DESeq2_mix_corrected$pvalue)[length(index_real)]]
threshold2 <- res_DESeq2_mix$pvalue[order(res_DESeq2_mix$pvalue)[length(index_real)]]
index_mix_corrected <- which(res_DESeq2_mix_corrected$pvalue<=threshold1)
index_mix <- which(res_DESeq2_mix$pvalue<=threshold2)
 
index_list <- list(real=index_real,corrected=index_mix_corrected,
                   mix=index_mix)
 

ggVennDiagram(index_list,category.names = c("real","corrected","mix"),
              label = "count", 
              label_color = "black",
              label_alpha = 0,
              edge_lty = "dashed", 
              edge_size = 1) +
  scale_fill_gradient(low="white",high = "#b9292b",name = "gene count")

# we can see the intersection between real data and corrected mix data is better than the mix data.