Self imputation

Use self progeny to impute, but not phase, mom. Base on old mctwos code, so uses haplotypes even though here we ignore phasing.

Functions

Create random haplotype

ran.hap=function(numloci,p){sapply(1:numloci,function(x) rbinom(1,1,p[x]))}

Add error to diploid

Error model assumes errors have equal likelihood of being called either alternative genotype.

add_error<-function(diploid,hom.error,het.error){
    hets_with_error=sample(which(diploid==1),round(het.error*length(which(diploid==1))))
    hom0_with_error=sample(which(diploid==0),round(hom.error*length(which(diploid==0))))
    hom2_with_error=sample(which(diploid==2),round(hom.error*length(which(diploid==2))))
    diploid=replace(diploid,hets_with_error,sample(c(0,2),length(hets_with_error),replace=T)  )
    diploid=replace(diploid,hom0_with_error,sample(c(1,2),length(hom0_with_error),replace=T)  )
    diploid=replace(diploid,hom2_with_error,sample(c(1,0),length(hom2_with_error),replace=T)  )
    return(diploid)
}

Make a kid

Returns list of true [[1]] and observed [[2]] kid

kid=function(mom,dad,het.error,hom.error){
    k1=mom[[rbinom(1,1,.5)+1]]
    k2=dad[[rbinom(1,1,.5)+1]]
    true_kid=k1+k2
    obs_kid=add_error(true_kid,hom.error,het.error)
    return(list(true_kid,obs_kid))
}

Return HW probs

hw_probs<-function(x){ return(c(x^2,2*x*(1-x),(1-x)^2))}

Infer mom’s genotype

We have obs. mom and obs. (selfed) kids. We want to know \(P(G|\theta)\), and \(P(G|\theta) \propto P(\theta|G) \times (G)\), where \(\theta\) is observed data. This consists of observed genotypes (\(G'\)) of both mom and kids. So: \(P(G|\theta)\propto \left( \prod\limits_{i=1}^{k}{P(G'_k|G)} \right) \times P(G'_{mom}|G) \times P(G)\) This function is to impute mom’s genotype from a progeny array of k kids at a single locus.

# inferred_mom=1 -> 00, 2->01, 3->11
infer_mom<-function(obs_mom,locus,progeny,p){
  mom_probs=as.numeric()
  for(inferred_mom in 1:3){
    #P(G'|G)
    pgg=gen_error_mat[inferred_mom,obs_mom[locus]+1] #+1 because obs_mom is 0,1, or 2
    #P(G)
    pg=hw_probs(p[locus])[inferred_mom]
    #P(kids|G) sum of logs instead of product
    pkg=sum(sapply(1:length(progeny), function(z) log(sum(probs[[inferred_mom]][,progeny[[z]][[2]][locus]+1]))))
    mom_probs[inferred_mom]=pkg+log(pgg)+log(pg)
  }
  return(which(mom_probs==max(mom_probs))-1)
}

Params and probabilities

Parameters

size.array=20 # size of progeny array
het.error=0.7 # het->hom error
hom.error=0.002 # hom->other error
numloci=1000

Set up the neutral SFS

x=1:99/100 #0.01 bins of freq.
freq=1/x
sfs=as.numeric(); 
for(i in 1:99){sfs=c(sfs,rep(x[i],100*freq[i]))}

Probabilities

# row 1 is true_gen 00, row2 is true_gen 01, row 3 is true_gen 11
# cols are obs. genotype (00,01,11)
# This is where Mendel comes in (see matrix below)
gen_error_mat<-matrix(c(1-hom.error,hom.error/2,hom.error/2,het.error/2,1-het.error,het.error/2,hom.error/2,hom.error/2,1-hom.error),byrow=T,nrow=3,ncol=3)

probs<-vector("list",3)
probs[[1]]<-gen_error_mat*matrix(c(1, 0, 0), nrow = 3,byrow=F,ncol=3)
probs[[2]]<-gen_error_mat*matrix(c(1/4, 1/2, 1/4), nrow = 3,byrow=F,ncol=3)
probs[[3]]<-gen_error_mat*matrix(c(0, 0, 1), nrow = 3,byrow=F,ncol=3)

Simulate and test

Make mom

Makes mom with error.

p=sample(sfs,numloci) #get freqs for all loci
a1=ran.hap(numloci,p) #make haplotypes
a2=ran.hap(numloci,p)

true_mom=list(a1,a2) #phased 
obs_mom=add_error(a1+a2,hom.error,het.error) #convert to diploid genotype

Make selfed progeny array

progeny<-vector("list",size.array)
progeny<-lapply(1:size.array, function(a) kid(true_mom,true_mom,het.error,hom.error))

Infer stuff

Do all loci, test mom

estimated_mom=sapply(1:numloci, function(a) infer_mom(obs_mom,a,progeny,p) )
sum(estimated_mom==true_mom[[1]]+true_mom[[2]])

## [1] 1000

Try lots

Here we run 100 sims. All use same sfs to save a wee bit of time. Returns mean number of loci correct.

sims=100
size.array=5 # size of progeny array
het.error=0.7 # het->hom error
hom.error=0.002 # hom->other error
numloci=1000 
errors.correct=FALSE # can assume we know error rates or not
freqs.correct=FALSE # can assume we know freqs or not

x=1:99/100 #0.01 bins of freq.
freq=1/x
sfs=as.numeric(); 
for(i in 1:99){sfs=c(sfs,rep(x[i],100*freq[i]))}
gen_error_mat<-matrix(c(1-hom.error,hom.error/2,hom.error/2,het.error/2,1-het.error,het.error/2,hom.error/2,hom.error/2,1-hom.error),byrow=T,nrow=3,ncol=3)
probs[[1]]<-gen_error_mat*matrix(c(1, 0, 0), nrow = 3,byrow=F,ncol=3)
probs[[2]]<-gen_error_mat*matrix(c(1/4, 1/2, 1/4), nrow = 3,byrow=F,ncol=3)
probs[[3]]<-gen_error_mat*matrix(c(0, 0, 1), nrow = 3,byrow=F,ncol=3)
p=sample(sfs,numloci) #get freqs for all loci

sim=as.numeric()
for(mysim in 1:sims){
  
if(errors.correct==FALSE){
  het.error=runif(1)/1.67+0.4 # random 0.4-1
  hom.error=10^-(runif(1)*4+1) # random
}
if(freqs.correct==FALSE){
    og.p=p
    p=sapply(1:numloci, function(a) sum(rbinom(140,1,p[a]))/140) 
    p[which(p==0)]=1/70;
}
  
  a1=ran.hap(numloci,p) #make haplotypes
  a2=ran.hap(numloci,p)
  true_mom=list(a1,a2) #phased (not necessary here, but a holdover)
  obs_mom=add_error(a1+a2,hom.error,het.error) #convert to diploid genotype
  progeny<-vector("list",size.array)
  progeny<-lapply(1:size.array, function(a) kid(true_mom,true_mom,het.error,hom.error))
  
  estimated_mom=sapply(1:numloci, function(a) infer_mom(obs_mom,a,progeny,p) ) 
  sim[mysim]=sum(estimated_mom==true_mom[[1]]+true_mom[[2]])
 # print((p-og.p)/og.p)[which(estimated_mom!=(true_mom[[1]]+true_mom[[2]]))]
}
mean(sim)

## [1] 858.13