Use self progeny to impute, but not phase, mom. Base on old mctwos code, so uses haplotypes even though here we ignore phasing.
ran.hap=function(numloci,p){sapply(1:numloci,function(x) rbinom(1,1,p[x]))}
Error model assumes errors have equal likelihood of being called either alternative genotype.
add_error<-function(diploid,hom.error,het.error){
hets_with_error=sample(which(diploid==1),round(het.error*length(which(diploid==1))))
hom0_with_error=sample(which(diploid==0),round(hom.error*length(which(diploid==0))))
hom2_with_error=sample(which(diploid==2),round(hom.error*length(which(diploid==2))))
diploid=replace(diploid,hets_with_error,sample(c(0,2),length(hets_with_error),replace=T) )
diploid=replace(diploid,hom0_with_error,sample(c(1,2),length(hom0_with_error),replace=T) )
diploid=replace(diploid,hom2_with_error,sample(c(1,0),length(hom2_with_error),replace=T) )
return(diploid)
}
Returns list of true [[1]] and observed [[2]] kid
kid=function(mom,dad,het.error,hom.error){
k1=mom[[rbinom(1,1,.5)+1]]
k2=dad[[rbinom(1,1,.5)+1]]
true_kid=k1+k2
obs_kid=add_error(true_kid,hom.error,het.error)
return(list(true_kid,obs_kid))
}
hw_probs<-function(x){ return(c(x^2,2*x*(1-x),(1-x)^2))}
We have obs. mom and obs. (selfed) kids. We want to know \(P(G|\theta)\), and \(P(G|\theta) \propto P(\theta|G) \times (G)\), where \(\theta\) is observed data. This consists of observed genotypes (\(G'\)) of both mom and kids. So: \(P(G|\theta)\propto \left( \prod\limits_{i=1}^{k}{P(G'_k|G)} \right) \times P(G'_{mom}|G) \times P(G)\) This function is to impute mom’s genotype from a progeny array of k kids at a single locus.
# inferred_mom=1 -> 00, 2->01, 3->11
infer_mom<-function(obs_mom,locus,progeny,p){
mom_probs=as.numeric()
for(inferred_mom in 1:3){
#P(G'|G)
pgg=gen_error_mat[inferred_mom,obs_mom[locus]+1] #+1 because obs_mom is 0,1, or 2
#P(G)
pg=hw_probs(p[locus])[inferred_mom]
#P(kids|G) sum of logs instead of product
pkg=sum(sapply(1:length(progeny), function(z) log(sum(probs[[inferred_mom]][,progeny[[z]][[2]][locus]+1]))))
mom_probs[inferred_mom]=pkg+log(pgg)+log(pg)
}
return(which(mom_probs==max(mom_probs))-1)
}
size.array=20 # size of progeny array
het.error=0.7 # het->hom error
hom.error=0.002 # hom->other error
numloci=1000
x=1:99/100 #0.01 bins of freq.
freq=1/x
sfs=as.numeric();
for(i in 1:99){sfs=c(sfs,rep(x[i],100*freq[i]))}
# row 1 is true_gen 00, row2 is true_gen 01, row 3 is true_gen 11
# cols are obs. genotype (00,01,11)
# This is where Mendel comes in (see matrix below)
gen_error_mat<-matrix(c(1-hom.error,hom.error/2,hom.error/2,het.error/2,1-het.error,het.error/2,hom.error/2,hom.error/2,1-hom.error),byrow=T,nrow=3,ncol=3)
probs<-vector("list",3)
probs[[1]]<-gen_error_mat*matrix(c(1, 0, 0), nrow = 3,byrow=F,ncol=3)
probs[[2]]<-gen_error_mat*matrix(c(1/4, 1/2, 1/4), nrow = 3,byrow=F,ncol=3)
probs[[3]]<-gen_error_mat*matrix(c(0, 0, 1), nrow = 3,byrow=F,ncol=3)
Makes mom with error.
p=sample(sfs,numloci) #get freqs for all loci
a1=ran.hap(numloci,p) #make haplotypes
a2=ran.hap(numloci,p)
true_mom=list(a1,a2) #phased
obs_mom=add_error(a1+a2,hom.error,het.error) #convert to diploid genotype
progeny<-vector("list",size.array)
progeny<-lapply(1:size.array, function(a) kid(true_mom,true_mom,het.error,hom.error))
Do all loci, test mom
estimated_mom=sapply(1:numloci, function(a) infer_mom(obs_mom,a,progeny,p) )
sum(estimated_mom==true_mom[[1]]+true_mom[[2]])
## [1] 1000
Here we run 100 sims. All use same sfs to save a wee bit of time. Returns mean number of loci correct.
sims=100
size.array=5 # size of progeny array
het.error=0.7 # het->hom error
hom.error=0.002 # hom->other error
numloci=1000
errors.correct=FALSE # can assume we know error rates or not
freqs.correct=FALSE # can assume we know freqs or not
x=1:99/100 #0.01 bins of freq.
freq=1/x
sfs=as.numeric();
for(i in 1:99){sfs=c(sfs,rep(x[i],100*freq[i]))}
gen_error_mat<-matrix(c(1-hom.error,hom.error/2,hom.error/2,het.error/2,1-het.error,het.error/2,hom.error/2,hom.error/2,1-hom.error),byrow=T,nrow=3,ncol=3)
probs[[1]]<-gen_error_mat*matrix(c(1, 0, 0), nrow = 3,byrow=F,ncol=3)
probs[[2]]<-gen_error_mat*matrix(c(1/4, 1/2, 1/4), nrow = 3,byrow=F,ncol=3)
probs[[3]]<-gen_error_mat*matrix(c(0, 0, 1), nrow = 3,byrow=F,ncol=3)
p=sample(sfs,numloci) #get freqs for all loci
sim=as.numeric()
for(mysim in 1:sims){
if(errors.correct==FALSE){
het.error=runif(1)/1.67+0.4 # random 0.4-1
hom.error=10^-(runif(1)*4+1) # random
}
if(freqs.correct==FALSE){
og.p=p
p=sapply(1:numloci, function(a) sum(rbinom(140,1,p[a]))/140)
p[which(p==0)]=1/70;
}
a1=ran.hap(numloci,p) #make haplotypes
a2=ran.hap(numloci,p)
true_mom=list(a1,a2) #phased (not necessary here, but a holdover)
obs_mom=add_error(a1+a2,hom.error,het.error) #convert to diploid genotype
progeny<-vector("list",size.array)
progeny<-lapply(1:size.array, function(a) kid(true_mom,true_mom,het.error,hom.error))
estimated_mom=sapply(1:numloci, function(a) infer_mom(obs_mom,a,progeny,p) )
sim[mysim]=sum(estimated_mom==true_mom[[1]]+true_mom[[2]])
# print((p-og.p)/og.p)[which(estimated_mom!=(true_mom[[1]]+true_mom[[2]]))]
}
mean(sim)
## [1] 858.13