Set values. Read depth is per genome, so for depth of \(x\) \(2x\) reads are generated for the two locus model. Set sfs
to TRUE to generate interval
\(^2\) allele frequencies from a neutral SFS for both loci. If sfs
is FALSE, it will generate all pairwise combinations of a range of interval
allele frequencies evenly spaced between 0 and 1 for both loci.
individuals <- 200
depth=5
sfs=TRUE
interval=10
Generate range of allele frequencies for data.
if(sfs){
p1=sample(rep(1:99/100,times=c(rmultinom(n=1,size=interval^2,prob=c(1/1:99)))),interval^2,replace=FALSE)
p2=sample(rep(1:99/100,times=c(rmultinom(n=1,size=interval^2,prob=c(1/1:99)))),interval^2,replace=FALSE) # sample shuffles order
}else{
p1=rep(0:interval/interval,interval+1)
p2=c(sapply(0:interval,function(x) rep(x/interval,interval+1)))
}
Randomly generate genotype observations for each subgenome that follow a multinomial distribution using the above allele frequencies
genolist_1 <- lapply(p1, function(a) rmultinom(n = individuals, size = 1, prob = c(a^2, (1-a)^2, 2*a*(1-a))))
genolist_2 <- lapply(p2, function(a) rmultinom(n = individuals, size = 1, prob = c(a^2, (1-a)^2, 2*a*(1-a))))
Turn genotypes into allele frequency of the minor allele summed across both loci. Resulting matrix has 1 row per sim, with columns representing individuals.
genofreqs <- t(sapply(1:length(p2), function(b) ((genolist_1[[b]][1,]*2+genolist_1[[b]][3,]*1)/2 + (genolist_2[[b]][1,]*2+genolist_2[[b]][3,]*1)/2 )/2 ))
Generate read counts for each subgenome that follow a binomial distribution with the genofreqs. Resulting matrix has 1 row per sim, with columns representing individuals.
readcounts <- apply(genofreqs, c(1, 2), function(c) rbinom(n=1,size=depth*2,prob=c))
Function to calculate the probability of a given value of \(p_1\) when we know \(p_2\). Function requires matrix of readcounts, total number of reads, and a vector of known allele frequencies \(p_2\) at locus 2.
L<-function(i,p2,readcounts,totreads){
allprobs<-colSums(
t(
apply(readcounts,2,function(c)
log(dbinom(c,totreads,0)*(1-i)^2*(1-p2)^2 +
dbinom(c,totreads,.25)*((1-i)^2*2*p2*(1-p2) + (1-p2)^2*2*i*(1-i)) +
dbinom(c,totreads, .5)*( (4*i*(1-i)*p2*(1-p2)+i^2*(1-p2)^2 + p2^2 * (1-i)^2)) +
dbinom(c,totreads, .75 )*(i^2*2*p2*(1-p2) + p2^2*2*i*(1-i)) +
dbinom(c,totreads, 1)*(i^2*p2^2))
)
)
)
return(allprobs)
}
Generate allele frequencies to test
pguess <- seq(0, 1, by = 0.01)
Apply function. Resulting matrix has 1 row per sim, the columns represent probability of values of pguess
across all individuals of the sim.
probs<- sapply(pguess, function(i) L(i,p2,readcounts,depth*2))
Finds max in each row, turns into value of pguess
. Plot against known.
best_p=apply(probs,1, function(x) (which(x==max(x))-1)/100)
plot(p1~best_p)
Return \(R^2\) and correlation coefficient.
summary(lm(p1~best_p))
##
## Call:
## lm(formula = p1 ~ best_p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053423 -0.013172 -0.003148 0.006856 0.086825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003144 0.003108 1.012 0.314
## best_p 1.000310 0.010710 93.402 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02424 on 98 degrees of freedom
## Multiple R-squared: 0.9889, Adjusted R-squared: 0.9888
## F-statistic: 8724 on 1 and 98 DF, p-value: < 2.2e-16
cor(p1,best_p)
## [1] 0.9944301
\(p_2\) known with some error. plot to see how bad.
p2guess=p2+rnorm(length(p2))/10
p2guess[p2guess<0]=0.0
p2guess[p2guess>1]=1
plot(p2guess~p2)
Apply function. Resulting matrix has 1 row per sim, the columns represent probability of values of pguess
across all individuals of the sim.
probs<- sapply(pguess, function(i) L(i,p2guess,readcounts,depth*2))
Finds max in each row, turns into value of pguess
. Plot against known.
best_p=apply(probs,1, function(x) (which(x==max(x))-1)/100)
plot(p1~best_p)
Return \(R^2\) and correlation coefficient.
summary(lm(p1~best_p))
##
## Call:
## lm(formula = p1 ~ best_p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18446 -0.03204 0.00615 0.02661 0.14866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0007776 0.0074584 0.104 0.917
## best_p 1.0055869 0.0261151 38.506 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05726 on 98 degrees of freedom
## Multiple R-squared: 0.938, Adjusted R-squared: 0.9374
## F-statistic: 1483 on 1 and 98 DF, p-value: < 2.2e-16
cor(p1,best_p)
## [1] 0.9685054
New likelihood function that only gets one value of \(p_2\) that I’m guessing. Apply to vector of readcounts for one sim.
L2<-function(pguess,readcountvec,totreads){
return(sapply(pguess, function(i) sapply(pguess, function(j) sum(log(dbinom(readcountvec,totreads,0)*(1-i)^2*(1-j)^2 +
dbinom(readcountvec,totreads,.25)*((1-i)^2*2*j*(1-j) + (1-j)^2*2*i*(1-i)) +
dbinom(readcountvec,totreads, .5)*( (4*i*(1-i)*j*(1-j)+i^2*(1-j)^2 + j^2 * (1-i)^2)) +
dbinom(readcountvec,totreads, .75 )*(i^2*2*j*(1-j) + j^2*2*i*(1-i)) +
dbinom(readcountvec,totreads, 1)*(i^2*j^2))))))
}
Put a prior on \(p_2\) based on neutral SFS
pguess<-1:99/100
prior<-1/1:99/sum(1/1:99)
Apply function. Resulting matrix has 1 row per sim, the columns represent probability of values of pguess
across all individuals of the sim.
if(sfs){
probs<- lapply(1:length(p2), function(x) t(t(L2(pguess,readcounts[x,],depth*2)+log(prior))+log(prior)))
} else{
probs<- lapply(1:length(p2), function(x) L2(pguess,readcounts[x,],depth*2))
}
Functions to find row/column where max of matrix is. If multiple equal, choose one randomly.
matxMax <- function(mtx)
{
mx<-which(mtx == max(mtx), arr.ind = TRUE)
if(nrow(mx)>1){mx=mx[sample(1:nrow(mx),1),]}
return( mx )
}
Finds max in each matrix, turns into value of pguess
.
best_p=lapply(probs, function(x) (matxMax(x)-1)/100)
p1_est<-unname(unlist(lapply(best_p, function(p) return(p[1]))))
p2_est<-unname(unlist(lapply(best_p, function(p) return(p[2]))))
Plot
plot(p1_est~p1)
plot(p2_est~p2)
summary(lm(p1~p1_est))
##
## Call:
## lm(formula = p1 ~ p1_est)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46814 -0.08313 -0.05405 0.06776 0.67713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09313 0.02399 3.882 0.000188 ***
## p1_est 0.48736 0.07583 6.427 4.73e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1929 on 98 degrees of freedom
## Multiple R-squared: 0.2965, Adjusted R-squared: 0.2893
## F-statistic: 41.3 on 1 and 98 DF, p-value: 4.734e-09
cor(p1,p1_est)
## [1] 0.5445238
summary(lm(p2~p2_est))
##
## Call:
## lm(formula = p2 ~ p2_est)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44261 -0.09890 -0.05362 0.06289 0.66679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.11321 0.02514 4.504 1.84e-05 ***
## p2_est 0.51382 0.09088 5.654 1.55e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2012 on 98 degrees of freedom
## Multiple R-squared: 0.2459, Adjusted R-squared: 0.2382
## F-statistic: 31.96 on 1 and 98 DF, p-value: 1.55e-07
cor(p2,p2_est)
## [1] 0.4959186
Why are these so bad? The likelihood surface has a huge flat ridge where lots of possible combinations are equally likely. Pick a sim, for example. Black dot is real values:
p1[49]
## [1] 0.01
p2[49]
## [1] 0.2
image(probs[[49]])
points(p1[49],p2[49],pch=19)