R Notebook

Two locus model

Set values. Read depth is per genome, so for depth of \(x\) \(2x\) reads are generated for the two locus model. Set sfs to TRUE to generate interval\(^2\) allele frequencies from a neutral SFS for both loci. If sfs is FALSE, it will generate all pairwise combinations of a range of interval allele frequencies evenly spaced between 0 and 1 for both loci.

individuals <- 200
depth=5
sfs=TRUE
interval=10

Generate range of allele frequencies for data.

if(sfs){
  p1=sample(rep(1:99/100,times=c(rmultinom(n=1,size=interval^2,prob=c(1/1:99)))),interval^2,replace=FALSE)
  p2=sample(rep(1:99/100,times=c(rmultinom(n=1,size=interval^2,prob=c(1/1:99)))),interval^2,replace=FALSE) # sample shuffles order
}else{
  p1=rep(0:interval/interval,interval+1)
  p2=c(sapply(0:interval,function(x) rep(x/interval,interval+1)))
}

Randomly generate genotype observations for each subgenome that follow a multinomial distribution using the above allele frequencies

genolist_1 <- lapply(p1, function(a) rmultinom(n = individuals, size = 1, prob = c(a^2, (1-a)^2, 2*a*(1-a))))
genolist_2 <- lapply(p2, function(a) rmultinom(n = individuals, size = 1, prob = c(a^2, (1-a)^2, 2*a*(1-a))))

Turn genotypes into allele frequency of the minor allele summed across both loci. Resulting matrix has 1 row per sim, with columns representing individuals.

genofreqs <- t(sapply(1:length(p2), function(b)   ((genolist_1[[b]][1,]*2+genolist_1[[b]][3,]*1)/2 + (genolist_2[[b]][1,]*2+genolist_2[[b]][3,]*1)/2 )/2    ))

Generate read counts for each subgenome that follow a binomial distribution with the genofreqs. Resulting matrix has 1 row per sim, with columns representing individuals.

readcounts <- apply(genofreqs, c(1, 2), function(c) rbinom(n=1,size=depth*2,prob=c))

Function to calculate the probability of a given value of \(p_1\) when we know \(p_2\). Function requires matrix of readcounts, total number of reads, and a vector of known allele frequencies \(p_2\) at locus 2.

L<-function(i,p2,readcounts,totreads){ 
  allprobs<-colSums(
    t(
      apply(readcounts,2,function(c) 
          log(dbinom(c,totreads,0)*(1-i)^2*(1-p2)^2  +
          dbinom(c,totreads,.25)*((1-i)^2*2*p2*(1-p2) + (1-p2)^2*2*i*(1-i))  + 
          dbinom(c,totreads, .5)*( (4*i*(1-i)*p2*(1-p2)+i^2*(1-p2)^2 + p2^2 * (1-i)^2))    +
          dbinom(c,totreads, .75 )*(i^2*2*p2*(1-p2) + p2^2*2*i*(1-i)) +
          dbinom(c,totreads, 1)*(i^2*p2^2))
      )
    )
  )
  return(allprobs)
}

Generate allele frequencies to test

pguess <- seq(0, 1, by = 0.01)

Run the test for known \(p_2\)

Apply function. Resulting matrix has 1 row per sim, the columns represent probability of values of pguess across all individuals of the sim.

probs<- sapply(pguess, function(i) L(i,p2,readcounts,depth*2))

Finds max in each row, turns into value of pguess. Plot against known.

best_p=apply(probs,1, function(x) (which(x==max(x))-1)/100)
plot(p1~best_p)

Return \(R^2\) and correlation coefficient.

summary(lm(p1~best_p))

## 
## Call:
## lm(formula = p1 ~ best_p)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053423 -0.013172 -0.003148  0.006856  0.086825 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.003144   0.003108   1.012    0.314    
## best_p      1.000310   0.010710  93.402   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02424 on 98 degrees of freedom
## Multiple R-squared:  0.9889, Adjusted R-squared:  0.9888 
## F-statistic:  8724 on 1 and 98 DF,  p-value: < 2.2e-16

cor(p1,best_p)

## [1] 0.9944301

Run the test but with a decent guess of \(p_2\).

\(p_2\) known with some error. plot to see how bad.

p2guess=p2+rnorm(length(p2))/10
p2guess[p2guess<0]=0.0
p2guess[p2guess>1]=1
plot(p2guess~p2)

Apply function. Resulting matrix has 1 row per sim, the columns represent probability of values of pguess across all individuals of the sim.

probs<- sapply(pguess, function(i) L(i,p2guess,readcounts,depth*2))

Finds max in each row, turns into value of pguess. Plot against known.

best_p=apply(probs,1, function(x) (which(x==max(x))-1)/100)
plot(p1~best_p)

Return \(R^2\) and correlation coefficient.

summary(lm(p1~best_p))

## 
## Call:
## lm(formula = p1 ~ best_p)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18446 -0.03204  0.00615  0.02661  0.14866 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0007776  0.0074584   0.104    0.917    
## best_p      1.0055869  0.0261151  38.506   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05726 on 98 degrees of freedom
## Multiple R-squared:  0.938,  Adjusted R-squared:  0.9374 
## F-statistic:  1483 on 1 and 98 DF,  p-value: < 2.2e-16

cor(p1,best_p)

## [1] 0.9685054

Now try with \(p_2\) unknown

New likelihood function that only gets one value of \(p_2\) that I’m guessing. Apply to vector of readcounts for one sim.

L2<-function(pguess,readcountvec,totreads){ 
return(sapply(pguess, function(i) sapply(pguess, function(j) sum(log(dbinom(readcountvec,totreads,0)*(1-i)^2*(1-j)^2  +
          dbinom(readcountvec,totreads,.25)*((1-i)^2*2*j*(1-j) + (1-j)^2*2*i*(1-i))  + 
          dbinom(readcountvec,totreads, .5)*( (4*i*(1-i)*j*(1-j)+i^2*(1-j)^2 + j^2 * (1-i)^2))    +
          dbinom(readcountvec,totreads, .75 )*(i^2*2*j*(1-j) + j^2*2*i*(1-i)) +
          dbinom(readcountvec,totreads, 1)*(i^2*j^2))))))
  }

Put a prior on \(p_2\) based on neutral SFS

pguess<-1:99/100
prior<-1/1:99/sum(1/1:99)

Apply function. Resulting matrix has 1 row per sim, the columns represent probability of values of pguess across all individuals of the sim.

if(sfs){
    probs<- lapply(1:length(p2), function(x) t(t(L2(pguess,readcounts[x,],depth*2)+log(prior))+log(prior)))
} else{
  probs<- lapply(1:length(p2), function(x) L2(pguess,readcounts[x,],depth*2))
}

Functions to find row/column where max of matrix is. If multiple equal, choose one randomly.

matxMax <- function(mtx)
{
    mx<-which(mtx == max(mtx), arr.ind = TRUE)
    if(nrow(mx)>1){mx=mx[sample(1:nrow(mx),1),]}
    return( mx )
}

Finds max in each matrix, turns into value of pguess.

best_p=lapply(probs, function(x) (matxMax(x)-1)/100)
p1_est<-unname(unlist(lapply(best_p, function(p) return(p[1]))))
p2_est<-unname(unlist(lapply(best_p, function(p) return(p[2]))))

Plot

plot(p1_est~p1)

plot(p2_est~p2)

summary(lm(p1~p1_est))

## 
## Call:
## lm(formula = p1 ~ p1_est)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46814 -0.08313 -0.05405  0.06776  0.67713 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09313    0.02399   3.882 0.000188 ***
## p1_est       0.48736    0.07583   6.427 4.73e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1929 on 98 degrees of freedom
## Multiple R-squared:  0.2965, Adjusted R-squared:  0.2893 
## F-statistic:  41.3 on 1 and 98 DF,  p-value: 4.734e-09

cor(p1,p1_est)

## [1] 0.5445238

summary(lm(p2~p2_est))

## 
## Call:
## lm(formula = p2 ~ p2_est)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44261 -0.09890 -0.05362  0.06289  0.66679 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.11321    0.02514   4.504 1.84e-05 ***
## p2_est       0.51382    0.09088   5.654 1.55e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2012 on 98 degrees of freedom
## Multiple R-squared:  0.2459, Adjusted R-squared:  0.2382 
## F-statistic: 31.96 on 1 and 98 DF,  p-value: 1.55e-07

cor(p2,p2_est)

## [1] 0.4959186

Why are these so bad? The likelihood surface has a huge flat ridge where lots of possible combinations are equally likely. Pick a sim, for example. Black dot is real values:

p1[49]

## [1] 0.01

p2[49]

## [1] 0.2

image(probs[[49]])
points(p1[49],p2[49],pch=19)