library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.4.2 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Normal derived SFS is that the expected number of SNPs \(\xi\) with a derived allele count of \(k\) is \(\mathbb{E}[\xi_k]=\theta/k\), which for \(\theta=5\) in a sample of \(n=18\) looks like:
k=1:17
f=k/18
theta=5
plot(theta/k ~ k, ylab="expected number",xlab="derived allele count",type="b")
If instead say we have a deletion at derived count of \(l\), then mutations can only occur on the sequence not deleted, which is at a frequency \(n-l\). The original frequency of a mutation is \(k\), but it is now observed at a frequency \(i\) out of \(n-l\) haplotypes. The conditional SFS is thus a modified sum of the “strictly disjoint” and “enclosing” mutations from Ferretti et.al Figure 1 and Eq. 13. “Complementary” mutations would also be observed, but would appear as fixed SNPs and not included in an SFS of polymorphic sites. That gives us this expectation: \[\mathbb{E}[\xi_{i|l}]=\begin{cases} % \theta\big(\frac{1}{i}-l\frac{\beta_n(i)-\beta_n(i+1)+\beta_n(l)-\beta_n(l+1)}{2}\big) \textrm{ for strictly disjoint mutations where } i=k \textrm{ for } k<(n-l) \\ % \theta\,l\big(\frac{\beta_n(l)-\beta_n(l+1)}{2}\big) \textrm{ for enclosing mutations where } i=k-l \textrm{ for } k>l\\ \end{cases}\]
where
\[a_n=\begin{cases} 0 \textrm{ for } x=1 \\ \sum_{x=1}^{n-1}\frac{1}{x} \textrm{ for } x>1 \end{cases} \\ \\ \textrm{ and } \\ \beta_n(x)=\frac{2n}{(n-x+1)(n-x)}(a_{n+1}-a_x)-\frac{2}{n-x} \]
an<-function(x){if(x==1){return(0)}else{return(sum(1/(1:(x-1))))}}
bn<-function(x,n){(2*n/((n-x+1)*(n-x)))*(an(n+1)-an(x))-2/(n-x)}
sfs_del<-function(theta,n,l){
sfs<-as.numeric()
for(k in 1:(n-l-1)){
sfs[k]=theta*((1/k)-l*(bn(k,n)-bn(k+1,n)+bn(l,n)-bn(l+1,n))/2) #disjoint
}
for(k in (l+1):(n-1)){
sfs[k-l]=sfs[k-l]+theta*l*((bn(l,n)-bn(l+1,n))/2) # enclosing
}
return(sfs)
}
For 25 NAM genomes, facetted by deletion frequency
dels=data.frame()
n=25
theta=5
#For d=0
y=theta/(1:(n-1))
y=y/sum(y)
x=(1:(n-1))/(24)
dd=rep(0,n-1)
df<-data.frame(x,y,dd)
dels<-rbind(dels,df)
#For d>0
for(d in 1:(n-2)){
y=sfs_del(theta=theta,n=n,l=d)
y=y/sum(y)
x=(1:(n-d-1))/(n-d)
dd=rep(d,n-d-1)
df<-data.frame(x,y,dd)
dels<-rbind(dels,df)
}
ggplot(dels,aes(y=y,x=x))+
geom_point()+
theme_minimal()+
xlab("SNP frequency")+
ylab("relative probability")+
facet_wrap(~dd,scales="free_y")