library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.4.2     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.2
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Make up a “true” population SFS for 100,000 TEs

Ne=10^5
te<-rbeta(100000,0.25,2)
hist(te,xlab="TE freqeuncy",main="",ylab="# of TEs")

Get TE ages (using eq. 13 \[t=4N_e\frac{-xlog(x)}{1-x}\] from Kimura & Ohta 1973 Genetics)

te_age<--4*Ne*te*log(te)/(1-te)
plot(te_age[order(te)]~te[order(te)],type="l",xlab="TE freq",ylab="TE age (generations)")

Now SFS for same number of SNPs. Note how SNPs have fewer rare variants.

snp<-rbeta(100000,.6,2)
hist(te,col=rgb(1,0,0,.25),main="",xlab="variant frequency",ylab="number")
hist(snp,col=rgb(0,0,1,.25),add=T)
legend("topright",legend=c("TE","SNP"),fill=c(rgb(1,0,0,.25),rgb(0,0,1,.25)))

Get age for SNPs, same way. Doing this means that both TEs and SNPs are neutral, just different SFS because of different mutation rates. Age shows similar difference as SFS.

snp_age<--4*Ne*snp*log(snp)/(1-snp)
hist(te_age,col=rgb(1,0,0,.25),main="",xlab="variant age",ylab="number")
hist(snp_age,col=rgb(0,0,1,.25),add=T)
legend("topright",legend=c("TE","SNP"),fill=c(rgb(1,0,0,.25),rgb(0,0,1,.25)))

Now let’s turn these “population values” into values for a sample of 25 genomes. First binomial sampling of “true” allele frequency, then filter to keep polymorphic sites only. Down to ~50K SNPs from 100K original.

tes<-data.frame(te,te_age)
tes$nam<-rbinom(n=length(te),size=25,prob=te)
tes<-filter(tes,nam/25<1,nam/25>0)

snps<-data.frame(snp,snp_age)
snps$nam<-rbinom(n=length(snp),size=25,prob=snp)
snps<-filter(snps,nam/25<1,nam/25>0)

Compare new SFS. More TE singletons still.

plot(hist(tes$nam,plot=FALSE)$density,pch=19,col=rgb(1,0,0,.5),main="",xlab="variant frequency",ylab="density")
points(hist(snps$nam,plot=FALSE)$density,pch=19,col=rgb(0,0,1,0.5))
legend("topright",legend=c("SNP","TE"),fill=c(rgb(0,0,1,.25),rgb(1,0,0,.25)))

Compare new ages:

hist(snps$snp_age,col=rgb(0,0,1,.25),main="",xlab="variant age",ylab="number")
hist(tes$te_age,col=rgb(1,0,0,.25),add=T)
legend("topright",legend=c("SNP","TE"),fill=c(rgb(0,0,1,.25),rgb(1,0,0,.25)))

Now let’s make deciles

age_breaks=quantile(tes$te_age,0:10/10)
age_breaks[1]=age_breaks[1]-1
tes$decile=cut(tes$te_age,breaks=age_breaks)
tes$decile=as.factor(tes$decile)
snps$decile=cut(snps$snp_age,breaks=age_breaks)
snps<-filter(snps,is.na(decile)==FALSE)

Calculate \[\Delta_{TE-SNP}\] and plot.

p_tes<-group_by(tes,decile) %>% summarize(p=mean(nam/25),te_age=median(te_age))
p_snps<-group_by(snps,decile) %>% summarize(p=mean(nam/25),snp_age=median(snp_age))
delta=tibble(delta=p_tes$p-p_snps$p,age=p_tes$te_age)
plot(delta$delta~as.factor(round(delta$age,-2)),xlab="age decile (years)",ylim=c(-0.1,0.1))