R Notebook

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(mvtnorm)

Parameters

numloci=50000
numpops=10
numdudes=10 # must be multiple of two since we will make half of each population polyploid
diploid=2
polyploid=4
fst=0.10
maf=0.05

Generate grand ancestral allele frequency from neutral SFS

grand_means <- rep(1:99/100,times=c(rmultinom(n=1,size=numloci,prob=c(1/1:99))))

Make pops using multivariate normal under star phylogeny. Fix allele freqs to be bound by [0,1] and filter on maf

popfreqs<-sapply(grand_means,function(p) 
  mvtnorm::rmvnorm(1,rep(p,numpops),p*(1-p)*diag(numpops)*fst)
)

#fix because MVN can go below 0 and above 1
popfreqs[popfreqs<0]=0 
popfreqs[popfreqs>1]=1

# filter for MAF! reset number of loci
newfreqs<-popfreqs[,which(colMeans(popfreqs)>maf)] 
numloci=dim(newfreqs)[2]

Generate some individuals

dudes<-list()
for(i in 1:numpops){
  for(j in 1:numdudes){
    if(j<(numdudes/2+1)){ ploidy=diploid } else{ ploidy=polyploid }
      dudes[[numdudes*(i-1)+j]]=rbinom(numloci,ploidy,newfreqs[i,])/ploidy # allele freq IN EACH INDIVIDUAL
  }
}

Check how well allele frequencies from tetraploid individuals matches diploids

#allele freqs from tetraploids
from_dudes<-apply(matrix(unlist(dudes[1:(numdudes/2)]),byrow=T,nrow=numdudes/2),2,mean)

#allele freqs from diploids
from_otherdudes<-apply(matrix(unlist(dudes[(numdudes/2+1):numdudes]),byrow=T,nrow=(numdudes/2)),2,mean)

#from data
from_data<-round(newfreqs[1,],3)

#compare
comp<-data.frame(from_dudes,from_otherdudes)
ggplot(comp,aes(x=from_dudes,y=from_otherdudes))+geom_point(alpha=0.01)+xlab("diploid frequency estimated from genotypes")+ylab("polyploid frequency estimated from genotypes")+geom_smooth(method="lm")+theme_minimal()+geom_abline(intercept=0,slope=1,lty=2)

## `geom_smooth()` using formula = 'y ~ x'

Sample one sequencing read per individual

single_read_dudes=list()
for(i in 1:numpops){
  for(j in 1:numdudes){
    single_read_dudes[[numdudes*(i-1)+j]]=rbinom(numloci,1,dudes[[numdudes*(i-1)+j]]) 
  }
}

PCA function

do_pca<-function(mat,kind,ploid=999){
  pca <- prcomp(mat,scale. = FALSE,center=TRUE)
  pc <- pca$x
  
  poppc<-data.frame(sort(rep(1:numpops,numdudes)), pc[,1],pc[,2])
  colnames(poppc)=c("pop","pc1","pc2")
  poppc=mutate(poppc,ploidy=rep(c(rep(diploid,(numdudes/2)),rep(polyploid,numdudes/2)),numpops)) %>% filter(ploidy<ploid+1)
  
  ggplot(poppc,aes(x=pc1,y=pc2,shape=as.factor(ploidy),color=as.factor(pop)))+
    geom_point()+
    ggtitle(kind)+
    theme_minimal()
}

PCA on real individual genotypes

real_mat <- matrix(unlist(dudes),byrow=T,nrow=numdudes*numpops) # dudes as rows, loci as columns 
do_pca(real_mat,"real")

PCA on single read estimates

single_mat<-matrix(unlist(single_read_dudes),byrow=T,nrow=numdudes*numpops) # dudes as rows, loci as columns
do_pca(single_mat,"single reads")