library(tidyverse)

Parameters

numloci=50000
numpops=10
numdudes=5
ploidy=4
fst=0.1
maf=0.05

Generate grand ancestral allele frequency from neutral SFS

grand_means <- rep(1:99/100,times=c(rmultinom(n=1,size=numloci,prob=c(1/1:99)))) 

Make pops using multivariate normal under star phylogeny. Fix allele freqs to be bound by [0,1] and filter on maf

popfreqs<-sapply(grand_means,function(p) 
  mvtnorm::rmvnorm(1,rep(p,numpops),p*(1-p)*diag(numpops)*fst)
)

#fix because MVN can go below 0 and above 1
popfreqs[popfreqs<0]=0 
popfreqs[popfreqs>1]=1

# filter for MAF! reset number of loci
newfreqs<-popfreqs[,which(colMeans(popfreqs)>maf)] 
numloci=dim(newfreqs)[2] 

Generate some individuals

dudes<-list()
for(i in 1:numpops){
  for(j in 1:numdudes){
    dudes[[numdudes*(i-1)+j]]=rbinom(numloci,ploidy,newfreqs[i,])/ploidy # allele freq IN EACH INDIVIDUAL
  }
}

Check how well allele frequencies from individuals mathces reality

from_dudes<-apply(matrix(unlist(dudes[1:numdudes]),byrow=T,nrow=numdudes),2,mean)
from_data<-round(newfreqs[1,],3)
qplot(from_dudes,from_data)+theme_minimal()

Sample one sequencing read per individual

single_read_dudes=list()
for(i in 1:numpops){
  for(j in 1:numdudes){
    single_read_dudes[[numdudes*(i-1)+j]]=rbinom(numloci,1,dudes[[numdudes*(i-1)+j]]) 
  }
}

PCA function

do_pca<-function(mat,kind){
  pca <- prcomp(mat,scale. = FALSE,center=TRUE)
  pc <- pca$x
  
  poppc<-data.frame(sort(rep(1:numpops,numdudes)), pc[,1],pc[,2])
  colnames(poppc)=c("pop","pc1","pc2")
  
  ggplot(poppc,aes(x=pc1,y=pc2,color=as.factor(pop)))+
    geom_point()+
    ggtitle(kind)+
    theme_minimal()
}

PCA on real individual genotypes

real_mat <- matrix(unlist(dudes),byrow=T,nrow=numdudes*numpops) # dudes as rows, loci as columns 
do_pca(real_mat,"real")

PCA on single read estimates

single_mat<-matrix(unlist(single_read_dudes),byrow=T,nrow=numdudes*numpops) # dudes as rows, loci as columns
do_pca(single_mat,"single reads")