library(tidyverse)
Parameters
numloci=50000
numpops=10
numdudes=5
ploidy=4
fst=0.1
maf=0.05
Generate grand ancestral allele frequency from neutral SFS
grand_means <- rep(1:99/100,times=c(rmultinom(n=1,size=numloci,prob=c(1/1:99))))
Make pops using multivariate normal under star phylogeny. Fix allele
freqs to be bound by [0,1] and filter on maf
popfreqs<-sapply(grand_means,function(p)
mvtnorm::rmvnorm(1,rep(p,numpops),p*(1-p)*diag(numpops)*fst)
)
#fix because MVN can go below 0 and above 1
popfreqs[popfreqs<0]=0
popfreqs[popfreqs>1]=1
# filter for MAF! reset number of loci
newfreqs<-popfreqs[,which(colMeans(popfreqs)>maf)]
numloci=dim(newfreqs)[2]
Generate some individuals
dudes<-list()
for(i in 1:numpops){
for(j in 1:numdudes){
dudes[[numdudes*(i-1)+j]]=rbinom(numloci,ploidy,newfreqs[i,])/ploidy # allele freq IN EACH INDIVIDUAL
}
}
Check how well allele frequencies from individuals mathces reality
from_dudes<-apply(matrix(unlist(dudes[1:numdudes]),byrow=T,nrow=numdudes),2,mean)
from_data<-round(newfreqs[1,],3)
qplot(from_dudes,from_data)+theme_minimal()
Sample one sequencing read per individual
single_read_dudes=list()
for(i in 1:numpops){
for(j in 1:numdudes){
single_read_dudes[[numdudes*(i-1)+j]]=rbinom(numloci,1,dudes[[numdudes*(i-1)+j]])
}
}
PCA function
do_pca<-function(mat,kind){
pca <- prcomp(mat,scale. = FALSE,center=TRUE)
pc <- pca$x
poppc<-data.frame(sort(rep(1:numpops,numdudes)), pc[,1],pc[,2])
colnames(poppc)=c("pop","pc1","pc2")
ggplot(poppc,aes(x=pc1,y=pc2,color=as.factor(pop)))+
geom_point()+
ggtitle(kind)+
theme_minimal()
}
PCA on real individual genotypes
real_mat <- matrix(unlist(dudes),byrow=T,nrow=numdudes*numpops) # dudes as rows, loci as columns
do_pca(real_mat,"real")
PCA on single read estimates
single_mat<-matrix(unlist(single_read_dudes),byrow=T,nrow=numdudes*numpops) # dudes as rows, loci as columns
do_pca(single_mat,"single reads")