functions stuff

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tibble' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'purrr' was built under R version 3.3.2
## Warning: package 'dplyr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
draw<-function() {
  
  #random allele freqs from a neutral model
  p_del=as.numeric(0)
  while(sum(p_del)==0 || sum(p_del)>1){
    p_del=as.numeric(0)
    p_del <- rep(1:100/100,times=c(rmultinom(n=1,size=round(1+rexp(1,.9)),prob=c(1/1:100))))
  }
  
  return(p_del)
}

do sims

nsims=10000

#make deletion and nondeltion allele frequencies
deletion_freqs <- (replicate(nsims, draw()))
non_del_allele = lapply(deletion_freqs,function(x) 1-sum(x))

all_sizes=as.numeric()
sims=list()
for(i in 1:nsims){
  alleles=c(deletion_freqs[[i]],non_del_allele[[i]])
  #random generate the deletion size
  #sizes=c(runif(length(alleles)-1),0) #uniform
  sizes=c(rbeta(length(alleles)-1,shape1=0.5,shape2=1),0) #beta
  #sizes=c(rep(0.5,length(alleles)-1),0) # with this and >.5 = hom, >0 = het should be in perfect HWE

  all_sizes=c(all_sizes,sizes)
  genos=as.numeric()
  geno_freqs=as.numeric()
  for(j in 1:(length(alleles))){
    for(k in 1:(length(alleles))){
      geno_freqs[length(geno_freqs)+1]=alleles[j]*alleles[k]
      if(sizes[j]+sizes[k]>0.6){ 
        genos[length(genos)+1]=2 #deletion homozgyote
      }
      else if(sizes[j]+sizes[k]>.01){
        genos[length(genos)+1]=1 #het
      }
      else{
        genos[length(genos)+1]=0 #normal homozgyote
      }
    }
  }
  estimated_del_freq=sum(geno_freqs*genos)/2
  hom_del<-sum(subset(geno_freqs,genos==2))
  hom_norm<-sum(subset(geno_freqs,genos==0))
  het<-sum(subset(geno_freqs,genos==1))
  sims[[i]]=(c(estimated_del_freq,hom_del,het,hom_norm,length(alleles)-1))
}

deldata=data.frame(matrix(unlist(sims), ncol=5, nrow=nsims, byrow=TRUE),1-unlist(non_del_allele))
colnames(deldata)=c("pA", "pAA","pAB","pBB", "num_del","real_freq")

deletion freq vs. estimated freq

ggplot(deldata,aes(x=real_freq,y=pA)) +
  geom_point(alpha=0.25) 

cor(deldata$pA,deldata$real_freq)
## [1] 0.8464122

histogram of deletion sizes

hist(all_sizes)

histogram of total deletion freq.

#sfs of total deletion freq
hist(1-unlist(non_del_allele))

deletions

#how many deletion alleles per locus?
table(unlist(lapply(deletion_freqs,function(x) length(x))))
## 
##    1    2    3    4    5    6    7    8 
## 3929 3946 1415  495  154   43   11    7

deletions vs. freq

# check number of deletions per allele frequency
ggplot(deldata,aes(y=pA,x=num_del,group=num_del))+geom_boxplot()

#make HW plot
  dat<-filter(deldata, pA > 0 & pA<1) %>%
  mutate(bin=cut(pA,0:100/100)) %>% 
  rename(AA=pAA,Aa=pAB,aa=pBB,p=pA) %>% 
  group_by(bin) %>% 
  summarize(mp=mean(p),AA=mean(AA),Aa=mean(Aa),aa=mean(aa)) %>% 
  gather(geno,frequency,-mp,-bin) %>%
  mutate(efreq=if_else(geno=="AA",mp^2,if_else(geno=="Aa",2*mp*(1-mp),(1-mp)^2)), ss=(frequency-efreq)^2) 
## Warning: package 'bindrcpp' was built under R version 3.3.2
  main<-ggplot(dat,aes(x=mp,y=frequency,color=geno)) + 
  geom_point() +
  geom_line(aes(x=mp,y=efreq,color=geno)) +
  theme(panel.grid.major = element_line(colour="gray", size=0.1)) 
  
  xbar <- axis_canvas(main, axis = "x") +
  geom_col(data=dat,aes(x = mp,y=ss)) + ylab("ss")

# create the combined plot
combined_plot <- insert_xaxis_grob(main, xbar, position = "bottom")
ggdraw(combined_plot)