Want to test for correlation between between pop diversity \(\pi_B=p_i*(1-p_j)+(1-p_i)*p_j\) and recombination rate.

Boring stuff.

setwd("~/projects/ecl298/")
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)

Recombination rate data from Sackton paper. These seem to already be interpolated at markers every 0.05 cM or so. For now only take chr 1, get cM/Mb for each 1Mb window.

load("raw_genetic_maps.RData") #gets osat object = O. Sativa genetic map
rec<-select(osat,chr,cm,mb) %>% arrange(chr,mb,cm) %>% mutate(win=floor(mb)) %>% group_by(chr,win) %>% summarise(start_mb=min(mb),end_mb=max(mb),start_cm=min(cm),end_cm=max(cm),count=n()) %>% mutate(rec=(end_cm-start_cm)/(end_mb-start_mb)) %>% filter(count>1) %>% select(chr,win,rec) 

Load SNP frequency data from vcftools (downloaded from Farm). Using old bams, no filtering etc. DON’T TRUST THESE AS FINAL RESULTS. Files are wonky, and I’m lazy, so perl to the rescue (line formatting for easier reading, but it’s still ugly perl)

for i in {1..12}; 
do perl -slae '
  open FILE, "<rice.$i.stats.txt.frq"; 
  while(<FILE>){  
    next if $_=~m/CHROM/; 
    chomp; 
    @data=split(/\t/,$_); 
    next if $data[2]>2; 
    @p=split(":",$data[4]);   
    @q=split(":",$data[5]); 
    print "$data[0]\t$data[1]\t$p[1]\t$q[1]"; 
    }' 
    -- -i=$i  > rice.$i.freq; 
done

Then concatenated into single file for each taxa.

#sym<-fread("sym.freq")
sym<-fread("sym.freq")
## 
Read 88.1% of 8112369 rows
Read 8112369 rows and 4 (of 4) columns from 0.126 GB file in 00:00:03
setnames(sym,c("CHR","POS","Ps","Qs"))
rice<-fread("rice.freq")
## 
Read 52.6% of 8112369 rows
Read 8112369 rows and 4 (of 4) columns from 0.183 GB file in 00:00:03
setnames(rice,c("chr","pos","Pr","Qr"))
allo<-fread("allo.freq")
## 
Read 77.9% of 8112369 rows
Read 8112369 rows and 4 (of 4) columns from 0.126 GB file in 00:00:03
setnames(allo,c("CHR","POS","Pa","Qa"))

Make a data.table of \(\pi_B\) per Mb window (for comparison)

allo_pi_b <- cbind(rice,allo) %>% mutate(PIB=Pr*Qa+Pa*Qr,mb=POS/1E6,win=floor(mb)) %>% group_by(chr,win) %>% summarise(div=sum(PIB),startmb=min(mb),endmb=max(mb),count=n()) %>% mutate(div_perbp=div/((endmb-startmb)*1E6))
sym_pi_b<- cbind(rice,sym) %>% mutate(PIB=Pr*Qs+Ps*Qr,mb=POS/1E6,win=floor(mb)) %>% group_by(chr,win) %>% summarise(div=sum(PIB),startmb=min(mb),endmb=max(mb),count=n()) %>% mutate(div_perbp=div/((endmb-startmb)*1E6))

Combine genetic and divergence data per bp, plot…

#allopatric
adat<-merge(rec,allo_pi_b,by=c("chr","win"))
ggplot(adat)+geom_point(aes(x=rec,y=div_perbp))+geom_smooth(aes(x=rec,y=div_perbp))+xlab("cM per Mb")+ylab(expression(paste("weighted ",pi[B],sep="")))+theme_bw()+ggtitle("Allopatric")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

cor.test(adat$rec,adat$div_perbp,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  adat$rec and adat$div_perbp
## S = 6074920, p-value = 0.1002
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.09176304
#sympatric
sdat<-merge(rec,sym_pi_b,by=c("chr","win"))
ggplot(sdat)+geom_point(aes(x=rec,y=div_perbp))+geom_smooth(aes(x=rec,y=div_perbp))+xlab("cM per Mb")+ylab(expression(paste("weighted ",pi[B],sep="")))+ggtitle("Sympatric")+theme_bw()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

cor.test(sdat$rec,sdat$div_perbp,method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  sdat$rec and sdat$div_perbp
## S = 8621588, p-value = 0.02536
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1180431