The Basics

library stuff

library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.3

options(scipen=999)
setwd("~/projects/b_project/")

genetic map

#download v2 and format to change to v3
if(file.exists("NAM_phasedImputed_1cM_AllZeaGBSv2.3_allChrs.zip")==FALSE){
  system("curl -O http://mirrors.iplantcollaborative.org/download/iplant/home/shared/panzea/genotypes/GBS/v23/NAM_phasedImputed_1cM_AllZeaGBSv
  2.3_allChrs.zip")
  system("unzip NAM_phasedImputed_1cM_AllZeaGBSv2.3_allChrs.zip")
}
map<-fread("NAM_phasedImputed_1cM_AllZeaGBSv2.3_allChrs.hmp.txt",header=T) %>% select(3:5)
# tweak the above file slightly, write to disk, move to AGPv3 at http://plants.ensembl.org/tools.html
ensembl<-fread("converted_gbs_map.gff.txt") %>% select(4) 
newmap<- mutate(map,chr=chrom,bp=ensembl) %>% select(c(chr,bp,cm)) %>% filter(complete.cases(.)==TRUE)

make interpolation functions

chr_lengths<-scan()
301433382

## [1] 301433382

237893627

## [1] 237893627

232227970

## [1] 232227970

242029974

## [1] 242029974

217928451

## [1] 217928451

169381756

## [1] 169381756

176810253

## [1] 176810253

175347686

## [1] 175347686

157021084

## [1] 157021084

149627545

## [1] 149627545

interpolate<-list()
for(i in 1:10){
  bp=c(0,chr_lengths[i])
  chr=c(i,i)
  cm=c(min(subset(newmap$cm,newmap$chr==i)),max(subset(newmap$cm,newmap$chr==i)))
  addon<-data.frame(chr,bp,cm)
  temp<-filter(newmap,chr==i)  
  temp<-rbind(addon,temp)
  temp=temp[order(temp$bp),]
  interpolate[[i]] = approxfun(temp$bp,temp$cm)  
}

bed files of position, nucleotide diversity (FARM ~/projects/bigd/angsbigd/results)

# bkn<-fread("BKN_diversity.txt",header=T) %>% 
#   filter(nSites>=10) %>% mutate(pi=tP/nSites,start=WinCenter,end=WinCenter) %>% 
#   select(Chr,start,end,pi)
# 
# ##don't do this here, as bedtools can only work with integer positions
# #interpolate fast like a madman
#  bkntemps<-list()
#  for(i in 1:10){
#  bkntemps[[i]]<-filter(bkn,Chr==i) %>% 
#    mutate(start_cm=round(interpolate[[i]](start)*1000000),end_cm=round(interpolate[[i]](end)*1000000)) %>% 
#  }
#  bkn_new=do.call("rbind",bkntemps)
# write.table(bkn_new,file="bkn.bed",quote=F,row.name=F,col.name=F,sep="\t")

til<-fread("TIL_diversity.txt",header=T) %>% 
  filter(nSites>=100) %>% mutate(pi=tP/nSites,start=WinCenter-50,end=WinCenter+50) %>% 
  select(Chr,start,end,pi)

# /100 gives you Morgans. multiplying by 1E6 gets you integeres. adding 9E6 prevents negatives
tiltemps<-list()
for(i in 1:10){
  tiltemps[[i]]<-filter(til,Chr==i) %>% 
    mutate(start_cm=round(interpolate[[i]](start)*1E7+9E7+1),end_cm=round(interpolate[[i]](end)*1E7+9E7+1)) %>% mutate(start=ifelse(start_cm>end_cm,end_cm,start_cm),end=ifelse(start_cm>end_cm,start_cm,end_cm)) %>%
    select(Chr,start,end,pi) %>% arrange(Chr,start)
}
til_new=do.call("rbind",tiltemps)

write.table(til_new,file="til.bed",quote=F,row.name=F,col.name=F,sep="\t")

only the genes – we’ll do distance from transcript edge for now (FARM ~/projects/bigd/angsbigd/results)

gff<-fread("Zea_mays.AGPv3.26.gff3") %>% 
  setnames(c("seqname","source","feature","start","end","score","strand","frame","attribute")) 

bob=subset(gff,gff$seqname %in% 1:10 & gff$feature=="gene")
gff<-bob

gfftemps<-list()
 for(i in 1:10){
 gfftemps[[i]]<-filter(gff,seqname==i) %>% 
   mutate(start=round(interpolate[[i]](start)*1E7+9E7 +1),end=round(interpolate[[i]](end)*1E7+9E7+1))
 }
 gff_new=do.call("rbind",gfftemps)
 gff_new<-mutate(gff_new,end=ifelse(end>start,end,start+1),seqname=as.numeric(seqname)) %>% arrange(seqname,start) 
 
write.table(gff_new,file="genes_v3.gff",quote=F,row.name=F,col.name=F,sep="\t")

get distance (FARM ~/projects/bigd/angsbigd/results)

``` #uses -D ref so upstream numbers are negative and downstream positive

system("bedtools closest -D ref -a til.bed -b genes_v3.gff > til_closest.txt")
#bedtools closest -D ref -a bkn.bed -b genes_v3.gff > bkn_closest.txt

format for R. should be chr, start, pi, distance (FARM ~/projects/bigd/angsbigd/results)

#cut -f 1,2,4,14 bkn_closest.txt > bkn_distance.txt
system("cut -f 1,2,4,14 til_closest.txt > til_distance.txt")

load distances

#bkn_bp<-fread("Desktop/bkn_closest.txt") %>%
#  setnames(c("chr","start","pi","distance"))
#bkn_new=do.call("rbind",bkntemps)

#tils
til_data<-fread("til_distance.txt") %>%
  setnames(c("chr","start","pi","distance")) %>%
  mutate(dist=abs(distance/1E7)/100) # 1M to turn back to cM, 100 to turn into M
# tiltemps<-list()
# for(i in 1:10){
# tiltemps[[i]]<-filter(til_bp,chr==i) %>% 
#   mutate(dist=abs( interpolate[[i]](start+distance)- interpolate[[i]](start))) %>% 
#   select(chr,pi,dist)
# }
# til_new=do.call("rbind",tiltemps)

get avg. number NS sites in gene

grep CDS Zea_mays.AGPv3.26.gff3| grep -v ^Pt | grep -v ^Mt |  cut -f 4,5,9 | cut -d ":" -f 1,2 | cut -d ";" -f 1 | grep P01 | awk '{bob=($2-$1)} { print bob, $3}' | perl -ne 'while(<>){ $_=~m/(\d+)\s+\S+CDS\:(.*P01)/; $length=$1; $gene=$2; $count{$gene}++; $length{$gene}+=$length; } foreach(keys(%count)){ print "$_\t", $length{$_}*.75, "\n"}' | awk '{ sum += $2 } END { if (NR>0 ) print sum / NR }'

->791.651 length, mean 4.388924 exons

average length of gene

grep gene Zea_mays.AGPv3.26.gff3| grep -v ^Pt | grep -v ^Mt |  cut -f 4,5,9 | cut -d ":" -f 1,2 | cut -d ";" -f 1 | grep T01 | awk '{sum+=($2-$1)} END { if (NR>0 ) print sum / NR }'

->3992.31

So gene structure is 250(1000)250(1000)250(1000)250

And average distance in cM across gene is 0.004506899

exon=1:200
syn=1:66*3
exon=exon[exon %in% syn ==FALSE]
exon2=exon+1200
exon3=exon2+1200
exon4=exon3+1200
gene=c(exon,exon2,exon3,exon4)*(0.004506899/4000)/100 #0.004506899 cM across 4000bp gene in M

Fit B

#pi_t_neutral=0.0127 # from top of spline curve, not anywhere meaningul/useful
pi_t_neutral=0.9*mean(subset(til_data$pi,til_data$dist>(10^(-4))))
til_data2<-subset(til_data,til_data$distance<1E-5)
til_spline=smooth.spline(til_data2$dist,til_data2$pi/pi_t_neutral)
plot(til_spline$y~til_spline$x,type="l",xlim=c(0,5E-6),ylim=c(0.5,1.1),lwd=3); abline(h=1,lty=2)

#sub_data<-subset(til_data,til_data$dist<3E-4)
#lm( (sub_data$pi/pi_t_neutral) ~ poly(sub_data$dist, 3, raw=TRUE))

#observed data for fit
obs_r<-subset(til_spline$x,til_spline$x<1E-5)
obs_b=subset(til_spline$y,til_spline$x<1E-5)

# parameters: mutation rate and starting value for t, heterozygous selection coefficeint
mu=3E-8

#B function
B<-function(s,r,mu,gene){
  exp(-1*sum( (mu*s)/(s+(r+gene)*(1-s))^2   ))
}  

Bvector<-function(s,r,mu,gene){
  sapply(1:length(r),function(J) exp(-1*sum((mu*s)/(s+(r[J]+gene)*(1-s))^2 )))
}

ss<-function(a,b){ sum((a-b)^2)}

#grid of possible values
bob=expand.grid(c(1,2,5),10^(-2:-10))
sgrid=bob$Var1*bob$Var2

sue=expand.grid(c(1,2,5),10^(-7:-10))
mgrid=sue$Var1*sue$Var2

sums_squares=matrix(nrow=length(sgrid),ncol=length(mgrid))
for(i in 1:length(sgrid)){  
  for(j in 1:length(mgrid)){
    test=Bvector(sgrid[i],obs_r,mgrid[j],gene) 
    sums_squares[i,j]=ss(test,obs_b); 
  }
  print(i/length(sgrid))
}

## [1] 0.03703704
## [1] 0.07407407
## [1] 0.1111111
## [1] 0.1481481
## [1] 0.1851852
## [1] 0.2222222
## [1] 0.2592593
## [1] 0.2962963
## [1] 0.3333333
## [1] 0.3703704
## [1] 0.4074074
## [1] 0.4444444
## [1] 0.4814815
## [1] 0.5185185
## [1] 0.5555556
## [1] 0.5925926
## [1] 0.6296296
## [1] 0.6666667
## [1] 0.7037037
## [1] 0.7407407
## [1] 0.7777778
## [1] 0.8148148
## [1] 0.8518519
## [1] 0.8888889
## [1] 0.9259259
## [1] 0.962963
## [1] 1

#plot(sums_squares~sgrid,log="xy",xlim=c(1E-3,1E-6))
# 1E-5
options(scipen=1)
paste("s is", sgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[1]])

## [1] "s is 1e-07"

paste("mu is", mgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[2]])

## [1] "mu is 5e-09"

expb=Bvector(sgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[1]],obs_r,mgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[2]],gene)
plot(til_spline$y~til_spline$x,type="l",xlim=c(0,5E-6),ylim=c(0.8,1.05),lwd=3); abline(h=1,lty=2)
lines(expb~obs_r,col="red",type="l")

Background Selection

The Basics

library stuff

genetic map

make interpolation functions

bed files of position, nucleotide diversity (FARM ~/projects/bigd/angsbigd/results)

only the genes – we’ll do distance from transcript edge for now (FARM ~/projects/bigd/angsbigd/results)

get distance (FARM ~/projects/bigd/angsbigd/results)

format for R. should be chr, start, pi, distance (FARM ~/projects/bigd/angsbigd/results)

load distances

get avg. number NS sites in gene

average length of gene

So gene structure is 250(1000)250(1000)250(1000)250

Fit B