library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
options(scipen=999)
setwd("~/projects/b_project/")
#download v2 and format to change to v3
if(file.exists("NAM_phasedImputed_1cM_AllZeaGBSv2.3_allChrs.zip")==FALSE){
system("curl -O http://mirrors.iplantcollaborative.org/download/iplant/home/shared/panzea/genotypes/GBS/v23/NAM_phasedImputed_1cM_AllZeaGBSv
2.3_allChrs.zip")
system("unzip NAM_phasedImputed_1cM_AllZeaGBSv2.3_allChrs.zip")
}
map<-fread("NAM_phasedImputed_1cM_AllZeaGBSv2.3_allChrs.hmp.txt",header=T) %>% select(3:5)
# tweak the above file slightly, write to disk, move to AGPv3 at http://plants.ensembl.org/tools.html
ensembl<-fread("converted_gbs_map.gff.txt") %>% select(4)
newmap<- mutate(map,chr=chrom,bp=ensembl) %>% select(c(chr,bp,cm)) %>% filter(complete.cases(.)==TRUE)
chr_lengths<-scan()
301433382
## [1] 301433382
237893627
## [1] 237893627
232227970
## [1] 232227970
242029974
## [1] 242029974
217928451
## [1] 217928451
169381756
## [1] 169381756
176810253
## [1] 176810253
175347686
## [1] 175347686
157021084
## [1] 157021084
149627545
## [1] 149627545
interpolate<-list()
for(i in 1:10){
bp=c(0,chr_lengths[i])
chr=c(i,i)
cm=c(min(subset(newmap$cm,newmap$chr==i)),max(subset(newmap$cm,newmap$chr==i)))
addon<-data.frame(chr,bp,cm)
temp<-filter(newmap,chr==i)
temp<-rbind(addon,temp)
temp=temp[order(temp$bp),]
interpolate[[i]] = approxfun(temp$bp,temp$cm)
}
# bkn<-fread("BKN_diversity.txt",header=T) %>%
# filter(nSites>=10) %>% mutate(pi=tP/nSites,start=WinCenter,end=WinCenter) %>%
# select(Chr,start,end,pi)
#
# ##don't do this here, as bedtools can only work with integer positions
# #interpolate fast like a madman
# bkntemps<-list()
# for(i in 1:10){
# bkntemps[[i]]<-filter(bkn,Chr==i) %>%
# mutate(start_cm=round(interpolate[[i]](start)*1000000),end_cm=round(interpolate[[i]](end)*1000000)) %>%
# }
# bkn_new=do.call("rbind",bkntemps)
# write.table(bkn_new,file="bkn.bed",quote=F,row.name=F,col.name=F,sep="\t")
til<-fread("TIL_diversity.txt",header=T) %>%
filter(nSites>=100) %>% mutate(pi=tP/nSites,start=WinCenter-50,end=WinCenter+50) %>%
select(Chr,start,end,pi)
# /100 gives you Morgans. multiplying by 1E6 gets you integeres. adding 9E6 prevents negatives
tiltemps<-list()
for(i in 1:10){
tiltemps[[i]]<-filter(til,Chr==i) %>%
mutate(start_cm=round(interpolate[[i]](start)*1E7+9E7+1),end_cm=round(interpolate[[i]](end)*1E7+9E7+1)) %>% mutate(start=ifelse(start_cm>end_cm,end_cm,start_cm),end=ifelse(start_cm>end_cm,start_cm,end_cm)) %>%
select(Chr,start,end,pi) %>% arrange(Chr,start)
}
til_new=do.call("rbind",tiltemps)
write.table(til_new,file="til.bed",quote=F,row.name=F,col.name=F,sep="\t")
gff<-fread("Zea_mays.AGPv3.26.gff3") %>%
setnames(c("seqname","source","feature","start","end","score","strand","frame","attribute"))
bob=subset(gff,gff$seqname %in% 1:10 & gff$feature=="gene")
gff<-bob
gfftemps<-list()
for(i in 1:10){
gfftemps[[i]]<-filter(gff,seqname==i) %>%
mutate(start=round(interpolate[[i]](start)*1E7+9E7 +1),end=round(interpolate[[i]](end)*1E7+9E7+1))
}
gff_new=do.call("rbind",gfftemps)
gff_new<-mutate(gff_new,end=ifelse(end>start,end,start+1),seqname=as.numeric(seqname)) %>% arrange(seqname,start)
write.table(gff_new,file="genes_v3.gff",quote=F,row.name=F,col.name=F,sep="\t")
``` #uses -D ref so upstream numbers are negative and downstream positive
system("bedtools closest -D ref -a til.bed -b genes_v3.gff > til_closest.txt")
#bedtools closest -D ref -a bkn.bed -b genes_v3.gff > bkn_closest.txt
#cut -f 1,2,4,14 bkn_closest.txt > bkn_distance.txt
system("cut -f 1,2,4,14 til_closest.txt > til_distance.txt")
#bkn_bp<-fread("Desktop/bkn_closest.txt") %>%
# setnames(c("chr","start","pi","distance"))
#bkn_new=do.call("rbind",bkntemps)
#tils
til_data<-fread("til_distance.txt") %>%
setnames(c("chr","start","pi","distance")) %>%
mutate(dist=abs(distance/1E7)/100) # 1M to turn back to cM, 100 to turn into M
# tiltemps<-list()
# for(i in 1:10){
# tiltemps[[i]]<-filter(til_bp,chr==i) %>%
# mutate(dist=abs( interpolate[[i]](start+distance)- interpolate[[i]](start))) %>%
# select(chr,pi,dist)
# }
# til_new=do.call("rbind",tiltemps)
grep CDS Zea_mays.AGPv3.26.gff3| grep -v ^Pt | grep -v ^Mt | cut -f 4,5,9 | cut -d ":" -f 1,2 | cut -d ";" -f 1 | grep P01 | awk '{bob=($2-$1)} { print bob, $3}' | perl -ne 'while(<>){ $_=~m/(\d+)\s+\S+CDS\:(.*P01)/; $length=$1; $gene=$2; $count{$gene}++; $length{$gene}+=$length; } foreach(keys(%count)){ print "$_\t", $length{$_}*.75, "\n"}' | awk '{ sum += $2 } END { if (NR>0 ) print sum / NR }'
->791.651 length, mean 4.388924 exons
grep gene Zea_mays.AGPv3.26.gff3| grep -v ^Pt | grep -v ^Mt | cut -f 4,5,9 | cut -d ":" -f 1,2 | cut -d ";" -f 1 | grep T01 | awk '{sum+=($2-$1)} END { if (NR>0 ) print sum / NR }'
->3992.31
And average distance in cM across gene is 0.004506899
exon=1:200
syn=1:66*3
exon=exon[exon %in% syn ==FALSE]
exon2=exon+1200
exon3=exon2+1200
exon4=exon3+1200
gene=c(exon,exon2,exon3,exon4)*(0.004506899/4000)/100 #0.004506899 cM across 4000bp gene in M
#pi_t_neutral=0.0127 # from top of spline curve, not anywhere meaningul/useful
pi_t_neutral=0.9*mean(subset(til_data$pi,til_data$dist>(10^(-4))))
til_data2<-subset(til_data,til_data$distance<1E-5)
til_spline=smooth.spline(til_data2$dist,til_data2$pi/pi_t_neutral)
plot(til_spline$y~til_spline$x,type="l",xlim=c(0,5E-6),ylim=c(0.5,1.1),lwd=3); abline(h=1,lty=2)
#sub_data<-subset(til_data,til_data$dist<3E-4)
#lm( (sub_data$pi/pi_t_neutral) ~ poly(sub_data$dist, 3, raw=TRUE))
#observed data for fit
obs_r<-subset(til_spline$x,til_spline$x<1E-5)
obs_b=subset(til_spline$y,til_spline$x<1E-5)
# parameters: mutation rate and starting value for t, heterozygous selection coefficeint
mu=3E-8
#B function
B<-function(s,r,mu,gene){
exp(-1*sum( (mu*s)/(s+(r+gene)*(1-s))^2 ))
}
Bvector<-function(s,r,mu,gene){
sapply(1:length(r),function(J) exp(-1*sum((mu*s)/(s+(r[J]+gene)*(1-s))^2 )))
}
ss<-function(a,b){ sum((a-b)^2)}
#grid of possible values
bob=expand.grid(c(1,2,5),10^(-2:-10))
sgrid=bob$Var1*bob$Var2
sue=expand.grid(c(1,2,5),10^(-7:-10))
mgrid=sue$Var1*sue$Var2
sums_squares=matrix(nrow=length(sgrid),ncol=length(mgrid))
for(i in 1:length(sgrid)){
for(j in 1:length(mgrid)){
test=Bvector(sgrid[i],obs_r,mgrid[j],gene)
sums_squares[i,j]=ss(test,obs_b);
}
print(i/length(sgrid))
}
## [1] 0.03703704
## [1] 0.07407407
## [1] 0.1111111
## [1] 0.1481481
## [1] 0.1851852
## [1] 0.2222222
## [1] 0.2592593
## [1] 0.2962963
## [1] 0.3333333
## [1] 0.3703704
## [1] 0.4074074
## [1] 0.4444444
## [1] 0.4814815
## [1] 0.5185185
## [1] 0.5555556
## [1] 0.5925926
## [1] 0.6296296
## [1] 0.6666667
## [1] 0.7037037
## [1] 0.7407407
## [1] 0.7777778
## [1] 0.8148148
## [1] 0.8518519
## [1] 0.8888889
## [1] 0.9259259
## [1] 0.962963
## [1] 1
#plot(sums_squares~sgrid,log="xy",xlim=c(1E-3,1E-6))
# 1E-5
options(scipen=1)
paste("s is", sgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[1]])
## [1] "s is 1e-07"
paste("mu is", mgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[2]])
## [1] "mu is 5e-09"
expb=Bvector(sgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[1]],obs_r,mgrid[which(sums_squares==min(sums_squares),arr.ind=TRUE)[2]],gene)
plot(til_spline$y~til_spline$x,type="l",xlim=c(0,5E-6),ylim=c(0.8,1.05),lwd=3); abline(h=1,lty=2)
lines(expb~obs_r,col="red",type="l")