Lock and load

library(cowplot)
library(dplyr)
library(data.table)
library(tidyr)
library(rtracklayer)
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

Get data

  1. read file
  2. add column for taxa
  3. ditch chr,index columns
  4. merge
  5. calculate nucleotide diversity per bp., megabases
teo<-fread("~/Desktop/ecl243_teo.thetas.graph.me") %>% mutate(taxa="teo") %>% select(3:15)
mz<-fread("~/Desktop/ecl243.thetas.graph.me") %>% mutate(taxa="mz") %>% select(3:15)
dat<-rbind(teo,mz) %>% mutate(pi=tP/nSites,Mb=WinCenter/1E6) 

Plot maize nucleotide diversity

filter(dat,taxa=="mz") %>% 
  ggplot(aes(y=pi,x=Mb))+
    geom_point(color=cbPalette[1])+
    geom_smooth(method="loess",span=0.1,color=cbPalette[1])+
    ggtitle("maize nucleotide diversity")+
    xlab("Mb") + 
    ylab(expression(hat(theta[pi])))+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold")) 

Filter for wonky outliers in windwos with few bp

filter(dat,taxa=="mz",nSites>1000) %>% 
  ggplot(aes(y=pi,x=Mb))+
  geom_point(color=cbPalette[1])+
  geom_smooth(method="loess",span=0.1,color=cbPalette[1])+
    ggtitle("maize nucleotide diversity ( > 1Kb sequence per window)")+
    xlab("Mb") + 
    ylab(expression(hat(theta[pi])))+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold")) 

Now look at Tajima’s D

filter(dat,taxa=="mz",nSites>1000) %>% 
  ggplot(aes(y=Tajima,x=Mb))+
    geom_point(color=cbPalette[3])+
    geom_smooth(method="loess",span=0.1,color=cbPalette[3])+
    ggtitle("maize Tajima's D")+
    xlab("Mb") + 
    ylab("Tajima D")+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold")) 

Add teosinte diversity

filter(dat,nSites>1000) %>% 
  ggplot(aes(y=pi,x=Mb,color=taxa))+
    geom_point()+
    geom_smooth(method="loess",span=0.1)+
    ggtitle("compared nucleotide diversity")+
    xlab("Mb") + 
    ylab(expression(hat(theta[pi])))+
    scale_color_manual(values=cbPalette)+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold"),legend.position=c(0.8, 0.9)) 

And D

filter(dat,nSites>1000) %>% 
  ggplot(aes(y=Tajima,x=Mb,color=taxa))+
    geom_point()+
    geom_smooth(method="loess",span=0.1)+
    ggtitle("comparison Tajima's D")+
    xlab("Mb") + 
    ylab("Tajima D")+
    scale_color_manual(values=cbPalette[3:4])+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold"),legend.position=c(0.8, 0.9)) 

Relationships: Calculate \(\hat\theta_W\) and comparisons

#pi vs watterson
filter(dat,nSites>1000,taxa=="mz") %>% 
  mutate(watterson=tW/nSites) %>%
  ggplot(aes(y=watterson,x=pi))+
    geom_point()+
    ggtitle("maize nucleotide diversity")+
    ylab(expression(hat(theta[W])))+scale_y_log10()+
    xlab(expression(hat(theta[pi])))+scale_x_log10()+
    geom_smooth(method="lm")+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold"),legend.position=c(0.8, 0.9)) 

Identify potential “sweeps”

#do some machinations, calculate ratio of pi values
  merge(mz,teo,by="WinCenter",suffixes=c(".mz",".teo")) %>% 
  mutate(rat=(tP.mz/nSites.mz)/(tP.teo/nSites.teo),Mb=WinCenter/1E6) %>%
  filter(nSites.mz>1000,nSites.teo>1000) %>% 
  ggplot(aes(y=rat,x=Mb))+
    geom_point()+
    ggtitle("here be sweeps?")+
    ylab(expression(paste(theta[pi],"maize / ",theta[pi],"teosinte")))+
    xlab("Mb")+
    geom_smooth(method="loess",span=0.05)+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold"),legend.position=c(0.8, 0.9)) 

Get a gff

chr10_genes<-import.gff(con="~/Desktop/Zea_mays.AGPv3.30.chromosome.10.gff3",format="gff3") %>% 
  as("data.frame") %>% 
  mutate(start=start/1E6,end=end/1E6) %>%
  filter(start<25,start>15,type=="gene")
chr10_exons<-import.gff(con="~/Desktop/Zea_mays.AGPv3.30.chromosome.10.gff3",format="gff3") %>% 
  as("data.frame") %>% 
  mutate(start=start/1E6,end=end/1E6) %>%
  filter(start<25,start>15,type=="exon")

Plot with genes, zoomed in to our region

  merge(mz,teo,by="WinCenter",suffixes=c(".mz",".teo")) %>% 
  mutate(rat=(tP.mz/nSites.mz)/(tP.teo/nSites.teo),Mb=WinCenter/1E6) %>%
  filter(nSites.mz>1000,nSites.teo>1000,Mb>17.5,Mb<18) %>% 
  ggplot(aes(y=rat,x=Mb))+
    geom_point()+
    ggtitle("here be sweeps?")+
    ylab(expression(paste(theta[pi],"maize / ",theta[pi],"teosinte")))+
    xlab("Mb")+
    scale_y_log10()+xlim(c(17.5,18))+
    geom_segment(aes(x = start, y = 0.08, xend = end, yend = 0.08),data=chr10_genes,colour=rgb(0,0,0,0.25),size=5)+
    geom_segment(aes(x = start, y = 0.08, xend = end, yend = 0.08),data=chr10_exons,colour=rgb(0,0,0,1),size=10)+
    theme(axis.text=element_text(size=14), plot.title=element_text(size=18),axis.title=element_text(size=18,face="bold"),legend.position=c(0.8, 0.9)) 

Which genes?

 candidates<-merge(mz,teo,by="WinCenter",suffixes=c(".mz",".teo")) %>% 
  mutate(rat=(tP.mz/nSites.mz)/(tP.teo/nSites.teo),Mb=WinCenter/1E6) %>%
  filter(nSites.mz>1000,nSites.teo>1000,rat<quantile(rat,0.05)) 

Let’s look at one:

mygene=filter(chr10_genes,start<17.8,start>17.7)$gene_id
print(paste('http://www.maizegdb.org/gene_center/gene/', mygene,sep=""))
## [1] "http://www.maizegdb.org/gene_center/gene/GRMZM2G137510"