Load in some stuff we’ll need

library(data.table)
library(ggplot2)
library(dplyr)
library(reshape2)

First, we call SNPs. Here’s the slurm script I used:

#!/bin/bash -l 
#SBATCH -D /home/jri/projects/ecl298
#SBATCH -J mpileup
#SBATCH -o /home/jri/out-%j.txt
#SBATCH -e /home/jri/error-%j.txt
#SBATCH --array=1-12

chr=$SLURM_ARRAY_TASK_ID

bamdir="/group/jrigrp5/ECL298/students/"
samtools mpileup -r $chr -uf $bamdir/Oryza_indica.ASM465v1.24.dna.genome.fa.gz $(ls $bamdir/*bam) | bcftools view - > all_og."$chr".try.raw.vcf

Then I estimated \(F_{ST}\) using vcftools in this slrum script:

#!/bin/bash -l
#SBATCH -D /home/jri/projects/ecl298
#SBATCH -J weir_fst
#SBATCH -o /home/jri/out-%j.txt
#SBATCH -e /home/jri/error-%j.txt
#SBATCH --array=1-12

chr=$SLURM_ARRAY_TASK_ID

vcftools --vcf all_og."$chr".renamed.vcf --weir-fst-pop allopatric --weir-fst-pop sympatric --fst-window-size 1000 --fst-window-step 500 --out allo_sym."$chr".fst.txt
vcftools --vcf all_og."$chr".renamed.vcf --weir-fst-pop allopatric --weir-fst-pop rice --fst-window-size 1000 --fst-window-step 500 --out allo_rice."$chr".fst.txt
vcftools --vcf all_og."$chr".renamed.vcf --weir-fst-pop sympatric  --weir-fst-pop rice --fst-window-size 1000 --fst-window-step 500 --out sym_rice."$chr".fst.txt

Note that both of these scripts take advantage of array jobs on slurm to launch 12 jobs, one for each chromosome. I then concatenated the chromosomes together to make individual \(F_{ST}\) files for each comparison.

Calculate genome-wide Fst

Here I open the \(F_{ST}\) files, combine.

#setwd
setwd("~/projects/ecl298/")

#read in 3 files of data
sr<-fread("sym_rice.fst",header=T)
ar<-fread("allo_rice.fst",header=T)
as<-fread("allo_sym.fst",header=T)

#change col names
sr<-mutate(sr,TYPE=factor(rep("SR",length(sr$CHROM))))
ar<-mutate(ar,TYPE=factor(rep("AR",length(ar$CHROM))))
as<-mutate(as,TYPE=factor(rep("AS",length(as$CHROM))))

#merge to make one big file
all<-rbind(sr,ar,as)

# put the chromosomes in the good order: 1-12
all$CHROM <- factor(all$CHROM,levels=c(1:12))

#remove negative values
all$WEIGHTED_FST[which(all$WEIGHTED_FST<0)]=0 #Weir's $F_ST$ can give values <0 so we reset to 0

Plot \(F_{ST}\) (on one chromosome for show, so as not to kill my computer):

#fix distance to be in Mb
all$BIN_START=all$BIN_START/1E6
all$BIN_END=all$BIN_END/1E6

#plot fst for each chromosome separately
#for(i in 12){
i=12
fst<-ggplot(subset(all,as.numeric(all$CHROM)==i)) + 
geom_smooth(aes(x=BIN_START,y=WEIGHTED_FST,color=TYPE),method=loess, span=0.01) + 
#facet_wrap(~ CHROM,ncol=2,scales="free_x") + # seperate plots for each chromosome, stretch smaller chromosomes
ggtitle("O. sativa (R), Allopatric (A) and Sympatric (S) O. glumaepatula") +
xlab(paste("Mb Chromosome ",i,sep="") ) + 
ylab(expression(F[ST])) + 
theme_bw() 

# save the plot to .png file
#png(paste("~/Desktop/fst.",i,".png",sep=""),750,500)
print(fst)

#dev.off()
#}

Figures for all 12 chromosomes look like:

Find outliers

Make wide version for easy outlier finding. Find outliers. Here I am arbitrarily identifying as an outlier anything in the bottom 10% of the empirical distribution of sympatric-rice \(F_{ST}\) and in the top 10% of the allopatric-sympatric \(F_{ST}\). Lots of other cutoffs would work fine too.

#go from long table to wide
all.wide<-dcast(all,CHROM+BIN_START+BIN_END~TYPE,value.var="WEIGHTED_FST")

#add column marking outliers using some criteria. here bottom 10% sympatric-rice AND top 10% sympatric-allopatric
all.wide<-mutate(all.wide,outlier=(SR<=quantile(SR,0.1,na.rm=T) & AS > quantile(AS,0.9,na.rm=T)))

Here we simply select those windows, write to file.

#get outliers, write to bedfile
outs<-subset(all.wide,all.wide$outlier==T)
write.table(file="outliers.bed",cbind(outs$CHROM,outs$BIN_START,outs$BIN_END),sep="\t",quote=F,row.names=F,col.names=F)

Now we use bedtools and a sorted gff of the Oryza genome to find the gene names and write those to a file. All the cutting and grepping isn’t necessary, but I’m too lazy to do things by hand.

#pipe lets me send output of unix commandline to an R object!
gene_pipe=pipe("~/src/bedtools2/bin/bedtools intersect -a ~/outliers.bed -b Oryza_sativa.IRGSP-1.0.25.sorted.gff3 -wb | cut -f 6,12 | grep ^gene | cut -f 2 | cut -d ';' -f 1 | cut -d ':' -f 2 | sort -n | uniq")
gene_list=scan(gene_pipe,what="character")
write.table(file="outlier_loci.txt",gene_list,quote=F,row.names=F,col.names=F)
head(gene_list)

## character(0)

fsts_rice

Calculate genome-wide Fst

Find outliers