#Identifying candidate outlier SNPs from PLINK FST data on mLynCan4_v1.p_filtered_reduced.vcf.gz We’ll be analyzing the fst_mLynCan4v1p_filtered_reduced.fst which has been downloaded locally from /project/uma_lisa_komoroske/Tanya/scripts/outliers/ folder on our project space.

This script is adopted from Giorgia Auteri’s excellent work here: https://github.com/ECOtlama/LittleBrownBats_WNSSurvivorsVsNonsurvivors And will be cited in any resulting publications

Let’s begin.

  1. Read in Data:
knitr::opts_chunk$set(echo = TRUE)

fdat <- read.table("/Users/tanyalama/Box Sync/project_canada_lynx_wgs/fst_mLynCan4v1p_filtered_reduced.fst",
                   header = TRUE, 
                   sep = "\t") #changed header to TRUE, may turn back?
#There are some FST values that are listed as NaN, those may need to be removed.

#colnames(fdat) <- c('LocusID', 'Pop1', 'Pop2', 'CHROM',    'POS', 'Column',    'OverallPi',    'AMOVAFst', 'FishersP', 'OddsRatio',    'CILow',    'CIHigh.All',   'LOD',  'CorrectedAMOVAFst', 'SmoothedAMOVAFst',    'SmoothedAMOVAFst P-value', 'WindowSNPCount') #I'm going to skip this command, because I think the column names are pretty straight forward. Because our data came from PLINK rather than STACKS, we have fewer columns, but the colun we are most interested is named FST
  1. Quality checks
#Check the dimensions of the data. The n rows should be equal to the number of SNPs in your input dataset. Here, we have 53265 rows. I know that 10,662 SNPs are listed as NaN under the FST column
dim(fdat)
## [1] 53265     5
#Most of our SNPs are in unique positions. This makes sense because we pruned clusters of SNPs >3 within 10bp of any given region
length(unique(fdat$POS)) 
## [1] 53258
  1. Plot the distribution of FST values
#I was able to get the ablines to plot but am not sure what they are exactly supposed to mean...
p<-hist(fdat$FST, breaks=1000, xlim=c(0.01, 1), main="distribution of FST values mLynCan4_v1.p SNPset")

#plot(p, xlim=c(0.01, 1)) #in command line
#abline(v=0.00, col = "red") #in command line
#abline(v=0.05, col = "blue") #in command line
  1. Drop NAN observations for FST
nomissing_fdat <- na.omit(fdat)
#this appears to have worked, let's review what was removed by running the same quality checks as above
dim(nomissing_fdat)
## [1] 42603     5
length(unique(nomissing_fdat$POS)) 
## [1] 42597
  1. Calculate FST stats
mFst <- mean(nomissing_fdat$FST);mFst  #mean fst -- this will only run when NaN observations are dropped
## [1] 0.0647362
sdFst <- sd(nomissing_fdat$FST); sdFst #one standard deviation of Fst
## [1] 0.2245454
cutoff3 <- mFst + (sdFst * 3); cutoff3 #the cutoff for three SDs from the mean this looks more like 9
## [1] 0.7383724
summary(nomissing_fdat) #overall summary
##      CHR                 SNP         POS                NMISS      
##  Length:42603       Min.   :0   Min.   :      842   Min.   :13.00  
##  Class :character   1st Qu.:0   1st Qu.: 29954607   1st Qu.:34.00  
##  Mode  :character   Median :0   Median : 63763784   Median :35.00  
##                     Mean   :0   Mean   : 74315691   Mean   :34.63  
##                     3rd Qu.:0   3rd Qu.:109932573   3rd Qu.:36.00  
##                     Max.   :0   Max.   :239965464   Max.   :37.00  
##       FST          
##  Min.   :-0.53991  
##  1st Qu.:-0.06426  
##  Median :-0.03323  
##  Mean   : 0.06474  
##  3rd Qu.: 0.08758  
##  Max.   : 1.00000
  1. Plot FST values
library(ggplot2) 
# Generate a unique list of chromosome names
scaffolds <- unique(nomissing_fdat$CHR)

chrom.cols <- c(rep(c("#F95045" , "#3CE0FF"), floor(length(scaffolds)/2))) # create alternating colors for scaffolds

cutoff <- cutoff3 #rename cutoff

#Making highlighter lines
#selLoc <- (fdat$LocusID[fdat$AMOVAFst >= cutoff])/1000000
#highlights <- data.frame(LocusID=selLoc, AMOVAFst=c(rep(0.7, length(selLoc))))

selLoc <- (nomissing_fdat$POS[nomissing_fdat$FST >= cutoff])/1000000 #divided by 1M because we are convering the POS in bp to Mbp for easier plotting #1,169 candidates
highlights <- data.frame(POS=selLoc, FST=c(rep(0.7, length(selLoc)))) #why FST = 0.7? that doesn't make sense

ggplot(nomissing_fdat, aes(x=POS/1000000, y=(FST), colour=CHR)) +
  geom_point(shape = 16, size = 0.2)+
  scale_y_continuous(limits=c(0,1.1))+
  #scale_color_manual(values=chrom.cols)+
  xlab("Alignment position (Mbp)")+
  ylab(bquote(paste(italic('F'['ST']*' ')))) +
  geom_hline(yintercept = c(cutoff), linetype = "dashed", size = 0.5, color = "gray35") +
  theme(legend.position="none", text = element_text(size=12), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
## Warning: Removed 24529 rows containing missing values (geom_point).

#################################################################################################
#Write output file with selected loci

selectedLoci <- na.omit(nomissing_fdat[nomissing_fdat$FST >= cutoff,])

ggplot(selectedLoci, aes(x=POS/1000000, y=(FST), color=as.factor(selectedLoci$CHR))) +
  geom_point(shape = 16, size = 0.6)+
  scale_y_continuous(limits=c(0,1.1))+
  #scale_color_manual(values=chrom.cols)+
  xlab("Alignment position (Mbp)")+
  ylab(bquote(paste(italic('F'['ST']*' ')))) +
  geom_hline(yintercept = c(cutoff), linetype = "dashed", size = 0.5, color = "gray35") +
  theme(legend.position="none", text = element_text(size=12), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))

write.csv(selectedLoci, "SelectedLoci.csv")