#Identifying candidate outlier SNPs from PLINK FST data on mLynCan4_v1.p_filtered_reduced.vcf.gz We’ll be analyzing the fst_mLynCan4v1p_filtered_reduced.fst which has been downloaded locally from /project/uma_lisa_komoroske/Tanya/scripts/outliers/ folder on our project space.
This script is adopted from Giorgia Auteri’s excellent work here: https://github.com/ECOtlama/LittleBrownBats_WNSSurvivorsVsNonsurvivors And will be cited in any resulting publications
Let’s begin.
knitr::opts_chunk$set(echo = TRUE)
fdat <- read.table("/Users/tanyalama/Box Sync/project_canada_lynx_wgs/fst_mLynCan4v1p_filtered_reduced.fst",
header = TRUE,
sep = "\t") #changed header to TRUE, may turn back?
#There are some FST values that are listed as NaN, those may need to be removed.
#colnames(fdat) <- c('LocusID', 'Pop1', 'Pop2', 'CHROM', 'POS', 'Column', 'OverallPi', 'AMOVAFst', 'FishersP', 'OddsRatio', 'CILow', 'CIHigh.All', 'LOD', 'CorrectedAMOVAFst', 'SmoothedAMOVAFst', 'SmoothedAMOVAFst P-value', 'WindowSNPCount') #I'm going to skip this command, because I think the column names are pretty straight forward. Because our data came from PLINK rather than STACKS, we have fewer columns, but the colun we are most interested is named FST
#Check the dimensions of the data. The n rows should be equal to the number of SNPs in your input dataset. Here, we have 53265 rows. I know that 10,662 SNPs are listed as NaN under the FST column
dim(fdat)
## [1] 53265 5
#Most of our SNPs are in unique positions. This makes sense because we pruned clusters of SNPs >3 within 10bp of any given region
length(unique(fdat$POS))
## [1] 53258
#I was able to get the ablines to plot but am not sure what they are exactly supposed to mean...
p<-hist(fdat$FST, breaks=1000, xlim=c(0.01, 1), main="distribution of FST values mLynCan4_v1.p SNPset")
#plot(p, xlim=c(0.01, 1)) #in command line
#abline(v=0.00, col = "red") #in command line
#abline(v=0.05, col = "blue") #in command line
nomissing_fdat <- na.omit(fdat)
#this appears to have worked, let's review what was removed by running the same quality checks as above
dim(nomissing_fdat)
## [1] 42603 5
length(unique(nomissing_fdat$POS))
## [1] 42597
mFst <- mean(nomissing_fdat$FST);mFst #mean fst -- this will only run when NaN observations are dropped
## [1] 0.0647362
sdFst <- sd(nomissing_fdat$FST); sdFst #one standard deviation of Fst
## [1] 0.2245454
cutoff3 <- mFst + (sdFst * 3); cutoff3 #the cutoff for three SDs from the mean this looks more like 9
## [1] 0.7383724
summary(nomissing_fdat) #overall summary
## CHR SNP POS NMISS
## Length:42603 Min. :0 Min. : 842 Min. :13.00
## Class :character 1st Qu.:0 1st Qu.: 29954607 1st Qu.:34.00
## Mode :character Median :0 Median : 63763784 Median :35.00
## Mean :0 Mean : 74315691 Mean :34.63
## 3rd Qu.:0 3rd Qu.:109932573 3rd Qu.:36.00
## Max. :0 Max. :239965464 Max. :37.00
## FST
## Min. :-0.53991
## 1st Qu.:-0.06426
## Median :-0.03323
## Mean : 0.06474
## 3rd Qu.: 0.08758
## Max. : 1.00000
library(ggplot2)
# Generate a unique list of chromosome names
scaffolds <- unique(nomissing_fdat$CHR)
chrom.cols <- c(rep(c("#F95045" , "#3CE0FF"), floor(length(scaffolds)/2))) # create alternating colors for scaffolds
cutoff <- cutoff3 #rename cutoff
#Making highlighter lines
#selLoc <- (fdat$LocusID[fdat$AMOVAFst >= cutoff])/1000000
#highlights <- data.frame(LocusID=selLoc, AMOVAFst=c(rep(0.7, length(selLoc))))
selLoc <- (nomissing_fdat$POS[nomissing_fdat$FST >= cutoff])/1000000 #divided by 1M because we are convering the POS in bp to Mbp for easier plotting #1,169 candidates
highlights <- data.frame(POS=selLoc, FST=c(rep(0.7, length(selLoc)))) #why FST = 0.7? that doesn't make sense
ggplot(nomissing_fdat, aes(x=POS/1000000, y=(FST), colour=CHR)) +
geom_point(shape = 16, size = 0.2)+
scale_y_continuous(limits=c(0,1.1))+
#scale_color_manual(values=chrom.cols)+
xlab("Alignment position (Mbp)")+
ylab(bquote(paste(italic('F'['ST']*' ')))) +
geom_hline(yintercept = c(cutoff), linetype = "dashed", size = 0.5, color = "gray35") +
theme(legend.position="none", text = element_text(size=12), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
## Warning: Removed 24529 rows containing missing values (geom_point).
#################################################################################################
#Write output file with selected loci
selectedLoci <- na.omit(nomissing_fdat[nomissing_fdat$FST >= cutoff,])
ggplot(selectedLoci, aes(x=POS/1000000, y=(FST), color=as.factor(selectedLoci$CHR))) +
geom_point(shape = 16, size = 0.6)+
scale_y_continuous(limits=c(0,1.1))+
#scale_color_manual(values=chrom.cols)+
xlab("Alignment position (Mbp)")+
ylab(bquote(paste(italic('F'['ST']*' ')))) +
geom_hline(yintercept = c(cutoff), linetype = "dashed", size = 0.5, color = "gray35") +
theme(legend.position="none", text = element_text(size=12), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
write.csv(selectedLoci, "SelectedLoci.csv")