We’ll be analyzing the fst_circ_therm_genelist_6SV.fst which has been downloaded locally from /project/uma_lisa_komoroske/Tanya/scripts/outliers/ folder on our project space.
This dataset includes 1650 SNPs within a 54-gene list related to circadian rhythm and thermo-tolerance in lynx. This gene set was pre-identified using primary literature
This script is adopted from Giorgia Auteri’s excellent work here: https://github.com/ECOtlama/LittleBrownBats_WNSSurvivorsVsNonsurvivors And will be cited in any resulting publications
Let’s begin.
knitr::opts_chunk$set(echo = TRUE)
fdat <- read.table("/Users/tanyalama/Box/project_canada_lynx_wgs/fst_circ_therm_genelist_6SV.fst",
header = TRUE,
sep = "\t") #changed header to TRUE, may turn back?
#There are some FST values that are listed as NaN, those may need to be removed.
#colnames(fdat) <- c('LocusID', 'Pop1', 'Pop2', 'CHROM', 'POS', 'Column', 'OverallPi', 'AMOVAFst', 'FishersP', 'OddsRatio', 'CILow', 'CIHigh.All', 'LOD', 'CorrectedAMOVAFst', 'SmoothedAMOVAFst', 'SmoothedAMOVAFst P-value', 'WindowSNPCount') #I'm going to skip this command, because I think the column names are pretty straight forward. Because our data came from PLINK rather than STACKS, we have fewer columns, but the colun we are most interested is named FST
#Check the dimensions of the data. The n rows should be equal to the number of SNPs in your input dataset. Here, we have 53265 rows. I know that 10,662 SNPs are listed as NaN under the FST column
dim(fdat)
## [1] 3394 5
#Most of our SNPs are in unique positions. This makes sense because we pruned clusters of SNPs >3 within 10bp of any given region
length(unique(fdat$POS))
## [1] 3391
#I was able to get the ablines to plot but am not sure what they are exactly supposed to mean...
p<-hist(fdat$FST, breaks=1000, xlim=c(0.01, 1), main="distribution of FST values circ_therm_genelist_6SV SNPset")
#plot(p, xlim=c(0.01, 1)) #in command line
#abline(v=0.00, col = "red") #in command line
#abline(v=0.05, col = "blue") #in command line
nomissing_fdat <- na.omit(fdat)
#this appears to have worked, let's review what was removed by running the same quality checks as above
dim(nomissing_fdat)
## [1] 1650 5
length(unique(nomissing_fdat$POS))
## [1] 1650
mFst <- mean(nomissing_fdat$FST);mFst #mean fst -- this will only run when NaN observations are dropped
## [1] 0.07099431
sdFst <- sd(nomissing_fdat$FST); sdFst #one standard deviation of Fst
## [1] 0.3585792
cutoff2 <- mFst + (sdFst * 2); cutoff2 #the cutoff for two SDs from the mean. This had be to changed from 3 sd's because that was >1 (limit of FST). Why is our sd so high?
## [1] 0.7881527
summary(nomissing_fdat) #overall summary
## CHR SNP POS NMISS
## NC_044308.1:356 .:1650 Min. : 445287 Min. : 4.000
## NC_044318.1:317 1st Qu.: 19224754 1st Qu.: 8.000
## NC_044306.1:169 Median : 43966496 Median : 9.000
## NC_044316.1:130 Mean : 57905452 Mean : 8.738
## NC_044309.1:122 3rd Qu.: 82658222 3rd Qu.: 9.000
## NC_044310.1:114 Max. :218406254 Max. :10.000
## (Other) :442
## FST
## Min. :-1.50000
## 1st Qu.:-0.16245
## Median : 0.00000
## Mean : 0.07099
## 3rd Qu.: 0.24848
## Max. : 1.00000
##
library(ggplot2)
# Generate a unique list of chromosome names
scaffolds <- unique(nomissing_fdat$CHR)
chrom.cols <- c(rep(c("#F95045" , "#3CE0FF"), floor(length(scaffolds)/2))) # create alternating colors for scaffolds
cutoff <- cutoff2 #rename cutoff
#Making highlighter lines
#selLoc <- (fdat$LocusID[fdat$AMOVAFst >= cutoff])/1000000
#highlights <- data.frame(LocusID=selLoc, AMOVAFst=c(rep(0.7, length(selLoc))))
selLoc <- (nomissing_fdat$POS[nomissing_fdat$FST >= cutoff])/1000000 #divided by 1M because we are convering the POS in bp to Mbp for easier plotting #1,169 candidates
highlights <- data.frame(POS=selLoc, FST=c(rep(0.7, length(selLoc)))) #why FST = 0.7? that doesn't make sense
This is a plot of the distribution of SNPs based on Fst and Het, color=chromosome, SNPs above the dotted line are outliers
ggplot(nomissing_fdat, aes(x=POS/1000000, y=(FST), colour=CHR)) +
geom_point(shape = 16, size = 0.8)+
scale_y_continuous(limits=c(0,1.1))+
#scale_color_manual(values=chrom.cols)+
xlab("Alignment position (Mbp)")+
ylab(bquote(paste(italic('F'['ST']*' ')))) +
geom_hline(yintercept = c(cutoff), linetype = "dashed", size = 0.5, color = "gray35") +
theme(legend.position="none", text = element_text(size=12), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
## Warning: Removed 825 rows containing missing values (geom_point).
selectedLoci <- na.omit(nomissing_fdat[nomissing_fdat$FST >= cutoff,])
ggplot(selectedLoci, aes(x=POS/1000000, y=(FST), color=as.factor(selectedLoci$CHR))) +
geom_point(shape = 16, size = 1)+
scale_y_continuous(limits=c(0,1.1))+
#scale_color_manual(values=chrom.cols)+
xlab("Alignment position (Mbp)")+
ylab(bquote(paste(italic('F'['ST']*' ')))) +
geom_hline(yintercept = c(cutoff), linetype = "dashed", size = 0.5, color = "gray35") +
theme(legend.position="none", text = element_text(size=12), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
write.csv(selectedLoci, "/Users/tanyalama/Box/project_canada_lynx_wgs/SelectedLoci_circ_therm_genelist_6SV.csv")