OutFLANK performance on Cichlid Data

2017-09-14

Packages and Data

#devtools::install_github("privefl/bigsnpr")
library(OutFLANK)  # outflank package
## Loading required package: qvalue
library(bigsnpr)   # package for LD pruning
## Loading required package: bigstatsr
library(bigstatsr) # package for LD pruning 

cic_rds<-readRDS("cichlid_data/cichlid.rds") # Note setwd and change path accordingly
cic_met<-read.csv("cichlid_data/cichlid_sample_metadata.csv") # Note setwd and change path accordingly

cic_rds$chromosome_int<-as.integer(as.factor(cic_rds$chromosome))
SNPdata <- t(cic_rds$G)
locinames <- paste(as.integer(cic_rds$fishing_pressure), cic_rds$position, sep="_")
locinames_update <- data.frame(a = cic_rds$chromosome_int, b = cic_rds$position)

OutFLANK analysis with all SNPs

FstDataFrame <- MakeDiploidFSTMat(SNPdata,locinames,cic_met$fishing_pressure)
## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 54869"
## [1] "20000 done of 54869"
## [1] "30000 done of 54869"
## [1] "40000 done of 54869"
## [1] "50000 done of 54869"
FstDataFrame$PlotPos <- 1:nrow(FstDataFrame)
FstDataFrame$Chrom<- cic_rds$chromosome_int

## FST Correlation
plot(FstDataFrame$FST, FstDataFrame$FSTNoCorr, 
     xlim=c(-0.01,0.3), ylim=c(-0.01,0.3),
     pch=20)
abline(0,1)

k <- 3 ## Number of pops 
out_ini <- OutFLANK(FstDataFrame, NumberOfSamples=k) ## Run outflank on FST dataframe

# Plot results to compare chi-squared distribution vs. actual FST distribution
OutFLANKResultsPlotter(out_ini, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         FALSE, RightZoomFraction = 0.05, titletext = NULL)

## Poor fit, particularly on right tail
OutFLANKResultsPlotter(out_ini, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         TRUE, RightZoomFraction = 0.15, titletext = NULL)

# Histogram of P-values weird
hist(out_ini$results$pvaluesRightTail,breaks = 20)

#### LD Pruning ####
G<-add_code256(big_copy(t(cic_rds$G),type="raw"),code=bigsnpr:::CODE_012)
## Warning: Assignment will down cast from double to raw.
## Hint: To remove this warning, use options(bigstatsr.typecast.warning = FALSE).
newpc<-snp_autoSVD(G=G,infos.chr =cic_rds$chromosome_int,infos.pos = cic_rds$position)
## Phase of clumping at r2 > 0.2.. keep 10811 SNPs.
## 
## Iteration 1:
## Computing SVD..
## 10 long-range LD regions were detected..
## 
## Iteration 2:
## Computing SVD..
## 
## Converged!
plot(newpc) # 3 k groups

which_pruned <- attr(newpc, which="subset") # Indexes of remaining SNPS after pruning


#### Evaluating OutFLANK with pruned data ####
out_trim <- OutFLANK(FstDataFrame[which_pruned,], NumberOfSamples=3)
head(out_trim$results)
##    LocusName         He           FST            T1         T2   FSTNoCorr
## 1     1_3190 0.02631111 -0.0106245867 -1.402192e-04 0.01319762 0.009626817
## 2     1_3203 0.08897778  0.0036302230  1.627347e-04 0.04482773 0.022694379
## 5     1_3311 0.02631111  0.0163152439  2.172200e-04 0.01331393 0.035663338
## 9     1_3340 0.03920000  0.0027356447  5.402162e-05 0.01974731 0.022401055
## 13    1_3376 0.15831111  0.0411934201  3.325202e-03 0.08072168 0.058098202
## 15    1_3424 0.24080000  0.0006484586  7.864950e-05 0.12128685 0.021967405
##        T1NoCorr   T2NoCorr meanAlleleFreq PlotPos Chrom indexOrder GoodH
## 1  0.0001270551 0.01319804     0.98666667       1     1          1  lowH
## 2  0.0010173685 0.04482910     0.04666667       2     1          2  lowH
## 5  0.0004748338 0.01331434     0.98666667       5     1          3  lowH
## 9  0.0004423745 0.01974793     0.98000000       9     1          4  lowH
## 13 0.0046899113 0.08072386     0.91333333      13     1          5 goodH
## 15 0.0026644483 0.12129099     0.86000000      15     1          6 goodH
##      qvalues   pvalues pvaluesRightTail OutlierFlag
## 1  0.9999584 0.4400478        0.7799761       FALSE
## 2  0.9999584 0.9743826        0.5128087       FALSE
## 5  0.9999584 0.6580625        0.3290313       FALSE
## 9  0.9999584 0.9642785        0.5178607       FALSE
## 13 0.9999584 0.2966123        0.1483061       FALSE
## 15 0.9999584 0.9491832        0.5254084       FALSE
OutFLANKResultsPlotter(out_trim, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         FALSE, RightZoomFraction = 0.15, titletext = NULL)

OutFLANKResultsPlotter(out_trim, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         TRUE, RightZoomFraction = 0.10, titletext = NULL)

# Decent distribution fit, no trimming needed. 
hist(out_trim$results$pvaluesRightTail,breaks = 20)

  # still a conservative histogram, which is typical of outflank

top_candidates <- out_trim$results$qvalues<0.01 
# outliers identified conservatively based on qvalue less than 0.01

Plotting

# Plot FST for all SNPs (pre-pruning)
plot(FstDataFrame$PlotPos, FstDataFrame$FST,
     col=grey((FstDataFrame$Chrom%%2+1)/3),
     pch=20, main = "Pre-pruning (with post-pruned outliers)")
points(out_trim$results$PlotPos[top_candidates],  out_trim$results$FST[top_candidates], 
       pch=21, cex=2, col=2)

# Simple manhattan plot with only indexed SNPs 
# Note this is does not represent positions on the contigs, simply a way of roughly visuallizing
# relative position of outlier SNPs at each of the three contigs
plot(out_trim$results$PlotPos, out_trim$results$FST,
     col=grey(((locinames_update[which_pruned,]$a)%%2+1)/3),
     pch=20, main="Post-pruning")
# Circle outlier SNPs 
points(out_trim$results$PlotPos[top_candidates],  out_trim$results$FST[top_candidates], 
       pch=21, cex=2, col=2)