Evaluate all on sim1a

 # install.packages("devtools")
  library(devtools)

## Warning: package 'devtools' was built under R version 3.4.1

#  source("http://bioconductor.org/biocLite.R")
#  biocLite("qvalue")
#  install_github("whitlock/OutFLANK")
  library(OutFLANK)

## Loading required package: qvalue

  if(!("adegenet" %in% installed.packages())){install.packages("adegenet")}

  library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.5.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

  library(adegenet)

## Loading required package: ade4

## 
##    /// adegenet 2.0.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

  library(pcadapt)
  library(lfmm)
  library(LEA)

### Read in edited VCF file and convert to outFLANK format 

  ind <- read.table("outputIndAll.txt", header=TRUE)
  muts <- read.table("OutputMuts_Detail.txt", header=TRUE)
  sim <- system("grep recomb outputSim.txt", TRUE)
  
  vcf <- read.vcfR("vcf_sim1a_contest.vcf.gz")

## 
Meta line 13 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 5940
## Character matrix gt cols: 1009
## skip: 0
## nrows: 5940
## row_num: 0
## 
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant: 5940
## All variants processed

  recom <- matrix(unlist(strsplit(sim, " ")), ncol=2)
  colnames(recom) <- recom[1,]
  recom <- recom[-1,]
  recom <- apply(recom, 2, as.numeric)

  muts

##    position    selCoef originGen type   freq   pa2 prop count
## 1      3200  0.0989896      3856   m2 0.2335 0.002 0.02  TRUE
## 2     10589 -0.1296740         4   m2 1.0000 0.000 0.00 FALSE
## 3     17593 -0.0991819       238   m2 0.1970 0.002 0.02  TRUE
## 4     18862 -1.2885800      3997   m2 0.0005 0.001 0.01  TRUE
## 5     19341  0.2009850      3994   m2 0.0005 0.000 0.00 FALSE
## 6     21117 -0.7836390      4000   m2 0.0005 0.000 0.00 FALSE
## 7     21929 -0.3388470      2119   m2 0.2415 0.021 0.21  TRUE
## 8     26152 -0.1577500      3997   m2 0.0015 0.000 0.00 FALSE
## 9     27702  0.1191710      3004   m2 0.5325 0.004 0.04  TRUE
## 10    30094  0.0959116      3815   m2 0.0145 0.000 0.00 FALSE
## 11    30729 -0.7890840      3997   m2 0.0015 0.001 0.01  TRUE
## 12    34180  0.0188577      2050   m2 0.4735 0.000 0.00 FALSE
## 13    36194  0.1990460      4000   m2 0.0005 0.000 0.00 FALSE
## 14    37385  0.1180380      2991   m2 0.0140 0.000 0.00 FALSE
## 15    81503 -1.0921700      4000   m2 0.0005 0.001 0.01  TRUE
## 16    81696  0.0594199       712   m2 0.7535 0.001 0.01  TRUE
## 17    81730  0.5315880         3   m2 0.3000 0.059 0.59  TRUE
## 18    85118 -0.1463480       360   m2 0.3095 0.005 0.05  TRUE
## 19    85955 -0.0500430        27   m2 1.0000 0.000 0.00 FALSE
## 20    87082  0.6965490      3993   m2 0.0005 0.000 0.00 FALSE
## 21    88848 -0.5934370      4000   m2 0.0005 0.000 0.00 FALSE
## 22    90699 -0.2599380      3999   m2 0.0010 0.000 0.00 FALSE
## 23    96270 -0.0208146      3677   m2 0.1755 0.000 0.00 FALSE
## 24    97530  0.5923690      4000   m2 0.0005 0.000 0.00 FALSE
## 25    99045 -0.7824100      3995   m2 0.0015 0.001 0.01  TRUE
## 26   100803  0.1659520      3739   m2 0.0870 0.002 0.02  TRUE
## 27   101009  0.3074200      4000   m2 0.0005 0.000 0.00 FALSE
## 28   104069 -0.0453107      3849   m2 0.0050 0.000 0.00 FALSE
## 29   106773  0.2700430      3994   m2 0.0020 0.000 0.00 FALSE
## 30   109888 -0.4721010      3998   m2 0.0005 0.000 0.00 FALSE
## 31   112932 -0.1638910      4000   m2 0.0005 0.000 0.00 FALSE
## 32   116833 -0.4205310      3995   m2 0.0005 0.000 0.00 FALSE
## 33   117820  0.5029660      3986   m2 0.0005 0.000 0.00 FALSE
## 34   186395  0.0340453      2889   m4 1.0000 0.000 0.00  TRUE
## 35   186895  0.0490189      3996   m4 0.0010 0.000 0.00 FALSE
## 36   187265  0.0049572      3940   m4 0.0355 0.000 0.00 FALSE

  muts_count <- muts[muts$count>0,]
  muts_count

##    position    selCoef originGen type   freq   pa2 prop count
## 1      3200  0.0989896      3856   m2 0.2335 0.002 0.02  TRUE
## 3     17593 -0.0991819       238   m2 0.1970 0.002 0.02  TRUE
## 4     18862 -1.2885800      3997   m2 0.0005 0.001 0.01  TRUE
## 7     21929 -0.3388470      2119   m2 0.2415 0.021 0.21  TRUE
## 9     27702  0.1191710      3004   m2 0.5325 0.004 0.04  TRUE
## 11    30729 -0.7890840      3997   m2 0.0015 0.001 0.01  TRUE
## 15    81503 -1.0921700      4000   m2 0.0005 0.001 0.01  TRUE
## 16    81696  0.0594199       712   m2 0.7535 0.001 0.01  TRUE
## 17    81730  0.5315880         3   m2 0.3000 0.059 0.59  TRUE
## 18    85118 -0.1463480       360   m2 0.3095 0.005 0.05  TRUE
## 25    99045 -0.7824100      3995   m2 0.0015 0.001 0.01  TRUE
## 26   100803  0.1659520      3739   m2 0.0870 0.002 0.02  TRUE
## 34   186395  0.0340453      2889   m4 1.0000 0.000 0.00  TRUE

  # Get indexes of causal loci
  all_loci <- as.numeric(vcf@fix[,"POS"])
  
  all_loci_sorted <- sort(all_loci)
  index <- which(all_loci_sorted %in% muts_count$position) #should line up
  all_loci_sorted[index]

##  [1]   3200  17593  18862  21929  27702  30729  81503  81696  81730  85118
## [11]  99045 100803

  muts_count$position

##  [1]   3200  17593  18862  21929  27702  30729  81503  81696  81730  85118
## [11]  99045 100803 186395

  muts_count$index <- c(index,NA) # add NA for the m4 mutation
  
  ymax <- 10^5
  add_layers <- function(){
    #QTL 1
    polygon(c(0,40000, 40000, 0), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("blue", 0.1), border = rgb(0,0,0,0))
    #QTL 2
    polygon(c(80000,120000, 120000,80000), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("blue", 0.1), border = rgb(0,0,0,0))
    
    abline(v = muts_count$position[muts_count$type=="m2"], col="black", lwd=muts_count$prop*20)
    
    # low recom w/ background selection
    polygon(c(54001, 64020, 64020, 54001), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("red", 0.2), border = rgb(0,0,0,0))
    
    # low recom only
    polygon(c(220424, 230443, 230443, 220424), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("red", 0.1), border = rgb(0,0,0,0))    
    
    # Selective Sweep
    polygon(c(185941, 189210, 189210, 185941), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("purple", 0.1), border = rgb(0,0,0,0))   
    abline(v = muts_count$position[muts_count$type=="m4"], col="purple", lwd=2)
  }
  
    plot(x=c(0,240000), y=c(0,0),ylim=c(0,1))
    add_layers()

Conversion script from VCF to Outflank/LFMM/PCAdapt format

NOTE: This has not been thouroughly tested.

### Load the vcfR package 
require(vcfR)

obj.vcfR <- read.vcfR("vcf_sim1a_contest.vcf.gz")

## 
Meta line 13 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 5940
## Character matrix gt cols: 1009
## skip: 0
## nrows: 5940
## row_num: 0
## 
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant: 5940
## All variants processed

# NB: Creates a vcfR object (stored in RAM) which size is twice as big as the original vcf file. So when dealing with large data, make sure you have enough RAM space available before proceeding.

geno <- extract.gt(obj.vcfR) # Character matrix containing the genotypes
position <- getPOS(obj.vcfR) # Positions in bp
chromosome <- getCHROM(obj.vcfR) # Chromosome information

G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))

G[geno %in% c("0/0", "0|0")] <- 0
G[geno  %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2

rds <- list(G = G, position = position, chromosome = chromosome)
saveRDS(rds, "sim1a.rds") 

# NB: About RDS 
# - Pros:
# Can store objects of different class
# Binary format so RDS files are very light
# - Cons: 
# Not human readable
# Not "any other language" readable, can be used only with R

OutFLANK

  training<-readRDS("inputs/sim1a.rds")
  dim(training$G) #1000 individuals in columns

## [1] 5940 1000

  gen_table <- t(training$G)
  locusPos <- all_loci_sorted
 
  dipmat <- MakeDiploidFSTMat(gen_table, locusPos, training$pop)

## Calculating FSTs, may take a few minutes...

  head(dipmat)

##   LocusName        He          FST            T1           T2  FSTNoCorr
## 1        21 0.0324555  0.011320095  1.838491e-04 0.0162409509 0.03013683
## 2       109 0.0009995 -0.006412466 -3.205650e-06 0.0004999091 0.01340827
## 3       133 0.0019980  0.007739770  7.737595e-06 0.0009997190 0.02700356
## 4       231 0.0591395  0.009521405  2.817581e-04 0.0295920765 0.02812038
## 5       290 0.0314880  0.010204201  1.607806e-04 0.0157563125 0.02907417
## 6       338 0.0958995  0.027598488  1.325005e-03 0.0480100581 0.04507415
##       T1NoCorr     T2NoCorr meanAlleleFreq
## 1 4.894857e-04 0.0162421105         0.9835
## 2 6.703422e-06 0.0004999467         0.9995
## 3 2.699795e-05 0.0009997921         0.9990
## 4 8.321991e-04 0.0295941651         0.9695
## 5 4.581346e-04 0.0157574408         0.9840
## 6 2.164156e-03 0.0480132420         0.9495

  hist(dipmat$FST, breaks=100)

  hist(dipmat$He[dipmat$He>0.1])

  plot(dipmat$FST, dipmat$FSTNoCorr)

  hist(dipmat$FSTNoCorr-dipmat$FST, breaks=100)

  plot(dipmat$He, dipmat$FST, pch=20, col="grey")

  hist(dipmat$FSTNoCorr, breaks=seq(0,0.3, by=0.001))

  hist(dipmat$FSTNoCorr[dipmat$He>0.05], breaks=seq(0,0.3, by=0.001))

  hist(dipmat$FSTNoCorr[dipmat$He>0.1], breaks=seq(0,0.3, by=0.001))

  quantile(dipmat$FSTNoCor, probs = c(0.99, 0.95, 0.9, 0.85), na.rm=TRUE)

##        99%        95%        90%        85% 
## 0.08912094 0.05720492 0.04848406 0.04403053

  quantile(dipmat$FSTNoCor, probs = c(0.05, 0.1, 0.15, 0.2, 0.3, 0.4), na.rm=TRUE)

##         5%        10%        15%        20%        30%        40% 
## 0.01340827 0.01558282 0.01775073 0.02008759 0.02419584 0.02786597

  outlier = OutFLANK(dipmat,NumberOfSamples = 39)

    OutFLANKResultsPlotter(outlier, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.05, titletext = NULL)

  outlier = OutFLANK(dipmat,NumberOfSamples = 39, 
                     RightTrimFraction = 0.05, LeftTrimFraction = 0.40,
                     qthreshold = 0.05, Hmin = 0.1)
    # increasing the Right Trim Fraction doesn't help
    # in this case we need to increase the left trim fraction because of the large number of rare alleles in the data (even though these are excluded in the algorithm, the are included in the calculation of the trim points)
  OutFLANKResultsPlotter(outlier, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.05, titletext = NULL)

  hist(outlier$results$pvaluesRightTail)

  hist(outlier$results$qvalues)

  str(outlier)

## List of 6
##  $ FSTbar               : num 0.0174
##  $ FSTNoCorrbar         : num 0.0363
##  $ dfInferred           : num 32.4
##  $ numberLowFstOutliers : int 0
##  $ numberHighFstOutliers: int 142
##  $ results              :'data.frame':   5940 obs. of  15 variables:
##   ..$ LocusName       : num [1:5940] 21 109 133 231 290 338 364 400 424 513 ...
##   ..$ He              : num [1:5940] 0.032455 0.000999 0.001998 0.059139 0.031488 ...
##   ..$ FST             : num [1:5940] 0.01132 -0.00641 0.00774 0.00952 0.0102 ...
##   ..$ T1              : num [1:5940] 1.84e-04 -3.21e-06 7.74e-06 2.82e-04 1.61e-04 ...
##   ..$ T2              : num [1:5940] 0.0162 0.0005 0.001 0.0296 0.0158 ...
##   ..$ FSTNoCorr       : num [1:5940] 0.0301 0.0134 0.027 0.0281 0.0291 ...
##   ..$ T1NoCorr        : num [1:5940] 4.89e-04 6.70e-06 2.70e-05 8.32e-04 4.58e-04 ...
##   ..$ T2NoCorr        : num [1:5940] 0.0162 0.0005 0.001 0.0296 0.0158 ...
##   ..$ meanAlleleFreq  : num [1:5940] 0.984 1 0.999 0.97 0.984 ...
##   ..$ indexOrder      : int [1:5940] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ GoodH           : Factor w/ 2 levels "goodH","lowH": 2 2 2 2 2 2 1 1 2 2 ...
##   ..$ qvalues         : num [1:5940] 1 1 1 1 1 ...
##   ..$ pvalues         : num [1:5940] 0.519782 0.000805 0.293219 0.36687 0.436286 ...
##   ..$ pvaluesRightTail: num [1:5940] 0.74 1 0.853 0.817 0.782 ...
##   ..$ OutlierFlag     : logi [1:5940] FALSE FALSE FALSE FALSE FALSE FALSE ...

  sum(outlier$results$pvaluesRightTail < 0.001, na.rm=TRUE)

## [1] 137

  sum(outlier$results$qvalues<0.01, na.rm=TRUE)

## [1] 101

OutFLANK eval

  head(outlier$results)

##   LocusName        He          FST            T1           T2  FSTNoCorr
## 1        21 0.0324555  0.011320095  1.838491e-04 0.0162409509 0.03013683
## 2       109 0.0009995 -0.006412466 -3.205650e-06 0.0004999091 0.01340827
## 3       133 0.0019980  0.007739770  7.737595e-06 0.0009997190 0.02700356
## 4       231 0.0591395  0.009521405  2.817581e-04 0.0295920765 0.02812038
## 5       290 0.0314880  0.010204201  1.607806e-04 0.0157563125 0.02907417
## 6       338 0.0958995  0.027598488  1.325005e-03 0.0480100581 0.04507415
##       T1NoCorr     T2NoCorr meanAlleleFreq indexOrder GoodH   qvalues
## 1 4.894857e-04 0.0162421105         0.9835          1  lowH 0.9999488
## 2 6.703422e-06 0.0004999467         0.9995          2  lowH 0.9999488
## 3 2.699795e-05 0.0009997921         0.9990          3  lowH 0.9999488
## 4 8.321991e-04 0.0295941651         0.9695          4  lowH 0.9999488
## 5 4.581346e-04 0.0157574408         0.9840          5  lowH 0.9999488
## 6 2.164156e-03 0.0480132420         0.9495          6  lowH 0.9999488
##        pvalues pvaluesRightTail OutlierFlag
## 1 0.5197820696        0.7401090       FALSE
## 2 0.0008050592        0.9995975       FALSE
## 3 0.2932188708        0.8533906       FALSE
## 4 0.3668703477        0.8165648       FALSE
## 5 0.4362856247        0.7818572       FALSE
## 6 0.3258701167        0.1629351       FALSE

  topm2_vcf <- rep(FALSE, nrow(outlier$results))
  topm2_vcf[muts_count$index[muts_count$type=="m2"]] <- TRUE
  plot(dipmat$He, dipmat$FST, pch=20, col="grey", main="outflank")
    points(dipmat$He[topm2_vcf], dipmat$FST[topm2_vcf], pch=21, col="blue")

    #points(POS[topm2_vcf], abs(phen_cor)[topm2_vcf], col="blue")
  
  
    plot(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20, main="outflank")
  add_layers()
  points(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20)
    points(outlier$results$LocusName[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], outlier$results$FST[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], col="red")

    plot(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20, xlim=c(0,40000), main="outflank")
  add_layers()
  points(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20)
    points(outlier$results$LocusName[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], outlier$results$FST[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], col="red")

       plot(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20, xlim=c(80000,120000), main="outflank")
  add_layers()
  points(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20)
    points(outlier$results$LocusName[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], outlier$results$FST[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], col="red")

PCADAPT

aux<-pcadapt(training$G,K=30)

## Number of SNPs: 5940
## Number of individuals: 1000

genotype_file <- aux
plot(aux,option="screeplot")

res<-pcadapt(training$G,K=4)

## Number of SNPs: 5940
## Number of individuals: 1000

x <- res
summary(x)

##                 Length Class  Mode   
## scores           4000  -none- numeric
## singular.values     4  -none- numeric
## zscores         23760  -none- numeric
## loadings        23760  -none- numeric
## maf              5940  -none- numeric
## missing          5940  -none- numeric
## stat             5940  -none- numeric
## gif                 1  -none- numeric
## chi2.stat        5940  -none- numeric
## pvalues          5940  -none- numeric

plot(x, option = "qqplot", threshold = 0.05, main="pcadapt")

#plot(x, option = "stat.distribution")

# Default output from PCAdapt
  par(mfrow=c(1,1))
  plot(locusPos, x$chi2.stat, col="grey", pch=20, main="pcadapt without LD pruning")
  add_layers()

# Calibrate p-values
  hist(res$pvalues,breaks=50)

  require(qvalue)
  plot(qvalue(res$pvalues))

    #choose # significant tests based on where expected false positives are 0 (bottom right)
  outliers<- order(res$pvalues)[1:200]  
  
# Plot without accounting for LD
  plot(locusPos, x$chi2.stat, col="grey", pch=20, main="pcadapt without LD pruning")
    add_layers()
    points(locusPos, x$chi2.stat, col="grey", pch=20)
    points(locusPos[outliers], x$chi2.stat[outliers], col="red", pch=20)

  plot(locusPos, -log10(x$pvalues), col="grey", pch=20, main="pcadapt without LD pruning")
    add_layers()
    points(locusPos, -log10(x$pvalues), col="grey", pch=20)
    points(locusPos[outliers], -log10(x$pvalues[outliers]), col="red", pch=20)

# Account for LD  ####  
  devtools::install_github("privefl/bigsnpr")

## Skipping install of 'bigsnpr' from a github remote, the SHA1 (ccb2e930) has not changed since last install.
##   Use `force = TRUE` to force installation

  library(bigsnpr)

## Loading required package: bigstatsr

  library(bigstatsr)
  G<-add_code256(big_copy(t(training$G),type="raw"),code=bigsnpr:::CODE_012)

## Warning: Assignment will down cast from double to raw.
## Hint: To remove this warning, use options(bigstatsr.typecast.warning = FALSE).

    #puts it in the raw format and stores likelihood genotype probability
  
  newpc<-snp_autoSVD(G=G,infos.chr =training$chromosome,infos.pos = training$position)

## Phase of clumping at r2 > 0.2.. keep 3352 SNPs.
## 
## Iteration 1:
## Computing SVD..
## 
## Converged!

    # this is doing SNP pruning - removing correlated SNPs
    # take snps with highest MAF and correlate snps around it
    # Snps with R^2 > 0.2 are removed
    # the subset is the indexes of the remaining SNPs
  str(newpc)

## List of 7
##  $ d     : num [1:10] 149 123 121 120 119 ...
##  $ u     : num [1:1000, 1:10] 0.037841 -0.000481 0.042954 -0.007822 -0.009347 ...
##  $ v     : num [1:3352, 1:10] 0.006 0.00255 0.00963 0.01262 -0.00197 ...
##  $ niter : int 8
##  $ nops  : int 154
##  $ center: num [1:3352] 0.033 0.001 0.002 0.032 0.12 0.018 0.001 0.548 0.38 0.003 ...
##  $ scale : num [1:3352] 0.1802 0.0316 0.0447 0.1774 0.3359 ...
##  - attr(*, "class")= chr "big_SVD"
##  - attr(*, "subset")= int [1:3352] 1 2 3 5 8 9 10 11 12 14 ...
##  - attr(*, "lrldr")='data.frame':    0 obs. of  3 variables:
##   ..$ Chr  : int(0) 
##   ..$ Start: int(0) 
##   ..$ Stop : int(0)

  plot(newpc)

  which_pruned <- attr(newpc, which="subset")
  length(which_pruned)

## [1] 3352

  gwas<-snp_gc(snp_pcadapt(G, U.row = newpc$u[,1]))
    # snp_pcadapt doing pcadapt but on G format
    # snp_gc is genomic inflation factor will set gif to 1
  
  str(gwas)

## Classes 'mhtest' and 'data.frame':   5940 obs. of  3 variables:
##  $ estim  : num  0.00506 0.01204 0.03221 -0.01587 0.01079 ...
##  $ std.err: num  0.0056 0.03167 0.02238 0.00415 0.00568 ...
##  $ score  : num  0.903 0.38 1.439 -3.822 1.901 ...
##  - attr(*, "transfo")=function (x)  
##  - attr(*, "predict")=function (xtr)

  pval<-predict(gwas,log10=F)
  hist(pval,breaks=50)

  plot(qvalue(pval))

  outliers<-order(pval)[1:50]  
  
  #Manhattan
  CHR<-training$chromosome
  #snp_manhattan(gwas , CHR, unlist(tapply(CHR, CHR, seq_along)), 
  #            dist.sep.chrs = 100,ind.highlight = ou) 
  
  #Eval on simulations
  plot(locusPos, -log10(pval), col="grey", pch=20, main="pcadapt with LD pruning")
    add_layers()
    points(locusPos, -log10(pval), col="grey", pch=20)
    points(locusPos[outliers], -log10(pval)[outliers], col="red", pch=20)

  plot(locusPos, -log10(pval), col="grey", pch=20, xlim=c(0,40000), main="pcadapt with LD pruning")
    add_layers()
    points(locusPos, -log10(pval), col="grey", pch=20)
    points(locusPos[outliers], -log10(pval)[outliers], col="red", pch=20)

    plot(locusPos, -log10(pval), col="grey", pch=20, xlim=c(80000,120000), main="pcadapt with LD pruning")
    add_layers()
    points(locusPos, -log10(pval), col="grey", pch=20)
    points(locusPos[outliers], -log10(pval)[outliers], col="red", pch=20)

HapFLK

Command line argument:

plink --vcf 
plink --bfile sim1a --maf 0.05 --out sim1a.commonf --make-bed
plink --bfile sim1a.commonf --indep-pairwise 50 5 0.5 --out sim1a.ldpruned
plink --bfile sim1a.commonf --extract sim1a.ldpruned.pruned.in --out sim1a.ldpruned --make-bed
admixture sim1a.ldpruned.bed 3

First, convert VCF to BED file. When Plink converts VCF you lose the loci names and positions, so you have to add that.

Next filter for MAF.

Next LD filter every 50 snps window, compute R^2 between all SNPs and choose SNPs so maximum R^2 between them is 0.5, then move 5 SNPs to next window For more information: Plink LD pruning

Make a new bed file with the pruned loci.

Run admixture to get population IDs for hapFLK.

Next: input q matrix into R and assign individuals to population Add population ID to Fam file from Pink

qmat=read.table('sim1a.ldpruned.3.Q')                     
K=dim(qmat)[2]                                                               
pop=apply(qmat,1,which.max)                                                  
fam=read.table('sim1a.commonf.fam')                                     
## assign individuals to "populations" based on the Q matrix                 
fam$V1=pop                                                                   
write.table(fam,col.names=F,row.names=F,file='sim1a.commonf.fam',quote=F)

Then run HapFLK with fam, bed, and bin file, and ‘qpop’ is name of outfile for the run

hapflk --bfile sim1a.commonf -p qpop

hapf <- read.table("results/hapflk/sim1.hapflk", header=TRUE)
plot(hapf$pos, hapf$hapflk, col="grey", pch=20, main="hapFLK")
  add_layers()

FLK tree

Admixture

FLK tree

HapFLK with pruning

flkLD <- read.table("results/hapflk/sim1a.pruned.flk", header=TRUE)
head(flkLD)

##   rs chr  pos     pzero         flk    pvalue
## 1  6   1  338 0.9456250 1.598368960 0.4496956
## 2  7   1  364 0.9443287 0.012716378 0.9936620
## 3  8   1  400 0.9402262 0.002463121 0.9987692
## 4 11   1  515 0.7340671 0.816289316 0.6648827
## 5 12   1  528 0.7977183 0.748574173 0.6877794
## 6 19   1 1006 0.9317668 1.149843996 0.5627488

plot(flkLD$pos, flkLD$flk, col="grey", pch=20, main="FLK with pruning")
  add_layers()
  points(flkLD$pos, flkLD$flk, col="grey", pch=20, main="FLK with pruning")

plot(flkLD$pos, -log10(flkLD$pvalue), col="grey", pch=20, main="FLK with pruning")
  add_layers()
  points(flkLD$pos, -log10(flkLD$pvalue), col="grey", pch=20, main="FLK with pruning")

SelEstim

Underlying model is probably not good for landscape data, IBD. Depend through prior on locus-specific coefficient of seleciton, as well as genome-wide estimate of selection, which is sensitive to underlying model. Here, we see big values of lambda parameters, indicating departures from the underlying model. If neutral variation and IBD and complex structure, then you will also inflate the lambda parameter.

Input conversion not provided, but broadly same format as BayPass except first two lines that indicate #pops and #loci.

Command line argument:

./src/selestim -file /home/rvitalis/work/SSMPG-2017/data/sim1a.selestim -outputs sim1a-1/ -npilot 25 -lpilot 500 -burnin 50000 -length 100000 -thin 25 -threads 16

# 
library(fields)         # extrapolation spatiale

## Loading required package: spam

## Loading required package: grid

## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: maps

library(maps)           # fonds de carte


# sourcing the SelEstim R functions #####

source('results/selestim/selestim/R/SelEstim.R')

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

delta <- read.table('results/selestim/analyses/sim1a-1/summary_delta.out',header = TRUE)
sigma <- read.table('results/selestim/analyses/sim1a-1/summary_sigma.out',header = TRUE)
kappa <- read.table('results/selestim/analyses/sim1a-1/summary_kappa.out',header = TRUE)

head(delta)

##   locus     mean      std      KLD
## 1     1 34.88909 31.42724 0.042387
## 2     2 30.60093 25.77727 0.097974
## 3     3 32.11019 26.99234 0.084146
## 4     4 36.96424 26.44977 0.127024
## 5     5 41.49536 30.49501 0.097431
## 6     6 75.78097 29.61704 0.736941

head(sigma)

##   locus deme     mean      std mean_all_1 std_all_1 mean_all_2 std_all_2
## 1     1    1 41.22727 60.96236   53.91734  71.06870   21.14763  30.77412
## 2     1    2 33.12967 51.13975   26.56959  36.51002   39.11276  60.90761
## 3     1    3 48.29376 71.73789   64.85277  82.27341   18.30601  28.49640
## 4     1    4 41.48338 64.64849   54.02337  74.96806   23.34451  39.22857
## 5     1    5 32.95772 48.45403   24.34895  32.43587   39.74236  57.13592
## 6     1    6 30.80820 45.28096   29.28071  39.92539   32.31746  49.96666

head(kappa)

##   locus  deme_1  deme_2  deme_3  deme_4  deme_5  deme_6  deme_7  deme_8
## 1     1 0.38725 0.52300 0.35575 0.40875 0.55925 0.50300 0.52200 0.52400
## 2     2 0.50150 0.50325 0.50550 0.51150 0.49600 0.50575 0.51200 0.49650
## 3     3 0.50450 0.50750 0.49125 0.51125 0.49200 0.51500 0.50775 0.51375
## 4     4 0.56200 0.55800 0.56200 0.56175 0.38975 0.61025 0.23150 0.48825
## 5     5 0.55525 0.49175 0.56825 0.51650 0.50250 0.55050 0.60300 0.50825
## 6     6 0.60200 0.66600 0.55550 0.59500 0.23650 0.74575 0.03375 0.49800
##    deme_9 deme_10 deme_11 deme_12 deme_13 deme_14 deme_15 deme_16 deme_17
## 1 0.53875 0.53825 0.57525 0.50500 0.51550 0.53600 0.45925 0.50650 0.48275
## 2 0.51575 0.50375 0.50875 0.50875 0.50825 0.50175 0.50800 0.49750 0.51925
## 3 0.51125 0.49650 0.50875 0.51150 0.49325 0.50825 0.50525 0.49975 0.50700
## 4 0.53250 0.39275 0.50400 0.56450 0.37775 0.44075 0.49925 0.44400 0.53275
## 5 0.55275 0.53050 0.50900 0.35575 0.50500 0.52125 0.38150 0.51125 0.52925
## 6 0.50975 0.39350 0.53825 0.40700 0.34050 0.35575 0.56425 0.42925 0.58325
##   deme_18 deme_19 deme_20 deme_21 deme_22 deme_23 deme_24 deme_25 deme_26
## 1 0.46475 0.51000 0.55050 0.43725 0.55100 0.51950 0.52075 0.53975 0.55425
## 2 0.49925 0.51000 0.51350 0.50300 0.52775 0.50325 0.50750 0.50550 0.49375
## 3 0.46450 0.49900 0.50025 0.50300 0.51650 0.50900 0.50725 0.51600 0.50475
## 4 0.54150 0.52450 0.48875 0.54525 0.52125 0.48825 0.46500 0.48400 0.52500
## 5 0.52975 0.51775 0.55200 0.52700 0.56475 0.51700 0.47650 0.46425 0.51725
## 6 0.61125 0.58075 0.31000 0.62225 0.59200 0.52650 0.17000 0.55025 0.58350
##   deme_27 deme_28 deme_29 deme_30 deme_31 deme_32 deme_33 deme_34 deme_35
## 1 0.33500  0.5365 0.53300 0.49150 0.52775 0.49875 0.50675 0.52075 0.44225
## 2 0.50825  0.5135 0.50700 0.50200 0.49825 0.46325 0.49325 0.49875 0.50550
## 3 0.51250  0.4490 0.51050 0.50275 0.50925 0.50125 0.51900 0.50425 0.50225
## 4 0.55875  0.5205 0.36725 0.49750 0.54125 0.42325 0.52750 0.53575 0.55475
## 5 0.31600  0.4345 0.47825 0.51000 0.50825 0.42050 0.48775 0.51975 0.53550
## 6 0.71600  0.5055 0.04475 0.53475 0.60475 0.39975 0.54950 0.59775 0.67000
##   deme_36 deme_37 deme_38 deme_39
## 1 0.52575 0.51000 0.52875 0.52800
## 2 0.50750 0.49175 0.49925 0.50225
## 3 0.49225 0.49500 0.50325 0.51325
## 4 0.47625 0.53600 0.47375 0.49750
## 5 0.42075 0.52825 0.55475 0.47425
## 6 0.38900 0.55075 0.53175 0.53750

########################################################
# plotting the locus-specific coefficient of selection ######
########################################################


plot.delta(file = 'results/selestim/analyses/sim1a-1/summary_delta.out')

plot(locusPos, delta$mean, pch=20, col="grey", main="Selestim")
  add_layers()
  points(locusPos, delta$mean, pch=20, col="grey")

plot(locusPos, delta$mean, pch=20, col="grey", xlim=c(0,40000), main="Selestim")
  add_layers()
  points(locusPos, delta$mean, pch=20, col="grey")

plot(locusPos, delta$mean, pch=20, col="grey", xlim=c(80000,120000), main="Selestim")
  add_layers()
  points(locusPos, delta$mean, pch=20, col="grey")

####################
# plotting the KLD ######
####################

plot.kld(file = 'results/selestim/analyses/sim1a-1/summary_delta.out',calibration_file = 'results/selestim/analyses/sim1a-1/calibration/summary_delta.out',limit = 0.01)

plot(locusPos, delta$KLD, pch=20, col="grey", main="Selestim")
  add_layers()
  points(locusPos, delta$KLD, pch=20, col="grey")

plot(locusPos, delta$KLD, pch=20, col="grey", xlim=c(0,40000), main="Selestim")
  add_layers()
  points(locusPos, delta$KLD, pch=20, col="grey")

plot(locusPos, delta$KLD, pch=20, col="grey", xlim=c(80000,120000), main="Selestim")
  add_layers()
  points(locusPos, delta$KLD, pch=20, col="grey")

########################
# Getting the top SNPs ######
########################

top.SNP.1 <- which(delta$KLD[1:1000] == max(delta$KLD[1:1000]))
top.SNP.2 <- which(delta$KLD == max(delta$KLD))

#############################################################################################
# Computing for each top SNP a composite variable that depend on both kappa_ij and sigma_ij ######
#############################################################################################

rslt.kappa.1 <- 2 * sigma$mean[sigma$locus == top.SNP.1] * (kappa[kappa$locus == top.SNP.1,-1] - 0.5)
rslt.kappa.2 <- 2 * sigma$mean[sigma$locus == top.SNP.2] * (kappa[kappa$locus == top.SNP.2,-1] - 0.5)

############################################
# Getting information from the populations ######
############################################

pos <- read.table('results/selestim/data/sim1a/outputIndAll_pop_sim1a.txt')
pop.x <- vector(length = 39)
pop.y <- vector(length = 39)
phn <- vector(length = 39)

for (i in 1:39) {
    pop.x[i] <- mean(pos$x[which(pos$pop == i)])
    pop.y[i] <- mean(pos$y[which(pos$pop == i)])
    phn[i] <- mean(pos$phenotype1[which(pos$pop == i)])
}

################################################
# extrapolation map for the average phenotypes ######
################################################

    grid = list(x = seq(0,1,0.01),y = seq(0,1,0.01))        # coordonnées du fond de carte
    fit <- Krig(cbind(pop.x,pop.y),as.numeric(phn),theta = 100,m = 1)

## Warning: 
## Grid searches over lambda (nugget and sill variances) with  minima at the endpoints: 
##   (REML) Restricted maximum likelihood 
##    minimum at  right endpoint  lambda  =  7.406845e-05 (eff. df= 37.05001 
## )

      # plot mean phenotype over space
    out <- predictSurface(fit,grid = grid,extrap = TRUE)

## [1] 10201     2

    plot.surface(out,type = "I",xlab = "Longitude",ylab = "Latitude",main = "Phenotype",cex.lab = 1.125,cex.main = 2)
    points(cbind(pop.x,pop.y),pch = 4,cex = 0.75,col="red",lwd = 2.5)

###########################################################################################
# extrapolation map for the population-specific coefficients of selection at the top SNPs ######
###########################################################################################

    grid = list(x = seq(0,1,0.01),y = seq(0,1,0.01))        # coordonnées du fond de carte
    fit <- Krig(cbind(pop.x,pop.y),as.numeric(rslt.kappa.1),theta = 100,m = 1)

## Warning: 
## Grid searches over lambda (nugget and sill variances) with  minima at the endpoints: 
##   (REML) Restricted maximum likelihood 
##    minimum at  right endpoint  lambda  =  7.406845e-05 (eff. df= 37.05001 
## )

    out <- predictSurface(fit,grid = grid,extrap = TRUE)

## [1] 10201     2

    plot.surface(out,type = "I",xlab = "Longitude",ylab = "Latitude",main = expression(2~italic(sigma)[ij]~(italic(kappa)[ij]~-~0.5)~~(TOP~SNP~1)),cex.lab = 1.125,cex.main = 2)
    points(cbind(pop.x,pop.y),pch = 4,cex = 0.75,col="red",lwd = 2.5)

    grid = list(x = seq(0,1,0.01),y = seq(0,1,0.01))        # coordonnées du fond de carte
    fit <- Krig(cbind(pop.x,pop.y),as.numeric(rslt.kappa.2),theta = 100,m = 1)
    out <- predictSurface(fit,grid = grid,extrap = TRUE)

## [1] 10201     2

    plot.surface(out,type = "I",xlab = "Longitude",ylab = "Latitude",main = expression(2~italic(sigma)[ij]~(italic(kappa)[ij]~-~0.5)~~(TOP~SNP~2)),cex.lab = 1.125,cex.main = 2)
    points(cbind(pop.x,pop.y),pch = 4,cex = 0.75,col="red",lwd = 2.5)

This is really cool, because the effect of the derived allele from SNP 1 on the phentoype is negative, but the effect of the derived allele from SNP 2 on the phenotype is positive. The maps are capturing this information (although in the opposite direction I would have expected).

LFMM

The only way to estimate the confounding factors at the same time as the phenotype-genotype relationship (for P-values) is to do the Genotype ~ Phenotype model. As long as your interestin in p-values this is OK, but not OK for polygenic risk scores or SNP effect on the phenotype. (In linear regression, the effect of y~x is not the same as the effect as x~y). It ma Wang et al. mathematical proof to show this is an optimal method to estimate P-values without confounding effects of population structure.

In a P~G model, you use the data to estimate the structure first and then the add the mixed effects to the model, to estimate the SNP effect on the phenotype.

Recommendation (up for discussion):

Use LFMM model to hone-in on loci underlying the phenotype, but then use phenotype~genotype to get effect size.
(lm coeff y~x) = cor(x,y)^2/(lm coeff x~y) Issue will applying to following output because (1) scaling and (2) not sure how to get R^2.

LFMM RIDGE REGRESSION Genotype ~ Phenotype

Olivier says this is the best of the following 3 approaches because it is an exact solution and not an approximate solution. Also proved it is optimal because global minimum of least square problem.

sim1 <- training
# extract scaled genotypes
scaled.genotype <- scale(as.matrix(t(sim1$G)))
  #scaled.genotype <- as.matrix(t(sim1$G))
# extract scaled phenotypes
x <- scale(as.matrix(sim1$phenotype1))
  # centering is important to remove mean
  # x <- scale(as.matrix(sim1$phenotype1), center=TRUE, scale=FALSE)
  # x <- as.matrix(sim1$phenotype1)
  # to do mean and not SD. this might make it possible to get effect sizes
pc <- prcomp(scaled.genotype)
plot(pc$sdev[1:20]^2)
points(5,pc$sdev[5]^2, type = "h", lwd = 3, col = "blue")

# ridge regression
lfmm.ridge <- lfmm::lfmm_ridge(Y = scaled.genotype, X = x, K = 5, lambda = 1e-4)
#The lfmm.ridge object contains estimates for the latent variables and for the effect sizes. Here, the estimates are used for computing calibrated significance values and for testing associations between the response matrix Y and the explanatory variable x. It can be done as follows:

#Fun stuff to compare effects to SNP effects
  #plot(muts_count$selCoef, lfmm.ridge$B[muts_count$index])
  #snp_cor_phen <- apply(scaled.genotype[,muts_count$index],2, function(yo)(cor(yo, x)))
  #new_est <- snp_cor_phen^2/lfmm.ridge$B[muts_count$index]
  #plot(muts_count$selCoef, new_est, xlim=c(-2,2), ylim=c(-2,2))
  #abline(1,1)

lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = x, lfmm = lfmm.ridge, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif

## [1] 4.731696

hist(p.values, col = "lightgreen", main="LFMM ridge")

qval <- qvalue::qvalue(p.values)
plot(qval)

#The plot suggests that setting fdr.level = 0.025 warrant few false positives.

qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)


plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM ridge")
  add_layers()
  points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(0,40000), main="LFMM ridge")
  add_layers()
  points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(80000,120000), main="LFMM ridge")
  add_layers()
  points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

LFMM RIDGE REGRESSION Genotype ~ Phenotype with LD pruning and MAF cutoff

### LD pruning was done in PCADAPT segment
  head(which_pruned)

## [1] 1 2 3 5 8 9

  genotype <- as.matrix(t(sim1$G))

## MAF 
  spectrum <- apply(genotype, 2, sum)
  #We first display a site frequency spectrum.
  plot(table(spectrum)[1:100])

  #We decide to filter out loci with MAF lower than 50 and define a new matrix with less genotypes: genotype.filtered
  
# now remove LD and MAF 
  index.filtered <- unique(which(spectrum > 50), which_pruned)
  genotype.filtered <- genotype[,index.filtered]
  
  scaled.genotype <- scale(genotype.filtered)
  # extract scaled phenotypes
  x <- scale(as.matrix(sim1$phenotype1))
  
  pc <- prcomp(scaled.genotype)
  plot(pc$sdev[1:20]^2)
  points(2,pc$sdev[2]^2, type = "h", lwd = 3, col = "blue")

# ridge regression
lfmm.ridge <- lfmm::lfmm_ridge(Y = scaled.genotype, X = x, K = 2, lambda = 1e-4)
#The lfmm.ridge object contains estimates for the latent variables and for the effect sizes. Here, the estimates are used for computing calibrated significance values and for testing associations between the response matrix Y and the explanatory variable x. It can be done as follows:

#Fun stuff to compare effects to SNP effects
  #plot(muts_count$selCoef, lfmm.ridge$B[muts_count$index])
  #snp_cor_phen <- apply(scaled.genotype[,muts_count$index],2, function(yo)(cor(yo, x)))
  #new_est <- snp_cor_phen^2/lfmm.ridge$B[muts_count$index]
  #plot(muts_count$selCoef, new_est, xlim=c(-2,2), ylim=c(-2,2))
  #abline(1,1)

lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = x, lfmm = lfmm.ridge, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif

## [1] 8.521286

hist(p.values, col = "lightgreen", main="LFMM ridge")

qval <- qvalue::qvalue(p.values)
plot(qval)

#The plot suggests that setting fdr.level = 0.025 warrant few false positives.

qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)

par(mfrow=c(1,1))
plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM ridge")
  add_layers()
  points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM ridge", xli=c(0,40000))
  add_layers()
  points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

LFMM LASSO Genotype ~ Phenotype

#LFMM parameters can alternatively be estimated by solving regularized least-squares mimimization, with lasso penalty as follows.
scaled.genotype <- scale(as.matrix(t(sim1$G)))
lfmm.lasso <- lfmm::lfmm_lasso(Y = scaled.genotype, X = x, K = 5, nozero.prop = 0.02)

## It = 1/100, err2 = 0.999000000011033

## It = 2/100, err2 = 0.987380312304857

## It = 3/100, err2 = 0.98738592511468

## === lambda = 0.215367927378323, no zero B proportion = 0.0175084175084175

## It = 1/100, err2 = 0.987386477109867

## It = 2/100, err2 = 0.987374213059095

## It = 3/100, err2 = 0.987373079604109

## === lambda = 0.205579122693436, no zero B proportion = 0.0198653198653199

## It = 1/100, err2 = 0.987372960154821

## It = 2/100, err2 = 0.987323431042971

## It = 3/100, err2 = 0.987318656299654

## === lambda = 0.196235234288913, no zero B proportion = 0.0234006734006734

 #The lfmm.lasso object contains new estimates for the latent variables and for the effect sizes. Note that for lasso, we didn't set the value of a regularization parameter. Instead, we set the proportion of non-null effects (here 2 percent).

lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = x, lfmm = lfmm.lasso, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif

## [1] 4.279243

hist(p.values, col = "lightblue")

qval <- qvalue::qvalue(p.values)
plot(qval)

qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)

plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM lasso")
  add_layers()
  points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(0,40000), main="LFMM lasso")
  add_layers()
  points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(80000,120000), main="LFMM lasso")
  add_layers()
  points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

LFMM LASSO WITH REMOVAL OF MAF Genotype ~ Phenotype

GWAS data analysis usually follows a number of preprocessing steps, including filtering the data (QC and MAF thesholding), LD pruning, imputation of missing genotypes. Here we show the effect of MAF filtering (LD pruning could be achieved by using the R package SNPRelate).

genotype <- as.matrix(t(sim1$G))
spectrum <- apply(genotype, 2, sum)

#We first display a site frequency spectrum.

plot(table(spectrum)[1:100])

#We decide to filter out loci with MAF lower than 50 (five percent) and define a new matrix with less genotypes: genotype.filtered

genotype.filtered <- genotype[,spectrum > 50]
index.filtered <- which(spectrum > 50)
dim(genotype.filtered)

## [1] 1000 2497

lfmm.lasso.filter <- lfmm::lfmm_lasso(Y = scale(genotype.filtered), X = x, K = 5, nozero.prop = 0.02)

## It = 1/100, err2 = 0.999000000000672

## It = 2/100, err2 = 0.961858901548095

## It = 3/100, err2 = 0.961900958340571

## It = 4/100, err2 = 0.961907396198033

## It = 5/100, err2 = 0.961908492116953

## === lambda = 0.237222773366769, no zero B proportion = 0.026832198638366

lfmm.test <- lfmm::lfmm_test(Y = scale(genotype.filtered), X = x, lfmm = lfmm.lasso.filter, calibrate = "gif")

p.values <- lfmm.test$calibrated.pvalue

lfmm.test$gif

## [1] 3.333462

hist(p.values, col = "lightblue")

qval <- qvalue::qvalue(p.values)
plot(qval)

qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)
plot(-log10(p.values), cex = .5, pch = 19, col = "grey")

plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM lasso MAF > 0.05")
  add_layers()
  points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(0,40000), main="LFMM lasso MAF > 0.05")
  add_layers()
  points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(80000,120000), main="LFMM lasso MAF > 0.05")
  add_layers()
  points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
  points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")

Filtering by MAF loses some of signals at the the rarer, small effect alleles, but also augments the signal at the low recombination with background selection.

LFMM with LEA Genotype ~ Phenotype

LFMM parameters can be estimated by using a hierarchical Bayesian model with sparse priors, as implemented in LEA.

# See instructions on github, skipped this step

BayPass

Conversion script from VCF to BayPass not provided (not robust to different kinds of VCF files). A robust conversion script would be welcome!

Code for running Sim1a and Sim2a:

i_baypass -npop 39 -gfile sim1a.genobaypass -efile sim1a.popscalecov -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS1_covis > anaS1_covis.log
i_baypass -npop 39 -gfile sim1a.genobaypass -efile sim1a.popscalecov -omegafile anaS1_covis_mat_omega.out -auxmodel -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS1_covaux > anaS1_covaux.log

i_baypass -npop 39 -gfile sim2a.genobaypass -efile sim2a.popscalecov -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS2_covis > anaS2_covis.log
i_baypass -npop 39 -gfile sim2a.genobaypass -efile sim2a.popscalecov -omegafile anaS2_covis_mat_omega.out -auxmodel -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS2_covaux > anaS2_covaux.log

i_baypass -npop 39 -gfile sim1a.genobaypass -efile sim1a.popscalecov -omegafile anaS1_covis_mat_omega.out -auxmodel -isingbeta 1. -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS1_covaux_isb1 > anaS1_covaux_isb1.log
i_baypass -npop 39 -gfile sim2a.genobaypass -efile sim2a.popscalecov -omegafile anaS2_covis_mat_omega.out -auxmodel -isingbeta 1. -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS2_covaux_isb1 > anaS2_covaux_isb1.log

source("results/baypass_rehh/standardize.xtx.R")
source("results/baypass_rehh/baypass_utils.R")

SIMU = 1 #1=training data set ; 2=validation data set
folder <- "results/baypass_rehh/"

det.snp=read.table(paste0(folder,"sim",SIMU,"a.detsnp"))
head(det.snp)

##   V1  V2 V3   V4
## 1  1  21  A TRUE
## 2  1 109  A TRUE
## 3  1 133  A TRUE
## 4  1 231  A TRUE
## 5  1 290  A TRUE
## 6  1 338  A TRUE

covariate=read.table(paste0(folder,"sim",SIMU,"a.popscalecov"))
head(covariate)

##           V1          V2         V3          V4        V5        V6
## 1 0.05803792  0.46902333 -0.9106860 -1.41580522 -1.437323 0.8860763
## 2 1.63474855 -1.39796336 -0.5892786  1.59239065 -1.391031 0.7589330
## 3 1.74271171 -0.05465763  0.4477096  0.02005608 -1.389433 0.7591777
## 4 1.35172812  1.27883394  0.1543175 -0.30974391 -1.616234 0.5872833
##           V7         V8         V9         V10       V11        V12
## 1 -1.4625347  1.2708972  0.5263237  0.88056841 -1.450059 -1.4382660
## 2 -0.7209904  1.6059511 -0.5427793 -0.36543051  1.215242  0.8158230
## 3 -1.9235362  0.4245828  1.1625310 -0.87860559 -1.041913 -0.7187590
## 4 -1.6369566 -1.2436665  0.9497098 -0.08608794 -0.503407 -0.7334598
##          V13        V14         V15         V16        V17        V18
## 1 -0.9763150  1.3888542 -0.01467199 -1.02980492 -0.1677179 -0.2729604
## 2 -1.4805418  1.0838990 -1.55839942  0.38008760 -0.8459146  0.7884319
## 3 -0.8257588 -0.1844147  0.37777999  0.07310678  0.9762889  1.3042229
## 4 -1.0179113 -0.8492701  1.77144900 -0.39486647  1.5375060  1.5794965
##           V19       V20        V21       V22        V23        V24
## 1  1.06910319 0.3309479  0.6058540  1.571471  0.9850457  0.8397252
## 2  0.09973125 0.5376317  0.1571538  0.196349 -0.7720743 -1.2113596
## 3 -1.55401094 1.4208936 -0.6839335 -1.005647  0.3060744  1.1580900
## 4 -1.13117941 1.1852670 -0.2672595 -1.328703  0.6904243  0.6832743
##          V25        V26        V27        V28        V29        V30
## 1  1.1910022  1.4922890 -0.6853850 -0.6895273 -1.3448089 -0.4329465
## 2 -1.5603223 -0.9456296  0.6134519 -0.1983667  0.2784985 -1.5406461
## 3 -0.5009385 -1.5350416  1.2754478  0.8769679 -1.4058968 -0.1361705
## 4 -0.9479869 -1.0785366  0.7475910  0.5086590 -1.4886872  0.2284050
##          V31       V32        V33        V34        V35         V36
## 1 -0.5755136 0.8112454  1.0996562  1.5377877 -0.4195035 -1.01295004
## 2 -1.0196346 1.2317262  0.5392415 -0.2616589  0.1583276 -0.09274187
## 3  0.7211368 0.9314482 -0.7053409 -1.3714621  0.4654611 -0.77886988
## 4  0.7192420 0.2261967 -0.3507818 -1.2198046  0.6106576 -0.65963837
##           V37         V38        V39
## 1 -0.91205250 -0.01911443 -0.3459619
## 2  1.34097044  0.19054307  1.2756312
## 3  0.17357996  0.98594818  1.0911735
## 4  0.04371451  0.99220997  1.0182154

# Omega and XtX estimation ####
om=as.matrix(read.table(paste0(folder,"anaS",SIMU,"_covis_mat_omega.out")))

xtx=read.table(paste0(folder,"anaS",SIMU,"_covis_summary_pi_xtx.out"),h=T)$M_XtX
head(xtx)

## [1] 38.43280 39.26080 39.75763 37.85158 38.53820 38.63391

xtx.std=standardize.xtx(xtx,npop=39)

#Bayes Factor (Importance Sampling estimate) for association with phenotype ####
  res.bfis=read.table(paste0(folder,"anaS",SIMU,"_covis_summary_betai_reg.out"),h=T)
  head(res.bfis)

##   COVARIABLE MRK   M_Pearson SD_Pearson    BF.dB.     Beta_is SD_Beta_is
## 1          1   1 -0.09913029  0.1319906 -12.17544 -0.00207851 0.00273331
## 2          1   2  0.04857973  0.1540008 -17.28124  0.00039619 0.00164867
## 3          1   3 -0.03988024  0.1467607 -17.30215 -0.00070579 0.00191350
## 4          1   4 -0.01157744  0.1290019 -17.18330 -0.00002905 0.00228726
## 5          1   5 -0.13067654  0.1302722 -15.25326 -0.00197577 0.00221203
## 6          1   6 -0.12179184  0.1152672 -12.70376 -0.00403038 0.00387398
##        eBPis
## 1 0.34970009
## 2 0.09146511
## 3 0.14737244
## 4 0.00442397
## 5 0.42974365
## 6 0.52554020

  bfis=res.bfis$BF.dB.[res.bfis$COVARIABLE==3]

#Bayes Factor (AUX model estimate) for association with phenotype ####
  res.aux=read.table(paste0(folder,"anaS",SIMU,"_covaux_summary_betai.out"),h=T)
  head(res.aux)

##   COVARIABLE MRK M_Beta SD_Beta M_Delta    BF.dB.
## 1          1   1      0       0       0 -13.05178
## 2          1   2      0       0       0 -13.05178
## 3          1   3      0       0       0 -13.05178
## 4          1   4      0       0       0 -13.05178
## 5          1   5      0       0       0 -13.05178
## 6          1   6      0       0       0 -13.05178

  bfmc=res.aux$BF.dB.[res.aux$COVARIABLE==3]
  
  #PIP (association with phenotype) obtained under the ising  model (spatial dependency of SNP in the regression on phenotype) ####
  res.ising=read.table(paste0(folder,"anaS",SIMU,"_covaux_isb1_summary_betai.out"),h=T)
  head(res.ising)

##   COVARIABLE MRK M_Beta SD_Beta M_Delta    BF.dB.
## 1          1   1      0       0       0 -13.05178
## 2          1   2      0       0       0 -13.05178
## 3          1   3      0       0       0 -13.05178
## 4          1   4      0       0       0 -13.05178
## 5          1   5      0       0       0 -13.05178
## 6          1   6      0       0       0 -13.05178

  pip.is=res.ising$M_Delta[res.ising$COVARIABLE==3]

  
plot(locusPos, xtx,col= "grey",pch=20, main="XtX BayPass")
  add_layers()
  points(locusPos, xtx,col= "grey", main="XtX", pch=20)

  plot(locusPos, xtx,col= "grey",main="XtX", pch=20, xlim=c(0,40000))
  add_layers()
  points(locusPos, xtx,col= "grey", main="XtX", pch=20)

  plot(locusPos, xtx,col= "grey",main="XtX", pch=20, xlim=c(80000,120000))
  add_layers()
  points(locusPos, xtx,col= "grey", main="XtX", pch=20)

plot(locusPos, bfis,col="grey",pch=20, main="BFis (pheno1) BayPass")
  add_layers()
  points(locusPos, bfis,col= "grey", main="BFis (pheno1)", pch=20)

  plot(locusPos, bfis,col= "grey",main="BFis (pheno1)", pch=20, xlim=c(0,40000))
  add_layers()
  points(locusPos, bfis,col= "grey", main="BFis (pheno1)", pch=20)

  plot(locusPos, bfis,col= "grey",main="BFis (pheno1)", pch=20, xlim=c(80000,120000))
  add_layers()
  points(locusPos, bfis,col= "grey", main="BFis (pheno1)", pch=20)

plot(locusPos, bfmc,col="grey",pch=20, main="BFmc (pheno1) BayPass")
  add_layers()
  points(locusPos, bfmc,col= "grey", main="BFmc (pheno1)", pch=20)

  plot(locusPos, bfmc,col= "grey",main="BFmc (pheno1)", pch=20, xlim=c(0,40000))
  add_layers()
  points(locusPos, bfmc,col= "grey", main="BFmc (pheno1)", pch=20)

  plot(locusPos, bfmc,col= "grey",main="BFmc (pheno1)", pch=20, xlim=c(80000,120000))
  add_layers()
  points(locusPos, bfmc,col= "grey", main="BFmc (pheno1)", pch=20)

plot(locusPos, sqrt(pip.is),col="grey",main="Ising Model PIP (pheno1) BayPass", pch=20)
  add_layers()
  points(locusPos, sqrt(pip.is),col= "grey", main="Ising Model PIP (pheno1)", pch=20)

  plot(locusPos, sqrt(pip.is),col= "grey",main="Ising Model PIP (pheno1)", pch=20, xlim=c(0,40000))
  add_layers()
  points(locusPos, sqrt(pip.is),col= "grey", main="Ising Model PIP (pheno1)", pch=20)

  plot(locusPos, sqrt(pip.is),col= "grey",main="Ising Model PIP (pheno1)  BayPass", pch=20, xlim=c(80000,120000))
  add_layers()
  points(locusPos, sqrt(pip.is),col= "grey", main="Ising Model PIP (pheno1)", pch=20)

####
# Why BayPass Fails:  ####

#### 1) Mean population phenotype/environment is highly correlated with major axis of structuring of genetic diversity
#SVD decomposistion of Omega (similar to principle components of the covariance matrix)
om.svd=svd(om)
#various correlation
tmp.data=as.data.frame(cbind(om.svd$u[,1:2],t(covariate[3:4,])))
colnames(tmp.data)=c("Omega PC1","Omega PC2","Pheno 1","Environment")
plot(tmp.data)

##Correlation between PC's and covariable is not that clear for the second simulation

#### 2) Continuous population: averaging phenotype within population lead to lose a lot of information


###A solution: use GCTA to exploit within pop variability ####
####

res.gcta=read.table(paste0(folder,"sim",SIMU,"a.mlma"),h=T)
lpval.gcta=-1*log10(res.gcta$p)
cex=rep(0.25,length(lpval.gcta))
cex[lpval.gcta>6]=1

head(det.snp)

##   V1  V2 V3   V4
## 1  1  21  A TRUE
## 2  1 109  A TRUE
## 3  1 133  A TRUE
## 4  1 231  A TRUE
## 5  1 290  A TRUE
## 6  1 338  A TRUE

plot(locusPos, xtx,col="grey",pch=16,cex=cex, main = "XTX with GCTA outliers large dots")
  add_layers()
  points(locusPos, xtx,col="grey",pch=16,cex=cex)
  points(locusPos[cex==1], xtx[cex==1],col="blue",pch=21)

plot(locusPos, lpval.gcta,col="grey",pch=16,cex=cex, main="GCTA log P")
  add_layers()
  points(locusPos, lpval.gcta,col="grey",pch=16,cex=cex)
  points(locusPos[cex==1], lpval.gcta[cex==1],col="blue",pch=21)

plot(locusPos, lpval.gcta,col="grey",pch=16,cex=cex, xlim=c(0,40000), main="GCTA log P")
  add_layers()
  points(locusPos, lpval.gcta,col="grey",pch=16,cex=cex)
  points(locusPos[cex==1], lpval.gcta[cex==1],col="blue",pch=21)

plot(locusPos, lpval.gcta,col="grey",pch=16,cex=cex, xlim=c(80000,120000), main="GCTA log P")
  add_layers()
  points(locusPos, lpval.gcta,col="grey",pch=16,cex=cex)
  points(locusPos[cex==1], lpval.gcta[cex==1],col="blue",pch=21)

SweeD

Likelihood is the score, alpha is inversely related to teh size of the region used to calculate the score.

$ SweeD -input ../../inputs/sim1a.vcf -grid 100 -name sim1a

out <- read.table("results/sweed/SweeD_Report_sim1a_edited.txt", header=TRUE)
head(out)

##    Position Likelihood      Alpha
## 1   21.0000 0.11374640 19.3926500
## 2  424.1212 0.07445243  9.6379690
## 3  827.2424 0.76083240  0.1096088
## 4 1230.3636 0.00000000  3.0000010
## 5 1633.4849 0.49518130  0.3392800
## 6 2036.6061 0.39430570  0.1856157

plot(out$Position, out$Likelihood, pch=20, col="grey", main="SweeD")
   add_layers()
   points(out$Position, out$Likelihood, pch=20, col="grey")

  # use simulations to determine a good likelihood cutoff and demographic models the score is over 50 or 60

REHH: iHS

The iHS is a within-population measure of LD.

Conversion script from VCF to REHH not provided.

###################
##For each chromosome in turn: 
## i)  Read haplotype data
## ii) Compute iHH_A, iHH_D, iES_Tang and iES_Sabeti
###################
library(rehh)

## Loading required package: rehh.data

cnt=0
for(i in 1:6){
 cnt=cnt+1
 tmp.hapfile=paste0(folder,"sim",SIMU,"a.chr",i,".thap")
 
 tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file=paste0(folder,"sim",SIMU,"a.rehhmap.inp"), chr.name=i,haplotype.in.columns=TRUE)
 
 tmp.scan=scan_hh(tmp.hap,threads=4)
 
 if(cnt==1){wgscan=tmp.scan}else{wgscan=rbind(wgscan,tmp.scan)}
}

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 976  SNPs declared for chromosome 1 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 976 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 1029  SNPs declared for chromosome 2 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1029 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 986  SNPs declared for chromosome 3 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 986 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 1033  SNPs declared for chromosome 4 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1033 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 961  SNPs declared for chromosome 5 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 961 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 955  SNPs declared for chromosome 6 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 955 SNPs

dim(wgscan)

## [1] 5940    7

ihs=ihh2ihs(wgscan,minmaf=0.05,freqbin=0.05)
  head(ihs$iHS,25)

##      CHR POSITION iHS -log10(p-value)
## rs6    1      338  NA              NA
## rs7    1      364  NA              NA
## rs8    1      400  NA              NA
## rs11   1      515  NA              NA
## rs12   1      528  NA              NA
## rs19   1     1006  NA              NA
## rs24   1     1191  NA              NA
## rs28   1     1365  NA              NA
## rs31   1     1498  NA              NA
## rs32   1     1513  NA              NA
## rs33   1     1550  NA              NA
## rs35   1     1600  NA              NA
## rs36   1     1612  NA              NA
## rs46   1     1925  NA              NA
## rs47   1     1976  NA              NA
## rs54   1     2392  NA              NA
## rs55   1     2427  NA              NA
## rs57   1     2449  NA              NA
## rs60   1     2496  NA              NA
## rs61   1     2511  NA              NA
## rs62   1     2542  NA              NA
## rs68   1     2879  NA              NA
## rs69   1     2882  NA              NA
## rs70   1     2909  NA              NA
## rs72   1     2934  NA              NA

  tail(ihs$iHS)

##        CHR POSITION iHS -log10(p-value)
## rs5919   6   239078  NA              NA
## rs5926   6   239398  NA              NA
## rs5927   6   239509  NA              NA
## rs5928   6   239538  NA              NA
## rs5932   6   239634  NA              NA
## rs5934   6   239706  NA              NA

  ihs$frequency.class

##            Number of SNPs mean of the log(iHHA/iHHD) ratio
## 0.05 - 0.1             32                       1.09660855
## 0.1 - 0.15             32                       0.79394704
## 0.15 - 0.2             51                       0.73844344
## 0.2 - 0.25             42                       0.49214859
## 0.25 - 0.3             35                       0.15316435
## 0.3 - 0.35             52                       0.15483401
## 0.35 - 0.4             44                       0.05861124
## 0.4 - 0.45             63                       0.01024730
## 0.45 - 0.5             62                       0.02172340
## 0.5 - 0.55             57                      -0.18316069
## 0.55 - 0.6             92                      -0.23828980
## 0.6 - 0.65             94                      -0.30992872
## 0.65 - 0.7             97                      -0.34170415
## 0.7 - 0.75            109                      -0.46591618
## 0.75 - 0.8            172                      -0.73141440
## 0.8 - 0.85            175                      -0.83584247
## 0.85 - 0.9            309                      -1.11365588
## 0.9 - 0.95            421                      -1.42897780
##            sd of the log(iHHA/iHHD) ratio
## 0.05 - 0.1                      0.5714631
## 0.1 - 0.15                      0.4663963
## 0.15 - 0.2                      0.5489434
## 0.2 - 0.25                      0.4246825
## 0.25 - 0.3                      0.3561133
## 0.3 - 0.35                      0.3309306
## 0.35 - 0.4                      0.4147639
## 0.4 - 0.45                      0.4461021
## 0.45 - 0.5                      0.3967031
## 0.5 - 0.55                      0.3237119
## 0.55 - 0.6                      0.4104828
## 0.6 - 0.65                      0.3957538
## 0.65 - 0.7                      0.3907953
## 0.7 - 0.75                      0.4567719
## 0.75 - 0.8                      0.4068721
## 0.8 - 0.85                      0.4522183
## 0.85 - 0.9                      0.5461895
## 0.9 - 0.95                      0.4923380

  #distribplot(ihs$iHS$iHS)

  #ihsplot(ihs)
  ihs$iHS[which(ihs$iHS[,4]>2),]

##        CHR POSITION       iHS -log10(p-value)
## rs185    1     8230 -3.182865        2.836166
## rs1449   2    58208 -2.771271        2.253071
## rs1457   2    58509 -2.827615        2.328863
## rs3566   4   141742 -3.412286        3.190976
## rs3568   4   141759  2.823829        2.323730
## rs3569   4   141759  2.823829        2.323730
## rs3870   4   153368 -2.661302        2.108803
## rs4166   5   166008  3.046233        2.635021
## rs5359   6   215729  3.214988        2.884557
## rs5617   6   226485 -2.787467        2.274726
## rs5713   6   230127 -3.254299        2.944345

  plot(ihs$iHS$POSITION,ihs$iHS$`-log10(p-value)`, col="grey", pch=20, main="REHH iHS")
    add_layers()
    points(ihs$iHS$POSITION,ihs$iHS$`-log10(p-value)`, col="grey", pch=20)

  plot(ihs$iHS$POSITION,ihs$iHS$iHS, col="grey", pch=20, main="REHH iHS")
    add_layers()
    points(ihs$iHS$POSITION,ihs$iHS$iHS, col="grey", pch=20)

###pop based on phenotype: ####
    # pop2=individuals with pheno1>1 and pop3=individuals with pheno1<-1

cnt=0
for(i in 1:6){
 cnt=cnt+1
 tmp.hapfile=paste0(folder,"sim",SIMU,"a.chr",i,".pop2.thap")
 tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file=paste0(folder,"sim",SIMU,"a.rehhmap.inp"), chr.name=i,haplotype.in.columns=TRUE)
 tmp.scan.pop1=scan_hh(tmp.hap,threads=4)
 tmp.hapfile=paste0(folder,"sim",SIMU,"a.chr",i,".pop3.thap")
 tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file=paste0(folder,"sim",SIMU,"a.rehhmap.inp"), chr.name=i,haplotype.in.columns=TRUE)
 tmp.scan.pop2=scan_hh(tmp.hap,threads=4)
 if(cnt==1){
  wgscan.pop1=tmp.scan.pop1
  wgscan.pop2=tmp.scan.pop2  
  }else{
  wgscan.pop1=rbind(wgscan.pop1,tmp.scan.pop1)
  wgscan.pop2=rbind(wgscan.pop2,tmp.scan.pop2)    
  }
}

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 976  SNPs declared for chromosome 1 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 976 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 976  SNPs declared for chromosome 1 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 976 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 1029  SNPs declared for chromosome 2 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 1029 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 1029  SNPs declared for chromosome 2 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 1029 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 986  SNPs declared for chromosome 3 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 986 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 986  SNPs declared for chromosome 3 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 986 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 1033  SNPs declared for chromosome 4 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 1033 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 1033  SNPs declared for chromosome 4 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 1033 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 961  SNPs declared for chromosome 5 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 961 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 961  SNPs declared for chromosome 5 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 961 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 955  SNPs declared for chromosome 6 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 955 SNPs

## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position

## Map file seems OK: 955  SNPs declared for chromosome 6 
## Haplotype are in columns with no header
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 955 SNPs

xpehh=ies2xpehh(wgscan.pop1,wgscan.pop2)

#xpehhplot(xpehh)
xpehh[which(xpehh[,4]>2.5),]

##        CHR POSITION     XPEHH -log10(p-value) [bilateral]
## rs924    1    38051  4.846620                    5.901070
## rs926    1    38093  3.388071                    3.152513
## rs1630   2    64920 -3.061946                    2.657769
## rs2044   3    81415 -3.013151                    2.587456
## rs3163   4   126453  3.064205                    2.661047
## rs4259   5   170425  3.381116                    3.141511
## rs4260   5   170440  3.292190                    3.002568
## rs4261   5   170460  3.835330                    3.901717
## rs4262   5   170493  3.505331                    3.340997
## rs4264   5   170563  3.198220                    2.859244
## rs5453   6   219803  3.096292                    2.707839
## rs5456   6   219919  3.425592                    3.212212
## rs5457   6   219933  3.705832                    3.676340
## rs5458   6   219965  3.572687                    3.451811
## rs5459   6   219997  3.560128                    3.431007
## rs5462   6   220209  3.162945                    2.806370
## rs5477   6   220630  3.796370                    3.833184
## rs5478   6   220661  3.797793                    3.835676
## rs5479   6   220694  3.726996                    3.712701
## rs5603   6   225881  3.084691                    2.690873
## rs5606   6   225958  3.001226                    2.570417
## rs5617   6   226485  3.455438                    3.260112
## rs5717   6   230275  3.326834                    3.056314
## rs5720   6   230312  2.975288                    2.533557

str(xpehh)

## 'data.frame':    5940 obs. of  4 variables:
##  $ CHR                        : chr  "1" "1" "1" "1" ...
##  $ POSITION                   : num  21 109 133 231 290 338 364 400 424 513 ...
##  $ XPEHH                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ -log10(p-value) [bilateral]: num  NA NA NA NA NA NA NA NA NA NA ...

plot(xpehh$POSITION,xpehh$XPEHH, col="grey", pch=20, main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$XPEHH, col="grey", pch=20)

    # positive score is
    # negative score is

plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)

plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(0,40000), main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)

plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(40000, 79500), main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)

plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(80000, 120000), main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)

plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(200000, 240000), main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)

plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(160000, 200000), main="REHH xpehh low vs. high pheno")
    add_layers()
    points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)

Note: Should have used map position based on cM (e.g, recombination) instead of SNP index or map position for REHH and XPEHH. This would compress the signals at the low recombination regions.

Omega Plus

OmegaPlus-M -threads 4 -input sim1a.vcf -grid 1000 -minwin 2000 -maxwin
 200000 -name SIM1A -seed 3993 -noSeparator

om <- read.table("results/omegaplus/OmegaPlus_Report.SIM1A", header=TRUE)
head(om)

##        POS SCORE
## 1  21.0000     0
## 2  60.9489     0
## 3 100.8979     0
## 4 140.8468     0
## 5 180.7958     0
## 6 220.7447     0

plot(om$POS, om$SCORE, main="Omega Plus")
  add_layers()

Raisd

RAiSD -I sim1a.vcf -n sim1A -s -t -f

-I input

-n name

-s separate files

-t don’t put delimiters

-f force overwrite

ras <- read.table("results/raisd/RAiSD_Report.sim1A.ALL", header=TRUE)
head(ras)

##   POS Variability SFS   LD Composite
## 1 267   0.0012320 0.2 0.40 9.857e-05
## 2 312   0.0010170 0.2 0.40 8.134e-05
## 3 330   0.0009892 0.1 0.40 3.957e-05
## 4 429   0.0009917 0.1 0.45 4.463e-05
## 5 488   0.0009942 0.1 0.32 3.182e-05
## 6 542   0.0010220 0.1 0.32 3.270e-05

plot(ras$POS, ras$Variability, pch=20, col="grey", main = "RAiSD")
  # higher the variability score, the lower the SNP variation
  add_layers()
  points(ras$POS, ras$Variability, pch=20, col="grey")

plot(ras$POS, ras$SFS, pch=20, col="grey")

  # this only counts high freq derived on left and right, don't pay attention

plot(ras$POS, ras$LD, pch=20, col="grey" , main = "RAiSD")
  # low LD score means that there are no unique pattern on left or on right
  # high LD score means that there are unique patterns on left and right side
  add_layers()
  points(ras$POS, ras$LD, pch=20, col="grey")

plot(ras$POS, ras$Composite, pch=20, col="grey", main = "RAiSD")
  # low LD score means that there are no unique pattern on left or on right
  # high LD score means that there are unique patterns on left and right side
  add_layers()
  points(ras$POS, ras$Composite, pch=20, col="grey")