# install.packages("devtools")
library(devtools)
## Warning: package 'devtools' was built under R version 3.4.1
# source("http://bioconductor.org/biocLite.R")
# biocLite("qvalue")
# install_github("whitlock/OutFLANK")
library(OutFLANK)
## Loading required package: qvalue
if(!("adegenet" %in% installed.packages())){install.packages("adegenet")}
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.5.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(adegenet)
## Loading required package: ade4
##
## /// adegenet 2.0.1 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(pcadapt)
library(lfmm)
library(LEA)
### Read in edited VCF file and convert to outFLANK format
ind <- read.table("outputIndAll.txt", header=TRUE)
muts <- read.table("OutputMuts_Detail.txt", header=TRUE)
sim <- system("grep recomb outputSim.txt", TRUE)
vcf <- read.vcfR("vcf_sim1a_contest.vcf.gz")
##
Meta line 13 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 5940
## Character matrix gt cols: 1009
## skip: 0
## nrows: 5940
## row_num: 0
##
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant: 5940
## All variants processed
recom <- matrix(unlist(strsplit(sim, " ")), ncol=2)
colnames(recom) <- recom[1,]
recom <- recom[-1,]
recom <- apply(recom, 2, as.numeric)
muts
## position selCoef originGen type freq pa2 prop count
## 1 3200 0.0989896 3856 m2 0.2335 0.002 0.02 TRUE
## 2 10589 -0.1296740 4 m2 1.0000 0.000 0.00 FALSE
## 3 17593 -0.0991819 238 m2 0.1970 0.002 0.02 TRUE
## 4 18862 -1.2885800 3997 m2 0.0005 0.001 0.01 TRUE
## 5 19341 0.2009850 3994 m2 0.0005 0.000 0.00 FALSE
## 6 21117 -0.7836390 4000 m2 0.0005 0.000 0.00 FALSE
## 7 21929 -0.3388470 2119 m2 0.2415 0.021 0.21 TRUE
## 8 26152 -0.1577500 3997 m2 0.0015 0.000 0.00 FALSE
## 9 27702 0.1191710 3004 m2 0.5325 0.004 0.04 TRUE
## 10 30094 0.0959116 3815 m2 0.0145 0.000 0.00 FALSE
## 11 30729 -0.7890840 3997 m2 0.0015 0.001 0.01 TRUE
## 12 34180 0.0188577 2050 m2 0.4735 0.000 0.00 FALSE
## 13 36194 0.1990460 4000 m2 0.0005 0.000 0.00 FALSE
## 14 37385 0.1180380 2991 m2 0.0140 0.000 0.00 FALSE
## 15 81503 -1.0921700 4000 m2 0.0005 0.001 0.01 TRUE
## 16 81696 0.0594199 712 m2 0.7535 0.001 0.01 TRUE
## 17 81730 0.5315880 3 m2 0.3000 0.059 0.59 TRUE
## 18 85118 -0.1463480 360 m2 0.3095 0.005 0.05 TRUE
## 19 85955 -0.0500430 27 m2 1.0000 0.000 0.00 FALSE
## 20 87082 0.6965490 3993 m2 0.0005 0.000 0.00 FALSE
## 21 88848 -0.5934370 4000 m2 0.0005 0.000 0.00 FALSE
## 22 90699 -0.2599380 3999 m2 0.0010 0.000 0.00 FALSE
## 23 96270 -0.0208146 3677 m2 0.1755 0.000 0.00 FALSE
## 24 97530 0.5923690 4000 m2 0.0005 0.000 0.00 FALSE
## 25 99045 -0.7824100 3995 m2 0.0015 0.001 0.01 TRUE
## 26 100803 0.1659520 3739 m2 0.0870 0.002 0.02 TRUE
## 27 101009 0.3074200 4000 m2 0.0005 0.000 0.00 FALSE
## 28 104069 -0.0453107 3849 m2 0.0050 0.000 0.00 FALSE
## 29 106773 0.2700430 3994 m2 0.0020 0.000 0.00 FALSE
## 30 109888 -0.4721010 3998 m2 0.0005 0.000 0.00 FALSE
## 31 112932 -0.1638910 4000 m2 0.0005 0.000 0.00 FALSE
## 32 116833 -0.4205310 3995 m2 0.0005 0.000 0.00 FALSE
## 33 117820 0.5029660 3986 m2 0.0005 0.000 0.00 FALSE
## 34 186395 0.0340453 2889 m4 1.0000 0.000 0.00 TRUE
## 35 186895 0.0490189 3996 m4 0.0010 0.000 0.00 FALSE
## 36 187265 0.0049572 3940 m4 0.0355 0.000 0.00 FALSE
muts_count <- muts[muts$count>0,]
muts_count
## position selCoef originGen type freq pa2 prop count
## 1 3200 0.0989896 3856 m2 0.2335 0.002 0.02 TRUE
## 3 17593 -0.0991819 238 m2 0.1970 0.002 0.02 TRUE
## 4 18862 -1.2885800 3997 m2 0.0005 0.001 0.01 TRUE
## 7 21929 -0.3388470 2119 m2 0.2415 0.021 0.21 TRUE
## 9 27702 0.1191710 3004 m2 0.5325 0.004 0.04 TRUE
## 11 30729 -0.7890840 3997 m2 0.0015 0.001 0.01 TRUE
## 15 81503 -1.0921700 4000 m2 0.0005 0.001 0.01 TRUE
## 16 81696 0.0594199 712 m2 0.7535 0.001 0.01 TRUE
## 17 81730 0.5315880 3 m2 0.3000 0.059 0.59 TRUE
## 18 85118 -0.1463480 360 m2 0.3095 0.005 0.05 TRUE
## 25 99045 -0.7824100 3995 m2 0.0015 0.001 0.01 TRUE
## 26 100803 0.1659520 3739 m2 0.0870 0.002 0.02 TRUE
## 34 186395 0.0340453 2889 m4 1.0000 0.000 0.00 TRUE
# Get indexes of causal loci
all_loci <- as.numeric(vcf@fix[,"POS"])
all_loci_sorted <- sort(all_loci)
index <- which(all_loci_sorted %in% muts_count$position) #should line up
all_loci_sorted[index]
## [1] 3200 17593 18862 21929 27702 30729 81503 81696 81730 85118
## [11] 99045 100803
muts_count$position
## [1] 3200 17593 18862 21929 27702 30729 81503 81696 81730 85118
## [11] 99045 100803 186395
muts_count$index <- c(index,NA) # add NA for the m4 mutation
ymax <- 10^5
add_layers <- function(){
#QTL 1
polygon(c(0,40000, 40000, 0), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("blue", 0.1), border = rgb(0,0,0,0))
#QTL 2
polygon(c(80000,120000, 120000,80000), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("blue", 0.1), border = rgb(0,0,0,0))
abline(v = muts_count$position[muts_count$type=="m2"], col="black", lwd=muts_count$prop*20)
# low recom w/ background selection
polygon(c(54001, 64020, 64020, 54001), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("red", 0.2), border = rgb(0,0,0,0))
# low recom only
polygon(c(220424, 230443, 230443, 220424), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("red", 0.1), border = rgb(0,0,0,0))
# Selective Sweep
polygon(c(185941, 189210, 189210, 185941), c(-ymax,-ymax, ymax, ymax), col = adjustcolor("purple", 0.1), border = rgb(0,0,0,0))
abline(v = muts_count$position[muts_count$type=="m4"], col="purple", lwd=2)
}
plot(x=c(0,240000), y=c(0,0),ylim=c(0,1))
add_layers()
NOTE: This has not been thouroughly tested.
### Load the vcfR package
require(vcfR)
obj.vcfR <- read.vcfR("vcf_sim1a_contest.vcf.gz")
##
Meta line 13 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 5940
## Character matrix gt cols: 1009
## skip: 0
## nrows: 5940
## row_num: 0
##
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant: 5940
## All variants processed
# NB: Creates a vcfR object (stored in RAM) which size is twice as big as the original vcf file. So when dealing with large data, make sure you have enough RAM space available before proceeding.
geno <- extract.gt(obj.vcfR) # Character matrix containing the genotypes
position <- getPOS(obj.vcfR) # Positions in bp
chromosome <- getCHROM(obj.vcfR) # Chromosome information
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
rds <- list(G = G, position = position, chromosome = chromosome)
saveRDS(rds, "sim1a.rds")
# NB: About RDS
# - Pros:
# Can store objects of different class
# Binary format so RDS files are very light
# - Cons:
# Not human readable
# Not "any other language" readable, can be used only with R
training<-readRDS("inputs/sim1a.rds")
dim(training$G) #1000 individuals in columns
## [1] 5940 1000
gen_table <- t(training$G)
locusPos <- all_loci_sorted
dipmat <- MakeDiploidFSTMat(gen_table, locusPos, training$pop)
## Calculating FSTs, may take a few minutes...
head(dipmat)
## LocusName He FST T1 T2 FSTNoCorr
## 1 21 0.0324555 0.011320095 1.838491e-04 0.0162409509 0.03013683
## 2 109 0.0009995 -0.006412466 -3.205650e-06 0.0004999091 0.01340827
## 3 133 0.0019980 0.007739770 7.737595e-06 0.0009997190 0.02700356
## 4 231 0.0591395 0.009521405 2.817581e-04 0.0295920765 0.02812038
## 5 290 0.0314880 0.010204201 1.607806e-04 0.0157563125 0.02907417
## 6 338 0.0958995 0.027598488 1.325005e-03 0.0480100581 0.04507415
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 4.894857e-04 0.0162421105 0.9835
## 2 6.703422e-06 0.0004999467 0.9995
## 3 2.699795e-05 0.0009997921 0.9990
## 4 8.321991e-04 0.0295941651 0.9695
## 5 4.581346e-04 0.0157574408 0.9840
## 6 2.164156e-03 0.0480132420 0.9495
hist(dipmat$FST, breaks=100)
hist(dipmat$He[dipmat$He>0.1])
plot(dipmat$FST, dipmat$FSTNoCorr)
hist(dipmat$FSTNoCorr-dipmat$FST, breaks=100)
plot(dipmat$He, dipmat$FST, pch=20, col="grey")
hist(dipmat$FSTNoCorr, breaks=seq(0,0.3, by=0.001))
hist(dipmat$FSTNoCorr[dipmat$He>0.05], breaks=seq(0,0.3, by=0.001))
hist(dipmat$FSTNoCorr[dipmat$He>0.1], breaks=seq(0,0.3, by=0.001))
quantile(dipmat$FSTNoCor, probs = c(0.99, 0.95, 0.9, 0.85), na.rm=TRUE)
## 99% 95% 90% 85%
## 0.08912094 0.05720492 0.04848406 0.04403053
quantile(dipmat$FSTNoCor, probs = c(0.05, 0.1, 0.15, 0.2, 0.3, 0.4), na.rm=TRUE)
## 5% 10% 15% 20% 30% 40%
## 0.01340827 0.01558282 0.01775073 0.02008759 0.02419584 0.02786597
outlier = OutFLANK(dipmat,NumberOfSamples = 39)
OutFLANKResultsPlotter(outlier, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.05, titletext = NULL)
outlier = OutFLANK(dipmat,NumberOfSamples = 39,
RightTrimFraction = 0.05, LeftTrimFraction = 0.40,
qthreshold = 0.05, Hmin = 0.1)
# increasing the Right Trim Fraction doesn't help
# in this case we need to increase the left trim fraction because of the large number of rare alleles in the data (even though these are excluded in the algorithm, the are included in the calculation of the trim points)
OutFLANKResultsPlotter(outlier, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.05, titletext = NULL)
hist(outlier$results$pvaluesRightTail)
hist(outlier$results$qvalues)
str(outlier)
## List of 6
## $ FSTbar : num 0.0174
## $ FSTNoCorrbar : num 0.0363
## $ dfInferred : num 32.4
## $ numberLowFstOutliers : int 0
## $ numberHighFstOutliers: int 142
## $ results :'data.frame': 5940 obs. of 15 variables:
## ..$ LocusName : num [1:5940] 21 109 133 231 290 338 364 400 424 513 ...
## ..$ He : num [1:5940] 0.032455 0.000999 0.001998 0.059139 0.031488 ...
## ..$ FST : num [1:5940] 0.01132 -0.00641 0.00774 0.00952 0.0102 ...
## ..$ T1 : num [1:5940] 1.84e-04 -3.21e-06 7.74e-06 2.82e-04 1.61e-04 ...
## ..$ T2 : num [1:5940] 0.0162 0.0005 0.001 0.0296 0.0158 ...
## ..$ FSTNoCorr : num [1:5940] 0.0301 0.0134 0.027 0.0281 0.0291 ...
## ..$ T1NoCorr : num [1:5940] 4.89e-04 6.70e-06 2.70e-05 8.32e-04 4.58e-04 ...
## ..$ T2NoCorr : num [1:5940] 0.0162 0.0005 0.001 0.0296 0.0158 ...
## ..$ meanAlleleFreq : num [1:5940] 0.984 1 0.999 0.97 0.984 ...
## ..$ indexOrder : int [1:5940] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ GoodH : Factor w/ 2 levels "goodH","lowH": 2 2 2 2 2 2 1 1 2 2 ...
## ..$ qvalues : num [1:5940] 1 1 1 1 1 ...
## ..$ pvalues : num [1:5940] 0.519782 0.000805 0.293219 0.36687 0.436286 ...
## ..$ pvaluesRightTail: num [1:5940] 0.74 1 0.853 0.817 0.782 ...
## ..$ OutlierFlag : logi [1:5940] FALSE FALSE FALSE FALSE FALSE FALSE ...
sum(outlier$results$pvaluesRightTail < 0.001, na.rm=TRUE)
## [1] 137
sum(outlier$results$qvalues<0.01, na.rm=TRUE)
## [1] 101
head(outlier$results)
## LocusName He FST T1 T2 FSTNoCorr
## 1 21 0.0324555 0.011320095 1.838491e-04 0.0162409509 0.03013683
## 2 109 0.0009995 -0.006412466 -3.205650e-06 0.0004999091 0.01340827
## 3 133 0.0019980 0.007739770 7.737595e-06 0.0009997190 0.02700356
## 4 231 0.0591395 0.009521405 2.817581e-04 0.0295920765 0.02812038
## 5 290 0.0314880 0.010204201 1.607806e-04 0.0157563125 0.02907417
## 6 338 0.0958995 0.027598488 1.325005e-03 0.0480100581 0.04507415
## T1NoCorr T2NoCorr meanAlleleFreq indexOrder GoodH qvalues
## 1 4.894857e-04 0.0162421105 0.9835 1 lowH 0.9999488
## 2 6.703422e-06 0.0004999467 0.9995 2 lowH 0.9999488
## 3 2.699795e-05 0.0009997921 0.9990 3 lowH 0.9999488
## 4 8.321991e-04 0.0295941651 0.9695 4 lowH 0.9999488
## 5 4.581346e-04 0.0157574408 0.9840 5 lowH 0.9999488
## 6 2.164156e-03 0.0480132420 0.9495 6 lowH 0.9999488
## pvalues pvaluesRightTail OutlierFlag
## 1 0.5197820696 0.7401090 FALSE
## 2 0.0008050592 0.9995975 FALSE
## 3 0.2932188708 0.8533906 FALSE
## 4 0.3668703477 0.8165648 FALSE
## 5 0.4362856247 0.7818572 FALSE
## 6 0.3258701167 0.1629351 FALSE
topm2_vcf <- rep(FALSE, nrow(outlier$results))
topm2_vcf[muts_count$index[muts_count$type=="m2"]] <- TRUE
plot(dipmat$He, dipmat$FST, pch=20, col="grey", main="outflank")
points(dipmat$He[topm2_vcf], dipmat$FST[topm2_vcf], pch=21, col="blue")
#points(POS[topm2_vcf], abs(phen_cor)[topm2_vcf], col="blue")
plot(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20, main="outflank")
add_layers()
points(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20)
points(outlier$results$LocusName[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], outlier$results$FST[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], col="red")
plot(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20, xlim=c(0,40000), main="outflank")
add_layers()
points(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20)
points(outlier$results$LocusName[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], outlier$results$FST[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], col="red")
plot(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20, xlim=c(80000,120000), main="outflank")
add_layers()
points(outlier$results$LocusName[outlier$results$He>0.1], outlier$results$FST[outlier$results$He>0.1], col="grey", pch=20)
points(outlier$results$LocusName[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], outlier$results$FST[outlier$results$He>0.1 & outlier$results$OutlierFlag==TRUE], col="red")
aux<-pcadapt(training$G,K=30)
## Number of SNPs: 5940
## Number of individuals: 1000
genotype_file <- aux
plot(aux,option="screeplot")
res<-pcadapt(training$G,K=4)
## Number of SNPs: 5940
## Number of individuals: 1000
x <- res
summary(x)
## Length Class Mode
## scores 4000 -none- numeric
## singular.values 4 -none- numeric
## zscores 23760 -none- numeric
## loadings 23760 -none- numeric
## maf 5940 -none- numeric
## missing 5940 -none- numeric
## stat 5940 -none- numeric
## gif 1 -none- numeric
## chi2.stat 5940 -none- numeric
## pvalues 5940 -none- numeric
plot(x, option = "qqplot", threshold = 0.05, main="pcadapt")
#plot(x, option = "stat.distribution")
# Default output from PCAdapt
par(mfrow=c(1,1))
plot(locusPos, x$chi2.stat, col="grey", pch=20, main="pcadapt without LD pruning")
add_layers()
# Calibrate p-values
hist(res$pvalues,breaks=50)
require(qvalue)
plot(qvalue(res$pvalues))
#choose # significant tests based on where expected false positives are 0 (bottom right)
outliers<- order(res$pvalues)[1:200]
# Plot without accounting for LD
plot(locusPos, x$chi2.stat, col="grey", pch=20, main="pcadapt without LD pruning")
add_layers()
points(locusPos, x$chi2.stat, col="grey", pch=20)
points(locusPos[outliers], x$chi2.stat[outliers], col="red", pch=20)
plot(locusPos, -log10(x$pvalues), col="grey", pch=20, main="pcadapt without LD pruning")
add_layers()
points(locusPos, -log10(x$pvalues), col="grey", pch=20)
points(locusPos[outliers], -log10(x$pvalues[outliers]), col="red", pch=20)
# Account for LD ####
devtools::install_github("privefl/bigsnpr")
## Skipping install of 'bigsnpr' from a github remote, the SHA1 (ccb2e930) has not changed since last install.
## Use `force = TRUE` to force installation
library(bigsnpr)
## Loading required package: bigstatsr
library(bigstatsr)
G<-add_code256(big_copy(t(training$G),type="raw"),code=bigsnpr:::CODE_012)
## Warning: Assignment will down cast from double to raw.
## Hint: To remove this warning, use options(bigstatsr.typecast.warning = FALSE).
#puts it in the raw format and stores likelihood genotype probability
newpc<-snp_autoSVD(G=G,infos.chr =training$chromosome,infos.pos = training$position)
## Phase of clumping at r2 > 0.2.. keep 3352 SNPs.
##
## Iteration 1:
## Computing SVD..
##
## Converged!
# this is doing SNP pruning - removing correlated SNPs
# take snps with highest MAF and correlate snps around it
# Snps with R^2 > 0.2 are removed
# the subset is the indexes of the remaining SNPs
str(newpc)
## List of 7
## $ d : num [1:10] 149 123 121 120 119 ...
## $ u : num [1:1000, 1:10] 0.037841 -0.000481 0.042954 -0.007822 -0.009347 ...
## $ v : num [1:3352, 1:10] 0.006 0.00255 0.00963 0.01262 -0.00197 ...
## $ niter : int 8
## $ nops : int 154
## $ center: num [1:3352] 0.033 0.001 0.002 0.032 0.12 0.018 0.001 0.548 0.38 0.003 ...
## $ scale : num [1:3352] 0.1802 0.0316 0.0447 0.1774 0.3359 ...
## - attr(*, "class")= chr "big_SVD"
## - attr(*, "subset")= int [1:3352] 1 2 3 5 8 9 10 11 12 14 ...
## - attr(*, "lrldr")='data.frame': 0 obs. of 3 variables:
## ..$ Chr : int(0)
## ..$ Start: int(0)
## ..$ Stop : int(0)
plot(newpc)
which_pruned <- attr(newpc, which="subset")
length(which_pruned)
## [1] 3352
gwas<-snp_gc(snp_pcadapt(G, U.row = newpc$u[,1]))
# snp_pcadapt doing pcadapt but on G format
# snp_gc is genomic inflation factor will set gif to 1
str(gwas)
## Classes 'mhtest' and 'data.frame': 5940 obs. of 3 variables:
## $ estim : num 0.00506 0.01204 0.03221 -0.01587 0.01079 ...
## $ std.err: num 0.0056 0.03167 0.02238 0.00415 0.00568 ...
## $ score : num 0.903 0.38 1.439 -3.822 1.901 ...
## - attr(*, "transfo")=function (x)
## - attr(*, "predict")=function (xtr)
pval<-predict(gwas,log10=F)
hist(pval,breaks=50)
plot(qvalue(pval))
outliers<-order(pval)[1:50]
#Manhattan
CHR<-training$chromosome
#snp_manhattan(gwas , CHR, unlist(tapply(CHR, CHR, seq_along)),
# dist.sep.chrs = 100,ind.highlight = ou)
#Eval on simulations
plot(locusPos, -log10(pval), col="grey", pch=20, main="pcadapt with LD pruning")
add_layers()
points(locusPos, -log10(pval), col="grey", pch=20)
points(locusPos[outliers], -log10(pval)[outliers], col="red", pch=20)
plot(locusPos, -log10(pval), col="grey", pch=20, xlim=c(0,40000), main="pcadapt with LD pruning")
add_layers()
points(locusPos, -log10(pval), col="grey", pch=20)
points(locusPos[outliers], -log10(pval)[outliers], col="red", pch=20)
plot(locusPos, -log10(pval), col="grey", pch=20, xlim=c(80000,120000), main="pcadapt with LD pruning")
add_layers()
points(locusPos, -log10(pval), col="grey", pch=20)
points(locusPos[outliers], -log10(pval)[outliers], col="red", pch=20)
Command line argument:
plink --vcf
plink --bfile sim1a --maf 0.05 --out sim1a.commonf --make-bed
plink --bfile sim1a.commonf --indep-pairwise 50 5 0.5 --out sim1a.ldpruned
plink --bfile sim1a.commonf --extract sim1a.ldpruned.pruned.in --out sim1a.ldpruned --make-bed
admixture sim1a.ldpruned.bed 3
First, convert VCF to BED file. When Plink converts VCF you lose the loci names and positions, so you have to add that.
Next filter for MAF.
Next LD filter every 50 snps window, compute R^2 between all SNPs and choose SNPs so maximum R^2 between them is 0.5, then move 5 SNPs to next window For more information: Plink LD pruning
Make a new bed file with the pruned loci.
Run admixture to get population IDs for hapFLK.
Next: input q matrix into R and assign individuals to population Add population ID to Fam file from Pink
qmat=read.table('sim1a.ldpruned.3.Q')
K=dim(qmat)[2]
pop=apply(qmat,1,which.max)
fam=read.table('sim1a.commonf.fam')
## assign individuals to "populations" based on the Q matrix
fam$V1=pop
write.table(fam,col.names=F,row.names=F,file='sim1a.commonf.fam',quote=F)
Then run HapFLK with fam, bed, and bin file, and ‘qpop’ is name of outfile for the run
hapflk --bfile sim1a.commonf -p qpop
hapf <- read.table("results/hapflk/sim1.hapflk", header=TRUE)
plot(hapf$pos, hapf$hapflk, col="grey", pch=20, main="hapFLK")
add_layers()
FLK tree
Admixture
FLK tree
FLK tree
flkLD <- read.table("results/hapflk/sim1a.pruned.flk", header=TRUE)
head(flkLD)
## rs chr pos pzero flk pvalue
## 1 6 1 338 0.9456250 1.598368960 0.4496956
## 2 7 1 364 0.9443287 0.012716378 0.9936620
## 3 8 1 400 0.9402262 0.002463121 0.9987692
## 4 11 1 515 0.7340671 0.816289316 0.6648827
## 5 12 1 528 0.7977183 0.748574173 0.6877794
## 6 19 1 1006 0.9317668 1.149843996 0.5627488
plot(flkLD$pos, flkLD$flk, col="grey", pch=20, main="FLK with pruning")
add_layers()
points(flkLD$pos, flkLD$flk, col="grey", pch=20, main="FLK with pruning")
plot(flkLD$pos, -log10(flkLD$pvalue), col="grey", pch=20, main="FLK with pruning")
add_layers()
points(flkLD$pos, -log10(flkLD$pvalue), col="grey", pch=20, main="FLK with pruning")
Underlying model is probably not good for landscape data, IBD. Depend through prior on locus-specific coefficient of seleciton, as well as genome-wide estimate of selection, which is sensitive to underlying model. Here, we see big values of lambda parameters, indicating departures from the underlying model. If neutral variation and IBD and complex structure, then you will also inflate the lambda parameter.
Input conversion not provided, but broadly same format as BayPass except first two lines that indicate #pops and #loci.
Command line argument:
./src/selestim -file /home/rvitalis/work/SSMPG-2017/data/sim1a.selestim -outputs sim1a-1/ -npilot 25 -lpilot 500 -burnin 50000 -length 100000 -thin 25 -threads 16
#
library(fields) # extrapolation spatiale
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
library(maps) # fonds de carte
# sourcing the SelEstim R functions #####
source('results/selestim/selestim/R/SelEstim.R')
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
delta <- read.table('results/selestim/analyses/sim1a-1/summary_delta.out',header = TRUE)
sigma <- read.table('results/selestim/analyses/sim1a-1/summary_sigma.out',header = TRUE)
kappa <- read.table('results/selestim/analyses/sim1a-1/summary_kappa.out',header = TRUE)
head(delta)
## locus mean std KLD
## 1 1 34.88909 31.42724 0.042387
## 2 2 30.60093 25.77727 0.097974
## 3 3 32.11019 26.99234 0.084146
## 4 4 36.96424 26.44977 0.127024
## 5 5 41.49536 30.49501 0.097431
## 6 6 75.78097 29.61704 0.736941
head(sigma)
## locus deme mean std mean_all_1 std_all_1 mean_all_2 std_all_2
## 1 1 1 41.22727 60.96236 53.91734 71.06870 21.14763 30.77412
## 2 1 2 33.12967 51.13975 26.56959 36.51002 39.11276 60.90761
## 3 1 3 48.29376 71.73789 64.85277 82.27341 18.30601 28.49640
## 4 1 4 41.48338 64.64849 54.02337 74.96806 23.34451 39.22857
## 5 1 5 32.95772 48.45403 24.34895 32.43587 39.74236 57.13592
## 6 1 6 30.80820 45.28096 29.28071 39.92539 32.31746 49.96666
head(kappa)
## locus deme_1 deme_2 deme_3 deme_4 deme_5 deme_6 deme_7 deme_8
## 1 1 0.38725 0.52300 0.35575 0.40875 0.55925 0.50300 0.52200 0.52400
## 2 2 0.50150 0.50325 0.50550 0.51150 0.49600 0.50575 0.51200 0.49650
## 3 3 0.50450 0.50750 0.49125 0.51125 0.49200 0.51500 0.50775 0.51375
## 4 4 0.56200 0.55800 0.56200 0.56175 0.38975 0.61025 0.23150 0.48825
## 5 5 0.55525 0.49175 0.56825 0.51650 0.50250 0.55050 0.60300 0.50825
## 6 6 0.60200 0.66600 0.55550 0.59500 0.23650 0.74575 0.03375 0.49800
## deme_9 deme_10 deme_11 deme_12 deme_13 deme_14 deme_15 deme_16 deme_17
## 1 0.53875 0.53825 0.57525 0.50500 0.51550 0.53600 0.45925 0.50650 0.48275
## 2 0.51575 0.50375 0.50875 0.50875 0.50825 0.50175 0.50800 0.49750 0.51925
## 3 0.51125 0.49650 0.50875 0.51150 0.49325 0.50825 0.50525 0.49975 0.50700
## 4 0.53250 0.39275 0.50400 0.56450 0.37775 0.44075 0.49925 0.44400 0.53275
## 5 0.55275 0.53050 0.50900 0.35575 0.50500 0.52125 0.38150 0.51125 0.52925
## 6 0.50975 0.39350 0.53825 0.40700 0.34050 0.35575 0.56425 0.42925 0.58325
## deme_18 deme_19 deme_20 deme_21 deme_22 deme_23 deme_24 deme_25 deme_26
## 1 0.46475 0.51000 0.55050 0.43725 0.55100 0.51950 0.52075 0.53975 0.55425
## 2 0.49925 0.51000 0.51350 0.50300 0.52775 0.50325 0.50750 0.50550 0.49375
## 3 0.46450 0.49900 0.50025 0.50300 0.51650 0.50900 0.50725 0.51600 0.50475
## 4 0.54150 0.52450 0.48875 0.54525 0.52125 0.48825 0.46500 0.48400 0.52500
## 5 0.52975 0.51775 0.55200 0.52700 0.56475 0.51700 0.47650 0.46425 0.51725
## 6 0.61125 0.58075 0.31000 0.62225 0.59200 0.52650 0.17000 0.55025 0.58350
## deme_27 deme_28 deme_29 deme_30 deme_31 deme_32 deme_33 deme_34 deme_35
## 1 0.33500 0.5365 0.53300 0.49150 0.52775 0.49875 0.50675 0.52075 0.44225
## 2 0.50825 0.5135 0.50700 0.50200 0.49825 0.46325 0.49325 0.49875 0.50550
## 3 0.51250 0.4490 0.51050 0.50275 0.50925 0.50125 0.51900 0.50425 0.50225
## 4 0.55875 0.5205 0.36725 0.49750 0.54125 0.42325 0.52750 0.53575 0.55475
## 5 0.31600 0.4345 0.47825 0.51000 0.50825 0.42050 0.48775 0.51975 0.53550
## 6 0.71600 0.5055 0.04475 0.53475 0.60475 0.39975 0.54950 0.59775 0.67000
## deme_36 deme_37 deme_38 deme_39
## 1 0.52575 0.51000 0.52875 0.52800
## 2 0.50750 0.49175 0.49925 0.50225
## 3 0.49225 0.49500 0.50325 0.51325
## 4 0.47625 0.53600 0.47375 0.49750
## 5 0.42075 0.52825 0.55475 0.47425
## 6 0.38900 0.55075 0.53175 0.53750
########################################################
# plotting the locus-specific coefficient of selection ######
########################################################
plot.delta(file = 'results/selestim/analyses/sim1a-1/summary_delta.out')
plot(locusPos, delta$mean, pch=20, col="grey", main="Selestim")
add_layers()
points(locusPos, delta$mean, pch=20, col="grey")
plot(locusPos, delta$mean, pch=20, col="grey", xlim=c(0,40000), main="Selestim")
add_layers()
points(locusPos, delta$mean, pch=20, col="grey")
plot(locusPos, delta$mean, pch=20, col="grey", xlim=c(80000,120000), main="Selestim")
add_layers()
points(locusPos, delta$mean, pch=20, col="grey")
####################
# plotting the KLD ######
####################
plot.kld(file = 'results/selestim/analyses/sim1a-1/summary_delta.out',calibration_file = 'results/selestim/analyses/sim1a-1/calibration/summary_delta.out',limit = 0.01)
plot(locusPos, delta$KLD, pch=20, col="grey", main="Selestim")
add_layers()
points(locusPos, delta$KLD, pch=20, col="grey")
plot(locusPos, delta$KLD, pch=20, col="grey", xlim=c(0,40000), main="Selestim")
add_layers()
points(locusPos, delta$KLD, pch=20, col="grey")
plot(locusPos, delta$KLD, pch=20, col="grey", xlim=c(80000,120000), main="Selestim")
add_layers()
points(locusPos, delta$KLD, pch=20, col="grey")
########################
# Getting the top SNPs ######
########################
top.SNP.1 <- which(delta$KLD[1:1000] == max(delta$KLD[1:1000]))
top.SNP.2 <- which(delta$KLD == max(delta$KLD))
#############################################################################################
# Computing for each top SNP a composite variable that depend on both kappa_ij and sigma_ij ######
#############################################################################################
rslt.kappa.1 <- 2 * sigma$mean[sigma$locus == top.SNP.1] * (kappa[kappa$locus == top.SNP.1,-1] - 0.5)
rslt.kappa.2 <- 2 * sigma$mean[sigma$locus == top.SNP.2] * (kappa[kappa$locus == top.SNP.2,-1] - 0.5)
############################################
# Getting information from the populations ######
############################################
pos <- read.table('results/selestim/data/sim1a/outputIndAll_pop_sim1a.txt')
pop.x <- vector(length = 39)
pop.y <- vector(length = 39)
phn <- vector(length = 39)
for (i in 1:39) {
pop.x[i] <- mean(pos$x[which(pos$pop == i)])
pop.y[i] <- mean(pos$y[which(pos$pop == i)])
phn[i] <- mean(pos$phenotype1[which(pos$pop == i)])
}
################################################
# extrapolation map for the average phenotypes ######
################################################
grid = list(x = seq(0,1,0.01),y = seq(0,1,0.01)) # coordonnées du fond de carte
fit <- Krig(cbind(pop.x,pop.y),as.numeric(phn),theta = 100,m = 1)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 7.406845e-05 (eff. df= 37.05001
## )
# plot mean phenotype over space
out <- predictSurface(fit,grid = grid,extrap = TRUE)
## [1] 10201 2
plot.surface(out,type = "I",xlab = "Longitude",ylab = "Latitude",main = "Phenotype",cex.lab = 1.125,cex.main = 2)
points(cbind(pop.x,pop.y),pch = 4,cex = 0.75,col="red",lwd = 2.5)
###########################################################################################
# extrapolation map for the population-specific coefficients of selection at the top SNPs ######
###########################################################################################
grid = list(x = seq(0,1,0.01),y = seq(0,1,0.01)) # coordonnées du fond de carte
fit <- Krig(cbind(pop.x,pop.y),as.numeric(rslt.kappa.1),theta = 100,m = 1)
## Warning:
## Grid searches over lambda (nugget and sill variances) with minima at the endpoints:
## (REML) Restricted maximum likelihood
## minimum at right endpoint lambda = 7.406845e-05 (eff. df= 37.05001
## )
out <- predictSurface(fit,grid = grid,extrap = TRUE)
## [1] 10201 2
plot.surface(out,type = "I",xlab = "Longitude",ylab = "Latitude",main = expression(2~italic(sigma)[ij]~(italic(kappa)[ij]~-~0.5)~~(TOP~SNP~1)),cex.lab = 1.125,cex.main = 2)
points(cbind(pop.x,pop.y),pch = 4,cex = 0.75,col="red",lwd = 2.5)
grid = list(x = seq(0,1,0.01),y = seq(0,1,0.01)) # coordonnées du fond de carte
fit <- Krig(cbind(pop.x,pop.y),as.numeric(rslt.kappa.2),theta = 100,m = 1)
out <- predictSurface(fit,grid = grid,extrap = TRUE)
## [1] 10201 2
plot.surface(out,type = "I",xlab = "Longitude",ylab = "Latitude",main = expression(2~italic(sigma)[ij]~(italic(kappa)[ij]~-~0.5)~~(TOP~SNP~2)),cex.lab = 1.125,cex.main = 2)
points(cbind(pop.x,pop.y),pch = 4,cex = 0.75,col="red",lwd = 2.5)
This is really cool, because the effect of the derived allele from SNP 1 on the phentoype is negative, but the effect of the derived allele from SNP 2 on the phenotype is positive. The maps are capturing this information (although in the opposite direction I would have expected).
The only way to estimate the confounding factors at the same time as the phenotype-genotype relationship (for P-values) is to do the Genotype ~ Phenotype model. As long as your interestin in p-values this is OK, but not OK for polygenic risk scores or SNP effect on the phenotype. (In linear regression, the effect of y~x is not the same as the effect as x~y). It ma Wang et al. mathematical proof to show this is an optimal method to estimate P-values without confounding effects of population structure.
In a P~G model, you use the data to estimate the structure first and then the add the mixed effects to the model, to estimate the SNP effect on the phenotype.
Recommendation (up for discussion):
Use LFMM model to hone-in on loci underlying the phenotype, but then use phenotype~genotype to get effect size.
(lm coeff y~x) = cor(x,y)^2/(lm coeff x~y) Issue will applying to following output because (1) scaling and (2) not sure how to get R^2.
Olivier says this is the best of the following 3 approaches because it is an exact solution and not an approximate solution. Also proved it is optimal because global minimum of least square problem.
sim1 <- training
# extract scaled genotypes
scaled.genotype <- scale(as.matrix(t(sim1$G)))
#scaled.genotype <- as.matrix(t(sim1$G))
# extract scaled phenotypes
x <- scale(as.matrix(sim1$phenotype1))
# centering is important to remove mean
# x <- scale(as.matrix(sim1$phenotype1), center=TRUE, scale=FALSE)
# x <- as.matrix(sim1$phenotype1)
# to do mean and not SD. this might make it possible to get effect sizes
pc <- prcomp(scaled.genotype)
plot(pc$sdev[1:20]^2)
points(5,pc$sdev[5]^2, type = "h", lwd = 3, col = "blue")
# ridge regression
lfmm.ridge <- lfmm::lfmm_ridge(Y = scaled.genotype, X = x, K = 5, lambda = 1e-4)
#The lfmm.ridge object contains estimates for the latent variables and for the effect sizes. Here, the estimates are used for computing calibrated significance values and for testing associations between the response matrix Y and the explanatory variable x. It can be done as follows:
#Fun stuff to compare effects to SNP effects
#plot(muts_count$selCoef, lfmm.ridge$B[muts_count$index])
#snp_cor_phen <- apply(scaled.genotype[,muts_count$index],2, function(yo)(cor(yo, x)))
#new_est <- snp_cor_phen^2/lfmm.ridge$B[muts_count$index]
#plot(muts_count$selCoef, new_est, xlim=c(-2,2), ylim=c(-2,2))
#abline(1,1)
lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = x, lfmm = lfmm.ridge, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 4.731696
hist(p.values, col = "lightgreen", main="LFMM ridge")
qval <- qvalue::qvalue(p.values)
plot(qval)
#The plot suggests that setting fdr.level = 0.025 warrant few false positives.
qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)
plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM ridge")
add_layers()
points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(0,40000), main="LFMM ridge")
add_layers()
points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(80000,120000), main="LFMM ridge")
add_layers()
points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
### LD pruning was done in PCADAPT segment
head(which_pruned)
## [1] 1 2 3 5 8 9
genotype <- as.matrix(t(sim1$G))
## MAF
spectrum <- apply(genotype, 2, sum)
#We first display a site frequency spectrum.
plot(table(spectrum)[1:100])
#We decide to filter out loci with MAF lower than 50 and define a new matrix with less genotypes: genotype.filtered
# now remove LD and MAF
index.filtered <- unique(which(spectrum > 50), which_pruned)
genotype.filtered <- genotype[,index.filtered]
scaled.genotype <- scale(genotype.filtered)
# extract scaled phenotypes
x <- scale(as.matrix(sim1$phenotype1))
pc <- prcomp(scaled.genotype)
plot(pc$sdev[1:20]^2)
points(2,pc$sdev[2]^2, type = "h", lwd = 3, col = "blue")
# ridge regression
lfmm.ridge <- lfmm::lfmm_ridge(Y = scaled.genotype, X = x, K = 2, lambda = 1e-4)
#The lfmm.ridge object contains estimates for the latent variables and for the effect sizes. Here, the estimates are used for computing calibrated significance values and for testing associations between the response matrix Y and the explanatory variable x. It can be done as follows:
#Fun stuff to compare effects to SNP effects
#plot(muts_count$selCoef, lfmm.ridge$B[muts_count$index])
#snp_cor_phen <- apply(scaled.genotype[,muts_count$index],2, function(yo)(cor(yo, x)))
#new_est <- snp_cor_phen^2/lfmm.ridge$B[muts_count$index]
#plot(muts_count$selCoef, new_est, xlim=c(-2,2), ylim=c(-2,2))
#abline(1,1)
lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = x, lfmm = lfmm.ridge, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 8.521286
hist(p.values, col = "lightgreen", main="LFMM ridge")
qval <- qvalue::qvalue(p.values)
plot(qval)
#The plot suggests that setting fdr.level = 0.025 warrant few false positives.
qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)
par(mfrow=c(1,1))
plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM ridge")
add_layers()
points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM ridge", xli=c(0,40000))
add_layers()
points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
#LFMM parameters can alternatively be estimated by solving regularized least-squares mimimization, with lasso penalty as follows.
scaled.genotype <- scale(as.matrix(t(sim1$G)))
lfmm.lasso <- lfmm::lfmm_lasso(Y = scaled.genotype, X = x, K = 5, nozero.prop = 0.02)
## It = 1/100, err2 = 0.999000000011033
## It = 2/100, err2 = 0.987380312304857
## It = 3/100, err2 = 0.98738592511468
## === lambda = 0.215367927378323, no zero B proportion = 0.0175084175084175
## It = 1/100, err2 = 0.987386477109867
## It = 2/100, err2 = 0.987374213059095
## It = 3/100, err2 = 0.987373079604109
## === lambda = 0.205579122693436, no zero B proportion = 0.0198653198653199
## It = 1/100, err2 = 0.987372960154821
## It = 2/100, err2 = 0.987323431042971
## It = 3/100, err2 = 0.987318656299654
## === lambda = 0.196235234288913, no zero B proportion = 0.0234006734006734
#The lfmm.lasso object contains new estimates for the latent variables and for the effect sizes. Note that for lasso, we didn't set the value of a regularization parameter. Instead, we set the proportion of non-null effects (here 2 percent).
lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = x, lfmm = lfmm.lasso, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 4.279243
hist(p.values, col = "lightblue")
qval <- qvalue::qvalue(p.values)
plot(qval)
qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)
plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM lasso")
add_layers()
points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(0,40000), main="LFMM lasso")
add_layers()
points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(80000,120000), main="LFMM lasso")
add_layers()
points(locusPos, -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
GWAS data analysis usually follows a number of preprocessing steps, including filtering the data (QC and MAF thesholding), LD pruning, imputation of missing genotypes. Here we show the effect of MAF filtering (LD pruning could be achieved by using the R package SNPRelate).
genotype <- as.matrix(t(sim1$G))
spectrum <- apply(genotype, 2, sum)
#We first display a site frequency spectrum.
plot(table(spectrum)[1:100])
#We decide to filter out loci with MAF lower than 50 (five percent) and define a new matrix with less genotypes: genotype.filtered
genotype.filtered <- genotype[,spectrum > 50]
index.filtered <- which(spectrum > 50)
dim(genotype.filtered)
## [1] 1000 2497
lfmm.lasso.filter <- lfmm::lfmm_lasso(Y = scale(genotype.filtered), X = x, K = 5, nozero.prop = 0.02)
## It = 1/100, err2 = 0.999000000000672
## It = 2/100, err2 = 0.961858901548095
## It = 3/100, err2 = 0.961900958340571
## It = 4/100, err2 = 0.961907396198033
## It = 5/100, err2 = 0.961908492116953
## === lambda = 0.237222773366769, no zero B proportion = 0.026832198638366
lfmm.test <- lfmm::lfmm_test(Y = scale(genotype.filtered), X = x, lfmm = lfmm.lasso.filter, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 3.333462
hist(p.values, col = "lightblue")
qval <- qvalue::qvalue(p.values)
plot(qval)
qval <- qvalue::qvalue(p.values, fdr.level = 0.025)
candidates <- which(qval$significant)
plot(-log10(p.values), cex = .5, pch = 19, col = "grey")
plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", main="LFMM lasso MAF > 0.05")
add_layers()
points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(0,40000), main="LFMM lasso MAF > 0.05")
add_layers()
points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
plot(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey", xlim=c(80000,120000), main="LFMM lasso MAF > 0.05")
add_layers()
points(locusPos[index.filtered], -log10(p.values), cex = .5, pch = 19, col = "grey")
points(locusPos[index.filtered][candidates], -log10(p.values[candidates]), cex = .5, pch = 19, col = "orange")
Filtering by MAF loses some of signals at the the rarer, small effect alleles, but also augments the signal at the low recombination with background selection.
LFMM parameters can be estimated by using a hierarchical Bayesian model with sparse priors, as implemented in LEA.
# See instructions on github, skipped this step
Conversion script from VCF to BayPass not provided (not robust to different kinds of VCF files). A robust conversion script would be welcome!
Code for running Sim1a and Sim2a:
i_baypass -npop 39 -gfile sim1a.genobaypass -efile sim1a.popscalecov -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS1_covis > anaS1_covis.log
i_baypass -npop 39 -gfile sim1a.genobaypass -efile sim1a.popscalecov -omegafile anaS1_covis_mat_omega.out -auxmodel -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS1_covaux > anaS1_covaux.log
i_baypass -npop 39 -gfile sim2a.genobaypass -efile sim2a.popscalecov -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS2_covis > anaS2_covis.log
i_baypass -npop 39 -gfile sim2a.genobaypass -efile sim2a.popscalecov -omegafile anaS2_covis_mat_omega.out -auxmodel -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS2_covaux > anaS2_covaux.log
i_baypass -npop 39 -gfile sim1a.genobaypass -efile sim1a.popscalecov -omegafile anaS1_covis_mat_omega.out -auxmodel -isingbeta 1. -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS1_covaux_isb1 > anaS1_covaux_isb1.log
i_baypass -npop 39 -gfile sim2a.genobaypass -efile sim2a.popscalecov -omegafile anaS2_covis_mat_omega.out -auxmodel -isingbeta 1. -nthreads 6 -pilotlength 500 -burnin 2500 -outprefix anaS2_covaux_isb1 > anaS2_covaux_isb1.log
source("results/baypass_rehh/standardize.xtx.R")
source("results/baypass_rehh/baypass_utils.R")
SIMU = 1 #1=training data set ; 2=validation data set
folder <- "results/baypass_rehh/"
det.snp=read.table(paste0(folder,"sim",SIMU,"a.detsnp"))
head(det.snp)
## V1 V2 V3 V4
## 1 1 21 A TRUE
## 2 1 109 A TRUE
## 3 1 133 A TRUE
## 4 1 231 A TRUE
## 5 1 290 A TRUE
## 6 1 338 A TRUE
covariate=read.table(paste0(folder,"sim",SIMU,"a.popscalecov"))
head(covariate)
## V1 V2 V3 V4 V5 V6
## 1 0.05803792 0.46902333 -0.9106860 -1.41580522 -1.437323 0.8860763
## 2 1.63474855 -1.39796336 -0.5892786 1.59239065 -1.391031 0.7589330
## 3 1.74271171 -0.05465763 0.4477096 0.02005608 -1.389433 0.7591777
## 4 1.35172812 1.27883394 0.1543175 -0.30974391 -1.616234 0.5872833
## V7 V8 V9 V10 V11 V12
## 1 -1.4625347 1.2708972 0.5263237 0.88056841 -1.450059 -1.4382660
## 2 -0.7209904 1.6059511 -0.5427793 -0.36543051 1.215242 0.8158230
## 3 -1.9235362 0.4245828 1.1625310 -0.87860559 -1.041913 -0.7187590
## 4 -1.6369566 -1.2436665 0.9497098 -0.08608794 -0.503407 -0.7334598
## V13 V14 V15 V16 V17 V18
## 1 -0.9763150 1.3888542 -0.01467199 -1.02980492 -0.1677179 -0.2729604
## 2 -1.4805418 1.0838990 -1.55839942 0.38008760 -0.8459146 0.7884319
## 3 -0.8257588 -0.1844147 0.37777999 0.07310678 0.9762889 1.3042229
## 4 -1.0179113 -0.8492701 1.77144900 -0.39486647 1.5375060 1.5794965
## V19 V20 V21 V22 V23 V24
## 1 1.06910319 0.3309479 0.6058540 1.571471 0.9850457 0.8397252
## 2 0.09973125 0.5376317 0.1571538 0.196349 -0.7720743 -1.2113596
## 3 -1.55401094 1.4208936 -0.6839335 -1.005647 0.3060744 1.1580900
## 4 -1.13117941 1.1852670 -0.2672595 -1.328703 0.6904243 0.6832743
## V25 V26 V27 V28 V29 V30
## 1 1.1910022 1.4922890 -0.6853850 -0.6895273 -1.3448089 -0.4329465
## 2 -1.5603223 -0.9456296 0.6134519 -0.1983667 0.2784985 -1.5406461
## 3 -0.5009385 -1.5350416 1.2754478 0.8769679 -1.4058968 -0.1361705
## 4 -0.9479869 -1.0785366 0.7475910 0.5086590 -1.4886872 0.2284050
## V31 V32 V33 V34 V35 V36
## 1 -0.5755136 0.8112454 1.0996562 1.5377877 -0.4195035 -1.01295004
## 2 -1.0196346 1.2317262 0.5392415 -0.2616589 0.1583276 -0.09274187
## 3 0.7211368 0.9314482 -0.7053409 -1.3714621 0.4654611 -0.77886988
## 4 0.7192420 0.2261967 -0.3507818 -1.2198046 0.6106576 -0.65963837
## V37 V38 V39
## 1 -0.91205250 -0.01911443 -0.3459619
## 2 1.34097044 0.19054307 1.2756312
## 3 0.17357996 0.98594818 1.0911735
## 4 0.04371451 0.99220997 1.0182154
# Omega and XtX estimation ####
om=as.matrix(read.table(paste0(folder,"anaS",SIMU,"_covis_mat_omega.out")))
xtx=read.table(paste0(folder,"anaS",SIMU,"_covis_summary_pi_xtx.out"),h=T)$M_XtX
head(xtx)
## [1] 38.43280 39.26080 39.75763 37.85158 38.53820 38.63391
xtx.std=standardize.xtx(xtx,npop=39)
#Bayes Factor (Importance Sampling estimate) for association with phenotype ####
res.bfis=read.table(paste0(folder,"anaS",SIMU,"_covis_summary_betai_reg.out"),h=T)
head(res.bfis)
## COVARIABLE MRK M_Pearson SD_Pearson BF.dB. Beta_is SD_Beta_is
## 1 1 1 -0.09913029 0.1319906 -12.17544 -0.00207851 0.00273331
## 2 1 2 0.04857973 0.1540008 -17.28124 0.00039619 0.00164867
## 3 1 3 -0.03988024 0.1467607 -17.30215 -0.00070579 0.00191350
## 4 1 4 -0.01157744 0.1290019 -17.18330 -0.00002905 0.00228726
## 5 1 5 -0.13067654 0.1302722 -15.25326 -0.00197577 0.00221203
## 6 1 6 -0.12179184 0.1152672 -12.70376 -0.00403038 0.00387398
## eBPis
## 1 0.34970009
## 2 0.09146511
## 3 0.14737244
## 4 0.00442397
## 5 0.42974365
## 6 0.52554020
bfis=res.bfis$BF.dB.[res.bfis$COVARIABLE==3]
#Bayes Factor (AUX model estimate) for association with phenotype ####
res.aux=read.table(paste0(folder,"anaS",SIMU,"_covaux_summary_betai.out"),h=T)
head(res.aux)
## COVARIABLE MRK M_Beta SD_Beta M_Delta BF.dB.
## 1 1 1 0 0 0 -13.05178
## 2 1 2 0 0 0 -13.05178
## 3 1 3 0 0 0 -13.05178
## 4 1 4 0 0 0 -13.05178
## 5 1 5 0 0 0 -13.05178
## 6 1 6 0 0 0 -13.05178
bfmc=res.aux$BF.dB.[res.aux$COVARIABLE==3]
#PIP (association with phenotype) obtained under the ising model (spatial dependency of SNP in the regression on phenotype) ####
res.ising=read.table(paste0(folder,"anaS",SIMU,"_covaux_isb1_summary_betai.out"),h=T)
head(res.ising)
## COVARIABLE MRK M_Beta SD_Beta M_Delta BF.dB.
## 1 1 1 0 0 0 -13.05178
## 2 1 2 0 0 0 -13.05178
## 3 1 3 0 0 0 -13.05178
## 4 1 4 0 0 0 -13.05178
## 5 1 5 0 0 0 -13.05178
## 6 1 6 0 0 0 -13.05178
pip.is=res.ising$M_Delta[res.ising$COVARIABLE==3]
plot(locusPos, xtx,col= "grey",pch=20, main="XtX BayPass")
add_layers()
points(locusPos, xtx,col= "grey", main="XtX", pch=20)
plot(locusPos, xtx,col= "grey",main="XtX", pch=20, xlim=c(0,40000))
add_layers()
points(locusPos, xtx,col= "grey", main="XtX", pch=20)
plot(locusPos, xtx,col= "grey",main="XtX", pch=20, xlim=c(80000,120000))
add_layers()
points(locusPos, xtx,col= "grey", main="XtX", pch=20)
plot(locusPos, bfis,col="grey",pch=20, main="BFis (pheno1) BayPass")
add_layers()
points(locusPos, bfis,col= "grey", main="BFis (pheno1)", pch=20)
plot(locusPos, bfis,col= "grey",main="BFis (pheno1)", pch=20, xlim=c(0,40000))
add_layers()
points(locusPos, bfis,col= "grey", main="BFis (pheno1)", pch=20)
plot(locusPos, bfis,col= "grey",main="BFis (pheno1)", pch=20, xlim=c(80000,120000))
add_layers()
points(locusPos, bfis,col= "grey", main="BFis (pheno1)", pch=20)
plot(locusPos, bfmc,col="grey",pch=20, main="BFmc (pheno1) BayPass")
add_layers()
points(locusPos, bfmc,col= "grey", main="BFmc (pheno1)", pch=20)
plot(locusPos, bfmc,col= "grey",main="BFmc (pheno1)", pch=20, xlim=c(0,40000))
add_layers()
points(locusPos, bfmc,col= "grey", main="BFmc (pheno1)", pch=20)
plot(locusPos, bfmc,col= "grey",main="BFmc (pheno1)", pch=20, xlim=c(80000,120000))
add_layers()
points(locusPos, bfmc,col= "grey", main="BFmc (pheno1)", pch=20)
plot(locusPos, sqrt(pip.is),col="grey",main="Ising Model PIP (pheno1) BayPass", pch=20)
add_layers()
points(locusPos, sqrt(pip.is),col= "grey", main="Ising Model PIP (pheno1)", pch=20)
plot(locusPos, sqrt(pip.is),col= "grey",main="Ising Model PIP (pheno1)", pch=20, xlim=c(0,40000))
add_layers()
points(locusPos, sqrt(pip.is),col= "grey", main="Ising Model PIP (pheno1)", pch=20)
plot(locusPos, sqrt(pip.is),col= "grey",main="Ising Model PIP (pheno1) BayPass", pch=20, xlim=c(80000,120000))
add_layers()
points(locusPos, sqrt(pip.is),col= "grey", main="Ising Model PIP (pheno1)", pch=20)
####
# Why BayPass Fails: ####
#### 1) Mean population phenotype/environment is highly correlated with major axis of structuring of genetic diversity
#SVD decomposistion of Omega (similar to principle components of the covariance matrix)
om.svd=svd(om)
#various correlation
tmp.data=as.data.frame(cbind(om.svd$u[,1:2],t(covariate[3:4,])))
colnames(tmp.data)=c("Omega PC1","Omega PC2","Pheno 1","Environment")
plot(tmp.data)
##Correlation between PC's and covariable is not that clear for the second simulation
#### 2) Continuous population: averaging phenotype within population lead to lose a lot of information
###A solution: use GCTA to exploit within pop variability ####
####
res.gcta=read.table(paste0(folder,"sim",SIMU,"a.mlma"),h=T)
lpval.gcta=-1*log10(res.gcta$p)
cex=rep(0.25,length(lpval.gcta))
cex[lpval.gcta>6]=1
head(det.snp)
## V1 V2 V3 V4
## 1 1 21 A TRUE
## 2 1 109 A TRUE
## 3 1 133 A TRUE
## 4 1 231 A TRUE
## 5 1 290 A TRUE
## 6 1 338 A TRUE
plot(locusPos, xtx,col="grey",pch=16,cex=cex, main = "XTX with GCTA outliers large dots")
add_layers()
points(locusPos, xtx,col="grey",pch=16,cex=cex)
points(locusPos[cex==1], xtx[cex==1],col="blue",pch=21)
plot(locusPos, lpval.gcta,col="grey",pch=16,cex=cex, main="GCTA log P")
add_layers()
points(locusPos, lpval.gcta,col="grey",pch=16,cex=cex)
points(locusPos[cex==1], lpval.gcta[cex==1],col="blue",pch=21)
plot(locusPos, lpval.gcta,col="grey",pch=16,cex=cex, xlim=c(0,40000), main="GCTA log P")
add_layers()
points(locusPos, lpval.gcta,col="grey",pch=16,cex=cex)
points(locusPos[cex==1], lpval.gcta[cex==1],col="blue",pch=21)
plot(locusPos, lpval.gcta,col="grey",pch=16,cex=cex, xlim=c(80000,120000), main="GCTA log P")
add_layers()
points(locusPos, lpval.gcta,col="grey",pch=16,cex=cex)
points(locusPos[cex==1], lpval.gcta[cex==1],col="blue",pch=21)
Likelihood is the score, alpha is inversely related to teh size of the region used to calculate the score.
$ SweeD -input ../../inputs/sim1a.vcf -grid 100 -name sim1a
out <- read.table("results/sweed/SweeD_Report_sim1a_edited.txt", header=TRUE)
head(out)
## Position Likelihood Alpha
## 1 21.0000 0.11374640 19.3926500
## 2 424.1212 0.07445243 9.6379690
## 3 827.2424 0.76083240 0.1096088
## 4 1230.3636 0.00000000 3.0000010
## 5 1633.4849 0.49518130 0.3392800
## 6 2036.6061 0.39430570 0.1856157
plot(out$Position, out$Likelihood, pch=20, col="grey", main="SweeD")
add_layers()
points(out$Position, out$Likelihood, pch=20, col="grey")
# use simulations to determine a good likelihood cutoff and demographic models the score is over 50 or 60
The iHS is a within-population measure of LD.
Conversion script from VCF to REHH not provided.
###################
##For each chromosome in turn:
## i) Read haplotype data
## ii) Compute iHH_A, iHH_D, iES_Tang and iES_Sabeti
###################
library(rehh)
## Loading required package: rehh.data
cnt=0
for(i in 1:6){
cnt=cnt+1
tmp.hapfile=paste0(folder,"sim",SIMU,"a.chr",i,".thap")
tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file=paste0(folder,"sim",SIMU,"a.rehhmap.inp"), chr.name=i,haplotype.in.columns=TRUE)
tmp.scan=scan_hh(tmp.hap,threads=4)
if(cnt==1){wgscan=tmp.scan}else{wgscan=rbind(wgscan,tmp.scan)}
}
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 976 SNPs declared for chromosome 1
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 976 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 1029 SNPs declared for chromosome 2
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1029 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 986 SNPs declared for chromosome 3
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 986 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 1033 SNPs declared for chromosome 4
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1033 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 961 SNPs declared for chromosome 5
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 961 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 955 SNPs declared for chromosome 6
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 955 SNPs
dim(wgscan)
## [1] 5940 7
ihs=ihh2ihs(wgscan,minmaf=0.05,freqbin=0.05)
head(ihs$iHS,25)
## CHR POSITION iHS -log10(p-value)
## rs6 1 338 NA NA
## rs7 1 364 NA NA
## rs8 1 400 NA NA
## rs11 1 515 NA NA
## rs12 1 528 NA NA
## rs19 1 1006 NA NA
## rs24 1 1191 NA NA
## rs28 1 1365 NA NA
## rs31 1 1498 NA NA
## rs32 1 1513 NA NA
## rs33 1 1550 NA NA
## rs35 1 1600 NA NA
## rs36 1 1612 NA NA
## rs46 1 1925 NA NA
## rs47 1 1976 NA NA
## rs54 1 2392 NA NA
## rs55 1 2427 NA NA
## rs57 1 2449 NA NA
## rs60 1 2496 NA NA
## rs61 1 2511 NA NA
## rs62 1 2542 NA NA
## rs68 1 2879 NA NA
## rs69 1 2882 NA NA
## rs70 1 2909 NA NA
## rs72 1 2934 NA NA
tail(ihs$iHS)
## CHR POSITION iHS -log10(p-value)
## rs5919 6 239078 NA NA
## rs5926 6 239398 NA NA
## rs5927 6 239509 NA NA
## rs5928 6 239538 NA NA
## rs5932 6 239634 NA NA
## rs5934 6 239706 NA NA
ihs$frequency.class
## Number of SNPs mean of the log(iHHA/iHHD) ratio
## 0.05 - 0.1 32 1.09660855
## 0.1 - 0.15 32 0.79394704
## 0.15 - 0.2 51 0.73844344
## 0.2 - 0.25 42 0.49214859
## 0.25 - 0.3 35 0.15316435
## 0.3 - 0.35 52 0.15483401
## 0.35 - 0.4 44 0.05861124
## 0.4 - 0.45 63 0.01024730
## 0.45 - 0.5 62 0.02172340
## 0.5 - 0.55 57 -0.18316069
## 0.55 - 0.6 92 -0.23828980
## 0.6 - 0.65 94 -0.30992872
## 0.65 - 0.7 97 -0.34170415
## 0.7 - 0.75 109 -0.46591618
## 0.75 - 0.8 172 -0.73141440
## 0.8 - 0.85 175 -0.83584247
## 0.85 - 0.9 309 -1.11365588
## 0.9 - 0.95 421 -1.42897780
## sd of the log(iHHA/iHHD) ratio
## 0.05 - 0.1 0.5714631
## 0.1 - 0.15 0.4663963
## 0.15 - 0.2 0.5489434
## 0.2 - 0.25 0.4246825
## 0.25 - 0.3 0.3561133
## 0.3 - 0.35 0.3309306
## 0.35 - 0.4 0.4147639
## 0.4 - 0.45 0.4461021
## 0.45 - 0.5 0.3967031
## 0.5 - 0.55 0.3237119
## 0.55 - 0.6 0.4104828
## 0.6 - 0.65 0.3957538
## 0.65 - 0.7 0.3907953
## 0.7 - 0.75 0.4567719
## 0.75 - 0.8 0.4068721
## 0.8 - 0.85 0.4522183
## 0.85 - 0.9 0.5461895
## 0.9 - 0.95 0.4923380
#distribplot(ihs$iHS$iHS)
#ihsplot(ihs)
ihs$iHS[which(ihs$iHS[,4]>2),]
## CHR POSITION iHS -log10(p-value)
## rs185 1 8230 -3.182865 2.836166
## rs1449 2 58208 -2.771271 2.253071
## rs1457 2 58509 -2.827615 2.328863
## rs3566 4 141742 -3.412286 3.190976
## rs3568 4 141759 2.823829 2.323730
## rs3569 4 141759 2.823829 2.323730
## rs3870 4 153368 -2.661302 2.108803
## rs4166 5 166008 3.046233 2.635021
## rs5359 6 215729 3.214988 2.884557
## rs5617 6 226485 -2.787467 2.274726
## rs5713 6 230127 -3.254299 2.944345
plot(ihs$iHS$POSITION,ihs$iHS$`-log10(p-value)`, col="grey", pch=20, main="REHH iHS")
add_layers()
points(ihs$iHS$POSITION,ihs$iHS$`-log10(p-value)`, col="grey", pch=20)
plot(ihs$iHS$POSITION,ihs$iHS$iHS, col="grey", pch=20, main="REHH iHS")
add_layers()
points(ihs$iHS$POSITION,ihs$iHS$iHS, col="grey", pch=20)
###pop based on phenotype: ####
# pop2=individuals with pheno1>1 and pop3=individuals with pheno1<-1
cnt=0
for(i in 1:6){
cnt=cnt+1
tmp.hapfile=paste0(folder,"sim",SIMU,"a.chr",i,".pop2.thap")
tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file=paste0(folder,"sim",SIMU,"a.rehhmap.inp"), chr.name=i,haplotype.in.columns=TRUE)
tmp.scan.pop1=scan_hh(tmp.hap,threads=4)
tmp.hapfile=paste0(folder,"sim",SIMU,"a.chr",i,".pop3.thap")
tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file=paste0(folder,"sim",SIMU,"a.rehhmap.inp"), chr.name=i,haplotype.in.columns=TRUE)
tmp.scan.pop2=scan_hh(tmp.hap,threads=4)
if(cnt==1){
wgscan.pop1=tmp.scan.pop1
wgscan.pop2=tmp.scan.pop2
}else{
wgscan.pop1=rbind(wgscan.pop1,tmp.scan.pop1)
wgscan.pop2=rbind(wgscan.pop2,tmp.scan.pop2)
}
}
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 976 SNPs declared for chromosome 1
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 976 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 976 SNPs declared for chromosome 1
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 976 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 1029 SNPs declared for chromosome 2
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 1029 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 1029 SNPs declared for chromosome 2
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 1029 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 986 SNPs declared for chromosome 3
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 986 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 986 SNPs declared for chromosome 3
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 986 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 1033 SNPs declared for chromosome 4
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 1033 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 1033 SNPs declared for chromosome 4
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 1033 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 961 SNPs declared for chromosome 5
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 961 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 961 SNPs declared for chromosome 5
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 961 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 955 SNPs declared for chromosome 6
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 170 haplotypes and 955 SNPs
## Warning in data2haplohh(hap_file = tmp.hapfile, map_file = paste0(folder, :
## Some SNPs map to the same position
## Map file seems OK: 955 SNPs declared for chromosome 6
## Haplotype are in columns with no header
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 154 haplotypes and 955 SNPs
xpehh=ies2xpehh(wgscan.pop1,wgscan.pop2)
#xpehhplot(xpehh)
xpehh[which(xpehh[,4]>2.5),]
## CHR POSITION XPEHH -log10(p-value) [bilateral]
## rs924 1 38051 4.846620 5.901070
## rs926 1 38093 3.388071 3.152513
## rs1630 2 64920 -3.061946 2.657769
## rs2044 3 81415 -3.013151 2.587456
## rs3163 4 126453 3.064205 2.661047
## rs4259 5 170425 3.381116 3.141511
## rs4260 5 170440 3.292190 3.002568
## rs4261 5 170460 3.835330 3.901717
## rs4262 5 170493 3.505331 3.340997
## rs4264 5 170563 3.198220 2.859244
## rs5453 6 219803 3.096292 2.707839
## rs5456 6 219919 3.425592 3.212212
## rs5457 6 219933 3.705832 3.676340
## rs5458 6 219965 3.572687 3.451811
## rs5459 6 219997 3.560128 3.431007
## rs5462 6 220209 3.162945 2.806370
## rs5477 6 220630 3.796370 3.833184
## rs5478 6 220661 3.797793 3.835676
## rs5479 6 220694 3.726996 3.712701
## rs5603 6 225881 3.084691 2.690873
## rs5606 6 225958 3.001226 2.570417
## rs5617 6 226485 3.455438 3.260112
## rs5717 6 230275 3.326834 3.056314
## rs5720 6 230312 2.975288 2.533557
str(xpehh)
## 'data.frame': 5940 obs. of 4 variables:
## $ CHR : chr "1" "1" "1" "1" ...
## $ POSITION : num 21 109 133 231 290 338 364 400 424 513 ...
## $ XPEHH : num NA NA NA NA NA NA NA NA NA NA ...
## $ -log10(p-value) [bilateral]: num NA NA NA NA NA NA NA NA NA NA ...
plot(xpehh$POSITION,xpehh$XPEHH, col="grey", pch=20, main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$XPEHH, col="grey", pch=20)
# positive score is
# negative score is
plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)
plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(0,40000), main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)
plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(40000, 79500), main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)
plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(80000, 120000), main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)
plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(200000, 240000), main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)
plot(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20, xlim=c(160000, 200000), main="REHH xpehh low vs. high pheno")
add_layers()
points(xpehh$POSITION,xpehh$`-log10(p-value)`, col="grey", pch=20)
Note: Should have used map position based on cM (e.g, recombination) instead of SNP index or map position for REHH and XPEHH. This would compress the signals at the low recombination regions.
OmegaPlus-M -threads 4 -input sim1a.vcf -grid 1000 -minwin 2000 -maxwin
200000 -name SIM1A -seed 3993 -noSeparator
om <- read.table("results/omegaplus/OmegaPlus_Report.SIM1A", header=TRUE)
head(om)
## POS SCORE
## 1 21.0000 0
## 2 60.9489 0
## 3 100.8979 0
## 4 140.8468 0
## 5 180.7958 0
## 6 220.7447 0
plot(om$POS, om$SCORE, main="Omega Plus")
add_layers()
RAiSD -I sim1a.vcf -n sim1A -s -t -f
-I input
-n name
-s separate files
-t don’t put delimiters
-f force overwrite
ras <- read.table("results/raisd/RAiSD_Report.sim1A.ALL", header=TRUE)
head(ras)
## POS Variability SFS LD Composite
## 1 267 0.0012320 0.2 0.40 9.857e-05
## 2 312 0.0010170 0.2 0.40 8.134e-05
## 3 330 0.0009892 0.1 0.40 3.957e-05
## 4 429 0.0009917 0.1 0.45 4.463e-05
## 5 488 0.0009942 0.1 0.32 3.182e-05
## 6 542 0.0010220 0.1 0.32 3.270e-05
plot(ras$POS, ras$Variability, pch=20, col="grey", main = "RAiSD")
# higher the variability score, the lower the SNP variation
add_layers()
points(ras$POS, ras$Variability, pch=20, col="grey")
plot(ras$POS, ras$SFS, pch=20, col="grey")
# this only counts high freq derived on left and right, don't pay attention
plot(ras$POS, ras$LD, pch=20, col="grey" , main = "RAiSD")
# low LD score means that there are no unique pattern on left or on right
# high LD score means that there are unique patterns on left and right side
add_layers()
points(ras$POS, ras$LD, pch=20, col="grey")
plot(ras$POS, ras$Composite, pch=20, col="grey", main = "RAiSD")
# low LD score means that there are no unique pattern on left or on right
# high LD score means that there are unique patterns on left and right side
add_layers()
points(ras$POS, ras$Composite, pch=20, col="grey")