knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(vcfR)
library(adegenet)
library(adegraphics)
library(pegas)
library(StAMPP)
library(lattice)
library(gplots)
library(ape)
library(ggmap)
library(LEA)
#The new packages we'll be using for these analyses (mainly gdsfmt and SNPRelate) need to be installed from Bioconductor Lite
#source("https://bioconductor.org/biocLite.R", echo=FALSE, verbose = FALSE)
#biocLite()
#biocLite(c("GenomicFeatures", "AnnotationDbi"))
#biocLite("SNPRelate")
library("devtools")
library(gdsfmt)
library(SNPRelate)
library(calibrate)
Preparing data
#gdsfmt package provides the genome data structure (GDS) file format for array-oriented bioinformatic data, which is a container for annotation and SNP genotypes.
#Clear cache if necessary using
#closeAllConnections()
#snpgdsClose(genofile) #in case you need to close or re-write a gds file
#closefn.gds(test1.gds)
#Load up our vcf file (SNP data for 24 red squirrels)
vcf.fn <- "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.vcf"
#Reformat VCF --> SNP GDS (genomic data structure) Format
snpgdsVCF2GDS(vcf.fn, "test1.gds", method="biallelic.only")
## VCF Format ==> SNP GDS Format
## Method: exacting biallelic SNPs
## Number of samples: 24
## Parsing "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.vcf" ...
## import 73055 variants.
## + genotype { Bit2 24x73055, 428.1K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file 'test1.gds' (890.7K)
## # of fragments: 53
## save to 'test1.gds.tmp'
## rename 'test1.gds.tmp' (890.3K, reduced: 396B)
## # of fragments: 20
#Summary
snpgdsSummary("test1.gds")
## The file name: /Users/tanyalama/Box Sync/Squirrel/test1.gds
## The total number of samples: 24
## The total number of SNPs: 73055
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
genofile <- snpgdsOpen("/Users/tanyalama/Box Sync/Squirrel/test1.gds", readonly= FALSE)
#Add population information to your gds genofile
add.gdsn(genofile, "pop_code", c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)) #all identified to 1 population for starters
#get sample id information
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
Data Analysis
#gdsfmt and SNPRelate accelerate two key computations in GWAS (genome wide association studies): principal component analysis (PCA) and relatedness analysis using identity-by-descent (IBD) measures.
LD-based SNP pruning
#It is suggested to use a pruned set of SNPs which are in approximate linkage equilibrium with each other to avoid the strong influence of SNP clusters in principal component analysis and relatedness analysis.
set.seed(1) #should be 1000, reduced for rmarkdown
#Try different LD thresholds for sensitivity analysis. #Need to include autosome.only = FALSE clause to run.
snpset <- snpgdsLDpruning(genofile, autosome.only = FALSE,remove.monosnp = TRUE, ld.threshold=0.1, verbose = FALSE) #38,384/72,744 markers are selected in total with .1 ld.
# Create an object to hold our ~38k selected SNPs
snpset.id <- unlist(snpset)
Principal Component Analysis (PCA)
#The functions in SNPRelate for PCA include calculating the genetic covariance matrix from genotypes, computing the correlation coefficients between sample loadings and genotypes for each SNP, calculating SNP eigenvectors (loadings), and estimating the sample loadings of a new dataset from specified SNP eigenvectors.
# Run PCA using the ldpruned set of 38k SNPs (snpset.id) instead of the full set.
#get sample id information
genofile
## File: /Users/tanyalama/Box Sync/Squirrel/test1.gds (890.4K)
## + [ ] *
## |--+ sample.id { Str8 24 ZIP_ra(53.0%), 104B }
## |--+ snp.id { Int32 73055 ZIP_ra(34.6%), 98.7K }
## |--+ snp.rs.id { Str8 73055 ZIP_ra(0.14%), 111B }
## |--+ snp.position { Int32 73055 ZIP_ra(27.5%), 78.5K }
## |--+ snp.chromosome { Str8 73055 ZIP_ra(5.30%), 64.2K }
## |--+ snp.allele { Str8 73055 ZIP_ra(14.2%), 40.6K }
## |--+ genotype { Bit2 24x73055, 428.1K } *
## |--+ snp.annot [ ]
## | |--+ qual { Float32 73055 ZIP_ra(62.1%), 177.2K }
## | \--+ filter { Str8 73055 ZIP_ra(0.23%), 831B }
## \--+ pop_code { Float64 24, 192B }
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id")) #run
pca <- snpgdsPCA(genofile, sample.id = NULL, snp.id=snpset.id, autosome.only = FALSE, num.thread=2, verbose= TRUE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 24 samples, 38,330 SNPs
## using 2 (CPU) cores
## PCA: the sum of all selected genotypes (0,1,2) = 1571094
## CPU capabilities: Double-Precision SSE2
## Tue May 22 09:28:18 2018 (internal increment: 20308)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Tue May 22 09:28:18 2018 Begin (eigenvalues and eigenvectors)
## Tue May 22 09:28:18 2018 Done.
#The following code shows how to calculate the percent of variation that is accounted for by the top principal components. It is clear to see the first two eigenvectors hold the largest percentage of variance among the population, although the total variance accounted for is still less the one-quarter of the total.
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
## [1] 6.19 5.88 5.37 5.19 5.17 5.01
#get population information
pop_code<- read.gdsn(index.gdsn(genofile, "pop_code"))
head(cbind(sample.id, pop_code))
## sample.id pop_code
## [1,] "UMA2_1" "1"
## [2,] "UMA35_1" "1"
## [3,] "RS32_1" "1"
## [4,] "UMA20_1" "1"
## [5,] "UMA39_1" "1"
## [6,] "UMA19_1" "1"
# make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
pop = factor(pop_code)[match(pca$sample.id,sample.id)],
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
tab #view the dataframe
## sample.id pop EV1 EV2
## 1 UMA2_1 1 -0.08130485 -0.085313106
## 2 UMA35_1 1 -0.07604746 -0.060878776
## 3 RS32_1 1 -0.04712566 -0.044342310
## 4 UMA20_1 1 -0.04969118 -0.074992965
## 5 UMA39_1 1 -0.08435959 -0.091507024
## 6 UMA19_1 1 -0.07667316 -0.088175748
## 7 RS104_1 1 -0.05436017 -0.160406748
## 8 UMA13_1 1 -0.01958050 -0.041402255
## 9 UMA5_1 1 -0.04561859 -0.081156379
## 10 UMA3_1 1 0.50935746 0.056932332
## 11 UMA26_1 1 -0.06098301 -0.042492740
## 12 RS33_1 1 -0.06825055 -0.107447959
## 13 UMA23_1 1 -0.02993273 -0.098066003
## 14 UMA40_1 1 -0.15829487 0.649175886
## 15 UMA8_1 1 -0.08938251 -0.078794825
## 16 UMA6_1 1 -0.06217682 -0.043349772
## 17 UMA9_1 1 -0.09063064 -0.001605688
## 18 UMA38_1 1 -0.15935724 0.650866654
## 19 UMA17_1 1 -0.10960971 -0.143523501
## 20 UMA41_1 1 -0.07838629 -0.056645251
## 21 UMA4_1 1 0.54485216 0.067936610
## 22 UMA37_1 1 -0.07459125 -0.085325566
## 23 UMA21_1 1 0.54562556 0.068026659
## 24 UMA42_1 1 -0.08347838 -0.107511527
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1") #three distinct groups
Relatedness Analysis: Estimating IBD Using PLINK method of moments (MoM)
#For relatedness analysis, identity-by-descent (IBD) estimation in SNPRelate can be done by either the method of moments (MoM) (Purcell et al., 2007) or maximum likelihood estimation (MLE) (Milligan, 2003; Choi et al., 2009). For both of these methods it is preffered to use a LD pruned SNP set.
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
# Estimate IBD coefficients
ibd <- snpgdsIBDMoM(genofile, sample.id, snp.id=snpset.id, autosome.only = FALSE, remove.monosnp= TRUE, kinship = TRUE, maf=0.05, missing.rate=0.05, num.thread=2, verbose= TRUE)
## IBD analysis (PLINK method of moment) on genotypes:
## Excluding 22,916 SNPs (monomorphic: TRUE, MAF: 0.05, missing rate: 0.05)
## Working space: 24 samples, 15,414 SNPs
## using 2 (CPU) cores
## PLINK IBD: the sum of all selected genotypes (0,1,2) = 544130
## Tue May 22 09:28:19 2018 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Tue May 22 09:28:19 2018 Done.
#maf (minor allele frequency) threshold is the frequency at which the second most common allele occurs in a given population.
# Make a dataframe
ibd.coeff <- snpgdsIBDSelection(ibd)
head(ibd.coeff)
## ID1 ID2 k0 k1 kinship
## 1 UMA2_1 UMA35_1 1 0 0
## 2 UMA2_1 RS32_1 1 0 0
## 3 UMA2_1 UMA20_1 1 0 0
## 4 UMA2_1 UMA39_1 1 0 0
## 5 UMA2_1 UMA19_1 1 0 0
## 6 UMA2_1 RS104_1 1 0 0
Estimating IBD: Maximum Likelihood Estimation
#Identical by descent (IBD) describes a matching segment of DNA shared by two or more individuals that has been inherited from a common ancestor without any intervening recombination. Alleles that are IBD have been interested by a recent common ancestor (and indicate recent gene flow).
# Estimate IBD coefficients
set.seed(1) #should be 1000
ibd <- snpgdsIBDMLE(genofile, sample.id = NULL, snp.id=snpset.id,autosome.only = FALSE, remove.monosnp= TRUE, maf=0.05, missing.rate=0.05, kinship = TRUE, num.thread=2, verbose = TRUE)
## Identity-By-Descent analysis (MLE) on genotypes:
## Excluding 22,916 SNPs (monomorphic: TRUE, MAF: 0.05, missing rate: 0.05)
## Working space: 24 samples, 15,414 SNPs
## using 2 (CPU) cores
## MLE IBD: the sum of all selected genotypes (0,1,2) = 544130
## MLE IBD: Tue May 22 09:28:19 2018 0%
# Make a dataframe
ibd.coeff <- snpgdsIBDSelection(ibd)