We display 23 individual estimates of admixture (K= 1:6) by overlaying pie-charts on a geographic map of the White Mountain National Forest.

r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
library(vcfR)
library(adegenet)
library(adegraphics)
library(pegas)
library(StAMPP)
library(lattice)
library(gplots)
library(ape)
library(ggmap) 
library(LEA)
library(maps)
library(mapdata)
library(maptools)
library(mapproj)
library(mapplots)
library(dismo)
library(plotrix)
library(dplyr)
library(rgdal)
library(ggplot2)
library(raster)
library(TESS)
library(ggforce)
install.packages('scatterpie')
## 
## The downloaded binary packages are in
##  /var/folders/d3/4f6lh5bn3k3_2xw9p_jz7xgh0000gn/T//RtmpDZB4Zk/downloaded_packages
library(scatterpie)
install.packages("LEA_1.4.0_tar.gz", repos = NULL, type ="source")

Running code for sNMF using our revised VCF

We had to drop UMA4_1 from the vcf because it was a duplicate of UMA21_1. This was accomplished using vcftools. The code for downloading and executing this via the UNIX command line can be found on my github. This step cannot be accomplished in R.

#Read in the new revised vcf and proceed
vcf<- read.vcfR("/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf", verbose = FALSE)
head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=2018-02-13T00:07:39"
## [1] "##source=VCF_popgen.pl"
## [1] "##reference=file:UMA13_denovo_clusters_relaxed.fasta"
## [1] "##contig=N/A"
## [1] "##phasing=none"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM              POS  ID REF ALT QUAL    FILTER
## [1,] "RAD_kmer_0000017" "89" NA "T" "C" "3139"  "PASS"
## [2,] "RAD_kmer_0000025" "27" NA "C" "T" "11314" "PASS"
## [3,] "RAD_kmer_0000025" "53" NA "C" "G" "1218"  "PASS"
## [4,] "RAD_kmer_0000025" "59" NA "G" "A" "3244"  "PASS"
## [5,] "RAD_kmer_0000025" "83" NA "C" "T" "5395"  "PASS"
## [6,] "RAD_kmer_0000026" "14" NA "T" "G" "3430"  "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT        UMA2_1              UMA35_1             
## [1,] "GT:DP:GQ:AD" "0/0:346:99:346,0"  "0/1:242:99:89,153" 
## [2,] "GT:DP:GQ:AD" "0/1:158:99:100,58" "0/1:235:99:123,112"
## [3,] "GT:DP:GQ:AD" "0/0:160:99:160,0"  "0/0:237:99:236,0"  
## [4,] "GT:DP:GQ:AD" "0/0:159:99:159,0"  "0/0:237:99:237,0"  
## [5,] "GT:DP:GQ:AD" "0/0:157:99:156,0"  "0/0:237:99:237,0"  
## [6,] "GT:DP:GQ:AD" "1/1:114:57:5,109"  "0/0:57:86:57,0"    
##      RS32_1             UMA20_1             UMA39_1           
## [1,] "0/0:208:99:207,1" "0/0:208:99:208,0"  "0/0:175:99:174,0"
## [2,] "0/0:92:99:92,0"   "0/0:160:99:160,0"  "0/1:194:99:97,97"
## [3,] "0/0:92:99:92,0"   "0/0:158:99:158,0"  "0/0:195:99:192,0"
## [4,] "0/0:91:99:91,0"   "0/0:158:99:158,0"  "0/0:195:99:194,1"
## [5,] "0/0:92:99:92,0"   "0/1:157:99:47,110" "0/0:195:99:195,0"
## [6,] "1/1:50:75:0,50"   "1/1:106:99:0,106"  "1/1:38:57:0,38"  
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:DP:GQ:AD"
## [1]
dp<- extract.gt(vcf, element = 'DP', as.numeric=TRUE) 
# UMA4_1 has been successfully removed!

# identify your input file (use the new revised vcf)
input.file <- "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf"

# vcf2geno function. The output.file should be same as the input file ending in .geno. force = TRUE if you want to overwrite an existing file under that name. 
vcf2geno(input.file, output.file = "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno", force = TRUE)
## 
##  - number of detected individuals:   23
##  - number of detected loci:      73055
## 
## For SNP info, please check /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.vcfsnp.
## 
## 1371 line(s) were removed because these are not SNPs.
## Please, check /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.removed file, for more informations.
## [1] "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno"

Data for scatterpie K=2

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 2) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK2 = Q(obj.snmf, K = 2, run = best)

barplot(t(qmatrixK2),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

Plotting admixture coeffients for K= 2

Data for scatterpie K=3

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 3) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK3 = Q(obj.snmf, K = 3, run = best)

barplot(t(qmatrixK3),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

Plotting admixture coeffients for K=3

Data for scatterpie K=4

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 4) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK4 = Q(obj.snmf, K = 4, run = best)

barplot(t(qmatrixK4),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

Plotting admixture coeffients for K=4

Data for scatterpie K=5

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 5) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

# at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK5 = Q(obj.snmf, K = 5, run = best)

barplot(t(qmatrixK5),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

Plotting admixture coeffients for K=5

Data for scatterpie K=6

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 6) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK6 = Q(obj.snmf, K = 6, run = best)

barplot(t(qmatrixK6),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

Plotting admixture coeffients for K=6