library(vcfR)
library(adegenet)
library(adegraphics)
library(pegas)
library(StAMPP)
library(lattice)
library(gplots)
library(ape)
library(ggmap) 
library(LEA)
library(ggplot2)
#install.packages(c("fields","RColorBrewer","mapplots"))
#source("http://bioconductor.org/biocLite.R")
#biocLite("LEA")

#vcf<- read.vcfR("/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.vcf", verbose = FALSE)
#head(vcf)

#Then plot important statistics summed over entire VCF
#chrom <- create.chromR(name='RAD_data', vcf=vcf) 
#plot(chrom) #plot the data

#Then extract the allele depths per each sample (DP field of VCF) and plot distribution of allele depths of all sites per each sample. NB: You may inspect and visualize other fields of VCF, e.g. allele depth (AD) or genotype quality (GQ)
# read depth distribution per individual 
#dp<- extract.gt(vcf, element = 'DP', as.numeric=TRUE)
#pdf("DP_RAD_data.pdf", width = 10, height=3) # boxplot 

New VCF revised

We had to drop UMA4_1 from the vcf because it was a duplicate of UMA21_1. This was accomplished using vcftools. The code for downloading and executing this via the UNIX command line can be found on my github. This step cannot be accomplished in R.

#Read in the new revised vcf and proceed
vcf<- read.vcfR("/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf", verbose = FALSE)
head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=2018-02-13T00:07:39"
## [1] "##source=VCF_popgen.pl"
## [1] "##reference=file:UMA13_denovo_clusters_relaxed.fasta"
## [1] "##contig=N/A"
## [1] "##phasing=none"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM              POS  ID REF ALT QUAL    FILTER
## [1,] "RAD_kmer_0000017" "89" NA "T" "C" "3139"  "PASS"
## [2,] "RAD_kmer_0000025" "27" NA "C" "T" "11314" "PASS"
## [3,] "RAD_kmer_0000025" "53" NA "C" "G" "1218"  "PASS"
## [4,] "RAD_kmer_0000025" "59" NA "G" "A" "3244"  "PASS"
## [5,] "RAD_kmer_0000025" "83" NA "C" "T" "5395"  "PASS"
## [6,] "RAD_kmer_0000026" "14" NA "T" "G" "3430"  "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT        UMA2_1              UMA35_1             
## [1,] "GT:DP:GQ:AD" "0/0:346:99:346,0"  "0/1:242:99:89,153" 
## [2,] "GT:DP:GQ:AD" "0/1:158:99:100,58" "0/1:235:99:123,112"
## [3,] "GT:DP:GQ:AD" "0/0:160:99:160,0"  "0/0:237:99:236,0"  
## [4,] "GT:DP:GQ:AD" "0/0:159:99:159,0"  "0/0:237:99:237,0"  
## [5,] "GT:DP:GQ:AD" "0/0:157:99:156,0"  "0/0:237:99:237,0"  
## [6,] "GT:DP:GQ:AD" "1/1:114:57:5,109"  "0/0:57:86:57,0"    
##      RS32_1             UMA20_1             UMA39_1           
## [1,] "0/0:208:99:207,1" "0/0:208:99:208,0"  "0/0:175:99:174,0"
## [2,] "0/0:92:99:92,0"   "0/0:160:99:160,0"  "0/1:194:99:97,97"
## [3,] "0/0:92:99:92,0"   "0/0:158:99:158,0"  "0/0:195:99:192,0"
## [4,] "0/0:91:99:91,0"   "0/0:158:99:158,0"  "0/0:195:99:194,1"
## [5,] "0/0:92:99:92,0"   "0/1:157:99:47,110" "0/0:195:99:195,0"
## [6,] "1/1:50:75:0,50"   "1/1:106:99:0,106"  "1/1:38:57:0,38"  
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:DP:GQ:AD"
## [1]
dp<- extract.gt(vcf, element = 'DP', as.numeric=TRUE)
#UMA4_1 has been successfully removed!

#Plot Read Depth (DP) per individual 
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Read Depth (DP)",las=2, cex=0.4, cex.axis=0.5)

#zoom to smaller values 
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Read Depth (DP)",
las=2, cex=0.4, cex.axis=0.5, ylim=c(0,50))
abline(h=8, col="red") 

sNMF: a program to estimate ancestry coefficients

#sNMF is a fast and efficient method for estimating individual ancestry coefficients based on sparse non-negative matrix factorization algorithms. In [1], the performances of sNMF were compared to the likelihood algorithm implemented in the computer program ADMIXTURE. Without loss of accuracy, sNMF computed estimates of ancestry coefficients with run-times approximately 10 to 30 times shorter than those of ADMIXTURE.

#Install all the necessary packages below:
  #install.packages(c("fields","RColorBrewer","mapplots")) above
  #source("http://bioconductor.org/biocLite.R")
  #biocLite("LEA") above
  #require(LEA) already installed

#Commands to load functions and import files in STRUCTURE format
  #source("http://membres-timc.imag.fr/Olivier.Francois/Conversion.R")
  #source("http://membres-timc.imag.fr/Olivier.Francois/POPSutilities.R")

#Importing input files
#Our files need to be convertable to .geno or .lmff format used by LEA. The LEA package has a couple of very helpful functions to convert vcf files to these formats in R. If you can't use these functions, you may need to convert your vcf to STRUCTURE format outside of R using PGDSpider (downloaded at ). 

#We found that using the vcf2geno or vcf2lmff functions were much easier than converting our vcf dataset to STRUCTURE format and then using the struct2geno function.

#identify your input file (use the new revised vcf)
input.file <- "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf"

#vcf2geno function. The output.file should be same as the input file ending in .geno. force = TRUE if you want to overwrite an existing file under that name. 
vcf2geno(input.file, output.file = "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno", force = TRUE)
## 
##  - number of detected individuals:   23
##  - number of detected loci:      73055
## 
## For SNP info, please check /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.vcfsnp.
## 
## 1371 line(s) were removed because these are not SNPs.
## Please, check /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.removed file, for more informations.
## [1] "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno"
#vcf2lfmm(input.file, output.file = "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.lfmm", force = TRUE)

Now that we have the correct input file formats, we can run a population structure analysis that assumes K = ? clusters. This can be done by using the snmf function of the LEA package. The input.file here should end in .geno

#Trying K clusters 1-10, and running 10 iterations per K
obj.snmf = snmf("/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno", K = 1:10, project = "new", repetitions = 1, tolerance = 0.00001, entropy=TRUE, ploidy = 2)
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] 1499776853
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 23
##         -L (number of loci)                        73055
##         -s (seed random init)                      1499776853
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -o (output file in .geno format)           /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
## 
##  Write genotype file with masked data, /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K1/run1/stringent_revised.vcf.recode.vcf_r1.1.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K1/run1/stringent_revised.vcf.recode.vcf_r1.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140618729047893
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 425152.795618
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K1/run1/stringent_revised.vcf.recode.vcf_r1.1.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K1/run1/stringent_revised.vcf.recode.vcf_r1.1.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K1/run1/stringent_revised.vcf.recode.vcf_r1.1.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K1/run1/stringent_revised.vcf.recode.vcf_r1.1.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.438142
## Cross-Entropy (masked data):  0.631116
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K2/run1/stringent_revised.vcf.recode.vcf_r1.2.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K2/run1/stringent_revised.vcf.recode.vcf_r1.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  1499776853
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=====================]
## Number of iterations: 57
## 
## Least-square error: 389836.269337
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K2/run1/stringent_revised.vcf.recode.vcf_r1.2.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K2/run1/stringent_revised.vcf.recode.vcf_r1.2.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K2/run1/stringent_revised.vcf.recode.vcf_r1.2.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K2/run1/stringent_revised.vcf.recode.vcf_r1.2.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.399964
## Cross-Entropy (masked data):  0.6167
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K3/run1/stringent_revised.vcf.recode.vcf_r1.3.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K3/run1/stringent_revised.vcf.recode.vcf_r1.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  734139723477731157
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=====================]
## Number of iterations: 56
## 
## Least-square error: 364309.903677
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K3/run1/stringent_revised.vcf.recode.vcf_r1.3.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K3/run1/stringent_revised.vcf.recode.vcf_r1.3.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K3/run1/stringent_revised.vcf.recode.vcf_r1.3.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K3/run1/stringent_revised.vcf.recode.vcf_r1.3.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.370011
## Cross-Entropy (masked data):  0.644527
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K4/run1/stringent_revised.vcf.recode.vcf_r1.4.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K4/run1/stringent_revised.vcf.recode.vcf_r1.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  734139723477731157
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=================]
## Number of iterations: 45
## 
## Least-square error: 338771.990965
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K4/run1/stringent_revised.vcf.recode.vcf_r1.4.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K4/run1/stringent_revised.vcf.recode.vcf_r1.4.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K4/run1/stringent_revised.vcf.recode.vcf_r1.4.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K4/run1/stringent_revised.vcf.recode.vcf_r1.4.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.339153
## Cross-Entropy (masked data):  0.682428
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 5  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          5
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K5/run1/stringent_revised.vcf.recode.vcf_r1.5.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K5/run1/stringent_revised.vcf.recode.vcf_r1.5.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  1499776853
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [========================================]
## Number of iterations: 108
## 
## Least-square error: 317770.948172
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K5/run1/stringent_revised.vcf.recode.vcf_r1.5.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K5/run1/stringent_revised.vcf.recode.vcf_r1.5.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      5
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K5/run1/stringent_revised.vcf.recode.vcf_r1.5.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K5/run1/stringent_revised.vcf.recode.vcf_r1.5.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.305377
## Cross-Entropy (masked data):  0.744707
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 6  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          6
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K6/run1/stringent_revised.vcf.recode.vcf_r1.6.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K6/run1/stringent_revised.vcf.recode.vcf_r1.6.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  4120283530098165589
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===========================================================================]
## Number of iterations: 200
## 
## Least-square error: 297797.135814
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K6/run1/stringent_revised.vcf.recode.vcf_r1.6.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K6/run1/stringent_revised.vcf.recode.vcf_r1.6.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      6
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K6/run1/stringent_revised.vcf.recode.vcf_r1.6.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K6/run1/stringent_revised.vcf.recode.vcf_r1.6.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.277674
## Cross-Entropy (masked data):  0.791478
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 7  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          7
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K7/run1/stringent_revised.vcf.recode.vcf_r1.7.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K7/run1/stringent_revised.vcf.recode.vcf_r1.7.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  734140831579293525
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [==================]
## Number of iterations: 47
## 
## Least-square error: 279053.810384
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K7/run1/stringent_revised.vcf.recode.vcf_r1.7.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K7/run1/stringent_revised.vcf.recode.vcf_r1.7.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      7
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K7/run1/stringent_revised.vcf.recode.vcf_r1.7.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K7/run1/stringent_revised.vcf.recode.vcf_r1.7.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.255454
## Cross-Entropy (masked data):  0.847322
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 8  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          8
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K8/run1/stringent_revised.vcf.recode.vcf_r1.8.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K8/run1/stringent_revised.vcf.recode.vcf_r1.8.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  1499776853
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [====================================================================]
## Number of iterations: 181
## 
## Least-square error: 259509.913215
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K8/run1/stringent_revised.vcf.recode.vcf_r1.8.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K8/run1/stringent_revised.vcf.recode.vcf_r1.8.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      8
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K8/run1/stringent_revised.vcf.recode.vcf_r1.8.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K8/run1/stringent_revised.vcf.recode.vcf_r1.8.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.231616
## Cross-Entropy (masked data):  0.901703
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 9  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          9
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K9/run1/stringent_revised.vcf.recode.vcf_r1.9.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K9/run1/stringent_revised.vcf.recode.vcf_r1.9.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  3467825796803250005
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=========================================================================]
## Number of iterations: 194
## 
## Least-square error: 239499.709436
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K9/run1/stringent_revised.vcf.recode.vcf_r1.9.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K9/run1/stringent_revised.vcf.recode.vcf_r1.9.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      9
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K9/run1/stringent_revised.vcf.recode.vcf_r1.9.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K9/run1/stringent_revised.vcf.recode.vcf_r1.9.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.21423
## Cross-Entropy (masked data):  0.943955
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 10  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             23
##         -L (number of loci)                    73055
##         -K (number of ancestral pops)          10
##         -x (input file)                        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         -q (individual admixture file)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K10/run1/stringent_revised.vcf.recode.vcf_r1.10.Q
##         -g (ancestral frequencies file)        /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K10/run1/stringent_revised.vcf.recode.vcf_r1.10.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140618729047893
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [====================================================]
## Number of iterations: 140
## 
## Least-square error: 219577.031644
## Write individual ancestry coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K10/run1/stringent_revised.vcf.recode.vcf_r1.10.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K10/run1/stringent_revised.vcf.recode.vcf_r1.10.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         23
##         -L (number of loci)                73055
##         -K (number of ancestral pops)      10
##         -x (genotype file)                 /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno
##         -q (individual admixture)          /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K10/run1/stringent_revised.vcf.recode.vcf_r1.10.Q
##         -g (ancestral frequencies)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/K10/run1/stringent_revised.vcf.recode.vcf_r1.10.G
##         -i (with masked genotypes)         /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.snmf/masked/stringent_revised.vcf.recode.vcf_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.194504
## Cross-Entropy (masked data):  1.00686
## The project is saved into :
##  sed.vcf.recode.vcf.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("sed.vcf.recode.vcf.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("sed.vcf.recode.vcf.snmfProject")

Examine the results

# plot cross-entropy criterion of all runs of the project
#This plot helps us identify the number of clusters to identify as the most likely for our population (looks like k=3)
######## using the revised vcf (no duplicates) the new results show k=2 instead of k=3 *************
plot(obj.snmf, cex = 1.2, col = "lightblue", pch = 19)

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 2) #we only did 5 runs. More realistically we would perform many runs and select the best one
ce
##           K = 2
## run 1 0.6166998
# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=3
best
## [1] 1
#At the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = 3 clusters. The Q-matrix has 24 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R (Figure 1).
#qmatrix = Q(obj.snmf, K = 3, run = best) #best option

qmatrix = Q(obj.snmf, K = 2, run = best) # requested by TL, K= 2, K=4
qmatrix
##             V1          V2
##  [1,] 0.999900 0.000100000
##  [2,] 0.979013 0.020986700
##  [3,] 0.976394 0.023605900
##  [4,] 0.991573 0.008426700
##  [5,] 0.999622 0.000377678
##  [6,] 0.999900 0.000100000
##  [7,] 0.990533 0.009466870
##  [8,] 0.982767 0.017233300
##  [9,] 0.999900 0.000100000
## [10,] 0.999900 0.000100000
## [11,] 0.977721 0.022279100
## [12,] 0.993784 0.006215670
## [13,] 0.993655 0.006345490
## [14,] 0.000100 0.999900000
## [15,] 0.993993 0.006007400
## [16,] 0.975044 0.024955800
## [17,] 0.973146 0.026854100
## [18,] 0.000100 0.999900000
## [19,] 0.996998 0.003001890
## [20,] 0.982017 0.017983100
## [21,] 0.999900 0.000100000
## [22,] 0.999900 0.000100000
## [23,] 0.999900 0.000100000
# Name the cluster assignment for each individual
cluster<- apply(qmatrix, 1, which.max) #this corresponds with the 1:24 order 
cluster
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1
# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1
# e.g. Individual #1 (UMA2) was assigned to cluster 1

Estimating Individual Admixture Proportions from NGS data

barplot(t(qmatrix),col=c("orange","violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

#legend v1, n= 21;  v2, n= 2

Other post-run sNMF treatments

Call the ancestral genotype frequency matrix, G, for the best run of k=3.

gmatrix = G(obj.snmf, K = 2, run = best)
head(gmatrix)
##             V1         V2
## [1,] 0.9040200 0.99862600
## [2,] 0.0959803 0.00137352
## [3,] 0.0001000 0.00010000
## [4,] 0.6622160 0.99990000
## [5,] 0.3377840 0.00010000
## [6,] 0.0001000 0.00010000
barplot(gmatrix,  border = NA, space = 0.2,  col = c("orange","violet"),  xlab = "Populations", ylab = "Ancestry proportions",  main = "Ancestry matrix") ->gp

Review sNMF run results

show(obj.snmf)
##file directory:                   K3/run1/ 
## Q output file:                    UMA13_aligned_genotypes_stringent_r1.3.Q 
## G output file:                    UMA13_aligned_genotypes_stringent_r1.3.G 
## snmfClass file:                   UMA13_aligned_genotypes_stringent_r1.3.snmfClass 
## number of ancestral populations:  3 
## run number:                       1 
## regularization parameter:         10 
## number of CPUs:                   1 
## seed:                             733639482 
## maximal number of iterations:     200 
## tolerance error:                  1e-05 
## Q input file:                      
## cross-Entropy:                    0.5789651 

Summary of the project

summary(obj.snmf)
## $repetitions
##                       K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8
## with cross-entropy        1     1     1     1     1     1     1     1
## without cross-entropy     0     0     0     0     0     0     0     0
## total                     1     1     1     1     1     1     1     1
##                       K = 9 K = 10
## with cross-entropy        1      1
## without cross-entropy     0      0
## total                     1      1
## 
## $crossEntropy
##          K = 1     K = 2    K = 3     K = 4     K = 5     K = 6     K = 7
## min  0.6311159 0.6166998 0.644527 0.6824277 0.7447072 0.7914778 0.8473218
## mean 0.6311159 0.6166998 0.644527 0.6824277 0.7447072 0.7914778 0.8473218
## max  0.6311159 0.6166998 0.644527 0.6824277 0.7447072 0.7914778 0.8473218
##          K = 8     K = 9   K = 10
## min  0.9017026 0.9439546 1.006863
## mean 0.9017026 0.9439546 1.006863
## max  0.9017026 0.9439546 1.006863

render(“/Users/tanyalama/Box Sync/Squirrel/sNMF_revised.Rmd”, output_format = , output_file = “/Users/tanyalama/Box Sync/Squirrel/sNMF_revised”)