sNMF analysis on red squirrel SNP data

library(vcfR)
library(adegenet)
library(adegraphics)
library(pegas)
library(StAMPP)
library(lattice)
library(gplots)
library(ape)
library(ggmap) 
require(LEA)

vcf<- read.vcfR("/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.vcf", verbose = FALSE)
head(vcf)

## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=2018-02-13T00:07:39"
## [1] "##source=VCF_popgen.pl"
## [1] "##reference=file:UMA13_denovo_clusters_relaxed.fasta"
## [1] "##contig=N/A"
## [1] "##phasing=none"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM              POS          ID REF ALT QUAL    FILTER
## [1,] "RAD_kmer_0000017" "0000000089" NA "T" "C" "3139"  "PASS"
## [2,] "RAD_kmer_0000025" "0000000027" NA "C" "T" "11314" "PASS"
## [3,] "RAD_kmer_0000025" "0000000053" NA "C" "G" "1218"  "PASS"
## [4,] "RAD_kmer_0000025" "0000000059" NA "G" "A" "3244"  "PASS"
## [5,] "RAD_kmer_0000025" "0000000083" NA "C" "T" "5395"  "PASS"
## [6,] "RAD_kmer_0000026" "0000000014" NA "T" "G" "3430"  "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT        UMA2_1                UMA35_1               
## [1,] "GT:DP:GQ:AD" "0/0:346:521:346,0"   "0/1:242:1658:89,153" 
## [2,] "GT:DP:GQ:AD" "0/1:158:1083:100,58" "0/1:235:2201:123,112"
## [3,] "GT:DP:GQ:AD" "0/0:160:241:160,0"   "0/0:237:334:236,0"   
## [4,] "GT:DP:GQ:AD" "0/0:159:239:159,0"   "0/0:237:357:237,0"   
## [5,] "GT:DP:GQ:AD" "0/0:157:213:156,0"   "0/0:237:357:237,0"   
## [6,] "GT:DP:GQ:AD" "1/1:114:57:5,109"    "0/0:57:86:57,0"      
##      RS32_1              UMA20_1              UMA39_1             
## [1,] "0/0:208:290:207,1" "0/0:208:313:208,0"  "0/0:175:240:174,0" 
## [2,] "0/0:92:138:92,0"   "0/0:160:241:160,0"  "0/1:194:1915:97,97"
## [3,] "0/0:92:138:92,0"   "0/0:158:238:158,0"  "0/0:195:227:192,0" 
## [4,] "0/0:91:137:91,0"   "0/0:158:238:158,0"  "0/0:195:271:194,1" 
## [5,] "0/0:92:138:92,0"   "0/1:157:823:47,110" "0/0:195:293:195,0" 
## [6,] "1/1:50:75:0,50"    "1/1:106:160:0,106"  "1/1:38:57:0,38"    
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:DP:GQ:AD"
## [1]

#Then plot important statistics summed over entire VCF
#chrom <- create.chromR(name='RAD_data', vcf=vcf) 
#plot(chrom) #plot the data

#Then extract the allele depths per each sample (DP field of VCF) and plot distribution of allele depths of all sites per each sample. NB: You may inspect and visualize other fields of VCF, e.g. allele depth (AD) or genotype quality (GQ)
# read depth distribution per individual 
dp<- extract.gt(vcf, element = 'DP', as.numeric=TRUE)
#pdf("DP_RAD_data.pdf", width = 10, height=3) # boxplot #where did this go?
#par(mar=c(8,4,1,1)) 

#Plot Read Depth (DP) per individual 
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Read Depth (DP)",las=2, cex=0.4, cex.axis=0.5)

#zoom to smaller values 
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Read Depth (DP)",
las=2, cex=0.4, cex.axis=0.5, ylim=c(0,50))
abline(h=8, col="red")

sNMF: a program to estimate ancestry coefficients

#sNMF is a fast and efficient method for estimating individual ancestry coefficients based on sparse non-negative matrix factorization algorithms. In [1], the performances of sNMF were compared to the likelihood algorithm implemented in the computer program ADMIXTURE. Without loss of accuracy, sNMF computed estimates of ancestry coefficients with run-times approximately 10 to 30 times shorter than those of ADMIXTURE.

#Install all the necessary packages below:
  #install.packages(c("fields","RColorBrewer","mapplots"))
  #source("http://bioconductor.org/biocLite.R")
  #biocLite("LEA")
  #require(LEA)

#Commands to load functions and import files in STRUCTURE format
  #source("http://membres-timc.imag.fr/Olivier.Francois/Conversion.R")
  #source("http://membres-timc.imag.fr/Olivier.Francois/POPSutilities.R")

#Importing input files
#Our files need to be convertable to .geno or .lmff format used by LEA. The LEA package has a couple of very helpful functions to convert vcf files to these formats in R. If you can't use these functions, you may need to convert your vcf to STRUCTURE format outside of R using PGDSpider (downloaded at ). 

#We found that using the vcf2geno or vcf2lmff functions were much easier than converting our vcf dataset to STRUCTURE format and then using the struct2geno function.1371 line(s) were removed because these are not SNPs.
#identify your input file
input.file <- "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.vcf"

#vcf2geno function. The output.file should be same as the input file ending in .geno. force = TRUE if you want to overwrite an existing file under that name. 
vcf2geno(input.file, output.file = "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.geno", force = TRUE)

## 
##  - number of detected individuals:   24
##  - number of detected loci:      73055
## 
## For SNP info, please check /Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.vcfsnp.
## 
## 1371 line(s) were removed because these are not SNPs.
## Please, check /Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.removed file, for more informations.

## [1] "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.geno"

#vcf2lfmm(input.file, output.file = "/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.lfmm", force = TRUE)

Now that we have the correct input file formats, we can run a population structure analysis that assumes K = ? clusters. This can be done by using the snmf function of the LEA package. The input.file here should end in .geno

#Trying K clusters 1-10, and running 10 iterations per K
obj.snmf = snmf("/Users/tanyalama/TMorelli_UMassachusetts_RedSquirrel_20170525-01609/VCF_files/UMA13_aligned_genotypes_stringent.geno", K = 1:10, project = "new", repetitions = 1, tolerance = 0.00001, entropy=TRUE, ploidy = 2)

Examine the results

# plot cross-entropy criterion of all runs of the project
#This plot helps us identify the number of clusters to identify as the most likely for our population (looks like k=3)
plot(obj.snmf, cex = 1.2, col = "lightblue", pch = 19)

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K = 3) #we only did 1 run. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=3

#At the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = 3 clusters. The Q-matrix has 24 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R (Figure 1).
qmatrix = Q(obj.snmf, K = 3, run = best)
qmatrix

##               V1          V2         V3
##  [1,] 0.00009999 0.000099990 0.99980000
##  [2,] 0.00009999 0.017684100 0.98221600
##  [3,] 0.02549310 0.020128800 0.95437800
##  [4,] 0.02419160 0.008085870 0.96772300
##  [5,] 0.00009999 0.000099990 0.99980000
##  [6,] 0.00433452 0.000099991 0.99556500
##  [7,] 0.01047820 0.008707010 0.98081500
##  [8,] 0.01340790 0.013623200 0.97296900
##  [9,] 0.01328680 0.000099991 0.98661300
## [10,] 0.68945000 0.015318700 0.29523100
## [11,] 0.02054980 0.023560600 0.95589000
## [12,] 0.01342240 0.002031030 0.98454700
## [13,] 0.02527490 0.008249450 0.96647600
## [14,] 0.00009999 0.999800000 0.00009999
## [15,] 0.01841360 0.004538520 0.97704800
## [16,] 0.00737400 0.018958900 0.97366700
## [17,] 0.01814580 0.028947800 0.95290600
## [18,] 0.00009999 0.999800000 0.00009999
## [19,] 0.01563070 0.000721297 0.98364800
## [20,] 0.01489680 0.014616000 0.97048700
## [21,] 0.99980000 0.000099990 0.00009999
## [22,] 0.01263910 0.000099991 0.98726100
## [23,] 0.99980000 0.000099990 0.00009999
## [24,] 0.00009999 0.000099990 0.99980000

# Name the cluster assignment for each individual
cluster<- apply(qmatrix, 1, which.max) #this corresponds with the 1:24 order 
# [1] 1 1 1 1 1 1 1 1 1 2 1 1 1 3 1 1 1 3 1 1 2 1 
# e.g. Individual #1 (UMA2) was assigned to cluster 1

Ancestry proportions

Estimating Individual Admixture Proportions from NGS data

barplot(t(qmatrix),col=c("orange","violet","lightgreen"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA4", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

#legend v1, n= 19;  v2, n= 3;  v3, n=2

Other post-run sNMF treatments Call the ancestral genotype frequency matrix, G, for the best run of k=3.

gmatrix = G(obj.snmf, K = 3, run = best)
head(gmatrix)

##          V1     V2       V3
## [1,] 0.9999 0.9999 0.891949
## [2,] 0.0001 0.0001 0.108051
## [3,] 0.0001 0.0001 0.000100
## [4,] 0.9999 0.9999 0.727694
## [5,] 0.0001 0.0001 0.272306
## [6,] 0.0001 0.0001 0.000100

barplot(gmatrix,  border = NA, space = 0.2,  col = c("orange","violet","lightgreen"),  xlab = "Populations", ylab = "Ancestry proportions",  main = "Ancestry matrix") ->gp

Review sNMF run results

show(obj.snmf)
##file directory:                   K3/run1/ 
## Q output file:                    UMA13_aligned_genotypes_stringent_r1.3.Q 
## G output file:                    UMA13_aligned_genotypes_stringent_r1.3.G 
## snmfClass file:                   UMA13_aligned_genotypes_stringent_r1.3.snmfClass 
## number of ancestral populations:  3 
## run number:                       1 
## regularization parameter:         10 
## number of CPUs:                   1 
## seed:                             733639482 
## maximal number of iterations:     200 
## tolerance error:                  1e-05 
## Q input file:                      
## cross-Entropy:                    0.5789651

Summary of the project

summary(obj.snmf)

## $repetitions
##                       K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8
## with cross-entropy        1     1     1     1     1     1     1     1
## without cross-entropy     0     0     0     0     0     0     0     0
## total                     1     1     1     1     1     1     1     1
##                       K = 9 K = 10
## with cross-entropy        1      1
## without cross-entropy     0      0
## total                     1      1
## 
## $crossEntropy
##          K = 1     K = 2     K = 3    K = 4     K = 5     K = 6     K = 7
## min  0.6063426 0.5879886 0.5725016 0.606929 0.6661644 0.7253022 0.7598518
## mean 0.6063426 0.5879886 0.5725016 0.606929 0.6661644 0.7253022 0.7598518
## max  0.6063426 0.5879886 0.5725016 0.606929 0.6661644 0.7253022 0.7598518
##          K = 8     K = 9    K = 10
## min  0.8290144 0.8536193 0.9081553
## mean 0.8290144 0.8536193 0.9081553
## max  0.8290144 0.8536193 0.9081553

Next: K-means clustering and discriminant analysis of principal components (DAPC)

#K‐means clustering is a fast method how to assign individuals into groups. Unlike STRUCTURE it does not make assumptions on population genetic parameters such as Hardy Weinberg equilibrium....

sNMF analysis on red squirrel SNP data

Tanya Lama

4/13/2018