Red Squirrel Admixture Scatter Pie

We display 23 individual estimates of admixture (K= 1:6) by overlaying pie-charts on a geographic map of the White Mountain National Forest.

r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
library(vcfR)
library(adegenet)
library(adegraphics)
library(pegas)
library(StAMPP)
library(lattice)
library(gplots)
library(ape)
library(ggmap) 
library(LEA)
library(maps)
library(mapdata)
library(maptools)
library(mapproj)
library(mapplots)
library(dismo)
library(plotrix)
library(dplyr)
library(rgdal)
library(ggplot2)
library(raster)
library(TESS)
library(ggforce)
install.packages('scatterpie')

## 
## The downloaded binary packages are in
##  /var/folders/d3/4f6lh5bn3k3_2xw9p_jz7xgh0000gn/T//RtmpiRQNJk/downloaded_packages

library(scatterpie)
install.packages("LEA_1.4.0_tar.gz", repos = NULL, type ="source")

Running code for sNMF using our revised VCF

We had to drop UMA4_1 from the vcf because it was a duplicate of UMA21_1. This was accomplished using vcftools. The code for downloading and executing this via the UNIX command line can be found on my github. This step cannot be accomplished in R.

#Read in the new revised vcf and proceed
vcf<- read.vcfR("/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf", verbose = FALSE)
head(vcf)

## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=2018-02-13T00:07:39"
## [1] "##source=VCF_popgen.pl"
## [1] "##reference=file:UMA13_denovo_clusters_relaxed.fasta"
## [1] "##contig=N/A"
## [1] "##phasing=none"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM              POS  ID REF ALT QUAL    FILTER
## [1,] "RAD_kmer_0000017" "89" NA "T" "C" "3139"  "PASS"
## [2,] "RAD_kmer_0000025" "27" NA "C" "T" "11314" "PASS"
## [3,] "RAD_kmer_0000025" "53" NA "C" "G" "1218"  "PASS"
## [4,] "RAD_kmer_0000025" "59" NA "G" "A" "3244"  "PASS"
## [5,] "RAD_kmer_0000025" "83" NA "C" "T" "5395"  "PASS"
## [6,] "RAD_kmer_0000026" "14" NA "T" "G" "3430"  "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT        UMA2_1              UMA35_1             
## [1,] "GT:DP:GQ:AD" "0/0:346:99:346,0"  "0/1:242:99:89,153" 
## [2,] "GT:DP:GQ:AD" "0/1:158:99:100,58" "0/1:235:99:123,112"
## [3,] "GT:DP:GQ:AD" "0/0:160:99:160,0"  "0/0:237:99:236,0"  
## [4,] "GT:DP:GQ:AD" "0/0:159:99:159,0"  "0/0:237:99:237,0"  
## [5,] "GT:DP:GQ:AD" "0/0:157:99:156,0"  "0/0:237:99:237,0"  
## [6,] "GT:DP:GQ:AD" "1/1:114:57:5,109"  "0/0:57:86:57,0"    
##      RS32_1             UMA20_1             UMA39_1           
## [1,] "0/0:208:99:207,1" "0/0:208:99:208,0"  "0/0:175:99:174,0"
## [2,] "0/0:92:99:92,0"   "0/0:160:99:160,0"  "0/1:194:99:97,97"
## [3,] "0/0:92:99:92,0"   "0/0:158:99:158,0"  "0/0:195:99:192,0"
## [4,] "0/0:91:99:91,0"   "0/0:158:99:158,0"  "0/0:195:99:194,1"
## [5,] "0/0:92:99:92,0"   "0/1:157:99:47,110" "0/0:195:99:195,0"
## [6,] "1/1:50:75:0,50"   "1/1:106:99:0,106"  "1/1:38:57:0,38"  
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:DP:GQ:AD"
## [1]

dp<- extract.gt(vcf, element = 'DP', as.numeric=TRUE) 
# UMA4_1 has been successfully removed!

# identify your input file (use the new revised vcf)
input.file <- "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf"

# vcf2geno function. The output.file should be same as the input file ending in .geno. force = TRUE if you want to overwrite an existing file under that name. 
vcf2geno(input.file, output.file = "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno", force = TRUE)

## 
##  - number of detected individuals:   23
##  - number of detected loci:      73055
## 
## For SNP info, please check /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.vcfsnp.
## 
## 1371 line(s) were removed because these are not SNPs.
## Please, check /Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.removed file, for more informations.

## [1] "/Users/tanyalama/vcftools/stringent_revised.vcf.recode.vcf.geno"

Data for scatterpie K=2

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 2) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK2 = Q(obj.snmf, K = 2, run = best)

barplot(t(qmatrixK2),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

setwd("/Users/tanyalama/Box Sync/Squirrel") #set the working directory
write.csv(qmatrixK2, "qmatrixK2.csv") #export the qmatrix to use for mapping
coords<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/squirrel_coords.csv"))
coords[,1]<- c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42")
new<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/qmatrixK2.csv"))
newmatrix<- new[,2:3]
data = as.data.frame(cbind(coords, newmatrix))
data$region <- factor(1:23)
colnames(data)[4:5] = c("A", "B")

###as character numeric instead of as.numeric
data = data %>% transform(Latitude =
                          as.numeric(as.character(Latitude)),
                          Longitude =
                          as.numeric(as.character(Longitude)),
                          A=
                          as.numeric(as.character(A)),
                          B =
                          as.numeric(as.character(B)), 
                          region = 
                          as.numeric(as.character(region)))
glimpse(data)

## Observations: 23
## Variables: 6
## $ ID        <fct> UMA2, UMA35, RS32, UMA20, UMA39, UMA19, RS104, UMA13...
## $ Latitude  <dbl> 44.27211, 44.29657, 44.29654, 44.27189, 44.29617, 44...
## $ Longitude <dbl> -71.26064, -71.35224, -71.35094, -71.25702, -71.3508...
## $ A         <dbl> 0.998914, 0.980764, 0.981060, 0.989389, 0.995713, 0....
## $ B         <dbl> 0.001086210, 0.019236000, 0.018939500, 0.010611300, ...
## $ region    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...

Plotting admixture coeffients for K= 2

#pie chart
p <-ggplot() + 
  scale_y_continuous(limits = c(44.265, 44.31)) +
  scale_x_continuous(limits = c(-71.36, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.002), data=data,
              cols=LETTERS[1:2], 
              color=NA, 
              alpha = 0.75) +
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=.002, height=.002))

#zoom in to view the clustered areas in the upper left corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.29, 44.30)) +
  scale_x_continuous(limits = c(-71.355, -71.342)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:2], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

Data for scatterpie K=3

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 3) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK3 = Q(obj.snmf, K = 3, run = best)

barplot(t(qmatrixK3),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

setwd("/Users/tanyalama/Box Sync/Squirrel") #set the working directory
write.csv(qmatrixK3, "qmatrixK3.csv") #export the qmatrix to use for mapping
coords<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/squirrel_coords.csv"))
coords[,1]<- c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42")
new<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/qmatrixK3.csv"))
newmatrix<- new[,2:4]
data = as.data.frame(cbind(coords, newmatrix))
data$region <- factor(1:23)
colnames(data)[4:6] = c("A", "B", "C")

###as character numeric instead of as.numeric
data = data %>% transform(Latitude =
                          as.numeric(as.character(Latitude)),
                          Longitude =
                          as.numeric(as.character(Longitude)),
                          A=
                          as.numeric(as.character(A)),
                          B =
                          as.numeric(as.character(B)),
                          C =
                          as.numeric(as.character(C)), 
                          region = 
                          as.numeric(as.character(region)))
glimpse(data)

## Observations: 23
## Variables: 7
## $ ID        <fct> UMA2, UMA35, RS32, UMA20, UMA39, UMA19, RS104, UMA13...
## $ Latitude  <dbl> 44.27211, 44.29657, 44.29654, 44.27189, 44.29617, 44...
## $ Longitude <dbl> -71.26064, -71.35224, -71.35094, -71.25702, -71.3508...
## $ A         <dbl> 0.000099990, 0.000099990, 0.000099990, 0.040334600, ...
## $ B         <dbl> 0.99980000, 0.98603700, 0.98371900, 0.95263100, 0.05...
## $ C         <dbl> 0.000099990, 0.013862900, 0.016181400, 0.007034230, ...
## $ region    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...

Plotting admixture coeffients for K=3

#pie chart
p <-ggplot() + 
  scale_y_continuous(limits = c(44.265, 44.31)) +
  scale_x_continuous(limits = c(-71.36, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.002), data=data,
              cols=LETTERS[1:3], 
              color=NA, 
              alpha = 0.75) +
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=.002, height=.002))

#zoom in to view the clustered areas in the upper left corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.29, 44.30)) +
  scale_x_continuous(limits = c(-71.355, -71.342)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:3], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

Data for scatterpie K=4

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 4) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK4 = Q(obj.snmf, K = 4, run = best)

barplot(t(qmatrixK4),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

setwd("/Users/tanyalama/Box Sync/Squirrel") #set the working directory
write.csv(qmatrixK4, "qmatrixK4.csv") #export the qmatrix to use for mapping
coords<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/squirrel_coords.csv"))
coords[,1]<- c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42")
new<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/qmatrixK4.csv"))
newmatrix<- new[,2:5]
data = as.data.frame(cbind(coords, newmatrix))
data$region <- factor(1:23)
colnames(data)[4:7] = c("A", "B", "C", "D")

###as character numeric instead of as.numeric
data = data %>% transform(Latitude =
                          as.numeric(as.character(Latitude)),
                          Longitude =
                          as.numeric(as.character(Longitude)),
                          A=
                          as.numeric(as.character(A)),
                          B =
                          as.numeric(as.character(B)),
                          C =
                          as.numeric(as.character(C)), 
                          D =
                          as.numeric(as.character(D)),
                          region = 
                          as.numeric(as.character(region)))
glimpse(data)

## Observations: 23
## Variables: 8
## $ ID        <fct> UMA2, UMA35, RS32, UMA20, UMA39, UMA19, RS104, UMA13...
## $ Latitude  <dbl> 44.27211, 44.29657, 44.29654, 44.27189, 44.29617, 44...
## $ Longitude <dbl> -71.26064, -71.35224, -71.35094, -71.25702, -71.3508...
## $ A         <dbl> 0.991961000, 0.986073000, 0.984528000, 0.950046000, ...
## $ B         <dbl> 0.000099982, 0.013727300, 0.011416500, 0.003487200, ...
## $ C         <dbl> 0.000099982, 0.000099982, 0.003955320, 0.041933400, ...
## $ D         <dbl> 0.007839170, 0.000099982, 0.000099991, 0.004533510, ...
## $ region    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...

Plotting admixture coeffients for K=4

#pie chart
p <-ggplot() + 
  scale_y_continuous(limits = c(44.265, 44.31)) +
  scale_x_continuous(limits = c(-71.36, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.002), data=data,
              cols=LETTERS[1:4], 
              color=NA, 
              alpha = 0.75) +
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=.002, height=.002))

#zoom in to view the clustered areas in the upper left corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.29, 44.30)) +
  scale_x_continuous(limits = c(-71.355, -71.342)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:4], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

#zoom in to view the clustered areas in the lower right corner

p <-ggplot() + 
  scale_y_continuous(limits = c(44.27, 44.28)) +
  scale_x_continuous(limits = c(-71.26, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:4], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

Data for scatterpie K=5

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 5) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

# at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK5 = Q(obj.snmf, K = 5, run = best)

barplot(t(qmatrixK5),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

setwd("/Users/tanyalama/Box Sync/Squirrel") #set the working directory
write.csv(qmatrixK5, "qmatrixK5.csv") #export the qmatrix to use for mapping
coords<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/squirrel_coords.csv"))
coords[,1]<- c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42")
new<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/qmatrixK5.csv"))
newmatrix<- new[,2:6]
data = as.data.frame(cbind(coords, newmatrix))
data$region <- factor(1:23)
colnames(data)[4:8] = c("A", "B", "C", "D", "E")

# as character numeric instead of as.numeric
data = data %>% transform(Latitude =
                          as.numeric(as.character(Latitude)),
                          Longitude =
                          as.numeric(as.character(Longitude)),
                          A=
                          as.numeric(as.character(A)),
                          B =
                          as.numeric(as.character(B)),
                          C =
                          as.numeric(as.character(C)), 
                          D =
                          as.numeric(as.character(D)),
                          E =
                          as.numeric(as.character(E)),
                          region = 
                          as.numeric(as.character(region)))
glimpse(data)

## Observations: 23
## Variables: 9
## $ ID        <fct> UMA2, UMA35, RS32, UMA20, UMA39, UMA19, RS104, UMA13...
## $ Latitude  <dbl> 44.27211, 44.29657, 44.29654, 44.27189, 44.29617, 44...
## $ Longitude <dbl> -71.26064, -71.35224, -71.35094, -71.25702, -71.3508...
## $ A         <dbl> 0.617800000, 0.998393000, 0.163619000, 0.975919000, ...
## $ B         <dbl> 0.006003330, 0.000099973, 0.000099991, 0.000099973, ...
## $ C         <dbl> 0.375997000, 0.000099973, 0.796412000, 0.000099973, ...
## $ D         <dbl> 0.000099982, 0.001306730, 0.022468600, 0.000099973, ...
## $ E         <dbl> 0.000099982, 0.000099973, 0.017400100, 0.023780900, ...
## $ region    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...

Plotting admixture coeffients for K=5

#pie chart
p <-ggplot() + 
  scale_y_continuous(limits = c(44.265, 44.31)) +
  scale_x_continuous(limits = c(-71.36, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.002), data=data,
              cols=LETTERS[1:5], 
              color=NA, 
              alpha = 0.75) +
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=.002, height=.002))

#zoom in to view the clustered areas in the upper left corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.29, 44.30)) +
  scale_x_continuous(limits = c(-71.355, -71.342)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:5], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

#zoom in to view the clustered areas in the lower right corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.27, 44.28)) +
  scale_x_continuous(limits = c(-71.26, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:5], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

Data for scatterpie K=6

# get the cross-entropy value for each run at K = ?
ce <- cross.entropy(obj.snmf, K= 6) #we only did 1 run here for expediency. More realistically we would perform many runs and select the best one

# select the run with the lowest cross-entropy value for K = ?
best <- which.min(ce) # the best run is #1 at k=2

###at the end of the run, the qmatrix object contains the matrix of ancestry coefficients for each individual and for K = ? clusters. The Q-matrix has 23 rows and 3 columns, and it is traditionally displayed using a barplot representation. For this representation, we just use the barplot function of R.

qmatrixK6 = Q(obj.snmf, K = 6, run = best)

barplot(t(qmatrixK6),col=c("red","orange", "yellow", "green", "blue", "violet"),border = NA, space = .2,xlab = "Individuals", ylab = "Admixture coefficients", main = "Ancestry matrix", horiz = FALSE, names.arg = c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42"),cex.names=0.65, las = 2)

setwd("/Users/tanyalama/Box Sync/Squirrel") #set the working directory
write.csv(qmatrixK6, "qmatrixK6.csv") #export the qmatrix to use for mapping
coords<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/squirrel_coords.csv"))
coords[,1]<- c("UMA2","UMA35","RS32","UMA20","UMA39","UMA19","RS104","UMA13","UMA5", "UMA3", "UMA26", "RS33","UMA23", "UMA40", "UMA8", "UMA6", "UMA9", "UMA38", "UMA17", "UMA41", "UMA37", "UMA21", "UMA42")
new<- as.matrix(read.csv("/Users/tanyalama/Box Sync/Squirrel/qmatrixK6.csv"))
newmatrix<- new[,2:7]
data = as.data.frame(cbind(coords, newmatrix))
data$region <- factor(1:23)
colnames(data)[4:9] = c("A", "B", "C", "D", "E", "F")

###as character numeric instead of as.numeric
data = data %>% transform(Latitude =
                          as.numeric(as.character(Latitude)),
                          Longitude =
                          as.numeric(as.character(Longitude)),
                          A=
                          as.numeric(as.character(A)),
                          B =
                          as.numeric(as.character(B)),
                          C =
                          as.numeric(as.character(C)), 
                          D =
                          as.numeric(as.character(D)),
                          E =
                          as.numeric(as.character(E)),
                          F =
                          as.numeric(as.character(F)),
                          region = 
                          as.numeric(as.character(region)))
glimpse(data)

## Observations: 23
## Variables: 10
## $ ID        <fct> UMA2, UMA35, RS32, UMA20, UMA39, UMA19, RS104, UMA13...
## $ Latitude  <dbl> 44.27211, 44.29657, 44.29654, 44.27189, 44.29617, 44...
## $ Longitude <dbl> -71.26064, -71.35224, -71.35094, -71.25702, -71.3508...
## $ A         <dbl> 0.000099964, 0.011511400, 0.000099982, 0.000099964, ...
## $ B         <dbl> 0.000099964, 0.002679540, 0.005480750, 0.000099964, ...
## $ C         <dbl> 0.086260300, 0.985509000, 0.913868000, 0.979801000, ...
## $ D         <dbl> 0.000099964, 0.000099973, 0.001940720, 0.019798900, ...
## $ E         <dbl> 0.913340000, 0.000099973, 0.078510200, 0.000099964, ...
## $ F         <dbl> 0.000099964, 0.000099973, 0.000099982, 0.000099964, ...
## $ region    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...

Plotting admixture coeffients for K=6

#pie chart
p <-ggplot() + 
  scale_y_continuous(limits = c(44.265, 44.31)) +
  scale_x_continuous(limits = c(-71.36, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.002), data=data,
              cols=LETTERS[1:6], 
              color=NA, 
              alpha = 0.75) +
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=.002, height=.002))

#zoom in to view the clustered areas in the upper left corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.29, 44.30)) +
  scale_x_continuous(limits = c(-71.355, -71.342)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:6], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))

#zoom in to view the clustered areas in the lower right corner
p <-ggplot() + 
  scale_y_continuous(limits = c(44.27, 44.28)) +
  scale_x_continuous(limits = c(-71.26, -71.25)) +
  geom_scatterpie(aes(x=Longitude, y=Latitude,
              group=region, r=.00025), data=data,
              cols=LETTERS[1:6], color=NA, alpha = 0.75) + 
  theme_classic() + 
  coord_equal()

p + geom_text(aes(x=Longitude, y=Latitude, label = ID), data = data, size = 1.5, position = position_jitter(width=0.0001, height=0.00015))