Filtering

FILTERING STEPS IN R

1. Download librairies

library(hierfstat)
library(mapdata)
library(viridis)
library(tidyr)
library(adegenet)
library(vcfR)
library(ggplot2)
library(poppr)

Download vcf.

vcf <- vcfR::read.vcfR("00-Data/filtered_2719neutral_snps.recode.vcf")

Download pop information.

pop <- read.table("00-Data/population_map_sea_cucumber.txt", header=TRUE, sep="\t", stringsAsFactors = TRUE)

Transform vcf to genind.

genind <- vcfR2genind(vcf)

Define population map.

genind@pop <- pop$STRATA

Remove loci with too much missing data.

loci <- missingno(genind, type = "loci", cutoff = 0.05, quiet = FALSE, freq = FALSE)
loci

## /// GENIND OBJECT /////////
## 
##  // 717 individuals; 1,790 loci; 3,580 alleles; size: 10.8 Mb
## 
##  // Basic content
##    @tab:  717 x 3580 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3580 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 20-34)

Remove individuals with too much missing data.

loci.individuals <- missingno(loci, type = "geno", cutoff = 0.05, quiet = FALSE, freq = FALSE)
loci.individuals

## /// GENIND OBJECT /////////
## 
##  // 688 individuals; 1,790 loci; 3,580 alleles; size: 10.4 Mb
## 
##  // Basic content
##    @tab:  688 x 3580 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3580 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 18-32)

Apply a Minor Allele Frequency (MAF) criteria.

loci.individuals.maf <- informloci(loci.individuals, cutoff = 2/nInd(loci.individuals), MAF = 0.01, quiet = FALSE)
loci.individuals.maf

Check the distribution of the heterozygosity (Het). First create a hierfstat object with the function genind2hierfstat.

loci.individuals.maf.dat <- genind2hierfstat(loci.individuals.maf)

Perform basic statistics on the hierfstat object.

basicstat <- basic.stats(loci.individuals.maf.dat, diploid = TRUE, digits = 2) 
Het <- as.data.frame(basicstat$perloc)

Check the distribution of observed heterozygosity (Ho).

ggplot(Het, aes(x=Ho))+
  geom_histogram(binwidth = 0.01,color="black", fill="white")+
  theme_classic()

Check the markers with Ho < 0.45.

het_0.4 <- Het[which(Het$Ho  < 0.45), ]
dim(het_0.4)

Save the name of this loci that you want to keep (with Ho < 0.45) in a vector.

het_loci <- as.vector(rownames(het_0.4))

Remove the X that hierfstat gave to each SNP ID.

het_loci <- gsub('X', '', het_loci)

Keep only SNPs with Het < 0.45.

loci.individuals.maf.het <- loci.individuals.maf[loc=het_loci]

POPULATION STRUCTURE IN R

DPCA

1. DPCA without a priori

Use Bayesian Information Criterion on your filtered dataset. The find.cluster functions implement the clustering procedure used in Discriminant Analysis of Principal Components (DAPC). This procedure consists in running successive K-means with an increasing number of clusters (K), after transforming data using a principal component analysis (PCA). For each model, a statistical measure of goodness of fit (by default, BIC) is computed, which allows to choose the optimal k.

grp_all <- find.clusters(loci.individuals.maf.het, max.n.clust=20, n.pca=nInd(loci.individuals)/3) # select N/3

You will see some figure like below. Select the model with the lowest BIC.

BIC: Select the optimal number of K

Select K=2 as the best number of clusters.

grp_all <- find.clusters(loci.individuals.maf, n.pca=nInd(loci.individuals)/3,n.da=3, n.clust=2) #

Observe the statistics.

grp_all$Kstat

Q1.How many individuals do you observe in these groups?

Observe the size of the group.

grp_all$size

Perform the DPCA.

dapc <- dapc(loci.individuals.maf,grp_all$grp,n.pca=nInd(loci.individuals)/3,n.da=1)

Discriminant analysis eigenvalue

Give a colorblind palette.

col2 <- c("#88CCEE", "#CC6677")

Observe the results.

dpca_result <- scatter(dapc, col=col2)

Visualize your results with complot. Observe the membership probability.

compoplot(dapc,col=col2,cleg=.6, posi=list(x=0,y=1.2), lab=loci.individuals.maf.het@pop)

Extract DPCA information.

dapc$IND <- row.names(dapc$ind.coord)
dapc_info <- as.data.frame(cbind(dapc$IND, dapc$ind.coord, grp_all$grp))
colnames(dapc_info) <- c("IND","DPC1", "K")

Add site information.

dapc_info$SITE <- substr(dapc_info$IND, 1,3)

2. Create a map of population structure

Import geographic coordinates file.

sites <-read.table("00-Data/24sites_lat_long.txt",header=TRUE, dec=".",sep="\t",na.strings="NA",strip.white=TRUE)
summary(sites)

Download the map for the sea cucumber.

wH <- map_data("worldHires",  xlim=c(-140,-120), ylim=c(45,60))

Merge this information and the dapc information.

dpca_geo <- merge(dapc_info, sites, by="SITE")
dpca_geo$DPC1 <- as.numeric(as.character(dpca_geo$DPC1))

Create the map.

x_title="Longitude"
y_title="Latitude"
graph1 <- ggplot() +
  geom_polygon(data = wH, aes(x=long, y = lat, group = group), fill = "gray80", color = NA) +
  coord_fixed(xlim=c(-140,-120), ylim=c(45,60), ratio=1.2)+
  theme(panel.background = element_rect(fill = "white", colour = "black"),
        axis.title = element_blank())+
  geom_point(aes(x = LONG, y = LAT, fill=DPC1), data=dpca_geo,shape = 21, size=2)+
  xlab("Longitude") + ylab("Latitude") +
  scale_fill_gradient(low="white",high="red")+
  theme_bw()+theme(legend.position = "none",
                   panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  labs(y=y_title)+  
  theme(axis.text.x=element_text(colour="black"))+
  theme(axis.text.y=element_text(colour="black"))
graph1

Save the map.

ggsave("Map_dpca_seacumcumber.pdf",width=10, height=10)

Q2. What kind of population structure do you observe?

3. DPCA with a priori

Find optimal alpha value. As indicated in adegenet tutorial: "The trade-off between power of discrimination and over-fitting can be measured by the a- score, which is simply the difference between the proportion of successful reassignment of the analysis (observed discrimination) and values obtained using random groups (random discrimination). It can be seen as the proportion of successful reassignment corrected for the number of retained PCs. It is implemented by a.score, which relies on repeating the DAPC analysis using randomized groups, and computing a-scores for each group, as well as the average a-score. So it’s totally normal that you will find slighly differences in the a-score calculation. It is why here I used the set.seed command.

set.seed(5); dapc_a_score <- dapc(loci.individuals.maf.het,n.pca=nInd(loci.individuals.maf.het)/3,n.da=10) 
temp_score <- optim.a.score(dapc_a_score)

For small number of clusters (K < 10), all eigenvalues can be retained since all discriminant functions can be examined without difficulty. When more clusters are analysed, it is likely that the first few dimensions will carry more information than the others and should be retained. Run the DAPC with a priori using the number of PCs retained following the alpha score.

dapc2 <-dapc(loci.individuals.maf.het,loci.individuals.maf.het@pop, n.pca=112, n.da=3)

Quickly visualize the DPCA.

scatter(dapc2, bg="white", scree.da=FALSE, legend=TRUE, solid=.4)

Save the results of the DPCA with a priori in a file.

names(dapc2)
head(dapc2$var.contr)
load_dpca2 <- as.data.frame(dapc2$var.contr)
write.table(load_dpca2, "Loadings_SNPs.txt", sep="\t", row.names=FALSE, quote=FALSE)

Analyse what percentage of genetic variance is explained by each axis.

pdf("Percent_explained_by_each_PCs.pdf")
percent= dapc2$eig/sum(dapc2$eig)*100
barplot(percent, ylab="Percent of genetic variance explained by eigenvectors", names.arg=round(percent,2))
dev.off()

Q3. What percentage of variation is explained by the first and the second axes?

4. Data visualization with ggplot2

Write the results of the DPCA and modify these results in excel in order to get a file such as “PC_results_adegenet.txt”.

dapc_prior=as.data.frame(dapc2$ind.coord)
write.table(dapc_prior, "DCPA_prior_results.txt", quote=F, sep="\t", row.names=TRUE)

Add information to the tab results of the DPCA.

dapc_prior$IND <- row.names(dapc_prior)

Add sampling site information.

dapc_prior$SITE <- substr(dapc_prior$IND, 1,3)

Add geographical information.

dpca_prior_geo <- merge(dapc_prior, sites, by="SITE")

Make a ggplot graph representing the DAPC for the first and second axes for the regions.

ggplot(dpca_prior_geo, aes(x=LD1, y=LD2, fill=LAT))+ 
  geom_point(size=2, pch=21)+
  scale_fill_gradient(low="white",high="red")+
  labs(x="DA1 (32.6%)")+
  labs(y="DA2 (5.2%)")+
  theme_classic()

Save the ggplot graph.

ggsave("DPCA_prior_seacumcumber.pdf",width=10,height=10,dpi=600,units="cm",useDingbats=F)

ADMIXTURE

Using ADMIXTURE, a custom program for SNP datasets only, we aimed to define the genetic units present in two fish species: Sebastes faciatus and S. mentella.

ADMIXTURE estimates individual ancestries by efficiently computing maximum likelihood estimates in a parametric model. To better understand this program, read the manual page Alexander 2011.

1. Run the ADMIXTURE program in your TERMINAL

** Prepare a .bed file** from your vcf file. To do so, first use VCFTOOLS http://vcftools.sourceforge.net in the terminal.

vcftools --vcf filtered_2719neutral_snps.recode.vcf --plink-tped --out filtered_2719neutral_snps.recode

The command --plink-tped will produce three files: - a .tped file with genotype information coded as binary - a .tfam file, which is the first six columns of a .ped file, i.e., Family ID, Individual ID, Paternal ID, Maternal ID, Sex and Phenotype

Then, use PLINK http://zzz.bwh.harvard.edu/plink/.

plink --tped filtered_2719neutral_snps.recode.tped --tfam filtered_2719neutral_snps.recode.tfam

The command --make-bed will produce three files: - a binary ped file (.bed) - the pedigree/phenotype information file (.fam) - an extended MAP file (*.bim) that contains information about the allele names, which would otherwise be lost in the .bed file

Run Admixture in bash. Go into the folder where your bed file is. Run admixture on your .bed file in your terminal by typing:

for K in echo $(seq 25) ; do admixture --cv=10 -B2000 -j8 
filtered_2719neutral_snps.recode.bed $K | tee log${K}.out;done

Usually the maximum number of K - to test as a first step - is selected based on the number of sampling locations that you have. Here we had 24 sampling locations (n = 24), so we first tested a K max = n + 1 = 25.

Collect the cross-validation information obtained from all the log files.

grep -h CV log*.out > cross_validation.txt
done

Extract the right order for individual id from your vcf file, using the tfam file information.

cut -f 1 filtered_2719neutral_snps.recode.tfam > id_admixture.txt
done

2. Find the optimal number of clusters

Using the R environment, first check the percent of error due to the number of genetic clusters inferred. Several methods are used for defining the optimal number of clusters.

Keep in mind to not over interpreting the number of clusters defined by this analysis and to complement the ADMIXTURE approach with a Principal Component Analysisor a Discriminant Principal Component Analysis. Indeed ADMIXTURE,which is similar to STRUCTURE, is based on model assumptions that do not always follow the biological reality of your dataset. See Lawson et al https://www.nature.com/articles/s41467-018-05257-7 for careful advice on how to interpret ADMIXTURE results.

In R, download libraries:

library(stringr)
library(ggplot2)
library(dplyr)

Download the cross-validation results you have previously created via bash command.

cv <- read.table("00-Data/cross_validation.txt")

Analyze the cross-validation results Then, add a K-cluster column indicating the number of K you test and select only two columns of interest, CV and K.

cv$K <-gsub("[\\(\\)]", "", regmatches(cv$V3, gregexpr("\\(.*?\\)", cv$V3)))
CV <- select(cv, V4,K)
CV2 <- separate(data = CV, col = K, into = c("K", "CLUSTER"), sep = "=")

Rename your two columns CV and K-cluster.

colnames(CV2) <- c("CV","K","CLUSTER")

Allow R to sort as numeric and not alphabetic.

CV2$CLUSTER <- as.numeric(as.character(CV2$CLUSTER))

Make a graph showing the cross-validation results. Then select the optimal number of clusters regarding: - the lowest cross validation error - when the cross-validation error decreases the most

graph_title="Cross-Validation plot"
x_title="K"
y_title="Cross-validation error"
ggplot(CV2, aes(x=CLUSTER,y=CV,group=1))+
  geom_point()+
  geom_line(linetype = "dashed")+
  labs(title=graph_title)+
  labs(x=x_title)+
  labs(y=y_title)+
  theme_classic()

Save the graph.

ggsave("Admixture_cross-validation.pdf",width=7,height=5,dpi=600)
dev.off()

Q4. What would be the optimal number of K that you will keep?

3. Analyse Q estimates results

In this section, the goal is to delineate how the percent of ancestry is distributed to each genetic cluster found, and this for each individual.

Assignment of clusters can sometimes be trivial even when any clear population structure patterns tend to emerge or even when the vast majority of the individuals tends to show less than 50% of ancestry to one genetic cluster (suggesting admixed populations).

One common mistake is to have an unbalanced number of samples per population. This brings out a high number of indivdiuals that tend to belong to this large population instead of other populations, see Puechmaille et al (2016).

Here we run admixture using K = 2

admixture filtered_2719neutral_snps.recode.bed 2
done

Then go in your R environment and download libraries:

library(reshape2)
library(plyr)
library(stringr)
library(ggplot2)
library(tidyverse)
library(RColorBrewer)

Read file.Q with the Q = the optimal K

admixture <- read.table("00-Data/2719snps_717ind.2.Q")

Add one column with the individuals names and the population they belong to.

admixture <- cbind(pop,admixture)

Rename columns.

colnames(admixture) <- c("IND","POP","K1","K2")

Transform the admixture object into a long format.

admixture_long <- melt(admixture,id.vars=c("IND","POP"),variable.name="ANCESTRY",value.name="PERC")
names(admixture_long)
class(admixture_long$ANCESTRY)
levels(admixture_long$ANCESTRY)

Subset only the individuals with about 50% of ancestry with one genetic cluster

admixture_50 <- subset(admixture, subset=admixture$K1>0.40&admixture$K1<0.60)
dim(admixture_50)

Make a graph with the ADMIXTURE results.

graph_title="Stacked barplot of Admixture analysis in species"
x_title="Individuals"
y_title="Ancestry"
graph_1<-ggplot(admixture_long,aes(x=POP,y=PERC,fill=ANCESTRY))
graph_1+geom_bar(stat="identity")+
scale_fill_manual(values=col2, name= "K", labels=c("K1","K2"))+ 
  labs(y=y_title)+
  labs(x=x_title)+
  theme_classic()

Save the graph.

ggsave("Sea_cumcumber.pdf",width=15,height=10,dpi=600,units="cm",useDingbats=F)

Estimate to which each genetic group each individual belongs by looking at its maximum % of ancestry.

admixture[, "max"] <- apply(admixture[, 3:4], 1, max)
head(admixture)

Check the individuals could be clearly inferred to belong to one genetic cluster.

admixture_subset <- subset(admixture, subset=admixture$max >= 0.9)

Q5. How many individuals are not well assigned (ancestry less than 0.70 to one genetic cluster)?

Create a pop map regarding the cluster found.

admixture$CLUSTER <- colnames(admixture)[apply(admixture,1,which.max)]
admixture_results <- select(admixture, IND, CLUSTER)
head(admixture_results)

Check the genetic clusters found per sampling sites in term of the number of individuals

table(admixture$CLUSTER, admixture$POP)

Save the files.

write.table(admixture_results, 'Admixture_results_K2.txt',quote=FALSE,   row.names=FALSE, sep="\t", dec=".")

4. Produce nice graphs from ADMIXTURE results

Install the library.

library(pophelper)

Check pophelperversion.

packageDescription("pophelper", fields="Version")

Import the .Q files.

slist <- readQ("00-Data/2719snps_717ind.2.Q",filetype="basic")

Import labels for ADMIXTURE runs.

labset <- read.table("00-Data/population_map_sea_cucumber.txt", header=TRUE,stringsAsFactors=F)
labset_order = labset[,2,drop=FALSE]
labset_order$STRATA <- as.character(labset_order$STRATA)

Check if labels are a character data type.

sapply(labset, is.character)
class(labset)

Create a qplot for K = 2 considering two species.

slist1 <- alignK(slist[1]) 
plotQ(slist1,  clustercol= col2,grplab = labset_order, grplabsize=3,
      showsp=FALSE,ordergrp=T,imgtype="pdf",
      showlegend=TRUE, legendpos="right", legendkeysize = 6, legendtextsize = 6,
      legendmargin=c(2,2,2,0), width=20, height=4,sortind="all")

Admixture_results_K2