# PCA demo
# Clear workspace
rm(list = ls())
# Set working directory
setwd("J:/2021_Genomics_Boot_Camp/Goat_Genomics_data")
# Change binary genotype to ped+map format
system("plink --bfile ADAPTmap_genotypeTOP_20160222_full --cow --nonfounders --allow-no-sex --recode --out ADAPTmap_TOP")
## [1] 0
# Run PLINK QC
system("plink --bfile ADAPTmap_genotypeTOP_20160222_full --cow --autosome --geno 0.1 --mind 0.1 --maf 0.05 --nonfounders --allow-no-sex --recode --out ADAPTmap_TOP")
## [1] 0
######################################
# Principal Component Analysis - PCA #
######################################
## Genetic distances between individuals
system("plink --cow --allow-no-sex --nonfounders --file ADAPTmap_TOP --distance-matrix --out dataForPCA")
## [1] 0
## Load data
dist_populations<-read.table("dataForPCA.mdist",header=F)
### Extract breed names
fam <- data.frame(famids=read.table("dataForPCA.mdist.id")[,1])
### Extract individual names
famInd <- data.frame(IID=read.table("dataForPCA.mdist.id")[,2])
## Perform PCA using the cmdscale function
# Time intensive step - takes a few minutes with the 4.5K animals
mds_populations <- cmdscale(dist_populations,eig=T,5)
## Extract the eigen vectors
eigenvec_populations <- cbind(fam,famInd,mds_populations$points)
## Proportion of variation captured by each eigen vector
eigen_percent <- round(((mds_populations$eig)/sum(mds_populations$eig))*100,2)
# Visualize PCA in tidyverse
# Load tidyverse
if (!require("tidyverse")) {
install.packages("tidyverse", dependencies = TRUE)
library(tidyverse)
}
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# PCA plot
ggplot(data = eigenvec_populations) +
geom_point(mapping = aes(x = `1`, y = `2`,color = famids), show.legend = FALSE ) +
geom_hline(yintercept = 0, linetype="dotted") +
geom_vline(xintercept = 0, linetype="dotted") +
labs(title = "PCA of wordwide goat populations",
x = paste0("Principal component 1 (",eigen_percent[1]," %)"),
y = paste0("Principal component 2 (",eigen_percent[2]," %)")) +
theme_minimal()
