genotyping

Plotting PCA results from scikit-allel

we'll visualize our anlayses fromscikit-allel in R, using the extremely powerful ggplot2 library from Hadley Wickham et al. We'll start by installing this package (if you haven't already!) and loading it

#install.packages("ggplot2")
#install.packages("magrittr")
#install.packages("dplyr")
library(ggplot2)
library(magrittr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.0.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Next, we'll read in the .csv file we exported from scikit-allel.

setwd("Z:/Documents/tomatech/genotyping_PCA/")
chrom_list = as.list(as.character(0:9))
  # list('LR213635.1', 'LR213631.1', 'LR213632.1', 'LR213628.1', 'LR213634.1',
  #                 'LR213627.1','LR213630.1', 'LR213636.1', 'LR213633.1', 'LR213629.1')
names(chrom_list) = chrom_list
pca_df_list = lapply(chrom_list, function(x){read.csv(paste0("pca_",x,".csv"))})

for(i in 1:length(pca_df_list)){
  colnames(pca_df_list[[i]]) <- c("PC1","PC2")
}

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

head(pca_df_list[[10]])

##          PC1        PC2
## 1 -1.8384562  -6.409675
## 2 -0.4720422  -4.945479
## 3 -0.9169236  -4.263043
## 4 -2.5951414  -6.157716
## 5 -2.4604540  -4.513585
## 6  5.9095254 -14.421318

load some sample data, and merge it with the PCA dataframe:

library(readxl)
sample_data = as.data.frame(read_excel("strain_name_id_list_tst.xlsx"))
df_w_secies = list()
for (i in names(pca_df_list)){
  df_w_secies[[i]] = cbind(pca_df_list[[i]],sample_data)
}

head(df_w_secies[[2]])

##          PC1        PC2    index   name
## 1 -24.701872  -6.323840 RSP10697 Black1
## 2 -22.097443 -10.707070 RSP11035 Black1
## 3 -22.378721 -10.849516 RSP11036 Black1
## 4   7.571184  -1.761531 RSP10603 Black1
## 5   8.574120  -3.410433 RSP11346 Black1
## 6  -7.413929   6.822155 RSP10005  Blue1

create a polygon (convex hull) to highlight putative species ID in PC space:

hull = list()
for (i in names(df_w_secies)){
  hull[[i]] = list()
  for (j in unique(df_w_secies[[i]]$name)){
    hull[[i]][[j]] <- df_w_secies[[i]][df_w_secies[[i]]$name==j,] %>%
      slice(chull(PC1, PC2))
    
  }
}

nsamples = length(unique(sample_data$name))
for (i in names(df_w_secies)){
  
  p1 <- ggplot(data=df_w_secies[[i]],aes(x=PC1,y=PC2,fill=name,color=name)) + 
  geom_point(pch=1) +
  scale_fill_manual(values = c(colorspace::rainbow_hcl(nsamples)))+
  scale_color_manual(values = c(colorspace::rainbow_hcl(nsamples)), guide=FALSE)+
  theme_classic() +
  xlab("Genotype PC1") +
  ylab("Genotype PC2") +
  geom_polygon(data = hull[[i]][[1]], alpha = 0.5) +
  geom_polygon(data = hull[[i]][[2]], alpha = 0.5) +
  geom_polygon(data = hull[[i]][[3]], alpha = 0.5) +
  geom_polygon(data = hull[[i]][[4]], alpha = 0.5) +
  geom_polygon(data = hull[[i]][[5]], alpha = 0.5) +
  geom_polygon(data = hull[[i]][[6]], alpha = 0.5) +
  geom_polygon(data = hull[[i]][[7]], alpha = 0.5) +
  #geom_polygon(data = hull[[i]][[8]], alpha = 0.5) +
  #geom_polygon(data = hull_ochr, alpha = 0.5) +
  labs(fill="Species") +
  ggtitle(paste0("Chrom ",i)) +
  theme(legend.text=element_text(size=15),
        axis.title = element_text(size=15),
        axis.text = element_text(size=15),
        legend.title=element_text(size=15))
  print(p1)
  
}