we'll visualize our anlayses fromscikit-allel in R, using the extremely powerful ggplot2 library from Hadley Wickham et al. We'll start by installing this package (if you haven't already!) and loading it
#install.packages("ggplot2")
#install.packages("magrittr")
#install.packages("dplyr")
library(ggplot2)
library(magrittr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Next, we'll read in the .csv file we exported from scikit-allel.
setwd("Z:/Documents/tomatech/genotyping_PCA/")
chrom_list = as.list(as.character(0:9))
# list('LR213635.1', 'LR213631.1', 'LR213632.1', 'LR213628.1', 'LR213634.1',
# 'LR213627.1','LR213630.1', 'LR213636.1', 'LR213633.1', 'LR213629.1')
names(chrom_list) = chrom_list
pca_df_list = lapply(chrom_list, function(x){read.csv(paste0("pca_",x,".csv"))})
for(i in 1:length(pca_df_list)){
colnames(pca_df_list[[i]]) <- c("PC1","PC2")
}
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.
head(pca_df_list[[10]])
## PC1 PC2
## 1 -1.8384562 -6.409675
## 2 -0.4720422 -4.945479
## 3 -0.9169236 -4.263043
## 4 -2.5951414 -6.157716
## 5 -2.4604540 -4.513585
## 6 5.9095254 -14.421318
load some sample data, and merge it with the PCA dataframe:
library(readxl)
sample_data = as.data.frame(read_excel("strain_name_id_list_tst.xlsx"))
df_w_secies = list()
for (i in names(pca_df_list)){
df_w_secies[[i]] = cbind(pca_df_list[[i]],sample_data)
}
head(df_w_secies[[2]])
## PC1 PC2 index name
## 1 -24.701872 -6.323840 RSP10697 Black1
## 2 -22.097443 -10.707070 RSP11035 Black1
## 3 -22.378721 -10.849516 RSP11036 Black1
## 4 7.571184 -1.761531 RSP10603 Black1
## 5 8.574120 -3.410433 RSP11346 Black1
## 6 -7.413929 6.822155 RSP10005 Blue1
create a polygon (convex hull) to highlight putative species ID in PC space:
hull = list()
for (i in names(df_w_secies)){
hull[[i]] = list()
for (j in unique(df_w_secies[[i]]$name)){
hull[[i]][[j]] <- df_w_secies[[i]][df_w_secies[[i]]$name==j,] %>%
slice(chull(PC1, PC2))
}
}
nsamples = length(unique(sample_data$name))
for (i in names(df_w_secies)){
p1 <- ggplot(data=df_w_secies[[i]],aes(x=PC1,y=PC2,fill=name,color=name)) +
geom_point(pch=1) +
scale_fill_manual(values = c(colorspace::rainbow_hcl(nsamples)))+
scale_color_manual(values = c(colorspace::rainbow_hcl(nsamples)), guide=FALSE)+
theme_classic() +
xlab("Genotype PC1") +
ylab("Genotype PC2") +
geom_polygon(data = hull[[i]][[1]], alpha = 0.5) +
geom_polygon(data = hull[[i]][[2]], alpha = 0.5) +
geom_polygon(data = hull[[i]][[3]], alpha = 0.5) +
geom_polygon(data = hull[[i]][[4]], alpha = 0.5) +
geom_polygon(data = hull[[i]][[5]], alpha = 0.5) +
geom_polygon(data = hull[[i]][[6]], alpha = 0.5) +
geom_polygon(data = hull[[i]][[7]], alpha = 0.5) +
#geom_polygon(data = hull[[i]][[8]], alpha = 0.5) +
#geom_polygon(data = hull_ochr, alpha = 0.5) +
labs(fill="Species") +
ggtitle(paste0("Chrom ",i)) +
theme(legend.text=element_text(size=15),
axis.title = element_text(size=15),
axis.text = element_text(size=15),
legend.title=element_text(size=15))
print(p1)
}