The example is split into 2 Parts:
Part 1 must be completed first to create a file,
SNPs_cleaned.csv, that has been completely prepared for
analysis.
Now in Part 2, you will analyze the data with PCA. The steps here will be:
scale())prcomp())screeplot())vegan::scores())In the code below all code is provided. Your tasks will be to do 4 things:
Load the vcfR package with library()
library(vcfR) # KEY
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(ggplot2)
library(ggpubr)
Set the working directory
getwd()
## [1] "/Users/danyajung/Desktop/CB /all_loci.vcf"
list.files()
## [1] "07-mean_imputation-2.html"
## [2] "07-mean_imputation-2.Rmd"
## [3] "08-PCA_worked.html"
## [4] "08-PCA_worked.Rmd"
## [5] "09-PCA_worked_example-SNPs-part1.html"
## [6] "09-PCA_worked_example-SNPs-part1.Rmd"
## [7] "10-PCA_worked_example-SNPs-part2.html"
## [8] "10-PCA_worked_example-SNPs-part2.Rmd"
## [9] "11.21443531-21683531.ALL.chr11_GRCh38.genotypes.20170504.vcf"
## [10] "11.21443531-21683531.ALL.chr11_GRCh38.genotypes.20170504.vcf.zip"
## [11] "1540_final_project_Final_Report_template.pdf"
## [12] "1540_final_report_flowchart.pdf"
## [13] "2.136483646-136733646.ALL.chr2_GRCh38.genotypes.20170504.vcf.gz"
## [14] "all_loci-2.vcf"
## [15] "all_loci.csv"
## [16] "all_loci.numbers"
## [17] "all_loci.txt"
## [18] "all_loci.vcf.txt"
## [19] "allomtery_3_scatterplot3d (1).Rmd"
## [20] "bird_snps_remove_NAs.Rmd"
## [21] "center_function.R"
## [22] "cluster_analysis_portfolio.Rmd"
## [23] "feature_engineering_intro_2_functions-part2.Rmd"
## [24] "feature_engineering.Rmd"
## [25] "final_report_template.Rmd"
## [26] "fst_exploration_in_class.Rmd"
## [27] "portfolio_ggpubr_intro-2-2.Rmd"
## [28] "portfolio_ggpubr_intro-2.Rmd"
## [29] "portfolio_ggpubr_log_transformation-2.Rmd"
## [30] "portfolio_ggpubr_log_transformation.Rmd"
## [31] "RPubs Portfolio.Rmd"
## [32] "RPubs Portfolio.Rmd 2"
## [33] "RPubs Portfolio.Rmd.zip"
## [34] "rsconnect"
## [35] "RStudio-2022.07.2-576.dmg"
## [36] "SNPs_cleaned.csv"
## [37] "test.Rmd"
## [38] "transpose_VCF_data-2.Rmd"
## [39] "transpose_VCF_data-3.Rmd"
## [40] "true.txt"
## [41] "walsh2017morphology.csv"
## [42] "walsh2017morphology.numbers"
## [43] "walsh2017morphology.RData"
## [44] "working_directory_practice-2.Rmd"
## [45] "working_directory_practice-3.Rmd"
## [46] "working_directory_practice-4.html"
## [47] "working_directory_practice-4.Rmd"
list.files(pattern = "vcf")
## [1] "11.21443531-21683531.ALL.chr11_GRCh38.genotypes.20170504.vcf"
## [2] "11.21443531-21683531.ALL.chr11_GRCh38.genotypes.20170504.vcf.zip"
## [3] "2.136483646-136733646.ALL.chr2_GRCh38.genotypes.20170504.vcf.gz"
## [4] "all_loci-2.vcf"
## [5] "all_loci.vcf.txt"
Load the data
SNPs_cleaned <- read.csv(file = "SNPs_cleaned.csv")
warning("If this didn't work, its may be because you didn't set your working directory.")
## Warning: If this didn't work, its may be because you didn't set your working
## directory.
center the data by using scale().
SNPs_scaled <- scale(SNPs_cleaned)
run PCA analysis by using prcomp().
pca_scaled <- prcomp(SNPs_scaled)
the scree plot from the PCA by using screelot().
TODO: UPDATE PLOT WITH TITLE
screeplot(pca_scaled,
ylab = "Relative importance",
main = "give me a basic title")
TODO: INTERPRET SCREEPLOT
the amount of variation explained by the first 2 PCs.
summary_out_scaled <- summary(pca_scaled)
PCA_variation <- function(pca_summary, PCs = 2){
var_explained <- pca_summary$importance[2,1:PCs]*100
var_explained <- round(var_explained,1)
return(var_explained)
}
var_out <- PCA_variation(summary_out_scaled,PCs = 10)
N_columns <- ncol(SNPs_scaled)
barplot(var_out,
main = "Percent variation Scree plot",
ylab = "Percent variation explained")
abline(h = 1/N_columns*100, col = 2, lwd = 2)
TODO: INTERPRET VARIANCE EXPLAINED
biplot and interpret the relationship between the features.
biplot(pca_scaled)
TODO: EXPLAIN WHY THIS IS A BAD IDEA
the PCA scores for plotting (vegan::scores())
pca_scores <- vegan::scores(pca_scaled)
pop_id <- c("Nel","Nel","Nel","Nel","Nel","Nel","Nel","Nel",
"Nel", "Nel", "Nel", "Nel", "Nel", "Nel", "Nel", "Alt",
"Alt", "Alt", "Alt", "Alt", "Alt", "Alt", "Alt", "Alt",
"Alt", "Alt", "Alt", "Alt", "Alt", "Alt", "Sub", "Sub",
"Sub", "Sub", "Sub", "Sub", "Sub", "Sub", "Sub", "Sub",
"Sub", "Cau", "Cau", "Cau", "Cau", "Cau", "Cau", "Cau",
"Cau", "Cau", "Cau", "Cau", "Cau", "Div", "Div", "Div",
"Div", "Div", "Div", "Div", "Div", "Div", "Div", "Div",
"Div", "Div", "Div", "Div")
Combine the scores with the species information into a dataframe.
pca_scores2 <- data.frame(pop_id,
pca_scores)
Plot the scores, with species color-coded
TODO: UPDATE PLOT WITH TITLE TODO: UPDATE X and Y AXES WITH AMOUNT OF VARIATION EXPLAINED
ggpubr::ggscatter(data = pca_scores2,
y = "PC2",
x = "PC1",
color = "pop_id",
shape = "pop_id",
xlab = "PC1 (10% variation)",
ylab = "PC2 (10% variation)",
main = "PCA summary")
TODO: INTERPRET PLOT