Loading packages/libraries
install.packages("vcfR", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/anina/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'vcfR' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'vcfR'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\anina\AppData\Local\R\win-library\4.2\00LOCK\vcfR\libs\x64\vcfR.dll to C:
## \Users\anina\AppData\Local\R\win-library\4.2\vcfR\libs\x64\vcfR.dll: Permission
## denied
## Warning: restored 'vcfR'
##
## The downloaded binary packages are in
## C:\Users\anina\AppData\Local\Temp\RtmpWCMxTT\downloaded_packages
library(vcfR)
## Warning: package 'vcfR' was built under R version 4.2.2
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(vegan)
## Warning: package 'vegan' was built under R version 4.2.2
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(ggpubr)
## Loading required package: ggplot2
confirm working directory/files, load vcf
getwd()
## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"
list.files(pattern = "vcf")
## [1] "mySNPs.vcf.gz" "vcf_num.csv" "vcf_num_df.csv" "vcf_num_df2.csv"
## [5] "vcf_scaled.csv"
vcf <- vcfR::read.vcfR(file = "mySNPs.vcf.gz",convertNA = T)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 892
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 892
## Character matrix gt cols: 2513
## skip: 0
## nrows: 892
## row_num: 0
##
Processed variant: 892
## All variants processed
extract genotype score
vcf_num <- vcfR::extract.gt(vcf, # key
element= "GT",
IDtoRowNames = F,
as.numeric= T,
convertNA = T)
saving as csv, confirming save
write.csv(vcf_num, file= "vcf_num.csv", row.names= F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1-1.csv" "vcf_num.csv"
## [3] "vcf_num_df.csv" "vcf_num_df2.csv"
## [5] "vcf_scaled.csv"
transposing original vcf & converting to R dataframe
vcf_num_t <- t(vcf_num)
vcf_num_df <- data.frame(vcf_num_t)
sample<- row.names(vcf_num_df)
vcf_num_df <- data.frame(sample, vcf_num_df)
dim(vcf_num_df)
## [1] 2504 893
saving as csv again (and confirming save)
getwd()
## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"
write.csv(vcf_num_df, file= "vcf_num_df.csv", row.names= F)
list.files()
## [1] "09-PCA_worked_example-SNPs-part1.Rmd"
## [2] "1000genomes_people_info2-1-1.csv"
## [3] "Final Submission.Rproj"
## [4] "final_report_template-3.docx"
## [5] "final_report_template-3.html"
## [6] "final_report_template-3.Rmd"
## [7] "Final_workflow.Rmd"
## [8] "mySNPs.vcf.gz"
## [9] "rsconnect"
## [10] "vcf_num.csv"
## [11] "vcf_num_df.csv"
## [12] "vcf_num_df2.csv"
## [13] "vcf_scaled.csv"
Merge data with population meta data Data obtained from https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel
pop_meta <- read.csv(file = "1000genomes_people_info2-1-1.csv")
merging metadata with SNP data
#make sure that the column "sample" is appearing as expected
names(pop_meta)
## [1] "pop" "super_pop" "sample" "sex" "lat" "lng"
names(vcf_num_df)[1:10]
## [1] "sample" "X1" "X2" "X3" "X4" "X5" "X6" "X7"
## [9] "X8" "X9"
vcf_num_df2 <- merge(pop_meta, vcf_num_df, by= "sample")
check dim/names to ensure merging occured properly
nrow(vcf_num_df) == nrow(vcf_num_df2)
## [1] TRUE
names(vcf_num_df2)[1:15]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4" "X5" "X6"
## [13] "X7" "X8" "X9"
save & double check again
getwd()
## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"
write.csv(vcf_num_df2, file= "vcf_num_df2.csv", row.names= F)
list.files()
## [1] "09-PCA_worked_example-SNPs-part1.Rmd"
## [2] "1000genomes_people_info2-1-1.csv"
## [3] "Final Submission.Rproj"
## [4] "final_report_template-3.docx"
## [5] "final_report_template-3.html"
## [6] "final_report_template-3.Rmd"
## [7] "Final_workflow.Rmd"
## [8] "mySNPs.vcf.gz"
## [9] "rsconnect"
## [10] "vcf_num.csv"
## [11] "vcf_num_df.csv"
## [12] "vcf_num_df2.csv"
## [13] "vcf_scaled.csv"
Omitting invariant features using funtion provided by Dr. Nathan Brouwer
invar_omit <- function(x){
cat("Dataframe of dim",dim(x), "processed ... \n")
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)
cat(length(i_var0),"columns removed\n")
if(length(i_var0) > 0){
x <- x[, -i_var0]
}
return(x)
}
running the function- double check so that it isn’t run on columns containing character data
names(vcf_num_df2)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4"
#the first 6 columns have character data, so a new object will be used to store the dataframe minus the invariants without modifying the character data
vcf_noinvar <- vcf_num_df2
vcf_noinvar[, -c(1:6)] <- invar_omit(vcf_noinvar[, -c(1:6)])
## Dataframe of dim 2504 892 processed ...
## 229 columns removed
Storing the number of invariant columns removed
my_meta_N_invar_cols <- 229
finding NAs and removing them using Dr. Brouwer’s function/methodology:
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF == TRUE)
N_NA <- length(i_NA)
return(i_NA)
}
# N_rows
# number of rows (individuals)
N_rows <- nrow(vcf_noinvar)
# N_NA
# vector to hold output (number of NAs)
N_NA <- rep(x = 0, times= N_rows)
# N_SNPs
# total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)
# the for() Loop
for(i in 1:N_rows){
# for each row, find the Location of
## NAs with bird_snps_num_t()
i_NA <- find_NAs(vcf_noinvar[i,])
# then determine how many NAs
## with Length()
N_NA_i <- length(i_NA)
# then save the output to
## our storage vector
N_NA[i] <- N_NA_i
}
instead of checking which rows have many NAs, I will now check if any rows have even one NA. 1000 genomes project has very few NAs, after all.
#2 ways of verifying that no columns have NAs:
#method 1: the length of the vector containing those column indexes where Number of NAs is not 0. i.e., how many columns have any NAs at all.
length(which(N_NA != 0))
## [1] 0
#method 2: verifying if the length of the vector containing the number of NAs per column equals the length of the vector containing the columns where number of NAs = 0. i.e., the number of columns with 0 NAs is the same as the number of total columns
(length(N_NA) == length(which(N_NA == 0)))
## [1] TRUE
both tests indicate that the data has 0 NAs, so mean imputation is not necessary. Now, the data is ready to be scaled so that it can be used in PCA. The character columns should not be included when scaling, of course.
vcf_scaled <- vcf_noinvar
vcf_scaled[,-c(1:6)] <- scale(vcf_noinvar[,-c(1:6)])
save & double check again
getwd()
## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"
write.csv(vcf_scaled, file= "vcf_scaled.csv", row.names= F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1-1.csv" "vcf_num.csv"
## [3] "vcf_num_df.csv" "vcf_num_df2.csv"
## [5] "vcf_scaled.csv"
now that the data is scaled, the prcomp() function can be used on the (numerical columns of the) data frame.
vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])
the output can then be examined using a screeplot
screeplot(vcf_pca)
to calculate the explained variation, a specialized function will be used
PCA_variation <- function(pca_summary, PCs= 2){
var_explained <- pca_summary$importance[2,1:PCs]*100
var_explained <- round(var_explained,3)
return(var_explained)
}
the function requires PCA variation data, which will be pulled from the summary
vcf_pca_summary <- summary(vcf_pca)
var_out <- PCA_variation(vcf_pca_summary, PCs= 500)
using the cutoff rule of thumb, the number of relevant PCAs can be approximated
# number of dimensions in the data
N_columns <- ncol(vcf_scaled)
# The value of the cutoff
cut_off <- (1/N_columns)*100
#Calculate the number PCs which exceed the cut off:
# which values below the cutoff
i_cut_off <- which(var_out < cut_off)
# what is first value below cutoff?
i_cut_off <- min(i_cut_off)
#Save the the first value below the cutoff
my_meta_N_meanNA_rowsPCs <- i_cut_off
extracting the amount of variation explained by the first 3 PCs, for reference:
my_meta_var_PC123 <- var_out[c(1,2,3)]
plotting the % variation
# make barplot
barplot(var_out,
main= "Percent variation(%) Scree plot",
ylab = "Percent variation(%) explained",
names.arg = 1:length(var_out))
abline(h = cut_off, col= 2, lwd = 2)
abline(v = i_cut_off)
legend("topright",
col= c(2,1),
lty = c(1,1),
legend = c( "Vertical line: cutoff",
"Horizontal line: 1st value below cut off"))
because there are so many bars, it may be more useful to view this as an
accumulation of % explained:
cumulative_variation <- cumsum(var_out)
plot(cumulative_variation, type= "l")
Plotting the PCA results:
first, getting the scores and combining them with the species info
vcf_pca_scores <- vegan::scores(vcf_pca)
vcf_pca_scores2 <- data.frame(super_pop = vcf_noinvar$super_pop,vcf_pca_scores)
examining the variation from the first 3 PCs:
my_meta_var_PC123[1]
## PC1
## 5.665
my_meta_var_PC123[2]
## PC2
## 1.724
my_meta_var_PC123[3]
## PC3
## 1.556
plotting scores, color coded by superpopulation
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC1",
X = "PC2",
color = "super_pop",
shape = "super_pop",
main= "PCA Scatterplot",
xlab = "PC1 (1.5% of variation)",
ylab = "PC2 (1.1% of variation)")
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC2",
X = "PC1",
color = "super_pop",
shape = "super_pop",
main= "PCA Scatterplot",
xlab = "PC1 (1.5% of variation)",
ylab = "PC2 (1.1% of variation)")