Workflow

Loading packages/libraries

install.packages("vcfR", repos = "https://cloud.r-project.org")

## Installing package into 'C:/Users/anina/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'vcfR' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'vcfR'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\anina\AppData\Local\R\win-library\4.2\00LOCK\vcfR\libs\x64\vcfR.dll to C:
## \Users\anina\AppData\Local\R\win-library\4.2\vcfR\libs\x64\vcfR.dll: Permission
## denied

## Warning: restored 'vcfR'

## 
## The downloaded binary packages are in
##  C:\Users\anina\AppData\Local\Temp\RtmpWCMxTT\downloaded_packages

library(vcfR)

## Warning: package 'vcfR' was built under R version 4.2.2

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(vegan)

## Warning: package 'vegan' was built under R version 4.2.2

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-4

library(ggpubr)

## Loading required package: ggplot2

confirm working directory/files, load vcf

getwd()

## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"

list.files(pattern = "vcf")

## [1] "mySNPs.vcf.gz"   "vcf_num.csv"     "vcf_num_df.csv"  "vcf_num_df2.csv"
## [5] "vcf_scaled.csv"

vcf <- vcfR::read.vcfR(file = "mySNPs.vcf.gz",convertNA = T)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 892
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 892
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 892
##   row_num: 0
## 
Processed variant: 892
## All variants processed

extract genotype score

vcf_num <- vcfR::extract.gt(vcf, # key
element= "GT",
IDtoRowNames = F,
as.numeric= T,
convertNA = T)

saving as csv, confirming save

write.csv(vcf_num, file= "vcf_num.csv", row.names= F)
list.files(pattern = "csv")

## [1] "1000genomes_people_info2-1-1.csv" "vcf_num.csv"                     
## [3] "vcf_num_df.csv"                   "vcf_num_df2.csv"                 
## [5] "vcf_scaled.csv"

transposing original vcf & converting to R dataframe

vcf_num_t <- t(vcf_num)

vcf_num_df <- data.frame(vcf_num_t)

sample<- row.names(vcf_num_df)

vcf_num_df <- data.frame(sample, vcf_num_df)

dim(vcf_num_df)

## [1] 2504  893

saving as csv again (and confirming save)

getwd()

## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"

write.csv(vcf_num_df, file= "vcf_num_df.csv", row.names= F)
list.files()

##  [1] "09-PCA_worked_example-SNPs-part1.Rmd"
##  [2] "1000genomes_people_info2-1-1.csv"    
##  [3] "Final Submission.Rproj"              
##  [4] "final_report_template-3.docx"        
##  [5] "final_report_template-3.html"        
##  [6] "final_report_template-3.Rmd"         
##  [7] "Final_workflow.Rmd"                  
##  [8] "mySNPs.vcf.gz"                       
##  [9] "rsconnect"                           
## [10] "vcf_num.csv"                         
## [11] "vcf_num_df.csv"                      
## [12] "vcf_num_df2.csv"                     
## [13] "vcf_scaled.csv"

Merge data with population meta data Data obtained from https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel

pop_meta <- read.csv(file = "1000genomes_people_info2-1-1.csv")

merging metadata with SNP data

#make sure that the column "sample" is appearing as expected
names(pop_meta)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_num_df)[1:10]

##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

vcf_num_df2 <- merge(pop_meta, vcf_num_df, by= "sample")

check dim/names to ensure merging occured properly

nrow(vcf_num_df) == nrow(vcf_num_df2)

## [1] TRUE

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

save & double check again

getwd()

## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"

write.csv(vcf_num_df2, file= "vcf_num_df2.csv", row.names= F)
list.files()

##  [1] "09-PCA_worked_example-SNPs-part1.Rmd"
##  [2] "1000genomes_people_info2-1-1.csv"    
##  [3] "Final Submission.Rproj"              
##  [4] "final_report_template-3.docx"        
##  [5] "final_report_template-3.html"        
##  [6] "final_report_template-3.Rmd"         
##  [7] "Final_workflow.Rmd"                  
##  [8] "mySNPs.vcf.gz"                       
##  [9] "rsconnect"                           
## [10] "vcf_num.csv"                         
## [11] "vcf_num_df.csv"                      
## [12] "vcf_num_df2.csv"                     
## [13] "vcf_scaled.csv"

Omitting invariant features using funtion provided by Dr. Nathan Brouwer

invar_omit <- function(x){
cat("Dataframe of dim",dim(x), "processed ... \n")
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)
cat(length(i_var0),"columns removed\n")
if(length(i_var0) > 0){
x <- x[, -i_var0]
}
return(x)
}

running the function- double check so that it isn’t run on columns containing character data

names(vcf_num_df2)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

#the first 6 columns have character data, so a new object will be used to store the dataframe minus the invariants without modifying the character data
vcf_noinvar <- vcf_num_df2
vcf_noinvar[, -c(1:6)] <- invar_omit(vcf_noinvar[, -c(1:6)])

## Dataframe of dim 2504 892 processed ... 
## 229 columns removed

Storing the number of invariant columns removed

my_meta_N_invar_cols <- 229

finding NAs and removing them using Dr. Brouwer’s function/methodology:

find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF == TRUE)
N_NA <- length(i_NA)
return(i_NA)
}


# N_rows
# number of rows (individuals)
N_rows <- nrow(vcf_noinvar)
# N_NA
# vector to hold output (number of NAs)
N_NA <- rep(x = 0, times= N_rows)
# N_SNPs
# total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)

# the for() Loop
for(i in 1:N_rows){
# for each row, find the Location of
## NAs with bird_snps_num_t()
i_NA <- find_NAs(vcf_noinvar[i,])
# then determine how many NAs
## with Length()
N_NA_i <- length(i_NA)
# then save the output to
## our storage vector
N_NA[i] <- N_NA_i
}

instead of checking which rows have many NAs, I will now check if any rows have even one NA. 1000 genomes project has very few NAs, after all.

#2 ways of verifying that no columns have NAs:

#method 1: the length of the vector containing those column indexes where Number of NAs is not 0. i.e., how many columns have any NAs at all.
length(which(N_NA != 0))

## [1] 0

#method 2: verifying if the length of the vector containing the number of NAs per column equals  the length of the vector containing the columns where number of NAs = 0. i.e., the number of columns with 0 NAs is the same as the number of total columns
(length(N_NA) == length(which(N_NA == 0)))

## [1] TRUE

both tests indicate that the data has 0 NAs, so mean imputation is not necessary. Now, the data is ready to be scaled so that it can be used in PCA. The character columns should not be included when scaling, of course.

vcf_scaled <- vcf_noinvar
vcf_scaled[,-c(1:6)] <- scale(vcf_noinvar[,-c(1:6)])

save & double check again

getwd()

## [1] "C:/Users/anina/Documents/biosc1540/Final Submission"

write.csv(vcf_scaled, file= "vcf_scaled.csv", row.names= F)
list.files(pattern = "csv")

## [1] "1000genomes_people_info2-1-1.csv" "vcf_num.csv"                     
## [3] "vcf_num_df.csv"                   "vcf_num_df2.csv"                 
## [5] "vcf_scaled.csv"

now that the data is scaled, the prcomp() function can be used on the (numerical columns of the) data frame.

vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])

the output can then be examined using a screeplot

screeplot(vcf_pca)

to calculate the explained variation, a specialized function will be used

PCA_variation <- function(pca_summary, PCs= 2){
var_explained <- pca_summary$importance[2,1:PCs]*100
var_explained <- round(var_explained,3)
return(var_explained)
}

the function requires PCA variation data, which will be pulled from the summary

vcf_pca_summary <- summary(vcf_pca)
var_out <- PCA_variation(vcf_pca_summary, PCs= 500)

using the cutoff rule of thumb, the number of relevant PCAs can be approximated

# number of dimensions in the data
N_columns <- ncol(vcf_scaled)
# The value of the cutoff
cut_off <- (1/N_columns)*100

#Calculate the number PCs which exceed the cut off:
# which values below the cutoff
i_cut_off <- which(var_out < cut_off)
# what is first value below cutoff?
i_cut_off <- min(i_cut_off)
#Save the the first value below the cutoff
my_meta_N_meanNA_rowsPCs <- i_cut_off

extracting the amount of variation explained by the first 3 PCs, for reference:

my_meta_var_PC123 <- var_out[c(1,2,3)]

plotting the % variation

# make barplot
barplot(var_out,
main= "Percent variation(%) Scree plot",
ylab = "Percent variation(%) explained",
names.arg = 1:length(var_out))
abline(h = cut_off, col= 2, lwd = 2)
abline(v = i_cut_off)
legend("topright",
col= c(2,1),
lty = c(1,1),
legend = c( "Vertical line: cutoff",
"Horizontal line: 1st value below cut off"))

because there are so many bars, it may be more useful to view this as an accumulation of % explained:

cumulative_variation <- cumsum(var_out)
plot(cumulative_variation, type= "l")

Plotting the PCA results:

first, getting the scores and combining them with the species info

vcf_pca_scores <- vegan::scores(vcf_pca)
vcf_pca_scores2 <- data.frame(super_pop = vcf_noinvar$super_pop,vcf_pca_scores)

examining the variation from the first 3 PCs:

my_meta_var_PC123[1]

##   PC1 
## 5.665

my_meta_var_PC123[2]

##   PC2 
## 1.724

my_meta_var_PC123[3]

##   PC3 
## 1.556

plotting scores, color coded by superpopulation

ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC1",
X = "PC2",
color = "super_pop", 
shape = "super_pop", 
main= "PCA Scatterplot",
xlab = "PC1 (1.5% of variation)",
ylab = "PC2 (1.1% of variation)")

ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC2",
X = "PC1",
color = "super_pop", 
shape = "super_pop", 
main= "PCA Scatterplot",
xlab = "PC1 (1.5% of variation)",
ylab = "PC2 (1.1% of variation)")

Workflow

2022-12-17