R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Load necessary R packages

library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(ggplot2)
library(ggpubr)

Set and confirm working directory

setwd("/Users/mansiavunoori/Documents/FinalProject")

getwd()
## [1] "/Users/mansiavunoori/Documents/FinalProject"
list.files(pattern = "vcf") 
## [1] "mansi_snps.vcf.gz" "vcf_num_df.csv"    "vcf_num_df2"      
## [4] "vcf_num.csv"       "vcf_scaled.csv"

Load VCF data and file

my_vcf <- "mansi_snps.vcf.gz"

vcf <- vcfR::read.vcfR(my_vcf, convertNA = T)
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 6956
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 6956
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 6956
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6956
## All variants processed

Convert raw VCF file to genotype scores

vcf_num <- vcfR::extract.gt(vcf, element = "GT", IDtoRowNames = F, as.numeric = T, convertNA = T)

Save csv file and confirm presence of file

write.csv(vcf_num, file = "vcf_num.csv", row.names = F)
list.files()
##  [1] "1000genomes_people_info2-1.csv"     "Final Project Work Flow_Mansi.Rmd" 
##  [3] "final_report_Mansi.Rmd"             "Final-Project-Work-Flow_Mansi.html"
##  [5] "Final-Project-Work-Flow_Mansi.Rmd"  "mansi_snps.vcf.gz"                 
##  [7] "rsconnect"                          "tester"                            
##  [9] "vcf_num_df.csv"                     "vcf_num_df2"                       
## [11] "vcf_num.csv"                        "vcf_scaled.csv"

Transpose original VCF to R dataframe

vcf_num_t <- t(vcf_num)

Make into DF

vcf_num_df <- data.frame(vcf_num_t)

Get sample names

sample <- row.names(vcf_num_df)

Add samples to DF

vcf_num_df <- data.frame(sample, vcf_num_df)

Check working directory

getwd()
## [1] "/Users/mansiavunoori/Documents/FinalProject"

Save csv and check for file

write.csv(vcf_num_df, file = "vcf_num_df.csv", row.names = F)
list.files()
##  [1] "1000genomes_people_info2-1.csv"     "Final Project Work Flow_Mansi.Rmd" 
##  [3] "final_report_Mansi.Rmd"             "Final-Project-Work-Flow_Mansi.html"
##  [5] "Final-Project-Work-Flow_Mansi.Rmd"  "mansi_snps.vcf.gz"                 
##  [7] "rsconnect"                          "tester"                            
##  [9] "vcf_num_df.csv"                     "vcf_num_df2"                       
## [11] "vcf_num.csv"                        "vcf_scaled.csv"

Clean data - Merge data with population meta data

pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")

Make sure the column “sample” appears in the meta data and SNP data

names(pop_meta)
## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

Merge two sets of data - Check dimensions and names

vcf_num_df2 <- merge(pop_meta, vcf_num_df, by = "sample")
nrow(vcf_num_df2) == nrow(vcf_num_df) 
## [1] TRUE

Check names of new dataframe

names(vcf_num_df2)[1:15]
##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

Check working directory, Save the csv, and Confirm presence of file

getwd()
## [1] "/Users/mansiavunoori/Documents/FinalProject"
write.csv(vcf_num_df2, file = "vcf_num_df2", row.names = F)
list.files()
##  [1] "1000genomes_people_info2-1.csv"     "Final Project Work Flow_Mansi.Rmd" 
##  [3] "final_report_Mansi.Rmd"             "Final-Project-Work-Flow_Mansi.html"
##  [5] "Final-Project-Work-Flow_Mansi.Rmd"  "mansi_snps.vcf.gz"                 
##  [7] "rsconnect"                          "tester"                            
##  [9] "vcf_num_df.csv"                     "vcf_num_df2"                       
## [11] "vcf_num.csv"                        "vcf_scaled.csv"

Omar invariant features

invar_omit <- function(x){
cat("Dataframe of dim", dim(x), "processed...\n") 
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)

cat(length(i_var0), "columns removed\n")

if (length(i_var0) > 0) {
  x <- x[, -i_var0]
}
return(x)
}

Check which columns have character data

names(vcf_num_df2)[1:10]
##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

New dataframe to store output and Run invar_omit() on numeric data

vcf_noinvar <- vcf_num_df2
vcf_noinvar[, -c(1:6)] <- invar_omit(vcf_noinvar[, -c(1:6)])
## Dataframe of dim 2504 6956 processed...
## 1780 columns removed

Create an object to store the number of invariant columns removed

my_meta_N_invar_cols <- 1780

Remove low-quality data - load find_NAs()

find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF == TRUE)
  N_NA <- length(i_NA)
  
  return(i_NA)
}

for() loop - search for NAs

N_rows <- nrow(vcf_noinvar)
N_NA <- rep(x = 0, times = N_rows)
N_SNPs <- ncol(vcf_noinvar)
cat("This may take a minute...")
## This may take a minute...
for(i in 1:N_rows){
  i_NA <- find_NAs(vcf_noinvar[i,])
  N_NA_i <- length(i_NA)
  N_NA[i] <- N_NA_i
}

Check if any row has >50% NAs

cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA>50)
## [1] FALSE

Average number of NAs per row

mean(percent_NA)
## [1] 0.002455135

Save the mean percent of NAs per row

my_meta_N_meanNA_rows <- mean(percent_NA)

Mean imputation

mean_imputation <- function(df){
  cat("This may take some time...")
  n_cols <- ncol(df)
  for(i in 1:n_cols){
    column_i <- df[,i]
    mean_i <- mean(column_i, na.rm = TRUE)
    NAs_i <- which(is.na(column_i))
    N_NAs <- length(NAs_i)
    column_i[NAs_i] <- mean_i
    df[,i] <- column_i
  }
  return(df)
}

Run function

names(vcf_noinvar)[1:10]
##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"
vcf_noNA <- vcf_noinvar
vcf_noNA[, -c(1:6)] <- mean_imputation(vcf_noinvar[,-c(1:6)])
## This may take some time...

Prepare and scale PCA date

vcf_scaled <- vcf_noNA
vcf_scaled[, -c(1:6)] <- scale(vcf_noNA[, -c(1:6)])

Saving prepared data

write.csv(vcf_scaled, file = "vcf_scaled.csv")

Run PCA

vcf_pca <- prcomp(vcf_scaled[, -c(1:6)])

Screeplot

screeplot(vcf_pca)

Calculate explained variation - load PCA variation function

PCA_variation <- function(pca_summary, PCs = 2){
  var_explained <- pca_summary$importance[2,1:PCs]*100
  var_explained <- round(var_explained, 3)
  return(var_explained)
}

Get summary information

vcf_pca_summary <- summary(vcf_pca)

Extract raw variation data

var_out <- PCA_variation(vcf_pca_summary, PCs = 1500)
var_out
##    PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8    PC9   PC10   PC11 
##  3.139  2.874  2.141  1.744  1.579  1.423  1.372  1.253  1.228  1.150  1.034 
##   PC12   PC13   PC14   PC15   PC16   PC17   PC18   PC19   PC20   PC21   PC22 
##  1.004  0.926  0.845  0.826  0.719  0.666  0.662  0.619  0.589  0.558  0.523 
##   PC23   PC24   PC25   PC26   PC27   PC28   PC29   PC30   PC31   PC32   PC33 
##  0.504  0.495  0.483  0.435  0.415  0.413  0.397  0.387  0.368  0.348  0.343 
##   PC34   PC35   PC36   PC37   PC38   PC39   PC40   PC41   PC42   PC43   PC44 
##  0.332  0.329  0.312  0.311  0.304  0.297  0.294  0.289  0.284  0.276  0.265 
##   PC45   PC46   PC47   PC48   PC49   PC50   PC51   PC52   PC53   PC54   PC55 
##  0.264  0.254  0.249  0.244  0.241  0.238  0.230  0.227  0.222  0.220  0.218 
##   PC56   PC57   PC58   PC59   PC60   PC61   PC62   PC63   PC64   PC65   PC66 
##  0.214  0.210  0.209  0.205  0.202  0.198  0.197  0.196  0.191  0.191  0.188 
##   PC67   PC68   PC69   PC70   PC71   PC72   PC73   PC74   PC75   PC76   PC77 
##  0.188  0.187  0.185  0.184  0.181  0.178  0.174  0.171  0.170  0.167  0.164 
##   PC78   PC79   PC80   PC81   PC82   PC83   PC84   PC85   PC86   PC87   PC88 
##  0.163  0.162  0.159  0.159  0.158  0.156  0.155  0.154  0.152  0.151  0.150 
##   PC89   PC90   PC91   PC92   PC93   PC94   PC95   PC96   PC97   PC98   PC99 
##  0.148  0.146  0.144  0.144  0.143  0.142  0.142  0.141  0.140  0.139  0.139 
##  PC100  PC101  PC102  PC103  PC104  PC105  PC106  PC107  PC108  PC109  PC110 
##  0.138  0.137  0.136  0.136  0.135  0.134  0.133  0.132  0.132  0.132  0.131 
##  PC111  PC112  PC113  PC114  PC115  PC116  PC117  PC118  PC119  PC120  PC121 
##  0.131  0.130  0.129  0.128  0.127  0.126  0.126  0.125  0.123  0.123  0.122 
##  PC122  PC123  PC124  PC125  PC126  PC127  PC128  PC129  PC130  PC131  PC132 
##  0.121  0.121  0.120  0.119  0.119  0.119  0.117  0.117  0.116  0.115  0.115 
##  PC133  PC134  PC135  PC136  PC137  PC138  PC139  PC140  PC141  PC142  PC143 
##  0.115  0.114  0.114  0.113  0.113  0.112  0.112  0.112  0.112  0.111  0.110 
##  PC144  PC145  PC146  PC147  PC148  PC149  PC150  PC151  PC152  PC153  PC154 
##  0.110  0.109  0.109  0.109  0.108  0.108  0.107  0.107  0.106  0.106  0.106 
##  PC155  PC156  PC157  PC158  PC159  PC160  PC161  PC162  PC163  PC164  PC165 
##  0.105  0.105  0.104  0.104  0.103  0.103  0.102  0.102  0.102  0.101  0.101 
##  PC166  PC167  PC168  PC169  PC170  PC171  PC172  PC173  PC174  PC175  PC176 
##  0.100  0.100  0.100  0.100  0.099  0.099  0.098  0.098  0.097  0.097  0.097 
##  PC177  PC178  PC179  PC180  PC181  PC182  PC183  PC184  PC185  PC186  PC187 
##  0.096  0.096  0.095  0.095  0.095  0.095  0.094  0.094  0.094  0.093  0.093 
##  PC188  PC189  PC190  PC191  PC192  PC193  PC194  PC195  PC196  PC197  PC198 
##  0.093  0.093  0.092  0.092  0.092  0.091  0.090  0.090  0.090  0.090  0.089 
##  PC199  PC200  PC201  PC202  PC203  PC204  PC205  PC206  PC207  PC208  PC209 
##  0.089  0.089  0.089  0.089  0.088  0.088  0.088  0.087  0.087  0.087  0.087 
##  PC210  PC211  PC212  PC213  PC214  PC215  PC216  PC217  PC218  PC219  PC220 
##  0.087  0.086  0.086  0.086  0.086  0.085  0.085  0.085  0.084  0.084  0.084 
##  PC221  PC222  PC223  PC224  PC225  PC226  PC227  PC228  PC229  PC230  PC231 
##  0.083  0.083  0.083  0.083  0.083  0.083  0.082  0.082  0.082  0.082  0.082 
##  PC232  PC233  PC234  PC235  PC236  PC237  PC238  PC239  PC240  PC241  PC242 
##  0.081  0.081  0.081  0.080  0.080  0.080  0.080  0.080  0.079  0.079  0.079 
##  PC243  PC244  PC245  PC246  PC247  PC248  PC249  PC250  PC251  PC252  PC253 
##  0.079  0.079  0.078  0.078  0.078  0.078  0.078  0.078  0.077  0.077  0.076 
##  PC254  PC255  PC256  PC257  PC258  PC259  PC260  PC261  PC262  PC263  PC264 
##  0.076  0.076  0.076  0.076  0.075  0.075  0.075  0.075  0.074  0.074  0.074 
##  PC265  PC266  PC267  PC268  PC269  PC270  PC271  PC272  PC273  PC274  PC275 
##  0.074  0.074  0.074  0.074  0.073  0.073  0.073  0.073  0.073  0.073  0.072 
##  PC276  PC277  PC278  PC279  PC280  PC281  PC282  PC283  PC284  PC285  PC286 
##  0.072  0.072  0.072  0.072  0.071  0.071  0.071  0.070  0.070  0.070  0.070 
##  PC287  PC288  PC289  PC290  PC291  PC292  PC293  PC294  PC295  PC296  PC297 
##  0.069  0.069  0.069  0.069  0.069  0.069  0.068  0.068  0.068  0.068  0.068 
##  PC298  PC299  PC300  PC301  PC302  PC303  PC304  PC305  PC306  PC307  PC308 
##  0.067  0.067  0.067  0.067  0.067  0.067  0.067  0.066  0.066  0.066  0.066 
##  PC309  PC310  PC311  PC312  PC313  PC314  PC315  PC316  PC317  PC318  PC319 
##  0.066  0.066  0.065  0.065  0.065  0.065  0.065  0.065  0.065  0.064  0.064 
##  PC320  PC321  PC322  PC323  PC324  PC325  PC326  PC327  PC328  PC329  PC330 
##  0.064  0.064  0.064  0.064  0.063  0.063  0.063  0.063  0.063  0.063  0.063 
##  PC331  PC332  PC333  PC334  PC335  PC336  PC337  PC338  PC339  PC340  PC341 
##  0.062  0.062  0.062  0.062  0.062  0.062  0.062  0.061  0.061  0.061  0.061 
##  PC342  PC343  PC344  PC345  PC346  PC347  PC348  PC349  PC350  PC351  PC352 
##  0.061  0.061  0.061  0.061  0.061  0.061  0.061  0.060  0.060  0.060  0.060 
##  PC353  PC354  PC355  PC356  PC357  PC358  PC359  PC360  PC361  PC362  PC363 
##  0.060  0.060  0.060  0.059  0.059  0.059  0.059  0.059  0.059  0.059  0.059 
##  PC364  PC365  PC366  PC367  PC368  PC369  PC370  PC371  PC372  PC373  PC374 
##  0.059  0.058  0.058  0.058  0.058  0.058  0.058  0.058  0.058  0.058  0.058 
##  PC375  PC376  PC377  PC378  PC379  PC380  PC381  PC382  PC383  PC384  PC385 
##  0.058  0.058  0.058  0.058  0.058  0.057  0.057  0.057  0.057  0.057  0.057 
##  PC386  PC387  PC388  PC389  PC390  PC391  PC392  PC393  PC394  PC395  PC396 
##  0.057  0.057  0.057  0.057  0.057  0.057  0.056  0.056  0.056  0.056  0.056 
##  PC397  PC398  PC399  PC400  PC401  PC402  PC403  PC404  PC405  PC406  PC407 
##  0.056  0.056  0.056  0.056  0.056  0.055  0.055  0.055  0.055  0.055  0.055 
##  PC408  PC409  PC410  PC411  PC412  PC413  PC414  PC415  PC416  PC417  PC418 
##  0.055  0.055  0.054  0.054  0.054  0.054  0.054  0.054  0.053  0.053  0.053 
##  PC419  PC420  PC421  PC422  PC423  PC424  PC425  PC426  PC427  PC428  PC429 
##  0.053  0.053  0.053  0.053  0.053  0.053  0.053  0.053  0.052  0.052  0.052 
##  PC430  PC431  PC432  PC433  PC434  PC435  PC436  PC437  PC438  PC439  PC440 
##  0.052  0.052  0.052  0.052  0.052  0.052  0.052  0.051  0.051  0.051  0.051 
##  PC441  PC442  PC443  PC444  PC445  PC446  PC447  PC448  PC449  PC450  PC451 
##  0.051  0.051  0.051  0.051  0.051  0.051  0.050  0.050  0.050  0.050  0.050 
##  PC452  PC453  PC454  PC455  PC456  PC457  PC458  PC459  PC460  PC461  PC462 
##  0.050  0.050  0.050  0.050  0.049  0.049  0.049  0.049  0.049  0.049  0.049 
##  PC463  PC464  PC465  PC466  PC467  PC468  PC469  PC470  PC471  PC472  PC473 
##  0.049  0.049  0.049  0.048  0.048  0.048  0.048  0.048  0.048  0.048  0.048 
##  PC474  PC475  PC476  PC477  PC478  PC479  PC480  PC481  PC482  PC483  PC484 
##  0.048  0.048  0.048  0.047  0.047  0.047  0.047  0.047  0.047  0.047  0.047 
##  PC485  PC486  PC487  PC488  PC489  PC490  PC491  PC492  PC493  PC494  PC495 
##  0.047  0.047  0.047  0.047  0.047  0.046  0.046  0.046  0.046  0.046  0.046 
##  PC496  PC497  PC498  PC499  PC500  PC501  PC502  PC503  PC504  PC505  PC506 
##  0.046  0.046  0.046  0.046  0.046  0.046  0.046  0.046  0.045  0.045  0.045 
##  PC507  PC508  PC509  PC510  PC511  PC512  PC513  PC514  PC515  PC516  PC517 
##  0.045  0.045  0.045  0.045  0.045  0.045  0.045  0.045  0.045  0.045  0.045 
##  PC518  PC519  PC520  PC521  PC522  PC523  PC524  PC525  PC526  PC527  PC528 
##  0.045  0.045  0.045  0.044  0.044  0.044  0.044  0.044  0.044  0.044  0.044 
##  PC529  PC530  PC531  PC532  PC533  PC534  PC535  PC536  PC537  PC538  PC539 
##  0.044  0.044  0.044  0.044  0.044  0.044  0.044  0.044  0.044  0.044  0.044 
##  PC540  PC541  PC542  PC543  PC544  PC545  PC546  PC547  PC548  PC549  PC550 
##  0.044  0.044  0.044  0.043  0.043  0.043  0.043  0.043  0.043  0.043  0.043 
##  PC551  PC552  PC553  PC554  PC555  PC556  PC557  PC558  PC559  PC560  PC561 
##  0.043  0.043  0.043  0.043  0.043  0.043  0.043  0.042  0.042  0.042  0.042 
##  PC562  PC563  PC564  PC565  PC566  PC567  PC568  PC569  PC570  PC571  PC572 
##  0.042  0.042  0.042  0.042  0.042  0.042  0.042  0.042  0.042  0.042  0.042 
##  PC573  PC574  PC575  PC576  PC577  PC578  PC579  PC580  PC581  PC582  PC583 
##  0.041  0.041  0.041  0.041  0.041  0.041  0.041  0.041  0.041  0.041  0.041 
##  PC584  PC585  PC586  PC587  PC588  PC589  PC590  PC591  PC592  PC593  PC594 
##  0.041  0.041  0.041  0.040  0.040  0.040  0.040  0.040  0.040  0.040  0.040 
##  PC595  PC596  PC597  PC598  PC599  PC600  PC601  PC602  PC603  PC604  PC605 
##  0.040  0.040  0.040  0.040  0.039  0.039  0.039  0.039  0.039  0.039  0.039 
##  PC606  PC607  PC608  PC609  PC610  PC611  PC612  PC613  PC614  PC615  PC616 
##  0.039  0.039  0.039  0.039  0.039  0.039  0.039  0.038  0.038  0.038  0.038 
##  PC617  PC618  PC619  PC620  PC621  PC622  PC623  PC624  PC625  PC626  PC627 
##  0.038  0.038  0.038  0.038  0.038  0.038  0.038  0.038  0.038  0.038  0.038 
##  PC628  PC629  PC630  PC631  PC632  PC633  PC634  PC635  PC636  PC637  PC638 
##  0.038  0.038  0.038  0.037  0.037  0.037  0.037  0.037  0.037  0.037  0.037 
##  PC639  PC640  PC641  PC642  PC643  PC644  PC645  PC646  PC647  PC648  PC649 
##  0.037  0.037  0.037  0.037  0.037  0.037  0.037  0.037  0.036  0.036  0.036 
##  PC650  PC651  PC652  PC653  PC654  PC655  PC656  PC657  PC658  PC659  PC660 
##  0.036  0.036  0.036  0.036  0.036  0.036  0.036  0.036  0.036  0.036  0.036 
##  PC661  PC662  PC663  PC664  PC665  PC666  PC667  PC668  PC669  PC670  PC671 
##  0.035  0.035  0.035  0.035  0.035  0.035  0.035  0.035  0.035  0.035  0.035 
##  PC672  PC673  PC674  PC675  PC676  PC677  PC678  PC679  PC680  PC681  PC682 
##  0.035  0.035  0.035  0.035  0.035  0.034  0.034  0.034  0.034  0.034  0.034 
##  PC683  PC684  PC685  PC686  PC687  PC688  PC689  PC690  PC691  PC692  PC693 
##  0.034  0.034  0.034  0.034  0.034  0.034  0.034  0.034  0.034  0.034  0.034 
##  PC694  PC695  PC696  PC697  PC698  PC699  PC700  PC701  PC702  PC703  PC704 
##  0.034  0.034  0.034  0.033  0.033  0.033  0.033  0.033  0.033  0.033  0.033 
##  PC705  PC706  PC707  PC708  PC709  PC710  PC711  PC712  PC713  PC714  PC715 
##  0.033  0.033  0.033  0.033  0.033  0.033  0.033  0.033  0.033  0.033  0.032 
##  PC716  PC717  PC718  PC719  PC720  PC721  PC722  PC723  PC724  PC725  PC726 
##  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032 
##  PC727  PC728  PC729  PC730  PC731  PC732  PC733  PC734  PC735  PC736  PC737 
##  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032  0.032 
##  PC738  PC739  PC740  PC741  PC742  PC743  PC744  PC745  PC746  PC747  PC748 
##  0.032  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031 
##  PC749  PC750  PC751  PC752  PC753  PC754  PC755  PC756  PC757  PC758  PC759 
##  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.031 
##  PC760  PC761  PC762  PC763  PC764  PC765  PC766  PC767  PC768  PC769  PC770 
##  0.031  0.031  0.031  0.031  0.031  0.031  0.031  0.030  0.030  0.030  0.030 
##  PC771  PC772  PC773  PC774  PC775  PC776  PC777  PC778  PC779  PC780  PC781 
##  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030 
##  PC782  PC783  PC784  PC785  PC786  PC787  PC788  PC789  PC790  PC791  PC792 
##  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030 
##  PC793  PC794  PC795  PC796  PC797  PC798  PC799  PC800  PC801  PC802  PC803 
##  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.030  0.029 
##  PC804  PC805  PC806  PC807  PC808  PC809  PC810  PC811  PC812  PC813  PC814 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029 
##  PC815  PC816  PC817  PC818  PC819  PC820  PC821  PC822  PC823  PC824  PC825 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029 
##  PC826  PC827  PC828  PC829  PC830  PC831  PC832  PC833  PC834  PC835  PC836 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029 
##  PC837  PC838  PC839  PC840  PC841  PC842  PC843  PC844  PC845  PC846  PC847 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029 
##  PC848  PC849  PC850  PC851  PC852  PC853  PC854  PC855  PC856  PC857  PC858 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029 
##  PC859  PC860  PC861  PC862  PC863  PC864  PC865  PC866  PC867  PC868  PC869 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.029 
##  PC870  PC871  PC872  PC873  PC874  PC875  PC876  PC877  PC878  PC879  PC880 
##  0.029  0.029  0.029  0.029  0.029  0.029  0.029  0.028  0.028  0.028  0.028 
##  PC881  PC882  PC883  PC884  PC885  PC886  PC887  PC888  PC889  PC890  PC891 
##  0.028  0.028  0.028  0.028  0.028  0.028  0.028  0.028  0.027  0.027  0.027 
##  PC892  PC893  PC894  PC895  PC896  PC897  PC898  PC899  PC900  PC901  PC902 
##  0.027  0.027  0.027  0.027  0.027  0.027  0.027  0.027  0.027  0.027  0.027 
##  PC903  PC904  PC905  PC906  PC907  PC908  PC909  PC910  PC911  PC912  PC913 
##  0.027  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.026 
##  PC914  PC915  PC916  PC917  PC918  PC919  PC920  PC921  PC922  PC923  PC924 
##  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.026  0.025 
##  PC925  PC926  PC927  PC928  PC929  PC930  PC931  PC932  PC933  PC934  PC935 
##  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.025 
##  PC936  PC937  PC938  PC939  PC940  PC941  PC942  PC943  PC944  PC945  PC946 
##  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.025  0.024  0.024 
##  PC947  PC948  PC949  PC950  PC951  PC952  PC953  PC954  PC955  PC956  PC957 
##  0.024  0.024  0.024  0.024  0.024  0.024  0.024  0.024  0.024  0.024  0.024 
##  PC958  PC959  PC960  PC961  PC962  PC963  PC964  PC965  PC966  PC967  PC968 
##  0.024  0.024  0.024  0.024  0.024  0.024  0.024  0.023  0.023  0.023  0.023 
##  PC969  PC970  PC971  PC972  PC973  PC974  PC975  PC976  PC977  PC978  PC979 
##  0.023  0.023  0.023  0.023  0.023  0.023  0.023  0.023  0.023  0.023  0.023 
##  PC980  PC981  PC982  PC983  PC984  PC985  PC986  PC987  PC988  PC989  PC990 
##  0.023  0.023  0.023  0.023  0.023  0.023  0.022  0.022  0.022  0.022  0.022 
##  PC991  PC992  PC993  PC994  PC995  PC996  PC997  PC998  PC999 PC1000 PC1001 
##  0.022  0.022  0.022  0.022  0.022  0.022  0.022  0.022  0.022  0.022  0.022 
## PC1002 PC1003 PC1004 PC1005 PC1006 PC1007 PC1008 PC1009 PC1010 PC1011 PC1012 
##  0.022  0.022  0.022  0.022  0.022  0.022  0.021  0.021  0.021  0.021  0.021 
## PC1013 PC1014 PC1015 PC1016 PC1017 PC1018 PC1019 PC1020 PC1021 PC1022 PC1023 
##  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021 
## PC1024 PC1025 PC1026 PC1027 PC1028 PC1029 PC1030 PC1031 PC1032 PC1033 PC1034 
##  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021  0.021 
## PC1035 PC1036 PC1037 PC1038 PC1039 PC1040 PC1041 PC1042 PC1043 PC1044 PC1045 
##  0.021  0.021  0.020  0.020  0.020  0.020  0.020  0.020  0.020  0.020  0.020 
## PC1046 PC1047 PC1048 PC1049 PC1050 PC1051 PC1052 PC1053 PC1054 PC1055 PC1056 
##  0.020  0.020  0.020  0.020  0.020  0.020  0.020  0.020  0.020  0.020  0.020 
## PC1057 PC1058 PC1059 PC1060 PC1061 PC1062 PC1063 PC1064 PC1065 PC1066 PC1067 
##  0.020  0.020  0.020  0.020  0.020  0.019  0.019  0.019  0.019  0.019  0.019 
## PC1068 PC1069 PC1070 PC1071 PC1072 PC1073 PC1074 PC1075 PC1076 PC1077 PC1078 
##  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019 
## PC1079 PC1080 PC1081 PC1082 PC1083 PC1084 PC1085 PC1086 PC1087 PC1088 PC1089 
##  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019  0.019 
## PC1090 PC1091 PC1092 PC1093 PC1094 PC1095 PC1096 PC1097 PC1098 PC1099 PC1100 
##  0.019  0.019  0.019  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018 
## PC1101 PC1102 PC1103 PC1104 PC1105 PC1106 PC1107 PC1108 PC1109 PC1110 PC1111 
##  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018 
## PC1112 PC1113 PC1114 PC1115 PC1116 PC1117 PC1118 PC1119 PC1120 PC1121 PC1122 
##  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.018  0.017 
## PC1123 PC1124 PC1125 PC1126 PC1127 PC1128 PC1129 PC1130 PC1131 PC1132 PC1133 
##  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017 
## PC1134 PC1135 PC1136 PC1137 PC1138 PC1139 PC1140 PC1141 PC1142 PC1143 PC1144 
##  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017 
## PC1145 PC1146 PC1147 PC1148 PC1149 PC1150 PC1151 PC1152 PC1153 PC1154 PC1155 
##  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017  0.017 
## PC1156 PC1157 PC1158 PC1159 PC1160 PC1161 PC1162 PC1163 PC1164 PC1165 PC1166 
##  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016 
## PC1167 PC1168 PC1169 PC1170 PC1171 PC1172 PC1173 PC1174 PC1175 PC1176 PC1177 
##  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016 
## PC1178 PC1179 PC1180 PC1181 PC1182 PC1183 PC1184 PC1185 PC1186 PC1187 PC1188 
##  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016 
## PC1189 PC1190 PC1191 PC1192 PC1193 PC1194 PC1195 PC1196 PC1197 PC1198 PC1199 
##  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016  0.016 
## PC1200 PC1201 PC1202 PC1203 PC1204 PC1205 PC1206 PC1207 PC1208 PC1209 PC1210 
##  0.016  0.016  0.016  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1211 PC1212 PC1213 PC1214 PC1215 PC1216 PC1217 PC1218 PC1219 PC1220 PC1221 
##  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1222 PC1223 PC1224 PC1225 PC1226 PC1227 PC1228 PC1229 PC1230 PC1231 PC1232 
##  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1233 PC1234 PC1235 PC1236 PC1237 PC1238 PC1239 PC1240 PC1241 PC1242 PC1243 
##  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1244 PC1245 PC1246 PC1247 PC1248 PC1249 PC1250 PC1251 PC1252 PC1253 PC1254 
##  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1255 PC1256 PC1257 PC1258 PC1259 PC1260 PC1261 PC1262 PC1263 PC1264 PC1265 
##  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1266 PC1267 PC1268 PC1269 PC1270 PC1271 PC1272 PC1273 PC1274 PC1275 PC1276 
##  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015  0.015 
## PC1277 PC1278 PC1279 PC1280 PC1281 PC1282 PC1283 PC1284 PC1285 PC1286 PC1287 
##  0.015  0.015  0.015  0.015  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1288 PC1289 PC1290 PC1291 PC1292 PC1293 PC1294 PC1295 PC1296 PC1297 PC1298 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1299 PC1300 PC1301 PC1302 PC1303 PC1304 PC1305 PC1306 PC1307 PC1308 PC1309 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1310 PC1311 PC1312 PC1313 PC1314 PC1315 PC1316 PC1317 PC1318 PC1319 PC1320 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1321 PC1322 PC1323 PC1324 PC1325 PC1326 PC1327 PC1328 PC1329 PC1330 PC1331 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1332 PC1333 PC1334 PC1335 PC1336 PC1337 PC1338 PC1339 PC1340 PC1341 PC1342 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1343 PC1344 PC1345 PC1346 PC1347 PC1348 PC1349 PC1350 PC1351 PC1352 PC1353 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.014 
## PC1354 PC1355 PC1356 PC1357 PC1358 PC1359 PC1360 PC1361 PC1362 PC1363 PC1364 
##  0.014  0.014  0.014  0.014  0.014  0.014  0.014  0.013  0.013  0.013  0.013 
## PC1365 PC1366 PC1367 PC1368 PC1369 PC1370 PC1371 PC1372 PC1373 PC1374 PC1375 
##  0.013  0.013  0.013  0.013  0.013  0.013  0.013  0.013  0.013  0.013  0.013 
## PC1376 PC1377 PC1378 PC1379 PC1380 PC1381 PC1382 PC1383 PC1384 PC1385 PC1386 
##  0.013  0.013  0.013  0.012  0.012  0.012  0.012  0.012  0.012  0.012  0.012 
## PC1387 PC1388 PC1389 PC1390 PC1391 PC1392 PC1393 PC1394 PC1395 PC1396 PC1397 
##  0.012  0.012  0.012  0.012  0.012  0.012  0.012  0.012  0.012  0.012  0.012 
## PC1398 PC1399 PC1400 PC1401 PC1402 PC1403 PC1404 PC1405 PC1406 PC1407 PC1408 
##  0.012  0.012  0.011  0.011  0.011  0.011  0.011  0.011  0.011  0.011  0.011 
## PC1409 PC1410 PC1411 PC1412 PC1413 PC1414 PC1415 PC1416 PC1417 PC1418 PC1419 
##  0.011  0.011  0.011  0.011  0.011  0.011  0.011  0.011  0.011  0.011  0.011 
## PC1420 PC1421 PC1422 PC1423 PC1424 PC1425 PC1426 PC1427 PC1428 PC1429 PC1430 
##  0.011  0.011  0.011  0.011  0.011  0.010  0.010  0.010  0.010  0.010  0.010 
## PC1431 PC1432 PC1433 PC1434 PC1435 PC1436 PC1437 PC1438 PC1439 PC1440 PC1441 
##  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010 
## PC1442 PC1443 PC1444 PC1445 PC1446 PC1447 PC1448 PC1449 PC1450 PC1451 PC1452 
##  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010  0.010 
## PC1453 PC1454 PC1455 PC1456 PC1457 PC1458 PC1459 PC1460 PC1461 PC1462 PC1463 
##  0.010  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009 
## PC1464 PC1465 PC1466 PC1467 PC1468 PC1469 PC1470 PC1471 PC1472 PC1473 PC1474 
##  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009 
## PC1475 PC1476 PC1477 PC1478 PC1479 PC1480 PC1481 PC1482 PC1483 PC1484 PC1485 
##  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.009  0.008  0.008 
## PC1486 PC1487 PC1488 PC1489 PC1490 PC1491 PC1492 PC1493 PC1494 PC1495 PC1496 
##  0.008  0.008  0.008  0.008  0.008  0.008  0.008  0.008  0.008  0.008  0.008 
## PC1497 PC1498 PC1499 PC1500 
##  0.008  0.008  0.008  0.008

Calculate the cut off for the rule of thumb

N_columns <- ncol(vcf_scaled)
N_columns
## [1] 6962
cut_off <- 1/N_columns*100
cut_off
## [1] 0.01436369
i_cut_off <- which(var_out < cut_off)
i_cut_off
## PC1281 PC1282 PC1283 PC1284 PC1285 PC1286 PC1287 PC1288 PC1289 PC1290 PC1291 
##   1281   1282   1283   1284   1285   1286   1287   1288   1289   1290   1291 
## PC1292 PC1293 PC1294 PC1295 PC1296 PC1297 PC1298 PC1299 PC1300 PC1301 PC1302 
##   1292   1293   1294   1295   1296   1297   1298   1299   1300   1301   1302 
## PC1303 PC1304 PC1305 PC1306 PC1307 PC1308 PC1309 PC1310 PC1311 PC1312 PC1313 
##   1303   1304   1305   1306   1307   1308   1309   1310   1311   1312   1313 
## PC1314 PC1315 PC1316 PC1317 PC1318 PC1319 PC1320 PC1321 PC1322 PC1323 PC1324 
##   1314   1315   1316   1317   1318   1319   1320   1321   1322   1323   1324 
## PC1325 PC1326 PC1327 PC1328 PC1329 PC1330 PC1331 PC1332 PC1333 PC1334 PC1335 
##   1325   1326   1327   1328   1329   1330   1331   1332   1333   1334   1335 
## PC1336 PC1337 PC1338 PC1339 PC1340 PC1341 PC1342 PC1343 PC1344 PC1345 PC1346 
##   1336   1337   1338   1339   1340   1341   1342   1343   1344   1345   1346 
## PC1347 PC1348 PC1349 PC1350 PC1351 PC1352 PC1353 PC1354 PC1355 PC1356 PC1357 
##   1347   1348   1349   1350   1351   1352   1353   1354   1355   1356   1357 
## PC1358 PC1359 PC1360 PC1361 PC1362 PC1363 PC1364 PC1365 PC1366 PC1367 PC1368 
##   1358   1359   1360   1361   1362   1363   1364   1365   1366   1367   1368 
## PC1369 PC1370 PC1371 PC1372 PC1373 PC1374 PC1375 PC1376 PC1377 PC1378 PC1379 
##   1369   1370   1371   1372   1373   1374   1375   1376   1377   1378   1379 
## PC1380 PC1381 PC1382 PC1383 PC1384 PC1385 PC1386 PC1387 PC1388 PC1389 PC1390 
##   1380   1381   1382   1383   1384   1385   1386   1387   1388   1389   1390 
## PC1391 PC1392 PC1393 PC1394 PC1395 PC1396 PC1397 PC1398 PC1399 PC1400 PC1401 
##   1391   1392   1393   1394   1395   1396   1397   1398   1399   1400   1401 
## PC1402 PC1403 PC1404 PC1405 PC1406 PC1407 PC1408 PC1409 PC1410 PC1411 PC1412 
##   1402   1403   1404   1405   1406   1407   1408   1409   1410   1411   1412 
## PC1413 PC1414 PC1415 PC1416 PC1417 PC1418 PC1419 PC1420 PC1421 PC1422 PC1423 
##   1413   1414   1415   1416   1417   1418   1419   1420   1421   1422   1423 
## PC1424 PC1425 PC1426 PC1427 PC1428 PC1429 PC1430 PC1431 PC1432 PC1433 PC1434 
##   1424   1425   1426   1427   1428   1429   1430   1431   1432   1433   1434 
## PC1435 PC1436 PC1437 PC1438 PC1439 PC1440 PC1441 PC1442 PC1443 PC1444 PC1445 
##   1435   1436   1437   1438   1439   1440   1441   1442   1443   1444   1445 
## PC1446 PC1447 PC1448 PC1449 PC1450 PC1451 PC1452 PC1453 PC1454 PC1455 PC1456 
##   1446   1447   1448   1449   1450   1451   1452   1453   1454   1455   1456 
## PC1457 PC1458 PC1459 PC1460 PC1461 PC1462 PC1463 PC1464 PC1465 PC1466 PC1467 
##   1457   1458   1459   1460   1461   1462   1463   1464   1465   1466   1467 
## PC1468 PC1469 PC1470 PC1471 PC1472 PC1473 PC1474 PC1475 PC1476 PC1477 PC1478 
##   1468   1469   1470   1471   1472   1473   1474   1475   1476   1477   1478 
## PC1479 PC1480 PC1481 PC1482 PC1483 PC1484 PC1485 PC1486 PC1487 PC1488 PC1489 
##   1479   1480   1481   1482   1483   1484   1485   1486   1487   1488   1489 
## PC1490 PC1491 PC1492 PC1493 PC1494 PC1495 PC1496 PC1497 PC1498 PC1499 PC1500 
##   1490   1491   1492   1493   1494   1495   1496   1497   1498   1499   1500
i_cut_off <- min(i_cut_off)
i_cut_off
## [1] 1281

Save the first value below the cutoff

my_meta_N_meanNA_rowsPCs <- i_cut_off
my_meta_var_PC123 <- var_out[c(1,2,3)]
my_meta_var_PC123
##   PC1   PC2   PC3 
## 3.139 2.874 2.141

Barplot

barplot(var_out, 
        main = "Percent variation (%) Scree plot",
        ylab = "Percent variation (%) explained",
        names.arg = 1:length(var_out))
        abline(h = cut_off, col = 2, lwd = 2)
        abline(v = i_cut_off)
        legend("topright",
               col = c(2,1),
               lty = c(1,1),
               legend = c("Vertical line: cutoff",
                          "Horizontal line: 1st value below cut off"))

Cumulative percentage variation

cumulative_variation <- cumsum(var_out)
plot(cumulative_variation, type = "l")

Plot PCA results - Calculate scores, Combine scores with species information, and Look at variation explained

vcf_pca_scores <- vegan::scores(vcf_pca)
vcf_pca_scores2 <- data.frame(super_pop = vcf_noNA$super_pop, vcf_pca_scores)
my_meta_var_PC123[1]
##   PC1 
## 3.139
my_meta_var_PC123[2]
##   PC2 
## 2.874
my_meta_var_PC123[3]
##   PC3 
## 2.141

Scatterplot - plot PC1 versus PC2

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC2",
                  x = "PC1",
                  color = "super_pop",
                  shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC1 (3.139% of variation)",
                  ylab = "PC2 (2.874% of variation)")

Scatterplot - plot PC2 versus PC3

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC3",
                  x = "PC2",
                  color = "super_pop",
                  shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC2 (2.874% of variation)",
                  ylab = "PC3 (2.141% of variation)")

Scatterplot - plot PC1 versus PC3

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC3",
                  x = "PC1",
                  color = "super_pop",
                  shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC1 (3.139% of variation)",
                  ylab = "PC3 (2.141% of variation)")