Work Flow for Final Project

Introduction

This report summarizes workflow and results of an analysis of SNPs from the 1000 Genomes Project. This is only the workflow, showing what was completed to prepare the data for PCA analysis and plotting. More indepth written analysis and explanations can be found on the Final Report. This workflow provides minimal explanations to give a better understanding of what is done.

Data preparation

Obtaining and loading data

Confirm working directory and location of files

getwd()
## [1] "/Users/jooppereira/Downloads"

Locate VCF File using list.files(pattern = “vcf”) and assign VCF file an easier name

list.files( pattern = "vcf")
##  [1] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504 (1).vcf.gz"
##  [2] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504 (2).vcf.gz"
##  [3] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"    
##  [4] "22.21299928-21539928.ALL.chr22_GRCh38.genotypes.20170504.vcf.gz"    
##  [5] "all_loci-1.vcf"                                                     
##  [6] "all_loci.vcf"                                                       
##  [7] "vcf_for_PCA.csv"                                                    
##  [8] "vcf_num_df.csv"                                                     
##  [9] "vcf_num_df2.csv"                                                    
## [10] "vcf_num.csv"
my_vcf <- "22.21299928-21539928.ALL.chr22_GRCh38.genotypes.20170504.vcf.gz"

Load VCF file

vcf <- vcfR::read.vcfR(my_vcf,
                       convertNA = T)
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 1911
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 1911
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 1911
##   row_num: 0
## 
Processed variant 1000
Processed variant: 1911
## All variants processed

Convert raw VCF file to genotype scores and save as a csv

vcf_num <- vcfR::extract.gt(vcf,
                            element= "GT",
                            IDtoRowNames = F,
                            as.numeric = T,
                            convertNA = T)
write.csv(vcf_num, file= "vcf_num.csv", row.names= F)

Transpose VCF, create dataframe, and add row names information into the dataframe created

vcf_num_t <- t(vcf_num)

vcf_num_df <- data.frame(vcf_num_t)

sample<- row.names(vcf_num_df)

vcf_num_df <- data.frame(sample, 
                         vcf_num_df)

Save a csv of the transposed dataframe

write.csv(vcf_num_df,
          file = "vcf_num_df.csv",
          row.names = F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1.csv" "SNPs_cleaned.csv"              
## [3] "vcf_for_PCA.csv"                "vcf_num_df.csv"                
## [5] "vcf_num_df2.csv"                "vcf_num.csv"                   
## [7] "walsh2017morphology.csv"

Data subsetting and meta data

Load population meta data, and check that “sample” appears in the column names for both meta AND SNP data

pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")
colnames(pop_meta)
## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"
colnames(vcf_num_df [1:10])
##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

Merge the two data sets and check the dimensions to see if the row numbers are the same before and after merging

vcf_num_df2 <- merge(pop_meta, vcf_num_df, by= "sample")
nrow(vcf_num_df) == nrow(vcf_num_df2)
## [1] TRUE

Check the names of the new dataframe created

names(vcf_num_df2 [1:10])
##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

Make a csv file for vcf_num_df2

write.csv(vcf_num_df2, file = "vcf_num_df2.csv", row.names = F)

Omitting invariants Load invar_omit() function

invar_omit <- function(x){
  cat("Dataframe of dim", dim(x), "processed... \n")
  sds <- apply(x,2,sd, na.rm=T)
  i_var0 <-which(sds== 0)
  
  cat(length(i_var0), "columns removed\n")
  
  if(length(i_var0) > 0){
    x<- x[,-i_var0]
  }
  return(x)
}

Since 4 columns have character data, we DON’T want to put these columns through invar_omit(). Use negative indexing

vcf_noinvar<- vcf_num_df2
vcf_noinvar[, -c(1:4)] <- invar_omit(vcf_noinvar[, -c(1:4)])
## Dataframe of dim 2504 1913 processed... 
## 375 columns removed

Removing data with too many NAs Create function to find NAs

find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF ==T)
  N_NA <- length(i_NA)
  
  return(i_NA)
}

Use for() loop to search for NAs

N_rows <- nrow(vcf_noinvar)
N_NA <- rep (x=0, times = N_rows)
N_SNPs <- ncol(vcf_noinvar)
for(i in 1:N_rows){
  i_NA <- find_NAs(vcf_noinvar[i,])
  N_NA_i <- length(i_NA)
  N_NA[i] <- N_NA_i
}

Check if any row has less than 50% NAs

cutoff50<- N_SNPs *0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA >50)
## [1] FALSE

Preparing for PCA Scaling data

#new copy of data
vcf_scaled <- vcf_noinvar

#scale
vcf_scaled[, -c(1:4)] <- scale(vcf_noinvar[, -c(1:4)])

Write a csv file for PCA analysis

write.csv(vcf_scaled, file= "vcf_for_PCA.csv", row.names = F)

Now were ready to carry out the PCA!

Data Analysis

Packages

The following packages were used in this analysis:

# plotting:
library(ggplot2)
library(ggpubr)

# scores() function
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
# 3D scatter plot
library(scatterplot3d)

Loading data

Load the fully processed data:

vcf_scaled <- read.csv(file = "vcf_for_PCA.csv")

Check the dimensions of the data to confirm this is the correct data:

dim(vcf_scaled)
## [1] 2504 1917

Principal Components Analysis

The data are scaled and ready for analysis. Only the first 4 columns contain character data and needs to be omitted.

head(vcf_scaled[,1:10])
##    sample pop super_pop    sex      lat        lng         X1         X2
## 1 HG00096 GBR       EUR   male 1.419725 -0.1190525 -0.0600481 -0.1367733
## 2 HG00097 GBR       EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## 3 HG00099 GBR       EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## 4 HG00100 GBR       EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## 5 HG00101 GBR       EUR   male 1.419725 -0.1190525 -0.0600481 -0.1367733
## 6 HG00102 GBR       EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
##            X3          X4
## 1 -0.04899962 -0.01998402
## 2 -0.04899962 -0.01998402
## 3 -0.04899962 -0.01998402
## 4 -0.04899962 -0.01998402
## 5 -0.04899962 -0.01998402
## 6 -0.04899962 -0.01998402

PCA

Principal Components Analysis was run using prcomp().

vcf_pca <- prcomp(vcf_scaled[,-c(1:4)])

Get the PCA scores, which will be plotted.:

vcf_pca_scores  <- vegan::scores(vcf_pca) 

Combine the scores with the sample information into a dataframe.

# call data.frame()
vcf_pca_scores2 <- data.frame(super_pop = vcf_scaled$super_pop, 
                              vcf_pca_scores)

# set as a factor
vcf_pca_scores2$super_pop <- factor(vcf_pca_scores2$super_pop)

PCA diagnostics

The following steps help us understand the PCA output and determine how many PCs should be plotted and/or used in further analyses such as scans for natural selection, cluster analysis, and GWAS.

Default scree plot

A default R scree plot was created with screeplot(). This plot does not provide extra information for assessing the importance of the PCs.

screeplot(vcf_pca, 
          xlab = "Principal Components")

Advanced scree plot

NEW Functions
PCA_variation() function

This function extacts information needed to make a more advanced, annotated scree plot.

# This is a NEW function
PCA_variation <- function(pca){
  
  # get summary information from PCA
  pca_summary <- summary(pca)
  
  # extract information from summary
  ## raw variance for each PC
  variance <- pca_summary$importance[1,]
  
  ## % variance explained by each PC
  var_explained <- pca_summary$importance[2,]*100
  var_explained <- round(var_explained,3)
  
  ## cumulative % variance  
  var_cumulative <- pca_summary$importance[3,]*100
  var_cumulative <- round(var_cumulative,3)
  
  # prepare output
  N.PCs <- length(var_explained)
  var_df <- data.frame(PC = 1:N.PCs,
            var_raw  = variance,
            var_percent = var_explained, 
            cumulative_percent = var_cumulative)
  
  # return output
  return(var_df)   
}
screeplot_snps() function

This functions makes a more advanced scree plot better suited for PCS on for SNPs.

# This is a NEW function
screeplot_snps <- function(var_df){
total_var <- sum(var_df$var_raw)
N <- length(var_df$var_raw)
var_cutoff <- total_var/N
var_cut_percent <- var_cutoff/total_var*100
var_cut_percent_rnd <- round(var_cut_percent,2)
i_above_cut <- which(var_df$var_percent > var_cut_percent)
i_cut <- max(i_above_cut) 
ti <- paste0("Cutoff = ",
            var_cut_percent_rnd,
            "%\n","Useful PCs = ",i_cut)
plot(var_df$var_percent,
        main =ti, type = "l",
     xlab = "PC",
     ylab = "Percent variation",
     col = 0)

segments(x0 = var_df$PC,
         x1 = var_df$PC,
         y0 = 0, 
         y1 = var_df$var_percent,
         col = 1)


segments(x0 = 0,
         x1 = N,
         y0 = var_cut_percent, 
         y1 = var_cut_percent,
         col = 2)

}
PCA_cumulative_var_plot() function

This makes a plot complementary to a scree plot. A scree plot plots the amount of variation explained by each PC. This plot plots a curve of cumulative amount of variation explained by the PCs.

# This is a NEW function
PCA_cumulative_var_plot <- function(var_df){
  plot(cumulative_percent ~ PC, 
       data = var_out,
       main = "Cumulative percent variation\n explained by PCs",
       xlab = "PC",
       ylab = "Cumulative %",
       type = "l")
  
  total_var <- sum(var_df$var_raw)
N <- length(var_df$var_raw)
var_cutoff <- total_var/N
var_cut_percent <- var_cutoff/total_var*100
var_cut_percent_rnd <- round(var_cut_percent,2)
i_above_cut <- which(var_df$var_percent > var_cut_percent)
i_cut <- max(i_above_cut) 

percent_cut_i <- which(var_out$PC == i_cut )
percent_cut <- var_out$cumulative_percent[percent_cut_i]
segments(x0 = i_cut,
         x1 = i_cut,
         y0 = 0, 
         y1 = 100,
         col = 2)

segments(x0 = -10,
         x1 = N,
         y0 = percent_cut, 
         y1 = percent_cut,
         col = 2)

}
Advanced screeplot analysis
Extract information

Extract information on the variance explained by each PC.

var_out <- PCA_variation(vcf_pca)

Look at the output of PCA_variation()

head(var_out)
##     PC  var_raw var_percent cumulative_percent
## PC1  1 7.251458       2.749              2.749
## PC2  2 6.000955       1.882              4.631
## PC3  3 5.456191       1.556              6.187
## PC4  4 4.781130       1.195              7.382
## PC5  5 4.624231       1.118              8.500
## PC6  6 4.427505       1.025              9.525
Advanced screeplot

Make the scree plot with screeplot_snps()

screeplot_snps(var_out)

Cumulative variation plot

Make cumulative variation plot with PCA_cumulative_var_plot()

PCA_cumulative_var_plot(var_out)

PCA Scatterplots

The object created above var_out indicates how much variation is explained by each of the Principal components. This information is often added to the axes of scatterplots of PCA output.

head(var_out)
##     PC  var_raw var_percent cumulative_percent
## PC1  1 7.251458       2.749              2.749
## PC2  2 6.000955       1.882              4.631
## PC3  3 5.456191       1.556              6.187
## PC4  4 4.781130       1.195              7.382
## PC5  5 4.624231       1.118              8.500
## PC6  6 4.427505       1.025              9.525

Plot PC1 versus PC2

Plot the scores, with super-population color-coded

# make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC2",
                  x = "PC1",
              color = "super_pop",   # TODO
              shape = "super_pop",   # TODO
              main = "PCA Scatterplot",
         ylab = "PC2 (1.882% of variation)",
         xlab = "PC1 (2.749% of variation)")

Plot PC2 versus PC3

Plot the scores, with super population color-coded

# make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC3",
                  x = "PC2",
                  color = "super_pop",   # todo
                  shape = "super_pop",   # todo
                  main = "PCA Scatterplot",
          ylab = "PC3 (1.556% of variation)",
          xlab = "PC2 (1.882% of variation)")

Plot PC1 versus PC3

Plot the scores, with super population color-coded

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC3",
                  x = "PC1",
                  ellipse = T,
            color = "super_pop",   
            shape = "super_pop",   
            main = "PCA Scatterplot",
      ylab = "PC3 (1.556% of variation)",
      xlab = "PC1 (2.749% of variation)")

3D scatterplot

The first 3 principal components can be presented as a 3D scatterplot.

colors_use <- as.numeric(vcf_pca_scores2$super_pop)
scatterplot3d(x = vcf_pca_scores2$PC1,
              y = vcf_pca_scores2$PC2,
              z = vcf_pca_scores2$PC3,
  color = colors_use,
              xlab = "PC1 (2.749%)",
              ylab = "PC2 (1.882%)",
              zlab = "PC3 (1.556%)")