Final Project

Preliminaries

Load the vcfR and other packages with library().

library(vcfR)

## Warning: package 'vcfR' was built under R version 4.2.2

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-2

library(ggplot2)
library(ggpubr)
library(vegan)

Make sure that your working directory is set to the location of the vcf file.

getwd()

## [1] "C:/Users/Chana/Documents/Pitt/Fall_2022/comp bio/final_project"

list.files(pattern = "vcf")

## [1] "7.27819601-28059601.ALL.chr7_GRCh38.genotypes.20170504.vcf.gz"
## [2] "vcf_scaled.csv"

Data preparation

Load Data

Load the snp data from the vcf file into an R object.

vcf_file <- "7.27819601-28059601.ALL.chr7_GRCh38.genotypes.20170504.vcf.gz"
vcf <- vcfR::read.vcfR(vcf_file, convertNA  = TRUE)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 7097
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 7097
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 7097
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7097
## All variants processed

Extract Genotype

The vcf object contains lots of data about the snps. Extract the numeric genotype data and save it as a matrix.

vcf_num <- vcfR::extract.gt(vcf, 
           element = "GT",
           IDtoRowNames  = F,
           as.numeric = T,
           convertNA = T,
           return.alleles = F)

Transpose

Transpose the object so that the snps, which are the features, become columns.

vcf_num_t <- t(vcf_num)

Turn the vcf matrix into a dataframe so that it is easier to work with.

vcf_num_df <- data.frame(vcf_num_t)

Merge Data

Load population metadata.

population_metadata <- read.csv(file = "1000genomes_people_info.csv")
sample <- row.names(vcf_num_df)
vcf_sample_df <- data.frame(sample, vcf_num_df)

Make sure that the column “sample” appears in both the metadata and the snp data.

names(population_metadata)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_sample_df)[1:10]

##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

vcf_num_df2 <- merge(population_metadata,
                     vcf_sample_df,
                     by = "sample")

Check that the dimensions are the same.

nrow(vcf_num_df) == nrow(vcf_num_df2)

## [1] TRUE

Check the names of the new dataframe.

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

Omit Ivariant Columns

invar_omit function

invar_omit <- function(x){
  cat("Dataframe of dim",dim(x), "processed...\n")
  sds <- apply(x, 2, sd, na.rm = TRUE)
  i_var0 <- which(sds == 0)
 
  
  cat(length(i_var0),"columns removed\n")
  
  if(length(i_var0) > 0){
     x <- x[, -i_var0]
  }
  
  return(x)                      
}

vcf_no_invar <- vcf_num_df2[, -c(1:6)]

vcf_no_invar<- invar_omit(vcf_no_invar)

## Dataframe of dim 2504 7097 processed...
## 1719 columns removed

vcf_no_invar <- data.frame(vcf_num_df2[, c(1:6)],
                           vcf_no_invar)

meta_num_invar_cols <- 1719

Find NAs

A function that takes a row of data and returns a vector of indexes where that row contains NAs. This is useful for determining the number of NAs in each row.

find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF == TRUE)
  N_NA <- length(i_NA)
  return(i_NA)
}

A loop that iterates through the rows of the snps dataframe and saves the number of NAs in each row. This helps determine which rows contain >50% NAs.

# N_rows
# number of rows (individuals)
N_rows <- nrow(vcf_no_invar)

# N_NA
# vector to hold output (number of NAs)
N_NA   <- rep(x = 0, times = N_rows)

# N_SNPs
# total number of columns (SNPs)
N_SNPs <- ncol(vcf_no_invar)

# the for() loop
for(i in 1:N_rows){
  
  # for each row, find the location of
  ## NAs with snps_num_t()
  i_NA <- find_NAs(vcf_no_invar[i,]) 
  
  # then determine how many NAs
  ## with length()
  N_NA_i <- length(i_NA)
  
  # then save the output to 
  ## our storage vector
  N_NA[i] <- N_NA_i
}

Set cutoff50 to half of the number of snps, because if more than this number of snps in a row are null then the row will be deleted.

# 50% of N_SNPs
cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA > 50)

## [1] FALSE

Get the average number of NAs per row.

mean(percent_NA)

## [1] 0.0005934042

mean_NAs_per_row <- mean(percent_NA)

NA Imputation

Use mean imputation on the any NAs.

mean_imputation <- function(df){
  
  cat("This may take some time...")
  n_cols <- ncol(df)
  
  for(i in 1:n_cols){
    
    # get the current column
    column_i <- df[, i]
    
    # get the mean of the current column
    mean_i <- mean(column_i, na.rm = TRUE)
    
    # get the NAs in the current column
    NAs_i <- which(is.na(column_i))
    
    # record the number of NAs
    N_NAs <- length(NAs_i)
  
    # replace the NAs in the current column
    column_i[NAs_i] <- mean_i
    
    # replace the original column with the
    ## updated columns
    df[, i] <- column_i
    
  }
  
  return(df)
}

Run the mean imputation function on numeric columns.

names(vcf_no_invar)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X2"        "X4"        "X5"        "X6"

vcf_no_NA <- vcf_no_invar
vcf_no_NA[,-c(1:6)] <- mean_imputation(vcf_no_invar[,-c(1:6)])

## This may take some time...

Data analysis

Scale Data

Use the scale function to scale and center the data.

vcf_scaled <- vcf_no_NA
vcf_scaled[,-c(1:6)] <- scale(vcf_no_NA[,-c(1:6)])

Save the scaled data as a csv file.

getwd()

## [1] "C:/Users/Chana/Documents/Pitt/Fall_2022/comp bio/final_project"

write.csv(vcf_scaled, file = "vcf_scaled.csv", row.names = F)
list.files()

##  [1] "1000genomes_people_info.csv"                                  
##  [2] "1540_final_project_Final_Report_template.pdf"                 
##  [3] "1540_week14_PCA_SNP_workflow.pdf"                             
##  [4] "7.27819601-28059601.ALL.chr7_GRCh38.genotypes.20170504.vcf.gz"
##  [5] "final_project.Rmd"                                            
##  [6] "final_project.Rproj"                                          
##  [7] "final_report_template.docx"                                   
##  [8] "final_report_template.html"                                   
##  [9] "final_report_template.Rmd"                                    
## [10] "head(snps).png"                                               
## [11] "rsconnect"                                                    
## [12] "vcf_scaled.csv"

Run PCA

Use prcomp to run PCA on the snp data.

vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])

Create Screeplot

Make a screeplot to determine which PCs make sense to analyze based on their variances.

screeplot(vcf_pca, 
          ylab  = "Relative importance",
          main = "SNP Screeplot")

PC1 contains far more variance than any of the other PCs, and would therefore be the most informative to analyze.

PC Variance Percentages

Determine the percentage of variance that the first 10 PCs describe and plot them with a barchart. Add a horizontal line which is 100 divided by the number of columns, which is what the percent variation of all the PCs would be if they were equal.

PCA_variation <- function(pca_summary, PCs = 2){
  var_explained <- pca_summary$importance[2,1:PCs]*100
  var_explained <- round(var_explained,1)
  return(var_explained)
}

vcf_pca_summary <- summary(vcf_pca)
var_out <- PCA_variation(vcf_pca_summary,PCs = 500)

N_columns <- ncol(vcf_scaled)
cut_off <- 1/N_columns * 100
i_cut_off <- which(var_out < cut_off)
i_cut_off <- min(i_cut_off)
meta_num_mean_NAs_rows_PCs <- i_cut_off
meta_var_PC123 <- var_out[c(1,2,3)]

PC1 contains most of the variance.

Plot the percentage variation.

barplot(var_out,
        main = "Percent variation (%) Scree plot",
        ylab = "Percent variation (%) explained",
        names.arg = 1:length(var_out))
        
abline(h = cut_off, col = 2, lwd = 2)
abline(v = i_cut_off)
legend("topright",
       col = c(2,1),
       lty = c(1,1),
       legend = c("Vertical line: cutoff",
                  "Horizontal line: 1st value below cut off"))

Plot the cumulative variation.

cumulative_variation <- cumsum(var_out)
plot(cumulative_variation, type = "l")

Plot PCA Results

Get the scores of each PC.

vcf_pca_scores <- vegan::scores(vcf_pca)

Combine the scores with the species info into a dataframe.

vcf_pca_scores2 <- data.frame(super_pop = vcf_no_NA$super_pop,
                              vcf_pca_scores)

Look at variation info explained by PCs.

meta_var_PC123[1]

## PC1 
## 1.8

meta_var_PC123[2]

## PC2 
## 1.5

meta_var_PC123[3]

## PC3 
## 1.3

##Plots

PC1 vs PC2

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC2",
                  x = "PC1",
                  color = "super_pop",
                  shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC1 (1.8% of variation)",
                  ylab = "PC2 (1.5% of variation)")

PC2 vs PC3

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC3",
                  x = "PC2",
                  color = "super_pop",
                  shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC2 (1.5% of variation)",
                  ylab = "PC3 (1.3% of variation)")

PC1 vs PC3

ggpubr::ggscatter(data = vcf_pca_scores2,
                  y = "PC3",
                  x = "PC1",
                  color = "super_pop",
                  shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC1 (1.8% of variation)",
                  ylab = "PC3 (1.3% of variation)")