Load Necessary R Packages

library(vcfR)
## Warning: package 'vcfR' was built under R version 4.2.2
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-2
library(ggplot2)
library(ggpubr)

Confirm Working Directory and Files

We should have the proper working directory set so we can access our necessary files. Let’s check this using R.

getwd()
## [1] "C:/Users/adamf/OneDrive/Documents/Pitt/Year 4/Fall2022/BIOSC1540/BIOSC1540FinalProject"
list.files()
##  [1] "1000genomes_people_info2-1.csv" "BIOSC1540FinalProject.Rproj"   
##  [3] "cleaned_data.csv"               "Funk_Data_Preparation.html"    
##  [5] "Funk_Data_Preparation.Rmd"      "Funk_Final_Report.docx"        
##  [7] "Funk_Final_Report.html"         "Funk_Final_Report.Rmd"         
##  [9] "my_snps.vcf.gz"                 "rsconnect"                     
## [11] "vcf_num.csv"

Set Up our SNP Data

We need to read in our SNP data from the VCF file that we saved so we can begin analyzing and cleaning our data.

vcf <- vcfR::read.vcfR("my_snps.vcf.gz", convertNA = TRUE)
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 6867
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 6867
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 6867
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6867
## All variants processed

Make sure the data is in the proper folder. Our files should say it has processed 6867 variants - is this the case?

Convert Raw VCF File to Genotype Scores

We need to convert our SNP data from categorical to numeric data. This is a form a dimensional reduction and allows us to use the data in PCA and further analyses.

vcf_num <- vcfR::extract.gt(vcf,
                           element = "GT",
                           IDtoRowNames = F,
                           as.numeric = T,
                           convertNA = T)

Transpose Original VCF Orientation to R Dataframe

Since our data is given with SNPs in rows and individuals in columns, we need to transpose the matrix and convert our data to a dataframe so we can use it for further analysis.

vcf_num_t <- t(vcf_num)

Next, we need to convert the transposed matrix to a data frame.

vcf_num_df <- data.frame(vcf_num_t)

Finally, let’s add in our row names from the data frame as a column that can be removed.

sample <- row.names(vcf_num_df)
vcf_num_df <- data.frame(sample, vcf_num_df)

Clean the Data

Let’s use our newly added sample names and merge our prepared SNP data frame with the metadata on the populations.

pop_meta <- read.csv("1000genomes_people_info2-1.csv")

Let us confirm that both data frames contain a column that is representative of the sample.

names(pop_meta)
## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"
names(vcf_num_df)[1:5]
## [1] "sample" "X1"     "X2"     "X3"     "X4"

Merge the two data sets.

vcf_num_df2 <- merge(pop_meta,
                     vcf_num_df,
                     by = "sample")

Let’s check the dimensions before and after merging - the number of rows should remain unchanged.

nrow(vcf_num_df) == nrow(vcf_num_df2)
## [1] TRUE

Finally, lets check the names of our merged data frame columns.

names(vcf_num_df2)[1:10]
##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

Omit Invariant Data

We now need to omit any invariant data so we don’t have issues when scaling our data for PCA later. Let’s create a nifty function to help us with this.

invar_omit <- function(x){
  cat("Dataframe of dim",dim(x), "processed...\n")
  sds <- apply(x, 2, sd, na.rm = TRUE)
  i_var0 <- which(sds == 0)
 
  
  cat(length(i_var0),"columns removed\n")
  
  if(length(i_var0) > 0){
     x <- x[, -i_var0]
  }
  
  return(x)                      
}

Now we need to make sure we run this function only on columns with numeric data.

#make a new df to store our output
vcf_noinvar <- vcf_num_df2

vcf_noinvar <- vcf_noinvar[, -c(1:6)]

vcf_noinvar <- invar_omit(vcf_noinvar)
## Dataframe of dim 2504 6867 processed...
## 1791 columns removed
vcf_noinvar <- data.frame(vcf_num_df2[, c(1:6)], vcf_noinvar)
dim(vcf_noinvar)
## [1] 2504 5082
#lets save how many invariant columns were removed
my_meta_N_invar_cols <- 1791

Remove Low Quality Data

With sequencing and SNP data, if we have a poor quality of sample to sequence, it may skew our results or make it difficult to trust the results. We will first create a function to determine where NAs occur and then further process those missing values.

find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF == TRUE)
  N_NA <- length(i_NA)
  
  cat("Results:",N_NA, "NAs present\n.")
  return(i_NA)
}

Now we can use a for loop to use our search function to find NAs present in each row of our dataframe.

# N_rows
# number of rows (individuals)
N_rows <- nrow(vcf_noinvar)

# N_NA
# vector to hold output (number of NAs)
N_NA   <- rep(x = 0, times = N_rows)

# N_SNPs
# total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)

# the for() loop
for(i in 1:N_rows){
  
  # for each row, find the location of
  ## NAs with vcf_noinvar()
  i_NA <- find_NAs(vcf_noinvar[i,]) 
  
  # then determine how many NAs
  ## with length()
  N_NA_i <- length(i_NA)
  
  # then save the output to 
  ## our storage vector
  N_NA[i] <- N_NA_i
}

Now we can check if any rows have >50% NAs.

cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA > 50)
## [1] FALSE

Let’s check the average number of NAs per row.

mean(percent_NA)
## [1] 0

Wow! It’s practically 0! That’s awesome!

Imputation of NAs

It appears that NAs are very rare in this data set. However, we will still perform a mean imputation scheme just in case.

Let’s load our imputation function.

mean_imputation <- function(df){
  n_cols <- ncol(df)
  
  for(i in 1:n_cols){
    #get current column number
    column_i <- df[, i]
    
    #get the mean of the column
    mean_i <- mean(column_i, na.rm = TRUE)
    
    #get indices of NAs in column
    NAs_i <- which(is.na(column_i))
    
    #report number of NAs
    N_NAs <- length(NAs_i)
    
    #replace the NAs in the column with the mean
    column_i[NAs_i] <- mean_i
    
    #replace original columns with that of the df
    df[, i] <- column_i
  }
  
  return(df)
}

We will only run this function on numeric columns (you try taking the average of a categorical variable).

vcf_noNA <- vcf_noinvar
vcf_noNA[, -c(1:6)] <- mean_imputation(vcf_noinvar[, -c(1:6)])

Prepare for PCA

Now that we have cleaned and processed our data, we can scale our data so it can be used for PCA.

vcf_scaled <- vcf_noNA

#scale
vcf_scaled[, -c(1:6)] <- scale(vcf_noNA[, -c(1:6)])

Write Data

Now that we have cleaned and prepared our data for PCA, we will save it and reload it in the final report.

write.csv(vcf_scaled, file = "cleaned_data.csv", row.names = F)