Workflow

Load Packages

install.packages("vcfR", 
                 repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/11/cj8mtc016vb3l823pt5_774c0000gn/T//RtmpJkWvJ3/downloaded_packages

Load the vcfR package and other necessary packages using the library() function

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-4

library(ggplot2)
library(ggpubr)

Confirm working directory and location of files

getwd()

## [1] "/Users/madisondougherty/Desktop/R Final Project"

list.files(pattern = "vcf")

## [1] "5.37812600-38052600.ALL.chr5_GRCh38.genotypes.20170504.vcf.gz"
## [2] "vcf_num_df.csv"                                               
## [3] "vcf_num_df2.csv"                                              
## [4] "vcf_num.csv"                                                  
## [5] "vcf_scaled.csv"

Set Up SNP Data

Load the vcf Data

Load the vcf data using the read.vcfR() function

vcf <- vcfR::read.vcfR(file = "5.37812600-38052600.ALL.chr5_GRCh38.genotypes.20170504.vcf.gz", convertNA = T)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 7384
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 7384
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 7384
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7384
## All variants processed

Extract Genotype Scores

Convert raw vcf file to numeric genotype scores and store in the object vcf_num

vcf_num <- vcfR::extract.gt(vcf,
           element = "GT",
           IDtoRowNames = F,
           as.numeric = T,
           convertNA = T)

Save the New Data to a .csv File

Save the .csv and confirm the presence of the file

write.csv(vcf_num, file = "vcf_num.csv", row.names = F)
list.files(pattern = "csv")

## [1] "1000genomes_people_info2.csv" "vcf_num_df.csv"              
## [3] "vcf_num_df2.csv"              "vcf_num.csv"                 
## [5] "vcf_scaled.csv"

Rotate the Data

Transpose the original VCF orientation into R data frame orientation using the t() function

vcf_num_t <- t(vcf_num)

Convert the data into the data frame vcf_num_df using the data.frame() function

vcf_num_df <- data.frame(vcf_num_t)

Get person (sample) names and store in the object sample

sample <- row.names(vcf_num_df)

Add sample info to data frame

vcf_num_df <- data.frame(sample, vcf_num_df)

Check working directory

getwd()

## [1] "/Users/madisondougherty/Desktop/R Final Project"

Save the vcf_num_df data frame as a new .csv file and confirm presence of the file

write.csv(vcf_num_df,
          file = "vcf_num_df.csv",
          row.names = F)
list.files(pattern = "csv")

## [1] "1000genomes_people_info2.csv" "vcf_num_df.csv"              
## [3] "vcf_num_df2.csv"              "vcf_num.csv"                 
## [5] "vcf_scaled.csv"

Clean Data

Merge data with population meta data

#Load population meta data
pop_meta <- read.csv(file = "1000genomes_people_info2.csv")

#Make sure the column "sample" appears in the meta data and SNP data
names(pop_meta)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_num_df)[1:10]

##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

#Merge the two sets of data
vcf_num_df2 <- merge(pop_meta,
                     vcf_num_df,
                     by = "sample")

Check the dimensions before and after the merge

nrow(vcf_num_df) == nrow(vcf_num_df2)

## [1] TRUE

Check the names of the new data frame

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

Check working directory, save the csv, and confirm the presence of the file

getwd()

## [1] "/Users/madisondougherty/Desktop/R Final Project"

write.csv(vcf_num_df2, file = "vcf_num_df2.csv", row.names = F)
list.files(pattern = "csv")

## [1] "1000genomes_people_info2.csv" "vcf_num_df.csv"              
## [3] "vcf_num_df2.csv"              "vcf_num.csv"                 
## [5] "vcf_scaled.csv"

Omit invariant features

Load invar_omit() function created by Dr. Brouwer

invar_omit <- function(x){
  cat("Dataframe of dim", dim(x), "processed...\n")
  sds <- apply(x, 2, sd, na.rm = TRUE)
  i_var0 <- which(sds == 0)
  
  cat(length(i_var0), "columns removed\n")
  
  if(length(i_var0) > 0){
    x <- x[, -i_var0]
  }
  return(x)
}

Check which columns have character data

names(vcf_num_df2)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

Create new data frame to store the output and run invar_omit() on the data

vcf_noinvar <- vcf_num_df2
vcf_noinvar <- vcf_noinvar[, -c(1:6)]
vcf_noinvar <- invar_omit(vcf_noinvar)

## Dataframe of dim 2504 7384 processed...
## 1847 columns removed

vcf_noinvar <- data.frame(vcf_num_df2[, c(1:6)], vcf_noinvar)
dim(vcf_noinvar)

## [1] 2504 5543

Create an object to store the number of invariant columns removed

my_meta_N_invar_cols <- 1847

Remove low-quality data

Load find_NAs() function created by Dr. Brouwer

find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF == TRUE)
  N_NA <- length(i_NA)
  
  return(i_NA)
}

Use a for() loop to search for NAs in each row

#N_rows is the number of rows (individuals)
N_rows <- nrow(vcf_noinvar)

#N_NA is a vector to hold output (number of NAs)
N_NA <- rep(x = 0, times = N_rows)

#N_SNPs is the total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)

#the for() loop
for(i in 1:N_rows){
  i_NA <- find_NAs(vcf_noinvar[i,]) #find the location of the NAs
  N_NA_i <- length(i_NA) #determine how many NAs
  N_NA[i] <- N_NA_i #save the output to our storage vector
}

Check if any row has >50% NAs

cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA > 50)

## [1] FALSE

Calculate the mean percent of NAs in each row

mean(percent_NA)

## [1] 0.0008285498

Save the mean percent of NAs per row

my_meta_N_meanNA_rows <- mean(percent_NA)

Imputation of NAs

Load the imputation function created by Dr. Brouwer

mean_imputation <- function(df){
  n_cols <- ncol(df)
  
  for(i in 1:n_cols){
    column_i <- df[, i] #get the current column
    mean_i <- mean(column_i, na.rm = TRUE) #get the mean of the current column
    NAs_i <- which(is.na(column_i)) #get the NAs in the current column
    N_NAs <- length(NAs_i) #report the number of NAs
    column_i[NAs_i] <- mean_i #replace the NAs in the current column
    df[, i] <- column_i #replace the original column with the updated columns
  }
  return(df)
}

Run the function on the data

names(vcf_noinvar)[1:10] #check for numeric data

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X3"        "X4"        "X5"

#make a new copy of the data 
vcf_noNA <- vcf_noinvar 

#run the function and save the new data to the new object
vcf_noNA[, -c(1:6)] <- mean_imputation(vcf_noinvar[, -c(1:6)])

Scale the Data

Make a new copy of the data

vcf_scaled <- vcf_noNA

Scale the data with the scale() function and save the output to the new object

vcf_scaled[, -c(1:6)] <- scale(vcf_noNA[, -c(1:6)])

Save the cleaned data to a new .csv

Check working directory, save the data to a new .csv, and check for the presence of the file

getwd()

## [1] "/Users/madisondougherty/Desktop/R Final Project"

write.csv(vcf_scaled,
          file = "vcf_scaled.csv",
          row.names = F)
list.files(pattern = "csv")

## [1] "1000genomes_people_info2.csv" "vcf_num_df.csv"              
## [3] "vcf_num_df2.csv"              "vcf_num.csv"                 
## [5] "vcf_scaled.csv"