BIOSC 1540 Final Project Workflow

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

Prepare the R session

Load necessary R packages

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-4

library(ggplot2)
library(ggpubr)
library(scatterplot3d)

Confirm the working directory and location of files

#Set the working directory
setwd("~/Desktop/Computational Biology/Final Project/My_SNPs")

#Get the working directory
getwd()

## [1] "/Users/madelinefontana/Desktop/Computational Biology/Final Project/My_SNPs"

#List the files in working directory
list.files(pattern="vcf")

## [1] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf"   
## [2] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz"
## [3] "vcf_num_df.csv"                                                 
## [4] "vcf_num_df2.csv"                                                
## [5] "vcf_num.csv"

Set SNP data up for R

Load the vcf data

my_vcf <- "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz"

Load the vcf file

vcf <- vcfR::read.vcfR(my_vcf, convertNA = T)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 8065
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 8065
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 8065
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant: 8065
## All variants processed

Convert raw VCF file to genotype scores

#Get the genotype score (allele counts)
vcf_num <- vcfR::extract.gt(vcf, element="GT",
                            IDtoRowNames=F, as.numeric=T,
                            convertNA=T)

Save the csv

write.csv(vcf_num, file = "vcf_num.csv", row.names = F)

Confirm the presense of the file

list.files()

##  [1] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf"   
##  [2] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "1000genomes_people_info2-1.csv"                                 
##  [4] "1540_final_project_Final_Report_template.pdf"                   
##  [5] "1540_final_report_flowchart.pdf"                                
##  [6] "1540_week14_PCA_SNP_workflow.pdf"                               
##  [7] "final_project_workflow.html"                                    
##  [8] "final_project_workflow.Rmd"                                     
##  [9] "final_report_template.Rmd"                                      
## [10] "gwas_pheno_env.csv"                                             
## [11] "load_VCF_data.Rmd"                                              
## [12] "My_SNPs.Rproj"                                                  
## [13] "pheno.csv"                                                      
## [14] "rsconnect"                                                      
## [15] "vcf_num_df.csv"                                                 
## [16] "vcf_num_df2.csv"                                                
## [17] "vcf_num.csv"

Transpose the original VCF orientation to R dataframe orientation

vcf_num_t <- t(vcf_num)

Make into a dataframe

vcf_num_df <- data.frame(vcf_num_t)

Get person (sample) names

sample <- row.names(vcf_num_df)

Add sample information to the dataframe

vcf_num_df <- data.frame(sample, vcf_num_df)

Check the working directory

getwd()

## [1] "/Users/madelinefontana/Desktop/Computational Biology/Final Project/My_SNPs"

Save the csv

write.csv(vcf_num_df, file="vcf_num_df.csv", row.names = F)

Confirm the presence of the file

list.files()

##  [1] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf"   
##  [2] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "1000genomes_people_info2-1.csv"                                 
##  [4] "1540_final_project_Final_Report_template.pdf"                   
##  [5] "1540_final_report_flowchart.pdf"                                
##  [6] "1540_week14_PCA_SNP_workflow.pdf"                               
##  [7] "final_project_workflow.html"                                    
##  [8] "final_project_workflow.Rmd"                                     
##  [9] "final_report_template.Rmd"                                      
## [10] "gwas_pheno_env.csv"                                             
## [11] "load_VCF_data.Rmd"                                              
## [12] "My_SNPs.Rproj"                                                  
## [13] "pheno.csv"                                                      
## [14] "rsconnect"                                                      
## [15] "vcf_num_df.csv"                                                 
## [16] "vcf_num_df2.csv"                                                
## [17] "vcf_num.csv"

Clean data

Merge data with population meta data

# Load population meta data
pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")

Merge meta data with SNP data

#Make sure the column "sample" appears in the meta data and SNP data
names(pop_meta)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_num_df)[1:10]

##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

Merge the two sets of data

vcf_num_df2 <- merge(pop_meta, vcf_num_df, by = "sample")

Check the dimensions before and after the merge

nrow(vcf_num_df) == nrow(vcf_num_df2)

## [1] TRUE

Check the names of the new dataframe

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

Check working directory

getwd()

## [1] "/Users/madelinefontana/Desktop/Computational Biology/Final Project/My_SNPs"

Save the csv

write.csv(vcf_num_df2, file = "vcf_num_df2.csv", row.names = F)

Confirm presense of file

list.files()

##  [1] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf"   
##  [2] "10.15015968-15255968.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "1000genomes_people_info2-1.csv"                                 
##  [4] "1540_final_project_Final_Report_template.pdf"                   
##  [5] "1540_final_report_flowchart.pdf"                                
##  [6] "1540_week14_PCA_SNP_workflow.pdf"                               
##  [7] "final_project_workflow.html"                                    
##  [8] "final_project_workflow.Rmd"                                     
##  [9] "final_report_template.Rmd"                                      
## [10] "gwas_pheno_env.csv"                                             
## [11] "load_VCF_data.Rmd"                                              
## [12] "My_SNPs.Rproj"                                                  
## [13] "pheno.csv"                                                      
## [14] "rsconnect"                                                      
## [15] "vcf_num_df.csv"                                                 
## [16] "vcf_num_df2.csv"                                                
## [17] "vcf_num.csv"

Omit invariant features

#Load and run invar_omit() function 

invar_omit <- function(x){
  cat("Datafrane of dim", dim(x), "processed...\n")
  sds <- apply(x,2,sd,na.rm=TRUE)
  i_var0 <- which(sds == 0)
  
  cat(length(i_var0), "columns removed\n")
  
  if(length(i_var0)>0){
    x <- x[,-i_var0]
  }
  return(x)
}

warning("add return(x_no_invar) if it is missing")

## Warning: add return(x_no_invar) if it is missing

Omit invariants

#Check which columns have character data
names(vcf_num_df2)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

#new dataframe to store output
vcf_noinvar <- vcf_num_df2

#run invar_omit() on numeric data 
vcf_noinvar[,-c(1:6)] <- invar_omit(vcf_noinvar[,-c(1:6)])

## Datafrane of dim 2504 8065 processed...
## 1891 columns removed

Create an object to store the number of invariant columns removed

#1891 columns removed
my_meta_N_invar_cols <- 1891

Remove low-quality data

#Load find_NAs()
find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF == TRUE)
  N_NA <- length(i_NA)
  
  return(i_NA)
}

For loop to search for NAs

#N_rows - number of rows (individuals)
N_rows <- nrow(vcf_noinvar)

#N_NA - vetcor to hold output (number of NAs)
N_NA <- rep(x=0, times=N_rows)

#N_SNPs - total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)

cat("This may take a minute...")

## This may take a minute...

#the for() loop
for(i in 1:N_rows){
  #for each row find the location of NAs with bird_snps_t()
  i_NA <- find_NAs(vcf_noinvar[i,])
  
  #then determine how many NAs with length()
  N_NA_i <- length(i_NA)
  
  #then save the output to our storage vector
  N_NA[i] <- N_NA_i
}

warning("If this did not work, you may be using the wrong name for your dataframe")

## Warning: If this did not work, you may be using the wrong name for your
## dataframe

Check if any row has >50% NAs

cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA>50)

## [1] FALSE

Average number of NAs per row and save the mean percent

mean(percent_NA)

## [1] 0

my_meta_N_meanNA_rows <- mean(percent_NA)

Imputation of NAs

#Load the imputation function

mean_imputation <- function(df){
  cat("This may take some time...")
  n_cols <- ncol(df)
  
  for(i in 1:n_cols){
    #get the current column
    column_i <- df[,i]
    
    #get the mean of the current column
    mean_i <- mean(column_i, na.rm = TRUE)
    
    #get the NAs in the current column
    NAs_i <- which(is.na(column_i))
    
    #report the number of NAs
    N_NAs <- length(NAs_i)
    
    #replace the NAs in the current column
    column_i[NAs_i] <- mean_i
    
    #replace the original column with the updated columns
    df[,i] <- column_i
  }
  
  return(df)
}

Run the imputation function

names(vcf_noinvar)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

#new copy of the data
vcf_noNA <- vcf_noinvar
vcf_noNA[,-c(1:6)] <- mean_imputation(vcf_noinvar[,-c(1:6)])

## This may take some time...

Prepare for PCA

Scale the data

#standard scaling

#new copy of data
vcf_scaled <- vcf_noNA

#scale
vcf_scaled[,-c(1:6)] <- scale(vcf_noNA[,-c(1:6)])

Run the PCA

vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])

PCA diagnostics

Examine the default scree plot

#the default scree plot provides no guidance on how many PCs to retain
screeplot(vcf_pca)

Calculate explained variation

#Load PCA variation function
PCA_variation <- function(pca_summary, PCs = 2){
  var_explained <- pca_summary$importance[2,1:PCs]*100
  var_explained <- round(var_explained,3)
  return(var_explained)
}

#Extract PCA variation data and calculate prcentage variation

Get summary information

vcf_pca_summary <- summary(vcf_pca)

Extract raw variation data considering >100 PCs

var_out <- PCA_variation(vcf_pca_summary, PCs = 500)

Calculate the cut off for the rule of thumb

#Note: N_columns is the dimension of the dataframe that was used in the PCA

#number of dimensions in the data
N_columns <- ncol(vcf_scaled)

#the value of the cutoff
cut_off <- 1/N_columns*100

Calculate the Number of PCs which exceed the cut off

#which values below the cutoff
i_cut_off <- which(var_out < cut_off)

#what is the first value below the cutoff
i_cut_off <- min(i_cut_off)

## Warning in min(i_cut_off): no non-missing arguments to min; returning Inf

Save the first value below the cutoff

my_meta_N_meanNA_rowsPCs <- i_cut_off

Extract the amount of variation explained by the first 3 PCs

my_meta_var_PC123 <- var_out[c(1,2,3)]

Plot percentage variation

#make a barplot
barplot(var_out, main = "Percent variation (%) Scree plot",
        ylab = "Percent variation (%) explained",
        names.arg = 1:length(var_out))

abline(h = cut_off, col = 2, lwd = 2)
abline(v = i_cut_off)

legend("topright", col = c(2,1), lty = c(1,1),
       legend = c("Vertical line: cutoff", 
                  "Horizontal line: 1st value below cut off"))

Plot cumulative percentage variation

cumulative_variation <- cumsum(var_out)
plot(cumulative_variation, type = "l")

Plot PCA results

Calculate and get the scores

#call vegan::scores()
vcf_pca_scores <- vegan::scores(vcf_pca)

#Combine the scores with the species information into a dataframe

#call data.frame()
vcf_pca_scores2 <- data.frame(super_pop = vcf_noNA$super_pop,
                              vcf_pca_scores)

Look at the information on the variation explained by the PCs

my_meta_var_PC123[1]

##   PC1 
## 2.337

my_meta_var_PC123[2]

##   PC2 
## 1.873

my_meta_var_PC123[3]

##   PC3 
## 1.383

Plot the results

#plot PC1 versus PC2
#plot the scores with super_pop color coded

#make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2, y = "PC2", x = "PC1",
                  color = "super_pop", shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC1 (2.3% of variation)",
                  ylab = "PC2 (1.9% of variation)")

#Note how in the plot the amount of variation explained by each PC is shown in the axis labels

Plot the scores with super_pop color-coded for PC2 versus PC3

#make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2, y = "PC3", x = "PC2",
                  color = "super_pop", shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC2 (1.9% of variation)",
                  ylab = "PC3 (1.4% of variation)")

#Note how in the plot the amount of variation explained by each PC is shown in the axis labels

Plot PC1 versus PC3 with super_pop color-coded

#make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2, y = "PC3", x = "PC1",
                  color = "super_pop", shape = "super_pop",
                  main = "PCA Scatterplot",
                  xlab = "PC1 (2.3% of variation)",
                  ylab = "PC3 (1.4% of variation)")

#Note how in the plot the amount of variation explained by each PC is shown in the axis labels

3D Scatterplot

The first 3 principal components can be presented as a 3D scatterplot.

scatterplot3d(x = vcf_pca_scores2$PC1,
              y = vcf_pca_scores2$PC2,
              z = vcf_pca_scores2$PC3,
              xlab = "PC1 (2.3%)",
              ylab = "PC2 (1.9%)",
              zlab = "PC3 (1.4%)")

warning("Be sure to update the amount of variation explained by the PCs")

## Warning: Be sure to update the amount of variation explained by the PCs

BIOSC 1540 Final Project Workflow

Madeline Fontana

2022-12-13

R Markdown

Prepare the R session

Load necessary R packages

Confirm the working directory and location of files

Set SNP data up for R

Load the vcf data

Load the vcf file

Convert raw VCF file to genotype scores

Save the csv

Confirm the presense of the file

Transpose the original VCF orientation to R dataframe orientation

Make into a dataframe

Get person (sample) names

Add sample information to the dataframe

Check the working directory

Save the csv

Confirm the presence of the file

Clean data

Merge data with population meta data

Merge meta data with SNP data

Merge the two sets of data

Check the dimensions before and after the merge

Check the names of the new dataframe

Check working directory

Save the csv

Confirm presense of file

Omit invariant features

Omit invariants

Create an object to store the number of invariant columns removed

Remove low-quality data

For loop to search for NAs

Check if any row has >50% NAs

Average number of NAs per row and save the mean percent

Imputation of NAs

Run the imputation function

Prepare for PCA

Scale the data

Run the PCA

PCA diagnostics

Examine the default scree plot

Calculate explained variation

Get summary information

Extract raw variation data considering >100 PCs

Calculate the cut off for the rule of thumb

Calculate the Number of PCs which exceed the cut off

Save the first value below the cutoff

Extract the amount of variation explained by the first 3 PCs

Plot percentage variation

Plot cumulative percentage variation

Plot PCA results

Calculate and get the scores

Look at the information on the variation explained by the PCs

Plot the results

Plot the scores with super_pop color-coded for PC2 versus PC3

Plot PC1 versus PC3 with super_pop color-coded

3D Scatterplot