Load the VCF Data

# my_vcf <- "16.33690720-33930720.ALL.chr16_GRCh38.genotypes.20170504.vcf"
# vcf <- vcfR::read.vcfR(my_vcf, convertNA = T)

Set Up SNP Data

# #Convert raw VCF file to genotype scores
# vcf_num <- vcfR::extract.gt(vcf,
#                             element = "GT",
#                             IDtoRowNames = F,
#                             as.numeric = T,
#                             convertNA = T)
# 
# #Save the csv and confirm presence
# write.csv(vcf_num, file = "vcf_num.csv", row.names = F)
# list.files()
# 
# #Transpose original VCF orientation to dataframe orientation
# vcf_num_t <- t(vcf_num)

#All data prior to here processed by Dr. Brouwer

#Load transposed data
snps_num_t <- read.csv(file="snps_num_t.csv")
dim(snps_num_t)

## [1] 2504 2001

#Make into a dataframe
vcf_num_df <- data.frame(snps_num_t)
dim(vcf_num_df)

## [1] 2504 2001

#Get person (sample) names
#sample <- row.names(snps_num_t)

#Add sample info to dataframe
#vcf_num_df <- data.frame(sample, vcf_num_df)

#Check working directory, save csv, and confirm presence of file
getwd()

## [1] "/Users/neha/Desktop/Computational Biology/my_snps"

write.csv(vcf_num_df, file = "vcf_num_df.csv", row.names = F)
list.files

## function (path = ".", pattern = NULL, all.files = FALSE, full.names = FALSE, 
##     recursive = FALSE, ignore.case = FALSE, include.dirs = FALSE, 
##     no.. = FALSE) 
## .Internal(list.files(path, pattern, all.files, full.names, recursive, 
##     ignore.case, include.dirs, no..))
## <bytecode: 0x7fccd2732540>
## <environment: namespace:base>

Clean Data

Merge data with population metadata

#Load population metadata
pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")
#Merge metadata with SNP data
names(pop_meta)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_num_df)[1:10]

##  [1] "sample" "X3811"  "X5736"  "X3141"  "X1804"  "X973"   "X4260"  "X1873" 
##  [9] "X6198"  "X1082"

dim(vcf_num_df)

## [1] 2504 2001

vcf_num_df2 <- merge(pop_meta, vcf_num_df, by = "sample", all.x = TRUE)
#Check
nrow(vcf_num_df) == nrow(vcf_num_df2)

## [1] TRUE

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X3811"     "X5736"     "X3141"     "X1804"     "X973"      "X4260"    
## [13] "X1873"     "X6198"     "X1082"

#Save the CSV
write.csv(vcf_num_df2, file = "vcf_num_df2.csv", row.names = F)
dim(vcf_num_df2)

## [1] 2504 2006

Omit invariant features

invar_omit <- function(x){
  cat("Dataframe of dim",dim(x), "processed...\n")
  sds <- apply(x, 2, sd, na.rm = TRUE)
  i_var0 <- which(sds == 0)
 
  
  cat(length(i_var0),"columns removed\n")
  
  if(length(i_var0) > 0){
     x <- x[, -i_var0]
  }
  
  ## add return()  with x in it
  return(x)                      
}

names(vcf_num_df2)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X3811"     "X5736"     "X3141"     "X1804"

#Omit invariants
vcf_noinvar <- vcf_num_df2[, -c(1:6)]
vcf_noinvar <- invar_omit(vcf_noinvar)

## Dataframe of dim 2504 2000 processed...
## 526 columns removed

vcf_noinvar <- data.frame(vcf_num_df2[, c("sample","pop","super_pop","sex","lat","lng")], 
                          vcf_noinvar)


my_meta_N_invar_cols <- 526

Remove low-quality data

#Load find_NAs()
find_NAs <- function(x){
  NAs_TF <- is.na(x)
  i_NA <- which(NAs_TF == TRUE)
  N_NA <- length(i-NA)
  
  return(i_NA)
}

#search for NAs

#number of rows (individuals)
N_rows <- nrow(vcf_noinvar)

#vector to hold output (# of NAs)
N_NA <- rep(x=0, times = N_rows)

#total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)

for (i in 1:N_rows){
  #for each row find the location of NAs
  i_NA <- find_NAs(vcf_noinvar[i,])
  
  #then determine how many NAs
  N_NA_i <- length(i_NA)
  
  #then save the output
  N_NA[i] <- N_NA_i
}

#Check if any row has >50% NAs
cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA >  50)

## [1] FALSE

mean(percent_NA)

## [1] 0

my_meta_N_meanNA_rows <- mean(percent_NA)

Imputation of NAs

#Load imputation function
mean_imputation <- function(df){
  
  n_cols <- ncol(df)
  
  for(i in 1:n_cols){
    #get current column
    column_i <- df[,i]
    #get mean of current column
    mean_i <- mean(column_i, na.rm = TRUE)
    #get NAs in current column
    NAs_i <- which (is.na(column_i))
    #report number of NAs
    N_NAs <- length(NAs_i)
    #replace the NAs in current column
    column_i[NAs_i] <- mean_i
    #replace original column with updated columns
    df[,i] <- column_i
  }
  
  return(df)
}

#Run the function on numeric columns
names(vcf_noinvar)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X3811"     "X5736"     "X4260"     "X1873"

vcf_noNA <- vcf_noinvar
dim(vcf_noinvar)

## [1] 2504 1480

vcf_noNA[,-c(1:6)] <- mean_imputation(vcf_noinvar[,-c(1:6)])

##Prepare for PCA

#Scale the Data

#new copy of data
vcf_scaled <- vcf_noNA
dim(vcf_noNA)

## [1] 2504 1480

#scale
vcf_scaled_char <- vcf_scaled[,c(1:6)]
dim(vcf_scaled_char)

## [1] 2504    6

vcf_scaled_num <- vcf_scaled[,-c(1:6)]
vcf_scaled_num <- scale(vcf_scaled_num)
vcf_scaled <- data.frame(vcf_scaled_char, vcf_scaled_num)

dim(vcf_scaled)

## [1] 2504 1480

write.csv(vcf_scaled,
          file = "vcf_scaled.csv",
          row.names=F)

##Run the PCA

vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])

PCA Diagnostics

#Examine the defualt screeplot

screeplot(vcf_pca)

#Calculate Explained variation

#Load PCA variation function
PCA_variation <- function(pca_summary, PCs = 2){
  var_explained <- pca_summary$importance[2,1:PCs]*100
  var_explained <- round(var_explained,3)
  return(var_explained)
}

#Extract PCA variation data and calculate percentage variation
#get summary information
vcf_pca_summary <- summary(vcf_pca)
var_out <- PCA_variation(vcf_pca_summary, PCs = 500)

#Calculate the cut off
#number dimensions of PCA dataframe
N_columns <- ncol(vcf_scaled)
#value of the cutoff
cut_off <- 1/N_columns*100

#Calculate the number PCs which exceed the cut off
#which values below the cutoff
i_cut_off <- which(var_out < cut_off)
#what is first value below cuttoff and save
i_cut_off <- min(i_cut_off)

## Warning in min(i_cut_off): no non-missing arguments to min; returning Inf

my_meta_N_meanNA_rowPCs <- i_cut_off

#Extract the amount of variation explained by first 3 PCs
my_meta_var_PC123 <- var_out[c(1,2,3)]

#Plot percentage variation
barplot(var_out, 
        main = "Percent variation (%) Scree plot",
        ylab = "Percent variation (%) explained",
        names.arg = 1:length(var_out))
abline(h=cut_off, col=2, lwd=2)
abline(v=i_cut_off)
legend("topright",
       col=c(2,1),
       lty=c(1,1),
       legend = c("Vertical line: cutoff", "Horizontal line: 1st value below cutoff"))

#Plot cumulative percentage variation
cumulative_variation <- cumsum(var_out)
plot (cumulative_variation)

#call vegan::scores()
vcf_pca_scores <- vegan::scores(vcf_pca)

#combine scores with species information into dataframe
vcf_pca_scores2 <- data.frame(super_pop = vcf_noNA$super_pop, vcf_pca_scores)

my_meta_var_PC123[1]

##   PC1 
## 6.081

my_meta_var_PC123[2]

##   PC2 
## 1.769

my_meta_var_PC123[3]

##   PC3 
## 0.823

#Plot results

#PC1 vs PC2
ggpubr::ggscatter(data = vcf_pca_scores2,
                  y="PC2",
                  x="PC1",
                  color="super_pop",
                  shape="super_pop",
                  main="PCA Scatterplot",
                  xlab="PC1 (6.1% of variation)",
                  ylab="PC2 (1.8% of variation)")

#PC2 vs PC3
ggpubr::ggscatter(data = vcf_pca_scores2,
                  y="PC3",
                  x="PC2",
                  color="super_pop",
                  shape="super_pop",
                  main="PCA Scatterplot",
                  xlab="PC2 (1.8% of variation)",
                  ylab="PC3 (0.8% of variation)")

#PC1 vs PC3
ggpubr::ggscatter(data = vcf_pca_scores2,
                  y="PC3",
                  x="PC1",
                  color="super_pop",
                  shape="super_pop",
                  main="PCA Scatterplot",
                  xlab="PC1 (6.1% of variation)",
                  ylab="PC3 (0.8% of variation)")

Final Workflow

Load the VCF Data

Set Up SNP Data

Clean Data

Merge data with population metadata

Omit invariant features

Remove low-quality data

Imputation of NAs

PCA Diagnostics