Load R Packages
library(vcfR)
library(vegan)
library(ggplot2)
library(ggpubr)
Confirm Working Directory
getwd()
## [1] "/Users/jasonlee/Desktop/R/BIOSC1540/Final_Project"
list.files(pattern = "vcf")
## [1] "JL_chromosome15.vcf.gz"
Load VCF Data
#From Ensembl
chromosome15_VCF <- "JL_chromosome15.vcf.gz"
vcf <- vcfR::read.vcfR(chromosome15_VCF,convertNA = T)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 7926
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 7926
## Character matrix gt cols: 2513
## skip: 0
## nrows: 7926
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7926
## All variants processed
Convert Raw VCF to Genotype Scores (Numerical)
vcf_num <- vcfR::extract.gt(vcf,
element = "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T)
Data Prep + Merging
vcf_num_t = t(vcf_num) #transpose (SNP in cols)
vcf_num_df = data.frame(vcf_num_t) #data frame object
vnd_dim <- dim(vcf_num_df)
paste("Number of Samples/Individuals: ",vnd_dim[1], " Number of Variants: ", vnd_dim[2])
## [1] "Number of Samples/Individuals: 2504 Number of Variants: 7926"
sample <- row.names(vcf_num_df) #sample names
vcf_num_df <- data.frame(sample, vcf_num_df) #put the sample/individual names as the first col
pop_meta <- read.csv(file="1000genomes_people_info2-1.csv")
names(pop_meta)
## [1] "pop" "super_pop" "sample" "sex" "lat" "lng"
names(vcf_num_df)[1:5]
## [1] "sample" "X1" "X2" "X3" "X4"
vcf_num_df2 <- merge(pop_meta,vcf_num_df, by="sample")
nrow(vcf_num_df) == nrow(vcf_num_df2)
## [1] TRUE
names(vcf_num_df2)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4"
Removing Invariants
#Function to remove invariants
invar_omit <- function(x){
cat("Dataframe of dim",dim(x),"processed...\n")
sds <- apply(x,2,sd,na.rm = TRUE)
i_var0 <- which(sds==0)
cat(length(i_var0),"columns removed\n")
if(length(i_var0) > 0){
x<- x[, -i_var0]
}
return(x)
}
vcf_noinvar <- vcf_num_df2
vcf_firstSix <- vcf_noinvar[,c(1:6)]
before_omit <- dim(vcf_noinvar)[2]
omit_df <- invar_omit(vcf_noinvar[,-c(1:6)])
## Dataframe of dim 2504 7926 processed...
## 1940 columns removed
vcf_noinvar <- data.frame(vcf_firstSix,omit_df)
after_omit <- dim(vcf_noinvar)[2]
paste("Number cols omitted: ", before_omit - after_omit)
## [1] "Number cols omitted: 1940"
#1940 Cols removed
my_meta_N_invar_cols <- 1940
Removing Low-Quality Data (many NAs)
#Function to find NAs
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF == TRUE)
N_NA <- length(i_NA)
return(i_NA)
}
N_rows <- nrow(vcf_noinvar)
N_NA <- rep(x=0, times = N_rows)
N_SNPs <- ncol(vcf_noinvar)
for(i in 1:N_rows){
i_NA <- find_NAs(vcf_noinvar[i,])
N_NA_i <- length(i_NA)
N_NA[i] <- N_NA_i
}
paste("Number of NAs: ", sum(N_NA))
## [1] "Number of NAs: 1201"
#check for rows >50% NAs
cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs * 100
any(percent_NA > 50)
## [1] FALSE
mean(percent_NA)
## [1] 0.008004549
my_meta_N_meanNA_rows <- mean(percent_NA)
Mean Imputation
#Function to perform mean imputation
mean_imputation <- function(df){
n_cols <- ncol(df)
for(i in 1:n_cols){
column_i <- df[, i]
mean_i <- mean(column_i, na.rm = TRUE)
NAs_i <- which(is.na(column_i))
N_NAs <- length(NAs_i)
column_i[NAs_i] <- mean_i
df[, i] <- column_i
}
return(df)
}
vcf_noNA <- vcf_noinvar
meanImpute <- mean_imputation(vcf_noinvar[,-c(1:6)])
vcf_noNA <- data.frame(vcf_firstSix, meanImpute)
paste("Remaining NAs: ", sum(is.na(vcf_noNA)))
## [1] "Remaining NAs: 0"
Preparation for PCA
vcf_scaled <- vcf_noNA
scaledVCF <- scale(vcf_noNA[,-c(1:6)])
vcf_scaled <- data.frame(vcf_firstSix,scaledVCF)
paste("Number of individuals: ", dim(vcf_scaled)[1], " Number of SNPs: ", dim(vcf_scaled)[2]-6)
## [1] "Number of individuals: 2504 Number of SNPs: 5986"
Save Cleaned Data for PCA
write.csv(vcf_scaled, file="cleanedChromosome15_Data.csv",row.names = F)