Load and/or install all packages required for analysis.
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(ggplot2)
library(ggpubr)
Confirm location of working directory and presence of desired vcf file.
getwd()
## [1] "/Users/grace/Desktop/Pitt/2nd/comp bio/Final Project"
list.files(pattern = "vcf")
## [1] "my_snps.vcf.gz" "vcf_data_processed.csv" "vcf_num_df_merged.csv"
## [4] "vcf_num_df.csv" "vcf_num.csv"
my_vcf <- "my_snps.vcf.gz"
Load in data from desired vcf file.
vcf <- vcfR::read.vcfR(my_vcf, convertNA = TRUE)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 9786
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 9786
## Character matrix gt cols: 2513
## skip: 0
## nrows: 9786
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant: 9786
## All variants processed
Extract genotypes from dataset and convert into numeric scores.
vcf_num <- vcfR::extract.gt(vcf,
element = "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T)
Save genotypic scores as a csv file + confirm its presence in working directory.
write.csv(vcf_num, file = "vcf_num.csv", row.names = F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1.csv" "vcf_data_processed.csv"
## [3] "vcf_num_df_merged.csv" "vcf_num_df.csv"
## [5] "vcf_num.csv"
Transpose + prepare csv file for analysis by R.
vcf_num_t <- t(vcf_num)
vcf_num_df <- data.frame(vcf_num_t)
sample <- row.names(vcf_num_df)
vcf_num_df <- data.frame(sample, vcf_num_df)
Save transposed csv file to working directory.
getwd()
## [1] "/Users/grace/Desktop/Pitt/2nd/comp bio/Final Project"
write.csv(vcf_num_df, file = "vcf_num_df.csv", row.names = F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1.csv" "vcf_data_processed.csv"
## [3] "vcf_num_df_merged.csv" "vcf_num_df.csv"
## [5] "vcf_num.csv"
Load 1000 Genomes population meta data.
pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")
Merge datasets after confirming presence of “sample” column in both.
names(pop_meta)
## [1] "pop" "super_pop" "sample" "sex" "lat" "lng"
names(vcf_num_df)[1:10]
## [1] "sample" "X1" "X2" "X3" "X4" "X5" "X6" "X7"
## [9] "X8" "X9"
vcf_num_df_merged <- merge(pop_meta, vcf_num_df, by = "sample")
Check dimensions, column names, and location of merged dataframe.
nrow(vcf_num_df) == nrow(vcf_num_df_merged)
## [1] TRUE
names(vcf_num_df_merged)[1:15]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4" "X5" "X6"
## [13] "X7" "X8" "X9"
Save merged csv file to working directory.
getwd()
## [1] "/Users/grace/Desktop/Pitt/2nd/comp bio/Final Project"
write.csv(vcf_num_df_merged, file = "vcf_num_df_merged.csv", row.names = F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1.csv" "vcf_data_processed.csv"
## [3] "vcf_num_df_merged.csv" "vcf_num_df.csv"
## [5] "vcf_num.csv"
Load in Dr. Brouwer’s invar_omit() function.
invar_omit <- function(x){
cat("Dataframe of dim", dim(x), "processed...\n")
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)
cat(length(i_var0), "columns removed\n")
if(length(i_var0) > 0){
x <- x[,-i_var0] }
return(x) }
Run invar_omit() on non-character columns + store to a new dataframe object.
names(vcf_num_df_merged)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4"
vcf_noinvars <- data.frame(vcf_num_df_merged[,c(1:6)],
invar_omit(vcf_num_df_merged[,-c(1:6)]),
row.names = NULL)
## Dataframe of dim 2504 9786 processed...
## 2525 columns removed
N_invar_cols <- 2525 # store number of omitted columns to an object
Load in Dr. Brouwer’s find_NAs() function.
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF ==T)
N_NA <- length(i_NA)
return(i_NA) }
Use a for-loop to locate each sample’s NAs.
N_rows <- nrow(vcf_noinvars) # save number of rows/samples
N_NA <- rep(x = 0, times = N_rows) # create an empty vector to hold the number of NAs in each sample
N_SNPs <- ncol(vcf_noinvars) # save number of columns/SNPs
for(i in 1:N_rows){ # loop through all samples
i_NA <- find_NAs(vcf_noinvars[i,]) # store locations (columns) of the sample's NAs in a vector
N_NA_i <- length(i_NA) # save sample's number of NAs
N_NA[i] <- N_NA_i } # add that number to the output vector created above
Use desired cutoff (% SNPs = NAs at which sample will be omitted) to determine if any samples should be removed.
cutoff50 <- N_SNPs * 0.5 # calculate cutoff
percent_NA <- N_NA / N_SNPs * 100 # calculate each sample's % of SNPs that are NAs
any(percent_NA > 50)
## [1] FALSE
mean(percent_NA)
## [1] 0.001329921
N_meanNAs_per_row <- mean(percent_NA) # store mean number of NAs in each sample
Load in Dr. Brouwer’s mean_imputation() function.
mean_imputation <- function(df){
n_cols <- ncol(df)
for(i in 1:n_cols){
column_i <- df[, i]
mean_i <- mean(column_i, na.rm = TRUE)
NAs_i <- which(is.na(column_i))
N_NAs <- length(NAs_i)
column_i[NAs_i] <- mean_i
df[, i] <- column_i }
return(df) }
Apply mean imputation to non-character columns.
names(vcf_noinvars)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X2" "X3" "X4" "X6"
vcf_noNA <- vcf_noinvars
vcf_noNA[,-c(1:6)] <- mean_imputation(vcf_noinvars[,-c(1:6)])
Prepare data by centering mean on 0 and scaling by standard deviation.
vcf_scaled <- vcf_noNA
vcf_scaled[,-c(1:6)] <- scale(vcf_noNA[,-c(1:6)])
write.csv(vcf_scaled, file = "vcf_data_processed.csv", row.names = F) # WORKFLOW ENDS HERE -> ANALYSIS IN FINAL REPORT
Apply PCA + examine screeplot to determine number of needed PCs.
vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])
screeplot(vcf_pca)
Load in Dr. Brouwer’s PCA_variation() function.
PCA_variation <- function(pca_summary,PCs = 2){
var_explained <- pca_summary$importance[2,1:PCs] * 100
var_explained <- round(var_explained,3)
return(var_explained) }
Extract PCA variation data + calculate the percentage of variation.
vcf_pca_summary <- summary(vcf_pca)
var_out <- PCA_variation(vcf_pca_summary, PCs = 500)
Calculate cutoff to determine needed PCs + save first value below it.
N_columns <- ncol(vcf_scaled)
cut_off <- 1 / N_columns * 100
i_cut_off <- which(var_out < cut_off)
i_cut_off <- min(i_cut_off)
## Warning in min(i_cut_off): no non-missing arguments to min; returning Inf
N_meanNA_rowsPCs <- i_cut_off
var_PC123 <- var_out[c(1,2,3)]
Plot percentage variation of PCs on a barplot with lines indicating cutoff + first value below.
barplot(var_out,
main = "Percent variation (%) Scree Plot",
ylab = "Percent variation (%) explained",
xlab = "PCs",
names.arg = 1:length(var_out))
abline(h = cut_off, col = 2, lwd = 2)
abline(v = i_cut_off)
legend("topright",
col = c(2,1),
lty = c(1,1),
legend = c("Vertical line: cutoff",
"Horizontal line: 1st value below cut off"))
Plot cumulative percentage variation of PCs.
cumulative_variation <- cumsum(var_out)
plot(cumulative_variation,
type = "l",
main = "Cumulative variation (%) Plot",
ylab = "Cumulative variation (%) explained",
xlab = "PCs")
Use vegan package to get PCA scores + check variation explained by first 2 PCs.
vcf_pca_scores <- vegan::scores(vcf_pca)
vcf_pca_scores2 <- data.frame(population = vcf_noNA$super_pop, vcf_pca_scores)
var_PC123[1]
## PC1
## 3.098
var_PC123[2]
## PC2
## 2.129
var_PC123[3]
## PC3
## 1.573
Compare PC1 & PC2 in a scatterplot color-coded by super population.
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC2",
x = "PC1",
color = "population",
shape = "population",
main = "PCA Scatterplot",
xlab = "PC1 (3.23% of variation)",
ylab = "PC2 (2.02% of variation)")
Compare PC2 & PC3 in a scatterplot color-coded by super population.
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC3",
x = "PC2",
color = "population",
shape = "population",
main = "PCA Scatterplot",
xlab = "PC1 (2.02% of variation)",
ylab = "PC2 (1.90% of variation)")
Compare PC1 & PC3 in a scatterplot color-coded by super population.
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC3",
x = "PC1",
color = "population",
shape = "population",
main = "PCA Scatterplot",
xlab = "PC1 (3.23% of variation)",
ylab = "PC3 (1.90% of variation)")