Load the vcfR and other packages with library().
library(vcfR)
## Warning: package 'vcfR' was built under R version 4.2.2
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-2
library(ggplot2)
library(ggpubr)
library(vegan)
Make sure that your working directory is set to the location of the vcf file.
getwd()
## [1] "C:/Users/Chana/Documents/Pitt/Fall_2022/comp bio/final_project"
list.files(pattern = "vcf")
## [1] "7.27819601-28059601.ALL.chr7_GRCh38.genotypes.20170504.vcf.gz"
## [2] "vcf_scaled.csv"
Load the snp data from the vcf file into an R object.
vcf_file <- "7.27819601-28059601.ALL.chr7_GRCh38.genotypes.20170504.vcf.gz"
vcf <- vcfR::read.vcfR(vcf_file, convertNA = TRUE)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 7097
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 7097
## Character matrix gt cols: 2513
## skip: 0
## nrows: 7097
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7097
## All variants processed
The vcf object contains lots of data about the snps. Extract the numeric genotype data and save it as a matrix.
vcf_num <- vcfR::extract.gt(vcf,
element = "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T,
return.alleles = F)
Transpose the object so that the snps, which are the features, become columns.
vcf_num_t <- t(vcf_num)
Turn the vcf matrix into a dataframe so that it is easier to work with.
vcf_num_df <- data.frame(vcf_num_t)
Load population metadata.
population_metadata <- read.csv(file = "1000genomes_people_info.csv")
sample <- row.names(vcf_num_df)
vcf_sample_df <- data.frame(sample, vcf_num_df)
Make sure that the column “sample” appears in both the metadata and the snp data.
names(population_metadata)
## [1] "pop" "super_pop" "sample" "sex" "lat" "lng"
names(vcf_sample_df)[1:10]
## [1] "sample" "X1" "X2" "X3" "X4" "X5" "X6" "X7"
## [9] "X8" "X9"
vcf_num_df2 <- merge(population_metadata,
vcf_sample_df,
by = "sample")
Check that the dimensions are the same.
nrow(vcf_num_df) == nrow(vcf_num_df2)
## [1] TRUE
Check the names of the new dataframe.
names(vcf_num_df2)[1:15]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4" "X5" "X6"
## [13] "X7" "X8" "X9"
invar_omit function
invar_omit <- function(x){
cat("Dataframe of dim",dim(x), "processed...\n")
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)
cat(length(i_var0),"columns removed\n")
if(length(i_var0) > 0){
x <- x[, -i_var0]
}
return(x)
}
vcf_no_invar <- vcf_num_df2[, -c(1:6)]
vcf_no_invar<- invar_omit(vcf_no_invar)
## Dataframe of dim 2504 7097 processed...
## 1719 columns removed
vcf_no_invar <- data.frame(vcf_num_df2[, c(1:6)],
vcf_no_invar)
meta_num_invar_cols <- 1719
A function that takes a row of data and returns a vector of indexes where that row contains NAs. This is useful for determining the number of NAs in each row.
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF == TRUE)
N_NA <- length(i_NA)
return(i_NA)
}
A loop that iterates through the rows of the snps dataframe and saves the number of NAs in each row. This helps determine which rows contain >50% NAs.
# N_rows
# number of rows (individuals)
N_rows <- nrow(vcf_no_invar)
# N_NA
# vector to hold output (number of NAs)
N_NA <- rep(x = 0, times = N_rows)
# N_SNPs
# total number of columns (SNPs)
N_SNPs <- ncol(vcf_no_invar)
# the for() loop
for(i in 1:N_rows){
# for each row, find the location of
## NAs with snps_num_t()
i_NA <- find_NAs(vcf_no_invar[i,])
# then determine how many NAs
## with length()
N_NA_i <- length(i_NA)
# then save the output to
## our storage vector
N_NA[i] <- N_NA_i
}
Set cutoff50 to half of the number of snps, because if more than this number of snps in a row are null then the row will be deleted.
# 50% of N_SNPs
cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA > 50)
## [1] FALSE
Get the average number of NAs per row.
mean(percent_NA)
## [1] 0.0005934042
mean_NAs_per_row <- mean(percent_NA)
Use mean imputation on the any NAs.
mean_imputation <- function(df){
cat("This may take some time...")
n_cols <- ncol(df)
for(i in 1:n_cols){
# get the current column
column_i <- df[, i]
# get the mean of the current column
mean_i <- mean(column_i, na.rm = TRUE)
# get the NAs in the current column
NAs_i <- which(is.na(column_i))
# record the number of NAs
N_NAs <- length(NAs_i)
# replace the NAs in the current column
column_i[NAs_i] <- mean_i
# replace the original column with the
## updated columns
df[, i] <- column_i
}
return(df)
}
Run the mean imputation function on numeric columns.
names(vcf_no_invar)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X2" "X4" "X5" "X6"
vcf_no_NA <- vcf_no_invar
vcf_no_NA[,-c(1:6)] <- mean_imputation(vcf_no_invar[,-c(1:6)])
## This may take some time...
Use the scale function to scale and center the data.
vcf_scaled <- vcf_no_NA
vcf_scaled[,-c(1:6)] <- scale(vcf_no_NA[,-c(1:6)])
Save the scaled data as a csv file.
getwd()
## [1] "C:/Users/Chana/Documents/Pitt/Fall_2022/comp bio/final_project"
write.csv(vcf_scaled, file = "vcf_scaled.csv", row.names = F)
list.files()
## [1] "1000genomes_people_info.csv"
## [2] "1540_final_project_Final_Report_template.pdf"
## [3] "1540_week14_PCA_SNP_workflow.pdf"
## [4] "7.27819601-28059601.ALL.chr7_GRCh38.genotypes.20170504.vcf.gz"
## [5] "final_project.Rmd"
## [6] "final_project.Rproj"
## [7] "final_report_template.docx"
## [8] "final_report_template.html"
## [9] "final_report_template.Rmd"
## [10] "head(snps).png"
## [11] "rsconnect"
## [12] "vcf_scaled.csv"
Use prcomp to run PCA on the snp data.
vcf_pca <- prcomp(vcf_scaled[,-c(1:6)])
Make a screeplot to determine which PCs make sense to analyze based on their variances.
screeplot(vcf_pca,
ylab = "Relative importance",
main = "SNP Screeplot")
PC1 contains far more variance than any of the other PCs, and would therefore be the most informative to analyze.
Determine the percentage of variance that the first 10 PCs describe and plot them with a barchart. Add a horizontal line which is 100 divided by the number of columns, which is what the percent variation of all the PCs would be if they were equal.
PCA_variation <- function(pca_summary, PCs = 2){
var_explained <- pca_summary$importance[2,1:PCs]*100
var_explained <- round(var_explained,1)
return(var_explained)
}
vcf_pca_summary <- summary(vcf_pca)
var_out <- PCA_variation(vcf_pca_summary,PCs = 500)
N_columns <- ncol(vcf_scaled)
cut_off <- 1/N_columns * 100
i_cut_off <- which(var_out < cut_off)
i_cut_off <- min(i_cut_off)
meta_num_mean_NAs_rows_PCs <- i_cut_off
meta_var_PC123 <- var_out[c(1,2,3)]
PC1 contains most of the variance.
Plot the percentage variation.
barplot(var_out,
main = "Percent variation (%) Scree plot",
ylab = "Percent variation (%) explained",
names.arg = 1:length(var_out))
abline(h = cut_off, col = 2, lwd = 2)
abline(v = i_cut_off)
legend("topright",
col = c(2,1),
lty = c(1,1),
legend = c("Vertical line: cutoff",
"Horizontal line: 1st value below cut off"))
Plot the cumulative variation.
cumulative_variation <- cumsum(var_out)
plot(cumulative_variation, type = "l")
Get the scores of each PC.
vcf_pca_scores <- vegan::scores(vcf_pca)
Combine the scores with the species info into a dataframe.
vcf_pca_scores2 <- data.frame(super_pop = vcf_no_NA$super_pop,
vcf_pca_scores)
Look at variation info explained by PCs.
meta_var_PC123[1]
## PC1
## 1.8
meta_var_PC123[2]
## PC2
## 1.5
meta_var_PC123[3]
## PC3
## 1.3
##Plots
PC1 vs PC2
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC2",
x = "PC1",
color = "super_pop",
shape = "super_pop",
main = "PCA Scatterplot",
xlab = "PC1 (1.8% of variation)",
ylab = "PC2 (1.5% of variation)")
PC2 vs PC3
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC3",
x = "PC2",
color = "super_pop",
shape = "super_pop",
main = "PCA Scatterplot",
xlab = "PC2 (1.5% of variation)",
ylab = "PC3 (1.3% of variation)")
PC1 vs PC3
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC3",
x = "PC1",
color = "super_pop",
shape = "super_pop",
main = "PCA Scatterplot",
xlab = "PC1 (1.8% of variation)",
ylab = "PC3 (1.3% of variation)")