This report summarizes workflow and results of an analysis of SNPs from the 1000 Genomes Project. This is only the workflow, showing what was completed to prepare the data for PCA analysis and plotting. More indepth written analysis and explanations can be found on the Final Report. This workflow provides minimal explanations to give a better understanding of what is done.
Confirm working directory and location of files
getwd()
## [1] "/Users/jooppereira/Downloads"
Locate VCF File using list.files(pattern = “vcf”) and assign VCF file an easier name
list.files( pattern = "vcf")
## [1] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504 (1).vcf.gz"
## [2] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504 (2).vcf.gz"
## [3] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"
## [4] "22.21299928-21539928.ALL.chr22_GRCh38.genotypes.20170504.vcf.gz"
## [5] "all_loci-1.vcf"
## [6] "all_loci.vcf"
## [7] "vcf_for_PCA.csv"
## [8] "vcf_num_df.csv"
## [9] "vcf_num_df2.csv"
## [10] "vcf_num.csv"
my_vcf <- "22.21299928-21539928.ALL.chr22_GRCh38.genotypes.20170504.vcf.gz"
Load VCF file
vcf <- vcfR::read.vcfR(my_vcf,
convertNA = T)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 1911
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 1911
## Character matrix gt cols: 2513
## skip: 0
## nrows: 1911
## row_num: 0
##
Processed variant 1000
Processed variant: 1911
## All variants processed
Convert raw VCF file to genotype scores and save as a csv
vcf_num <- vcfR::extract.gt(vcf,
element= "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T)
write.csv(vcf_num, file= "vcf_num.csv", row.names= F)
Transpose VCF, create dataframe, and add row names information into the dataframe created
vcf_num_t <- t(vcf_num)
vcf_num_df <- data.frame(vcf_num_t)
sample<- row.names(vcf_num_df)
vcf_num_df <- data.frame(sample,
vcf_num_df)
Save a csv of the transposed dataframe
write.csv(vcf_num_df,
file = "vcf_num_df.csv",
row.names = F)
list.files(pattern = "csv")
## [1] "1000genomes_people_info2-1.csv" "SNPs_cleaned.csv"
## [3] "vcf_for_PCA.csv" "vcf_num_df.csv"
## [5] "vcf_num_df2.csv" "vcf_num.csv"
## [7] "walsh2017morphology.csv"
Load population meta data, and check that “sample” appears in the column names for both meta AND SNP data
pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")
colnames(pop_meta)
## [1] "pop" "super_pop" "sample" "sex" "lat" "lng"
colnames(vcf_num_df [1:10])
## [1] "sample" "X1" "X2" "X3" "X4" "X5" "X6" "X7"
## [9] "X8" "X9"
Merge the two data sets and check the dimensions to see if the row numbers are the same before and after merging
vcf_num_df2 <- merge(pop_meta, vcf_num_df, by= "sample")
nrow(vcf_num_df) == nrow(vcf_num_df2)
## [1] TRUE
Check the names of the new dataframe created
names(vcf_num_df2 [1:10])
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4"
Make a csv file for vcf_num_df2
write.csv(vcf_num_df2, file = "vcf_num_df2.csv", row.names = F)
invar_omit <- function(x){
cat("Dataframe of dim", dim(x), "processed... \n")
sds <- apply(x,2,sd, na.rm=T)
i_var0 <-which(sds== 0)
cat(length(i_var0), "columns removed\n")
if(length(i_var0) > 0){
x<- x[,-i_var0]
}
return(x)
}
Since 4 columns have character data, we DON’T want to put these columns through invar_omit(). Use negative indexing
vcf_noinvar<- vcf_num_df2
vcf_noinvar[, -c(1:4)] <- invar_omit(vcf_noinvar[, -c(1:4)])
## Dataframe of dim 2504 1913 processed...
## 375 columns removed
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF ==T)
N_NA <- length(i_NA)
return(i_NA)
}
Use for() loop to search for NAs
N_rows <- nrow(vcf_noinvar)
N_NA <- rep (x=0, times = N_rows)
N_SNPs <- ncol(vcf_noinvar)
for(i in 1:N_rows){
i_NA <- find_NAs(vcf_noinvar[i,])
N_NA_i <- length(i_NA)
N_NA[i] <- N_NA_i
}
Check if any row has less than 50% NAs
cutoff50<- N_SNPs *0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA >50)
## [1] FALSE
#new copy of data
vcf_scaled <- vcf_noinvar
#scale
vcf_scaled[, -c(1:4)] <- scale(vcf_noinvar[, -c(1:4)])
Write a csv file for PCA analysis
write.csv(vcf_scaled, file= "vcf_for_PCA.csv", row.names = F)
Now were ready to carry out the PCA!
The following packages were used in this analysis:
# plotting:
library(ggplot2)
library(ggpubr)
# scores() function
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
# 3D scatter plot
library(scatterplot3d)
Load the fully processed data:
vcf_scaled <- read.csv(file = "vcf_for_PCA.csv")
Check the dimensions of the data to confirm this is the correct data:
dim(vcf_scaled)
## [1] 2504 1917
The data are scaled and ready for analysis. Only the first 4 columns contain character data and needs to be omitted.
head(vcf_scaled[,1:10])
## sample pop super_pop sex lat lng X1 X2
## 1 HG00096 GBR EUR male 1.419725 -0.1190525 -0.0600481 -0.1367733
## 2 HG00097 GBR EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## 3 HG00099 GBR EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## 4 HG00100 GBR EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## 5 HG00101 GBR EUR male 1.419725 -0.1190525 -0.0600481 -0.1367733
## 6 HG00102 GBR EUR female 1.419725 -0.1190525 -0.0600481 -0.1367733
## X3 X4
## 1 -0.04899962 -0.01998402
## 2 -0.04899962 -0.01998402
## 3 -0.04899962 -0.01998402
## 4 -0.04899962 -0.01998402
## 5 -0.04899962 -0.01998402
## 6 -0.04899962 -0.01998402
Principal Components Analysis was run using
prcomp().
vcf_pca <- prcomp(vcf_scaled[,-c(1:4)])
Get the PCA scores, which will be plotted.:
vcf_pca_scores <- vegan::scores(vcf_pca)
Combine the scores with the sample information into a dataframe.
# call data.frame()
vcf_pca_scores2 <- data.frame(super_pop = vcf_scaled$super_pop,
vcf_pca_scores)
# set as a factor
vcf_pca_scores2$super_pop <- factor(vcf_pca_scores2$super_pop)
The following steps help us understand the PCA output and determine how many PCs should be plotted and/or used in further analyses such as scans for natural selection, cluster analysis, and GWAS.
A default R scree plot was created with screeplot().
This plot does not provide extra information for assessing the
importance of the PCs.
screeplot(vcf_pca,
xlab = "Principal Components")
This function extacts information needed to make a more advanced, annotated scree plot.
# This is a NEW function
PCA_variation <- function(pca){
# get summary information from PCA
pca_summary <- summary(pca)
# extract information from summary
## raw variance for each PC
variance <- pca_summary$importance[1,]
## % variance explained by each PC
var_explained <- pca_summary$importance[2,]*100
var_explained <- round(var_explained,3)
## cumulative % variance
var_cumulative <- pca_summary$importance[3,]*100
var_cumulative <- round(var_cumulative,3)
# prepare output
N.PCs <- length(var_explained)
var_df <- data.frame(PC = 1:N.PCs,
var_raw = variance,
var_percent = var_explained,
cumulative_percent = var_cumulative)
# return output
return(var_df)
}
This functions makes a more advanced scree plot better suited for PCS on for SNPs.
# This is a NEW function
screeplot_snps <- function(var_df){
total_var <- sum(var_df$var_raw)
N <- length(var_df$var_raw)
var_cutoff <- total_var/N
var_cut_percent <- var_cutoff/total_var*100
var_cut_percent_rnd <- round(var_cut_percent,2)
i_above_cut <- which(var_df$var_percent > var_cut_percent)
i_cut <- max(i_above_cut)
ti <- paste0("Cutoff = ",
var_cut_percent_rnd,
"%\n","Useful PCs = ",i_cut)
plot(var_df$var_percent,
main =ti, type = "l",
xlab = "PC",
ylab = "Percent variation",
col = 0)
segments(x0 = var_df$PC,
x1 = var_df$PC,
y0 = 0,
y1 = var_df$var_percent,
col = 1)
segments(x0 = 0,
x1 = N,
y0 = var_cut_percent,
y1 = var_cut_percent,
col = 2)
}
This makes a plot complementary to a scree plot. A scree plot plots the amount of variation explained by each PC. This plot plots a curve of cumulative amount of variation explained by the PCs.
# This is a NEW function
PCA_cumulative_var_plot <- function(var_df){
plot(cumulative_percent ~ PC,
data = var_out,
main = "Cumulative percent variation\n explained by PCs",
xlab = "PC",
ylab = "Cumulative %",
type = "l")
total_var <- sum(var_df$var_raw)
N <- length(var_df$var_raw)
var_cutoff <- total_var/N
var_cut_percent <- var_cutoff/total_var*100
var_cut_percent_rnd <- round(var_cut_percent,2)
i_above_cut <- which(var_df$var_percent > var_cut_percent)
i_cut <- max(i_above_cut)
percent_cut_i <- which(var_out$PC == i_cut )
percent_cut <- var_out$cumulative_percent[percent_cut_i]
segments(x0 = i_cut,
x1 = i_cut,
y0 = 0,
y1 = 100,
col = 2)
segments(x0 = -10,
x1 = N,
y0 = percent_cut,
y1 = percent_cut,
col = 2)
}
Extract information on the variance explained by each PC.
var_out <- PCA_variation(vcf_pca)
Look at the output of PCA_variation()
head(var_out)
## PC var_raw var_percent cumulative_percent
## PC1 1 7.251458 2.749 2.749
## PC2 2 6.000955 1.882 4.631
## PC3 3 5.456191 1.556 6.187
## PC4 4 4.781130 1.195 7.382
## PC5 5 4.624231 1.118 8.500
## PC6 6 4.427505 1.025 9.525
Make the scree plot with screeplot_snps()
screeplot_snps(var_out)
Make cumulative variation plot with
PCA_cumulative_var_plot()
PCA_cumulative_var_plot(var_out)
The object created above var_out indicates how much
variation is explained by each of the Principal components. This
information is often added to the axes of scatterplots of PCA
output.
head(var_out)
## PC var_raw var_percent cumulative_percent
## PC1 1 7.251458 2.749 2.749
## PC2 2 6.000955 1.882 4.631
## PC3 3 5.456191 1.556 6.187
## PC4 4 4.781130 1.195 7.382
## PC5 5 4.624231 1.118 8.500
## PC6 6 4.427505 1.025 9.525
Plot the scores, with super-population color-coded
# make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC2",
x = "PC1",
color = "super_pop", # TODO
shape = "super_pop", # TODO
main = "PCA Scatterplot",
ylab = "PC2 (1.882% of variation)",
xlab = "PC1 (2.749% of variation)")
Plot the scores, with super population color-coded
# make color and shape = "super_pop"
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC3",
x = "PC2",
color = "super_pop", # todo
shape = "super_pop", # todo
main = "PCA Scatterplot",
ylab = "PC3 (1.556% of variation)",
xlab = "PC2 (1.882% of variation)")
Plot the scores, with super population color-coded
ggpubr::ggscatter(data = vcf_pca_scores2,
y = "PC3",
x = "PC1",
ellipse = T,
color = "super_pop",
shape = "super_pop",
main = "PCA Scatterplot",
ylab = "PC3 (1.556% of variation)",
xlab = "PC1 (2.749% of variation)")
The first 3 principal components can be presented as a 3D scatterplot.
colors_use <- as.numeric(vcf_pca_scores2$super_pop)
scatterplot3d(x = vcf_pca_scores2$PC1,
y = vcf_pca_scores2$PC2,
z = vcf_pca_scores2$PC3,
color = colors_use,
xlab = "PC1 (2.749%)",
ylab = "PC2 (1.882%)",
zlab = "PC3 (1.556%)")