Workflow: PCA Analysis of 1000 Genomes

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ## Loading the necessary R packages

library(vegan)

## Warning: package 'vegan' was built under R version 4.2.2

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-4

library(ggplot2)
library(ggpubr)

Confirm your working directory and location of files

getwd()

## [1] "C:/Users/slmg/OneDrive/Desktop/my snps"

list.files(pattern="vcf")

## [1] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
## [2] "vcf_num.csv"                                                    
## [3] "vcf_num_df.csv"                                                 
## [4] "vcf_num_df2.csv"                                                
## [5] "vcf_scaled.csv"

Set SNP data up

Load the vcf data

vcf<- vcfR::read.vcfR("20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz",convertNA = T)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 7220
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 7220
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 7220
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7220
## All variants processed

Convert raw VCF into genotype scores (allele counts)

vcf_num<-vcfR::extract.gt(vcf,element="GT", IDtoRowNames=F,as.numeric=T,convertNA=T)

Save the csv and confirm it is there

write.csv(vcf_num,file="vcf_num.csv",row.names=F)
list.files()

##  [1] "1000genomes_people_info2-1.csv"                                 
##  [2] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "final_report_template.Rmd"                                      
##  [4] "loading SNPS.Rmd"                                               
##  [5] "vcf_num.csv"                                                    
##  [6] "vcf_num_df.csv"                                                 
##  [7] "vcf_num_df2.csv"                                                
##  [8] "vcf_scaled.csv"                                                 
##  [9] "Workflow.html"                                                  
## [10] "Workflow.Rmd"

Transposing original VCF orientation to a dataframe orientation that R utilizes

vcf_num_t<-t(vcf_num)
vcf_num_df<-data.frame(vcf_num_t)
sample<-row.names(vcf_num_df)
vcf_num_df<-data.frame(sample,vcf_num_df)

Check working directory and save the csv of the new dataframe and checking it is there

getwd()

## [1] "C:/Users/slmg/OneDrive/Desktop/my snps"

write.csv(vcf_num_df,file="vcf_num_df.csv", row.names=F)
list.files()

##  [1] "1000genomes_people_info2-1.csv"                                 
##  [2] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "final_report_template.Rmd"                                      
##  [4] "loading SNPS.Rmd"                                               
##  [5] "vcf_num.csv"                                                    
##  [6] "vcf_num_df.csv"                                                 
##  [7] "vcf_num_df2.csv"                                                
##  [8] "vcf_scaled.csv"                                                 
##  [9] "Workflow.html"                                                  
## [10] "Workflow.Rmd"

Clean data

pop_meta<-read.csv(file='1000genomes_people_info2-1.csv')
names(pop_meta)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_num_df)[1:10]

##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

vcf_num_df2<-merge(pop_meta,vcf_num_df,by="sample")
nrow(vcf_num_df)==nrow(vcf_num_df2)

## [1] TRUE

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

getwd()

## [1] "C:/Users/slmg/OneDrive/Desktop/my snps"

write.csv(vcf_num_df2,file="vcf_num_df2.csv", row.names=F)
list.files()

##  [1] "1000genomes_people_info2-1.csv"                                 
##  [2] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "final_report_template.Rmd"                                      
##  [4] "loading SNPS.Rmd"                                               
##  [5] "vcf_num.csv"                                                    
##  [6] "vcf_num_df.csv"                                                 
##  [7] "vcf_num_df2.csv"                                                
##  [8] "vcf_scaled.csv"                                                 
##  [9] "Workflow.html"                                                  
## [10] "Workflow.Rmd"

Omit invariant features

invar_omit<-function(x)
{
  cat("Dataframe of dim", dim(x), "processed...\n")
  sds<-apply(x,2,sd,na.rm=TRUE)
  i_var0<-which(sds==0)
  cat(length(i_var0),"columns removed \n")
  if(length(i_var0)>0)
  {
    x<-x[,-i_var0]
  }
  return (x)
}
names(vcf_num_df2)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

vcf_noinvar<-vcf_num_df2

vcf_noinvar<-vcf_noinvar[,-c(1:6)]

vcf_noinvar<-invar_omit(vcf_noinvar)

## Dataframe of dim 2504 7220 processed...
## 1756 columns removed

vcf_noinvar<-data.frame(vcf_num_df2[,c(1:6)], 
                        vcf_noinvar)



my_meta_N_invar_cols<-1756

Remove low-quality data (was omitted to make run faster but there were 0 NAs)

# find_NAs<-function(x)
# {
#   NAs_TF<-is.na(x)
#   i_NA<-which(NAs_TF==TRUE)
#   N_NA<-length(i_NA)
#   return(i_NA)
# }
# 
# N_rows<-nrow(vcf_noinvar) ##rows are individuals 
# N_NA<-rep(x=0,times=N_rows)
# N_SNPs<-ncol(vcf_noinvar) ## columns are the SNPS 
# for(i in 1:N_rows)
# {
#   i_NA<-find_NAs(vcf_noinvar[i,])
#   N_NA_i<-length(i_NA)
#   N_NA[i]<-N_NA_i
# }
# N_NA_i
# 
# cutoff50<-N_SNPs*0.5
# percent_NA<-N_NA/N_SNPs*100
# any(percent_NA>50)
# mean(percent_NA)
# my_meta_N_meanNA_rows<-mean(percent_NA)

Load mean imputation function

mean_imputation<-function(df)
{
  cat("This may take some time...")
  n_cols<-ncol(df)
  for(i in 1:n_cols)
  {
    column_i<-df[,i]
    mean_i<-mean(column_i,na.rm=TRUE)
    NAs_i<-which(is.na(column_i))
    N_NAs<-length(NAs_i)
    column_i[NAs_i]<- mean_i
    df[,i]<-column_i
  }
  return(df)
}
names(vcf_noinvar)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X5"        "X6"

vcf_noNA<-vcf_noinvar
vcf_noNA[,-c(1:6)]<-mean_imputation(vcf_noinvar[,-c(1:6)])

## This may take some time...

Prepare for PCA

Scaling Data

vcf_noNA <- vcf_noinvar
vcf_scaled<-vcf_noNA
vcf_scaled[,-c(1:6)]<-scale(vcf_noNA[,-c(1:6)])

write.csv(vcf_scaled, file = "vcf_scaled.csv",
          row.names = F)

Running PCA

vcf_pca<-prcomp(vcf_scaled[,-c(1:6)])

PCA Diagnosis

screeplot(vcf_pca)

PCA_variation<-function(pca_summary,PCs=2)
{
  var_explained<-pca_summary$importance[2,1:PCs]*100
  var_explained<-round(var_explained,3)
  return(var_explained)
}
vcf_pca_summary<-summary(vcf_pca)
var_out<-PCA_variation(vcf_pca_summary,PCs=500)
N_columns<-ncol(vcf_scaled)
cut_off<-1/N_columns*100
i_cut_off<-which(var_out<cut_off)
i_cut_off<-min(i_cut_off)

## Warning in min(i_cut_off): no non-missing arguments to min; returning Inf

my_meta_N_meanNA_rowsPCs<-i_cut_off
my_meta_var_PC123<-var_out[c(1,2,3)]
#plot percentage variation 
barplot(var_out,main="Percent variation (%) scree plot",
        ylab="Percent variation (%) explained", 
        names.arg=1:length(var_out))
abline(h=cut_off,col=2,lwd=2)
abline(v=i_cut_off)
legend("topright",col=c(2,1),lty=c(1,1),
       legend=c("Vertical line: cutoff", 
       "Horizontal line: 1st value below cutoff"))

#plot cumulative variation 
cumulative_variation<-cumsum(var_out)
plot(cumulative_variation, type="l")

Plot PCA results

vcf_pca_scores<-vegan::scores(vcf_pca)
vcf_pca_scores2<-data.frame(super_pop=vcf_noNA$super_pop,vcf_pca_scores)
my_meta_var_PC123[1]

##   PC1 
## 2.439

my_meta_var_PC123[2]

##   PC2 
## 1.777

my_meta_var_PC123[3]

##   PC3 
## 1.637

#plot the results on scatter plot 
# PC2 vs PC1 
ggpubr::ggscatter(data=vcf_pca_scores2,y="PC2", x="PC1", 
                  color="super_pop", shape="super_pop", 
                  main="PCA Scatterplot", xlab="PC1(2.4% of variation)", 
                  ylab="PC2(1.8% of variation)")

# PC2 vs PC3
ggpubr::ggscatter(data=vcf_pca_scores2, y="PC3", x="PC2", 
                  color="super_pop", shape="super_pop", main="PCA Scatterplot", 
                  xlab="PC2(1.8% of variation)", 
                  ylab="PC3 (1.6% of variation)")

# PC3 vs PC1
ggpubr::ggscatter(data=vcf_pca_scores2, y="PC3", x="PC1", 
                  color="super_pop", shape="super_pop", main="PCA Scatterplot", 
                  xlab="PC1(2.4% of variation)", 
                  ylab="PC3 (1.6% of variation)")