Load Needed Packages

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.6-2

library(ggplot2)
library(ggpubr)

Confirm Working Directory

getwd()

## [1] "C:/Users/afhar/Desktop/cb_project"

list.files(pattern="vcf")

## [1] "3.39417505-39657505.ALL.chr3_GRCh38.genotypes.20170504.vcf.gz"
## [2] "numvcf.csv"                                                   
## [3] "vcf_df.csv"                                                   
## [4] "vcf_num.csv"                                                  
## [5] "vcf_num_df.csv"                                               
## [6] "vcf_num_df2.csv"

Set up SNP Data

Load vcf data

my_vcf<-"3.39417505-39657505.ALL.chr3_GRCh38.genotypes.20170504.vcf.gz"
vcf<-vcfR::read.vcfR(my_vcf, convertNA=T)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 6889
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 6889
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 6889
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6889
## All variants processed

Convert raw VCF File to genotype scores

Save as csv file

Confirm file

vcf_num<-vcfR::extract.gt(vcf, element="GT",
                          IDtoRowNames= F, 
                          as.numeric=T,
                          convertNA=T)
write.csv(vcf_num, file="vcf_num.csv", row.names= F)

list.files()

##  [1] "1000genomes_people_info2-1.csv"                               
##  [2] "3.39417505-39657505.ALL.chr3_GRCh38.genotypes.20170504.vcf.gz"
##  [3] "cb_project.Rproj"                                             
##  [4] "cleaned_data.csv"                                             
##  [5] "final_report_template.Rmd"                                    
##  [6] "FinalProject.Rmd"                                             
##  [7] "Harvill-final_report.docx"                                    
##  [8] "Harvill-final_report.html"                                    
##  [9] "Harvill final_report.Rmd"                                     
## [10] "Harvill_Project_Workflow.html"                                
## [11] "Harvill_Project_Workflow.Rmd"                                 
## [12] "loaded-snp-data.html"                                         
## [13] "loaded data screenshot.png"                                   
## [14] "loaded snp data.R"                                            
## [15] "loaded snp data.Rmd"                                          
## [16] "my_snps"                                                      
## [17] "numvcf.csv"                                                   
## [18] "rsconnect"                                                    
## [19] "vcf_df.csv"                                                   
## [20] "vcf_num.csv"                                                  
## [21] "vcf_num_df.csv"                                               
## [22] "vcf_num_df2.csv"

Transpose originalVCF orientation to R dataframe orientation

vcf_num_t<-t(vcf_num)

Convert to dataframe

vcf_num_df<-data.frame(vcf_num_t)

Get person(sample)names

sample<-row.names(vcf_num_df)

Add sample to dataframe

vcf_num_df<-data.frame(sample, vcf_num_df)

Confirm working directory

getwd()

## [1] "C:/Users/afhar/Desktop/cb_project"

Save the csv

write.csv(vcf_num_df, file="vcf_num_df.csv",
          row.names=F)

Confirm file

list.files(pattern="csv")

## [1] "1000genomes_people_info2-1.csv" "cleaned_data.csv"              
## [3] "numvcf.csv"                     "vcf_df.csv"                    
## [5] "vcf_num.csv"                    "vcf_num_df.csv"                
## [7] "vcf_num_df2.csv"

Clean Data

Load population meta data

pop_meta<-read.csv(file="1000genomes_people_info2-1.csv")

Make sure column “sample” appears in meta data and SNP data

names(pop_meta)

## [1] "pop"       "super_pop" "sample"    "sex"       "lat"       "lng"

names(vcf_num_df)[1:10]

##  [1] "sample" "X1"     "X2"     "X3"     "X4"     "X5"     "X6"     "X7"    
##  [9] "X8"     "X9"

Merge with SNP data

vcf_num_df2<-merge(pop_meta, vcf_num_df, by="sample")

Check to make sure dimensions are same

nrow(vcf_num_df) == nrow(vcf_num_df2)

## [1] TRUE

Check names of new dataframe

names(vcf_num_df2)[1:15]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"        "X5"        "X6"       
## [13] "X7"        "X8"        "X9"

Confirm working directory

getwd()

## [1] "C:/Users/afhar/Desktop/cb_project"

Save csv

write.csv(vcf_num_df2, file="vcf_num_df2.csv", row.names=F)

Confirm file

list.files(pattern="csv")

## [1] "1000genomes_people_info2-1.csv" "cleaned_data.csv"              
## [3] "numvcf.csv"                     "vcf_df.csv"                    
## [5] "vcf_num.csv"                    "vcf_num_df.csv"                
## [7] "vcf_num_df2.csv"

Omit invarient features

invar_omit <- function(x){
  cat("Dataframe of dim",dim(x), "processed...\n")
  sds <- apply(x, 2, sd, na.rm = TRUE)
  i_var0 <- which(sds == 0)
 
  
  cat(length(i_var0),"columns removed\n")
  
  if(length(i_var0) > 0){
     x <- x[, -i_var0]
  }
  
  return(x)                      
}

Check which columns have character data

names(vcf_num_df2)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

Skip character columns with negative indexing

vcf_noinvar <- vcf_num_df2

#dim(vcf_noinvar)

vcf_noinvar[,-c(1:6)] <- invar_omit(vcf_noinvar[, -c(1:6)])

## Dataframe of dim 2504 6889 processed...
## 1564 columns removed

#dim(vcf_noinvar)

Create an object of number of columns removed

my_meta_N_invar_cols<-1564

Remove low quality data

find_NAs<-function(x){
  NAs_TF<-is.na(x)
  i_NA<-which(NAs_TF == TRUE)
  N_NA<-length(i_NA)
  return(i_NA)
}

For loop to find NAs

N_rows <- nrow(vcf_noinvar)

# N_NA
# vector to hold output (number of NAs)
N_NA   <- rep(x = 0, times = N_rows)

# N_SNPs
# total number of columns (SNPs)
N_SNPs <- ncol(vcf_noinvar)

# the for() loop
for(i in 1:N_rows){
  
  # for each row, find the location of
  ## NAs with vcf_noinvar()
  i_NA <- find_NAs(vcf_noinvar[i,]) 
  
  # then determine how many NAs
  ## with length()
  N_NA_i <- length(i_NA)
  
  # then save the output to 
  ## our storage vector
  N_NA[i] <- N_NA_i
}

Check if any row has >50% NAs

cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA > 50)

## [1] FALSE

Find average number of NAs per row

my_meta_N_meanNA_rows <- mean(percent_NA)

Imputation of NAs

Mean imputation on any NAs present

mean_imputation<-function(df){
  cat("This may take some time...")
  n_cols<-ncol(df)
  for(i in 1:n_cols){
    column_i<-df[,i]
    mean_i<-mean(column_i, na.rm=TRUE)
    NAs_i<-which(is.na(column_i))
    N_NAs<-length(NAs_i)
    column_i[NAs_i]<-mean_i
    df[, i]<-column_i
  }
  return(df)
}

Run function on numeric values

names(vcf_noinvar)[1:10]

##  [1] "sample"    "pop"       "super_pop" "sex"       "lat"       "lng"      
##  [7] "X1"        "X2"        "X3"        "X4"

vcf_noNA<-vcf_noinvar
vcf_noNA[,-c(1:6)]<-mean_imputation(vcf_noinvar[,-c(1:6)])

## This may take some time...

Prepare for PCA

Scale data

Only run on SNP columns (use negative indexing to skip character data)

vcf_scaled<-vcf_noNA
vcf_scaled[,-c(1:6)]<-scale(vcf_noNA[,-c(1:6)])

Write Cleaned Data to csv file

write.csv(vcf_scaled, file = "cleaned_data.csv", row.names = F)

Work Flow: Analysis of 1000 Genomes Data with PCA

Alison Harvill

2022-12-15

Load Needed Packages

Confirm Working Directory

Set up SNP Data

Load vcf data

Convert raw VCF File to genotype scores

Save as csv file

Confirm file

Transpose originalVCF orientation to R dataframe orientation

Convert to dataframe

Get person(sample)names

Add sample to dataframe

Confirm working directory

Save the csv

Confirm file

Clean Data

Load population meta data

Merge with SNP data

Check to make sure dimensions are same

Check names of new dataframe

Confirm working directory

Save csv

Omit invarient features

Remove low quality data

For loop to find NAs

Check if any row has >50% NAs

Imputation of NAs

Prepare for PCA

Scale data

Write Cleaned Data to csv file