Load the vcfR and other packages with library()
library(vcfR)
library(vegan)
library(ggplot2)
library(ggpubr)
Make sure that the working directory is set to the location of the SNP file
setwd("~/BIOSC_1540/BIOSC1540Project")
getwd()
## [1] "/Users/joelpredix/BIOSC_1540/BIOSC1540Project"
list.files(pattern = "vcf")
## [1] "10.29000-269000.ALL.chr10_GRCh38.genotypes.20170504.vcf"
## [2] "10.29000-269000.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz"
## [3] "12.14685036-14925036.ALL.chr12_GRCh38.genotypes.20170504.vcf.gz"
## [4] "2.136483646-136733646.ALL.chr2_GRCh38.genotypes.20170504.vcf.gz"
## [5] "all_loci-1.vcf"
## [6] "all_loci.vcf"
Read in the vcf file containing the SNP data from Chromosome 10 and convert missing data to NA, store data into an object called snps. Read in the csv file with info from 1000 genomes
snps <- vcfR::read.vcfR("10.29000-269000.ALL.chr10_GRCh38.genotypes.20170504.vcf.gz", convertNA =
TRUE)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 6735
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 6735
## Character matrix gt cols: 2513
## skip: 0
## nrows: 6735
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6735
## All variants processed
csv_genome <- read.csv(file = "1000genomes_people_info2-1.csv")
Use vcfR::extract.gt(snps) in order to make the genotypes in the data R-compatible
snps_num <- vcfR::extract.gt(snps,
element = "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T,
return.alleles = F)
Rotate the data so the columns are now the rows and rows now the columns, this makes looking and manipulating the data easier in R
snps_num_t <- t(snps_num)
Turn the data into a data.frame to easily manipulate the data in R
snps_num_df <- data.frame(snps_num_t)
csv_genome_df <- data.frame(csv_genome)
Extract the row names in the data frame and turn this into a column. This process will help with merging the data frames
sample <- row.names(snps_num_df)
snps_num_df <- data.frame(sample, snps_num_df)
Merge the data frames created above. Doing so will help with graphing the super populations when doing PCA
snps_num_df_2 <- merge(csv_genome_df, snps_num_df, by = "sample")
Omits invariant data, since the SNP is fixed for that location. Due to problem with function, extract the first six columns and put them into a separate data frame. Then use c bind to put the categoral variables back into the final noinvar data frame
invar_omit <- function(x){
cat("Dataframe of dim",dim(x), "processed...\n")
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)
cat(length(i_var0),"columns removed\n")
if(length(i_var0) > 0){
x <- x[, -i_var0]
}
return(x)
}
snps_noinvar <- snps_num_df_2
first_six <- snps_noinvar[, c(1:6)]
snps_noinvar <- invar_omit(snps_noinvar[, -c(1:6)])
## Dataframe of dim 2504 6735 processed...
## 1834 columns removed
snps_noinvar2 <- cbind(first_six, snps_noinvar)
Function that looks at NAs in a single column or vector, returns the index values
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF == TRUE)
N_NA <- length(i_NA)
cat("Results:",N_NA, "NAs present\n.")
return(i_NA)
}
A for loop that goes through each row of the data and find how many NAs are in each row
Replaces NAs with the mean of that column
mean_imputation <- function(df){
N_col <- ncol(df)
for(i in 1:N_col){
column_i <- df[, i]
mean_i <- mean(column_i, na.rm = TRUE)
NAs_i <- which(is.na(column_i))
N_NAs <- length(NAs_i)
column_i[NAs_i] <- mean_i
df[, i] <- column_i
}
return(df)
}
snps_noNA <- snps_noinvar2
snps_noNA[, -c(1:6)] <- mean_imputation(snps_noinvar2[, -c(1:6)])
Use scale on the final data frame for PCA
vcf_scaled <- snps_noNA
vcf_scaled[,-c(1:6)] <- scale(vcf_scaled[,-c(1:6)])
Save the data with no NAs and invariants as a .csv file which can be loaded again later.
write.csv(vcf_scaled, file = "SNPs_cleaned.csv",
row.names = F)
Check for the presence of the file with list.files()
list.files(pattern = ".csv")
## [1] "1000genomes_people_info2-1.csv" "SNPs_cleaned.csv"
## [3] "walsh2017morphology.csv"