library(vcfR)
***** *** vcfR *** *****
This is vcfR 1.13.0
browseVignettes('vcfR') # Documentation
citation('vcfR') # Citation
***** ***** ***** *****
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.6-4
library(ggplot2)
library(ggpubr)
getwd()
[1] "/Users/james"
list.files(pattern = "vcf")
[1] "vcfR_test.vcf" "vcfR_test.vcf.gz"
##Set SNP data for R
my_vcf <- "8.21122256-21362256.ALL.chr8_GRCh38.genotypes.20170504.vcf.gz"
vcf <- vcfR::read.vcfR(my_vcf, convertNA = TRUE)
Scanning file to determine attributes.
File attributes:
meta lines: 130
header_line: 131
variant count: 8991
column count: 2513
Meta line 130 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 8991
Character matrix gt cols: 2513
skip: 0
nrows: 8991
row_num: 0
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant: 8991
All variants processed
vcf_num <- vcfR::extract.gt(vcf,
element = "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T)
vcf_num_t <- t(vcf_num)
## make into a dataframe
vcf_num_df <- data.frame(vcf_num_t)
## get person (sample) names
sample <- row.names(vcf_num_df)
## Add sample into dataframe
vcf_num_df <- data.frame(sample,
vcf_num_df)
pop_meta <- read.csv(file = "1000genomes_people_info2-1.csv")
names(pop_meta)
[1] "pop" "super_pop" "sample" "sex" "lat" "lng"
names(vcf_num_df)[1:10]
[1] "sample" "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9"
vcf_num_df2 <- merge(pop_meta,
vcf_num_df,
by = "sample")
write.csv(vcf_num_df2, file = "vcf_num_df2.csv", row.names = F)
## function
invar_omit <- function(x){
cat("Dataframe of dim",dim(x), "processed...\n")
sds <- apply(x, 2, sd, na.rm = TRUE)
i_var0 <- which(sds == 0)
cat(length(i_var0),"columns removed\n")
if(length(i_var0) > 0){
x <- x[, -i_var0]
}
## add return() with x in it
return(x)
}
vcf_noinvar <- vcf_num_df2
vcf_noinvar[, -c(1:6)] <- invar_omit(vcf_noinvar[, -c(1:6)])
Dataframe of dim 2504 8991 processed...
2208 columns removed
my_meta_N_invar_cols <- 2208
remove rows or SNPs with many NAs
find_NAs <- function(x){
NAs_TF <- is.na(x)
i_NA <- which(NAs_TF == TRUE)
N_NA <- length(i_NA)
return(i_NA)
}
N_rows <- nrow(vcf_noinvar)
N_NA <- rep(x = 0, times = N_rows)
N_SNPs <- ncol(vcf_noinvar)
##for loop to search for NAs
for (i in 1:N_rows){
i_NA <- find_NAs(vcf_noinvar[i,])
N_NA_i <- length(i_NA)
N_NA[i] <- N_NA_i
}
cutoff50 <- N_SNPs*0.5
percent_NA <- N_NA/N_SNPs*100
any(percent_NA > 50)
[1] FALSE
my_meta_N_invar_rows <- mean(percent_NA)
mean_imputation <- function(df){
n_cols <- ncol(df)
for(i in 1:n_cols){
column_i <- df[, i]
mean_i <- mean(column_i, na.rm = TRUE)
NAs_i <- which(is.na(column_i))
N_NAs <- length(NAs_i)
column_i[NAs_i] <- mean_i
df[, i] <- column_i
}
return(df)
}
names(vcf_noinvar)[1:10]
[1] "sample" "pop" "super_pop" "sex" "lat" "lng" "X1" "X2"
[9] "X3" "X4"
vcf_noNA <- vcf_noinvar
vcf_noNA[, -c(1:6)] <- mean_imputation(vcf_noinvar[, -c(1:6)])
write.csv(vcf_noNA, file = "Cleaned_Data.csv", row.names = F)