Install missing package
# install.packages("kgp")
Load packages
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(kgp)
library(ggplot2)
library(ggpubr)
The package kgp3 has metadata stored in a nice format
head(kgp3)
## # A tibble: 6 × 10
## fid id pid mid sex sexf pop reg population region
## <chr> <chr> <chr> <chr> <int> <fct> <chr> <chr> <chr> <chr>
## 1 HG00096 HG00096 0 0 1 male GBR EUR British in Englan… Europe
## 2 HG00097 HG00097 0 0 2 female GBR EUR British in Englan… Europe
## 3 HG00099 HG00099 0 0 2 female GBR EUR British in Englan… Europe
## 4 HG00100 HG00100 0 0 2 female GBR EUR British in Englan… Europe
## 5 HG00101 HG00101 0 0 1 male GBR EUR British in Englan… Europe
## 6 HG00102 HG00102 0 0 2 female GBR EUR British in Englan… Europe
Download the vcf file for the lesson
vcf.file <- "2.136483646-136733646.ALL.chr2_GRCh38.genotypes.20170504.vcf.gz"
my_vcf <- vcfR::read.vcfR(vcf.file)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 7896
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 7896
## Character matrix gt cols: 2513
## skip: 0
## nrows: 7896
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7896
## All variants processed
Get population metadata into a vector
myPops <- factor(kgp3$pop)
Get the data on the SNPs from the raw vcf R object (ignore the syntax)
fixed <- data.frame(my_vcf@fix)
gt <- data.frame(my_vcf@gt)
i.dup <- which(duplicated(fixed$ID) == TRUE)
my_vcf@fix <- my_vcf@fix[-i.dup,]
my_vcf@gt <- my_vcf@gt[-i.dup,]
fixed <- data.frame(my_vcf@fix)
Best known as “Fst”.
Alternate version: “Gst”
# calculate index of genetic distance (Fixation index)
my_Gst <- genetic_diff(my_vcf,
myPops,
method = "nei")
Make a dataframe of results and add to the ID codes for SNPs
# don't worry about all of the syntax
my_df <- data.frame(SNP_i = 1:nrow(my_Gst),
Gst = my_Gst$Gst,
SNP_ID = fixed[,"ID"])
Look at the output
head(my_df)
## SNP_i Gst SNP_ID
## 1 1 0.123309067 rs10169220
## 2 2 0.014619130 rs141621006
## 3 3 0.004563135 rs7425133
## 4 4 0.005683807 rs573725448
## 5 5 0.009408318 rs542309059
## 6 6 0.004851793 rs145572016
ggpubr::gghistogram(data = my_df, x = "Gst")
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
## Warning: Removed 29 rows containing non-finite values (stat_bin).
ggpubr::ggscatter(data = my_df,
y = "Gst",
x = "SNP_i",
color = "Gst",
main = "Manhattan Plot")
## Warning: Removed 29 rows containing missing values (geom_point).
Get SNP with maximum Gst
Gst_max <- max(my_df$Gst, na.rm = T)
Gst_max_i <- which.max(my_df$Gst)
fixed[Gst_max_i,1:4]
## CHROM POS ID REF
## 3942 2 136606632 rs574516 T
fixed[Gst_max_i,"ID"]
## [1] "rs574516"