Install missing package
#install.packages("kgp")
Load packages
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(kgp)
library(ggplot2)
library(ggpubr)
The package kgp3 has metadata stored in a nice format
head(kgp3)
## # A tibble: 6 × 10
## fid id pid mid sex sexf pop reg population region
## <chr> <chr> <chr> <chr> <int> <fct> <chr> <chr> <chr> <chr>
## 1 HG00096 HG00096 0 0 1 male GBR EUR British in Englan… Europe
## 2 HG00097 HG00097 0 0 2 female GBR EUR British in Englan… Europe
## 3 HG00099 HG00099 0 0 2 female GBR EUR British in Englan… Europe
## 4 HG00100 HG00100 0 0 2 female GBR EUR British in Englan… Europe
## 5 HG00101 HG00101 0 0 1 male GBR EUR British in Englan… Europe
## 6 HG00102 HG00102 0 0 2 female GBR EUR British in Englan… Europe
Download the vcf file for the lesson
vcf.file <- "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"
my_vcf <- vcfR::read.vcfR(vcf.file)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 6877
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 6877
## Character matrix gt cols: 2513
## skip: 0
## nrows: 6877
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6877
## All variants processed
Get population metadata into a vector
myPops <- factor(kgp3$pop)
Get the data on the SNPs from the raw vcf R object (ignore the syntax)
fixed <- data.frame(my_vcf@fix)
Best known as “Fst”.
Alternate version: “Gst”
# calculate index of genetic distance (Fixation index)
my_Gst <- genetic_diff(my_vcf,
myPops,
method = "nei")
Make a dataframe of results and add to the ID codes for SNPs
# don't worry about all of the syntax
my_df <- data.frame(SNP_i = 1:nrow(my_Gst),
Gst = my_Gst$Gst,
SNP_ID = fixed[,"ID"])
Look at the output
head(my_df)
## SNP_i Gst SNP_ID
## 1 1 0.004563135 rs199561435
## 2 2 0.010048206 rs3737523
## 3 3 0.009692324 rs189523472
## 4 4 0.015376296 rs74122246
## 5 5 0.004474110 rs145194873
## 6 6 0.004851793 rs547902942
ggpubr::gghistogram(data = my_df, x = "Gst")
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).
ggpubr::ggscatter(data = my_df,
y = "Gst",
x = "SNP_i",
color = "Gst",
main = "Manhattan Plot")
## Warning: Removed 26 rows containing missing values (geom_point).
Get SNP with maximum Gst
Gst_max <- max(my_df$Gst, na.rm = T)
Gst_max_i <- which.max(my_df$Gst)
fixed[Gst_max_i,1:4]
## CHROM POS ID REF
## 4280 1 159204893 rs2814778 T
fixed[Gst_max_i,"ID"]
## [1] "rs2814778"
The C allele primarily occurs in African populations while the T gene is more commonly found in those of European and Asian descent.
This SNP is within the DARC gene and is a key SNP in the resistance against Plasmodium vivax malaria.