Packages

Install missing package

#install.packages("kgp")

Load packages

library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(kgp)
library(ggplot2)
library(ggpubr)

The package kgp3 has metadata stored in a nice format

head(kgp3)
## # A tibble: 6 × 10
##   fid     id      pid   mid     sex sexf   pop   reg   population         region
##   <chr>   <chr>   <chr> <chr> <int> <fct>  <chr> <chr> <chr>              <chr> 
## 1 HG00096 HG00096 0     0         1 male   GBR   EUR   British in Englan… Europe
## 2 HG00097 HG00097 0     0         2 female GBR   EUR   British in Englan… Europe
## 3 HG00099 HG00099 0     0         2 female GBR   EUR   British in Englan… Europe
## 4 HG00100 HG00100 0     0         2 female GBR   EUR   British in Englan… Europe
## 5 HG00101 HG00101 0     0         1 male   GBR   EUR   British in Englan… Europe
## 6 HG00102 HG00102 0     0         2 female GBR   EUR   British in Englan… Europe

Load data

Download the vcf file for the lesson

vcf.file <- "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"
my_vcf <- vcfR::read.vcfR(vcf.file)
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 6877
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 6877
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 6877
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6877
## All variants processed

Get population metadata into a vector

myPops <- factor(kgp3$pop)

Get the data on the SNPs from the raw vcf R object (ignore the syntax)

fixed <- data.frame(my_vcf@fix)

Calculate “Fixation Index”

Best known as “Fst”.

Alternate version: “Gst”

# calculate index of genetic distance (Fixation index)
my_Gst <- genetic_diff(my_vcf,
                       myPops, 
                       method = "nei")

Make a dataframe of results and add to the ID codes for SNPs

# don't worry about all of the syntax
my_df <- data.frame(SNP_i = 1:nrow(my_Gst),
                    Gst = my_Gst$Gst,
                    SNP_ID = fixed[,"ID"])

Look at the output

head(my_df)
##   SNP_i         Gst      SNP_ID
## 1     1 0.004563135 rs199561435
## 2     2 0.010048206   rs3737523
## 3     3 0.009692324 rs189523472
## 4     4 0.015376296  rs74122246
## 5     5 0.004474110 rs145194873
## 6     6 0.004851793 rs547902942
ggpubr::gghistogram(data = my_df, x = "Gst")
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).

ggpubr::ggscatter(data = my_df, 
                  y = "Gst",
                  x = "SNP_i",
                  color = "Gst",
                  main = "Manhattan Plot")
## Warning: Removed 26 rows containing missing values (geom_point).

Get SNP with maximum Gst

Gst_max   <- max(my_df$Gst, na.rm = T)
Gst_max_i <- which.max(my_df$Gst)


fixed[Gst_max_i,1:4]
##      CHROM       POS        ID REF
## 4280     1 159204893 rs2814778   T
fixed[Gst_max_i,"ID"]
## [1] "rs2814778"

The C allele primarily occurs in African populations while the T gene is more commonly found in those of European and Asian descent.

This SNP is within the DARC gene and is a key SNP in the resistance against Plasmodium vivax malaria.