Packages

Install missing package

#install.packages("kgp")

Load packages

library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.13.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(kgp)
library(ggplot2)
library(ggpubr)

The package kgp3 has metadata stored in a nice format

head(kgp3)
## # A tibble: 6 × 10
##   fid     id      pid   mid     sex sexf   pop   reg   population         region
##   <chr>   <chr>   <chr> <chr> <int> <fct>  <chr> <chr> <chr>              <chr> 
## 1 HG00096 HG00096 0     0         1 male   GBR   EUR   British in Englan… Europe
## 2 HG00097 HG00097 0     0         2 female GBR   EUR   British in Englan… Europe
## 3 HG00099 HG00099 0     0         2 female GBR   EUR   British in Englan… Europe
## 4 HG00100 HG00100 0     0         2 female GBR   EUR   British in Englan… Europe
## 5 HG00101 HG00101 0     0         1 male   GBR   EUR   British in Englan… Europe
## 6 HG00102 HG00102 0     0         2 female GBR   EUR   British in Englan… Europe

Load data

Download the vcf file for the lesson

vcf.file <- "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"
my_vcf <- vcfR::read.vcfR(vcf.file)
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 130
##   header_line: 131
##   variant count: 6877
##   column count: 2513
## 
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 6877
##   Character matrix gt cols: 2513
##   skip: 0
##   nrows: 6877
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6877
## All variants processed

Get population metadata into a vector

myPops <- factor(kgp3$pop)

Get the data on the SNPs from the raw vcf R object (ignore the syntax)

fixed <- data.frame(my_vcf@fix)

Calculate “Fixation Index”

Best known as “Fst”.

Alternate version: “Gst”

# calculate index of genetic distance (Fixation index)
my_Gst <- genetic_diff(my_vcf,
                       myPops, 
                       method = "nei")

Make a dataframe of results and add to the ID codes for SNPs

# don't worry about all of the syntax
my_df <- data.frame(SNP_i = 1:nrow(my_Gst),
                    Gst = my_Gst$Gst,
                    SNP_ID = fixed[,"ID"])

Look at the output

head(my_df)
##   SNP_i         Gst      SNP_ID
## 1     1 0.004563135 rs199561435
## 2     2 0.010048206   rs3737523
## 3     3 0.009692324 rs189523472
## 4     4 0.015376296  rs74122246
## 5     5 0.004474110 rs145194873
## 6     6 0.004851793 rs547902942
ggpubr::gghistogram(data = my_df, x = "Gst")
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).

ggpubr::ggscatter(data = my_df, 
                  y = "Gst",
                  x = "SNP_i",
                  color = "Gst",
                  main = "Manhattan Plot")
## Warning: Removed 26 rows containing missing values (geom_point).

Get SNP with maximum Gst

Gst_max   <- max(my_df$Gst, na.rm = T)
#Gst_max_i <- which.max(myDiff$Gst)
Gst_max_i <- which.max(my_df$Gst)

fixed[Gst_max_i,1:4]
##      CHROM       POS        ID REF
## 4280     1 159204893 rs2814778   T
fixed[Gst_max_i,"ID"]
## [1] "rs2814778"

The geographic pattern of this gene is more prevalent in Africa and a few parts of the Americas, which can be contributed to the West African Slave Trade and movement of those populations during that time.

The biological impact can be seen in the way the gene is distributed throughout populations around the world.