Install missing package
#install.packages("kgp")
Load packages
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(kgp)
library(ggplot2)
library(ggpubr)
The package kgp3 has metadata stored in a nice format
head(kgp3)
## # A tibble: 6 × 10
## fid id pid mid sex sexf pop reg population region
## <chr> <chr> <chr> <chr> <int> <fct> <chr> <chr> <chr> <chr>
## 1 HG00096 HG00096 0 0 1 male GBR EUR British in Englan… Europe
## 2 HG00097 HG00097 0 0 2 female GBR EUR British in Englan… Europe
## 3 HG00099 HG00099 0 0 2 female GBR EUR British in Englan… Europe
## 4 HG00100 HG00100 0 0 2 female GBR EUR British in Englan… Europe
## 5 HG00101 HG00101 0 0 1 male GBR EUR British in Englan… Europe
## 6 HG00102 HG00102 0 0 2 female GBR EUR British in Englan… Europe
Download the vcf file for the lesson
vcf.file <- "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"
my_vcf <- vcfR::read.vcfR(vcf.file)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 6877
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 6877
## Character matrix gt cols: 2513
## skip: 0
## nrows: 6877
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant: 6877
## All variants processed
Get population metadata into a vector
myPops <- factor(kgp3$pop)
Get the data on the SNPs from the raw vcf R object (ignore the syntax)
fixed <- data.frame(my_vcf@fix)
Best known as “Fst”. Alternate version: “Gst”
# calculate index of genetic distance (Fixation index)
my_Gst <- genetic_diff(my_vcf,
myPops,
method = "nei")
Make a dataframe of results and add to the ID codes for SNPs
# don't worry about all of the syntax
my_df <- data.frame(SNP_i = 1:nrow(my_Gst),
Gst = my_Gst$Gst,
SNP_ID = fixed[,"ID"])
Look at the output
head(my_df)
## SNP_i Gst SNP_ID
## 1 1 0.004563135 rs199561435
## 2 2 0.010048206 rs3737523
## 3 3 0.009692324 rs189523472
## 4 4 0.015376296 rs74122246
## 5 5 0.004474110 rs145194873
## 6 6 0.004851793 rs547902942
ggpubr::gghistogram(data = my_df, x = "Gst")
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).
ggpubr::ggscatter(data = my_df,
y = "Gst",
x = "SNP_i",
color = "Gst",
main = "Manhattan Plot")
## Warning: Removed 26 rows containing missing values (geom_point).
# will see a singular SNP that stands out above the rest in terms of Gst -> a unique SNP
# high Gst (close to 1.0) = unique
# low Gst (close to 0.0) = non-unique
Get SNP with maximum Gst
Gst_max <- max(my_df$Gst, na.rm = T) # gets the highest Gst value in the dataset (the most unique)
Gst_max_i <- which.max(my_df$Gst) # gets the SNP ID of the most unique SNP (highest Gst)
fixed[Gst_max_i,1:4]
## CHROM POS ID REF
## 4280 1 159204893 rs2814778 T
fixed[Gst_max_i,"ID"]
## [1] "rs2814778"
# Geography world view of the SNP shows its uniqueness
# SNP is nearly fixed (all but a line of blue) in African populations
# everywhere else, SNP is nearly non-fixed
# this fixation is a result of malaria-induced natural selection
The allele is predominantly yellow throughout the globe, but is notably almost entirely blue in one specific region, indicating that the allele is nearly completely fixed in African populations.
We can interpret the involvement of natural selection in the allele frequency pattern exhibited by the map. The fixation of that certain allele/SNP in the African region alone suggests that it brought some evolutionary benefit to the inhabitants of that region.