Exploring population genetics of EBV genomes in BL endemic regions

Here, I would like to share some of my results exploring the single nucleotide variation analysis of EBV genomes we sequenced. All the code and data have been deposited to the repo https://github.com/yasinkaymaz/ViralGenomeAssembly Note: Directories are relative to repo master.

library(vcfR)

## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.8.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****

library(dartR)

## Loading required package: adegenet

## Loading required package: ade4

## 
##    /// adegenet 2.1.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()

require(gridExtra)

## Loading required package: gridExtra

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ tibble  1.4.2     ✔ purrr   0.2.5
## ✔ tidyr   0.8.1     ✔ dplyr   0.7.6
## ✔ readr   1.1.1     ✔ stringr 1.3.1
## ✔ tibble  1.4.2     ✔ forcats 0.3.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()

source(here::here("PapeRCodes/functions.R"))
#Load the data
pops <- read.delim(here::here("workspace/data/pop7.txt"),row.names = 1, header = T)
vcf.ebv <- read.vcfR(here::here("workspace/data/my_ICed_sequence.aln.Sub_for_Assoc_modified.vcf"))

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 3
##   header_line: 4
##   variant count: 9488
##   column count: 107
## 
Meta line 3 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 9488
##   Character matrix gt cols: 107
##   skip: 0
##   nrows: 9488
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant: 9488
## All variants processed

#Convert vcf file into gl format
gl <- vcfR2genlight(vcf.ebv)

## Warning in vcfR2genlight(vcf.ebv): Found 1153 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 1153 loci will be omitted from the genlight object.

#Filter variants from regions that are not covered (sequenced) in at least 25% of all genomes.
gl <- gl.filter.callrate(gl, method = "ind",threshold = .25, recalc = TRUE)

## Starting gl.filter.callrate: Filtering on Call Rate
##   Removing individuals based on Call Rate, threshold t = 0.25 
##   List of individuals deleted because of low call rate: 1-3-0430-07-9_CP_genome_fixed_assembly_Fixed 1-4-0589-06-9_CP_genome_fixed_assembly_Fixed 1-6-6007-08-11_CP_genome_fixed_assembly_Fixed from populations:  
## Starting gl.filter.monomorphs: Deleting monomorphic loci
##   Deleting monomorphic loci and loci with all NA scores
## Completed gl.filter.monomorphs
## 
## Starting gl.recalc.metrics: Recalculating locus metrics
## Starting utils.recalc.avgpic: Recalculating OneRatioRef, OneRatioSnp, PICRef, PICSnp and AvgPIC
## Completed utils.recalc.avgpic
## 
## Starting utils.recalc.callrate: Recalculating CallRate
## Completed utils.recalc.callrate
## 
## Starting utils.recalc.freqhets: Recalculating frequency of heterozygotes
## Completed utils.recalc.freqhets
## 
## Starting utils.recalc.freqhomref: Recalculating frequency of homozygotes, reference allele
## Completed utils.recalc.freqhomref
## 
## Starting utils.recalc.freqhomref: Recalculating frequency of homozygotes, alternate allele
## Completed utils.recalc.freqhomsnp
## 
## Note: Locus metrics recalculated
## Completed gl.recalc.metrics
## 
## gl.filter.callrate completed

#Import genome type and patient info
gl$pop <- pops[indNames(gl),2]
type <- pops[indNames(gl),1]

#Run PCOA analysis for all variants
pc <- gl.pcoa(gl)

## Performing a PCoA, individuals as entities, SNP loci as attributes
## Ordination yielded 13 informative dimensions from 94 original dimensions
##   PCoA Axis 1 explains 43.2 % of the total variance
##   PCoA Axis 1 and 2 combined explain 50.9 % of the total variance
##   PCoA Axis 1-3 combined explain 55.7 % of the total variance

pc$loadings %>% as.tibble() %>% ggplot(aes())

df <- as.data.frame(pc$loadings)
#ggplot(df, aes(x=as.numeric(gl@position),y=Axis2, group = 1))+geom_point(size=.1)