Here, I would like to share some of my results exploring the single nucleotide variation analysis of EBV genomes we sequenced. All the code and data have been deposited to the repo https://github.com/yasinkaymaz/ViralGenomeAssembly Note: Directories are relative to repo master.
library(vcfR)
library(dartR)
require(gridExtra)
library(ggplot2) library(plotly)
library(tidyverse)
source(here::here("PapeRCodes/functions.R")) #Load the data pops <- read.delim(here::here("workspace/data/pop7.txt"),row.names = 1, header = T) vcf.ebv <- read.vcfR(here::here("workspace/data/my_ICed_sequence.aln.Sub_for_Assoc_modified.vcf"))
## Scanning file to determine attributes. ## File attributes: ## meta lines: 3 ## header_line: 4 ## variant count: 9488 ## column count: 107 ## Meta line 3 read in. ## All meta lines processed. ## gt matrix initialized. ## Character matrix gt created. ## Character matrix gt rows: 9488 ## Character matrix gt cols: 107 ## skip: 0 ## nrows: 9488 ## row_num: 0 ## Processed variant 1000 Processed variant 2000 Processed variant 3000 Processed variant 4000 Processed variant 5000 Processed variant 6000 Processed variant 7000 Processed variant 8000 Processed variant 9000 Processed variant: 9488 ## All variants processed
#Convert vcf file into gl format gl <- vcfR2genlight(vcf.ebv)
## Warning in vcfR2genlight(vcf.ebv): Found 1153 loci with more than two alleles. ## Objects of class genlight only support loci with two alleles. ## 1153 loci will be omitted from the genlight object.
#Filter variants from regions that are not covered (sequenced) in at least 25% of all genomes. gl <- gl.filter.callrate(gl, method = "ind",threshold = .25, recalc = TRUE)
## Starting gl.filter.callrate: Filtering on Call Rate ## Removing individuals based on Call Rate, threshold t = 0.25 ## List of individuals deleted because of low call rate: 1-3-0430-07-9_CP_genome_fixed_assembly_Fixed 1-4-0589-06-9_CP_genome_fixed_assembly_Fixed 1-6-6007-08-11_CP_genome_fixed_assembly_Fixed from populations: ## Starting gl.filter.monomorphs: Deleting monomorphic loci ## Deleting monomorphic loci and loci with all NA scores ## Completed gl.filter.monomorphs ## ## Starting gl.recalc.metrics: Recalculating locus metrics ## Starting utils.recalc.avgpic: Recalculating OneRatioRef, OneRatioSnp, PICRef, PICSnp and AvgPIC ## Completed utils.recalc.avgpic ## ## Starting utils.recalc.callrate: Recalculating CallRate ## Completed utils.recalc.callrate ## ## Starting utils.recalc.freqhets: Recalculating frequency of heterozygotes ## Completed utils.recalc.freqhets ## ## Starting utils.recalc.freqhomref: Recalculating frequency of homozygotes, reference allele ## Completed utils.recalc.freqhomref ## ## Starting utils.recalc.freqhomref: Recalculating frequency of homozygotes, alternate allele ## Completed utils.recalc.freqhomsnp ## ## Note: Locus metrics recalculated ## Completed gl.recalc.metrics ## ## gl.filter.callrate completed
#Import genome type and patient info gl$pop <- pops[indNames(gl),2] type <- pops[indNames(gl),1] #Run PCOA analysis for all variants pc <- gl.pcoa(gl)
## Performing a PCoA, individuals as entities, SNP loci as attributes ## Ordination yielded 13 informative dimensions from 94 original dimensions ## PCoA Axis 1 explains 43.2 % of the total variance ## PCoA Axis 1 and 2 combined explain 50.9 % of the total variance ## PCoA Axis 1-3 combined explain 55.7 % of the total variance
pc$loadings %>% as.tibble() %>% ggplot(aes())
df <- as.data.frame(pc$loadings) #ggplot(df, aes(x=as.numeric(gl@position),y=Axis2, group = 1))+geom_point(size=.1)
Here is the PCOA of 1st and 2nd axis with all variants.
# p <- gl.pcoa.plot(pc, gl, type = type, xaxis=1, yaxis=2, ellipse = F, labels = "pop", title = "for All variants")
## Plotting populations
ggplotly(p)
## Warning in normalizePath(f2): path[1]="./webshot4f959bb351c.png": No such ## file or directory
## Warning in file(con, "rb"): cannot open file './webshot4f959bb351c.png': No ## such file or directory
## Error in file(con, "rb"): cannot open the connection