Overview:

This script takes a formatted variant locus file from “4.vcf.to.ordination.aa.rmd” and attaches gene names onto only the non-synonymous variants for feeding into script 6 for allelic characterization and script 8 for the gene variation (~Ka/Ks) pipeline

Input: * aa.variant.table.csv: output from “4.vcf.to.ordination.aa.rmd” * g.morbida_CDS.gff3: simple gene start and stop descriptions

Output: * aa.variant.annotations.csv: roughly formatted non-synonymous variant to gene map * aa.loci.to.genes.csv: a clean map of non-synonymous variant loci to genes

importing and reformatting the gff3 file

library(stringr)
gff <- read.csv("../g.morbida_CDS.gff3", sep = "\t", row.names = NULL, skip = 2, header = FALSE)
gff <- gff[c(1,4,5,9)]
colnames(gff) <- c("scaffold","start","stop","gene")
gene.split <- str_split_fixed(gff$gene, ";",2)
gene.split <- str_split_fixed(gene.split[,1], "=",2)
gene.split <- str_split_fixed(gene.split[,2], ":",2)
gff$gene <- as.character(gene.split[,1])
kable(head(gff),format="markdown")
scaffold start stop gene
scaffold_0 674 1516 augustus_masked-scaffold_0-processed-gene-0.2274-mRNA-1
scaffold_0 9081 12438 maker-scaffold_0-augustus-gene-0.2665-mRNA-1
scaffold_0 12504 13645 maker-scaffold_0-augustus-gene-0.2665-mRNA-1
scaffold_0 14732 15859 augustus_masked-scaffold_0-processed-gene-0.1933-mRNA-1
scaffold_0 16219 16581 maker-scaffold_0-augustus-gene-0.2819-mRNA-1
scaffold_0 16781 17263 maker-scaffold_0-augustus-gene-0.2819-mRNA-1

Importing the variant table

This table was created by the rmd script entitield “geosmithia.vcf.to.ordination.rmd” and was written as “variant.table.csv”

the table is imported and formatted such it has independent scaffold and position fields

var.table <- read.csv("aa.variant.table.csv")
var.table <- var.table[-1]
kable(head(var.table),format="markdown")
locus GM11 GM17 GM20 GM25 GM39 GM59 GM64 GM108 GM118 GM140 GM158 GM170 GM177 GM178 GM179 GM188 GM196 GM205 GM216 GM219 GM236 GM307
scaffold_0_1000237|A/G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000453|C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000478|C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000576|G/C 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000669|T/A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000736|T/C 0 0 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0
library(stringr)
library(tidyr)
locus.split <- data.frame(str_split_fixed(var.table$locus, "_", 2))
locus.split2 <- data.frame(str_split_fixed(locus.split$X2, "_", 2))
locus.split3 <- data.frame(str_split_fixed(locus.split2$X2, "\\|", 2))
locus.split4 <- cbind(locus.split,locus.split2,locus.split3)
locus.split4 <- locus.split4[-c(2,4)]
locus.split <- cbind(unite(locus.split4[c(1,2)], col="locus",sep="_",remove=TRUE), locus.split4[c(3,4)])
colnames(locus.split) <- c("scaffold","location","var")
var.table.split <- cbind(var.table[1],locus.split,var.table[-1])
var.table.split$location <- as.numeric(as.character(var.table.split$location))
kable(head(var.table.split),format="markdown")
locus scaffold location var GM11 GM17 GM20 GM25 GM39 GM59 GM64 GM108 GM118 GM140 GM158 GM170 GM177 GM178 GM179 GM188 GM196 GM205 GM216 GM219 GM236 GM307
scaffold_0_1000237|A/G scaffold_0 1000237 A/G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000453|C/T scaffold_0 1000453 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000478|C/T scaffold_0 1000478 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000576|G/C scaffold_0 1000576 G/C 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000669|T/A scaffold_0 1000669 T/A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
scaffold_0_1000736|T/C scaffold_0 1000736 T/C 0 0 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0

attaching gene names to variant loci

This is accomplished with two logical tests 1. for each line, if scaffold matches AND location is between start and stop, then the annotations are transferred over

This chunk is VERY SLOW!

testvar2 = var.table.split
n=nrow(testvar2)

#we now loop through the rows of the variant table and find the corresponding row in the gff
#This is a very slow funtion and could be improved using a vectorized function
#it could also be improved by making the gff an actual lookup table

#this loop finds the fimarkdown position that maps to an annotation and makes a new dataframe
for (x in c(1:n)) {
  x.sub1 <- subset(gff, scaffold == testvar2[x,"scaffold"] & start <= testvar2[x,"location"] & stop >=testvar2[x,"location"])
  if (nrow(x.sub1) >= 1) {
    testvar3 = cbind(testvar2[x,],x.sub1$gene)
    colnames(testvar3)[ncol(testvar3)] <- "gene"
    restart = x+1 #the next loop, which populates this new mapping table
    break}
  else {NULL}
  rm(x.sub1)
}

#this now goes through every line in the variant table and creates a line for each mapping in the testvar3 object
#it will create a line for each mapping in the case of multiple annotations per position
for (x in c(restart:n)) {
  x.sub1 <- subset(gff, scaffold == testvar2[x,"scaffold"] & start <= testvar2[x,"location"] & stop >=testvar2[x,"location"])
  if (nrow(x.sub1) >= 1) {
    testvar4 = cbind(testvar2[x,],x.sub1$gene)
    colnames(testvar4)[ncol(testvar3)] <- "gene"
    testvar3 <- rbind(testvar3,testvar4)}
  else {NULL}
  rm(x.sub1)
}
write.csv(testvar3,"aa.variant.annotations.csv")

How many dupicate mappings are there? i.e. variants at position that map to more than one annotation?

cat(nrow(testvar3)-nrow(testvar2))
## 118
var.annotations.aa <- cbind(testvar3$gene,testvar3[-27])
kable(var.annotations.aa[40:60,],format="markdown")
testvar3$gene locus scaffold location var GM11 GM17 GM20 GM25 GM39 GM59 GM64 GM108 GM118 GM140 GM158 GM170 GM177 GM178 GM179 GM188 GM196 GM205 GM216 GM219 GM236 GM307
40 maker-scaffold_0-augustus-gene-0.2914-mRNA-1 scaffold_0_1013345|T/C scaffold_0 1013345 T/C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
41 maker-scaffold_0-augustus-gene-0.2914-mRNA-1 scaffold_0_1013347|C/A scaffold_0 1013347 C/A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
42 snap_masked-scaffold_0-processed-gene-0.599-mRNA-1 scaffold_0_1014934|C/T scaffold_0 1014934 C/T 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0
43 snap_masked-scaffold_0-processed-gene-0.599-mRNA-1 scaffold_0_1014943|C/T scaffold_0 1014943 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
44 snap_masked-scaffold_0-processed-gene-0.599-mRNA-1 scaffold_0_1015017|A/G scaffold_0 1015017 A/G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
45 snap_masked-scaffold_0-processed-gene-0.599-mRNA-1 scaffold_0_1015422|C/T scaffold_0 1015422 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
46 snap_masked-scaffold_0-processed-gene-0.599-mRNA-1 scaffold_0_1015426|G/A scaffold_0 1015426 G/A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
47 snap_masked-scaffold_0-processed-gene-0.599-mRNA-1 scaffold_0_1015588|C/T scaffold_0 1015588 C/T 0 0 0 1 0 0 0 1 1 0 1 1 1 0 1 0 0 0 1 1 0 0
48 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1017769|G/A scaffold_0 1017769 G/A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
49 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1018802|C/T scaffold_0 1018802 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
50 maker-scaffold_0-augustus-gene-0.2665-mRNA-1 scaffold_0_10212|A/G scaffold_0 10212 A/G 0 0 0 1 0 0 0 1 1 1 1 1 1 0 1 1 0 0 0 0 0 0
51 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1022965|G/A scaffold_0 1022965 G/A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
52 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1024682|A/C scaffold_0 1024682 A/C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
53 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1024684|T/G scaffold_0 1024684 T/G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
54 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1024976|T/C scaffold_0 1024976 T/C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
55 augustus_masked-scaffold_0-processed-gene-0.2442-mRNA-1 scaffold_0_1025195|T/C scaffold_0 1025195 T/C 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0 0 0 1 1 0 0
56 maker-scaffold_0-augustus-gene-0.2665-mRNA-1 scaffold_0_10262|TCCGGAG/T scaffold_0 10262 TCCGGAG/T 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0
57 maker-scaffold_0-augustus-gene-0.2665-mRNA-1 scaffold_0_10262|T/G scaffold_0 10262 T/G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
58 maker-scaffold_0-augustus-gene-0.2665-mRNA-1 scaffold_0_10289|GGAGCTA/G scaffold_0 10289 GGAGCTA/G 0 0 0 1 0 0 0 1 1 1 1 1 1 0 1 1 0 0 0 0 0 0
59 maker-scaffold_0-snap-gene-0.925-mRNA-1 scaffold_0_1037504|G/C scaffold_0 1037504 G/C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
60 maker-scaffold_0-snap-gene-0.925-mRNA-1 scaffold_0_1037510|T/C scaffold_0 1037510 T/C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

The variant table now has gene names attached.

Next we will remove the NAs (variants in non-coding regions) and write the table

colnames(var.annotations.aa)[1] <- "gene"
var.genes.aa <- var.annotations.aa[!(var.annotations.aa$gene == "NA"),]
write.csv(var.genes.aa, "aa.loci.to.genes.csv")
kable(head(var.genes.aa),format="markdown")
gene locus scaffold location var GM11 GM17 GM20 GM25 GM39 GM59 GM64 GM108 GM118 GM140 GM158 GM170 GM177 GM178 GM179 GM188 GM196 GM205 GM216 GM219 GM236 GM307
maker-scaffold_0-augustus-gene-0.2912-mRNA-1 scaffold_0_1000237|A/G scaffold_0 1000237 A/G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
maker-scaffold_0-augustus-gene-0.2912-mRNA-1 scaffold_0_1000453|C/T scaffold_0 1000453 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
maker-scaffold_0-augustus-gene-0.2912-mRNA-1 scaffold_0_1000478|C/T scaffold_0 1000478 C/T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
maker-scaffold_0-augustus-gene-0.2912-mRNA-1 scaffold_0_1000576|G/C scaffold_0 1000576 G/C 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
maker-scaffold_0-augustus-gene-0.2912-mRNA-1 scaffold_0_1000669|T/A scaffold_0 1000669 T/A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
maker-scaffold_0-augustus-gene-0.2912-mRNA-1 scaffold_0_1000736|T/C scaffold_0 1000736 T/C 0 0 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0