final.assemblies.genes.proteins

The goal of this script is to harvest

  1. Assemblies

  2. Predicted genes

  3. Predicted proteins

  4. Annotation tables

From the current project and rename appropriately

1 Assemblies

This code chunk digs into the assembly folders, grabs the genome fastas, and renames appropriately

library(Biostrings)

genomes <- list.files("../r2_complete/")

for (x in (1:length(genomes))) {
  g <- readDNAStringSet(paste0("../r2_complete/",genomes[x],"/genome.fa"))
  name <- names(g)
  seq <- paste(g)
  genome <- data.frame(name, seq)
  writeFasta(genome,paste0("genomes/",genomes[x],".fa"))
}

2 Locus tags

We have to have a simple unique locus tag system for each of the genomes. These will be used to assign simple names to the genes and proteins.

loci <- c(
  "GLAV",
  "GMORB3",
  "GOBSC",
  "GOMNI1",
  "GOMNI2A",
  "GPAL153",
  "GPAL18A",
  "GPAL19A",
  "GPAL20A",
  "GPAL4",
  "GPAL6",
  "GSP21",
  "GSP23",
  "GSP10",
  "GSP2",
  "GSP41"
)

locus.map <- data.frame(genomes,loci)

locus.map
##          genomes    loci
## 1    G.lavendula    GLAV
## 2      G.morbida  GMORB3
## 3      G.obscura   GOBSC
## 4   G.omnicola.1  GOMNI1
## 5  G.omnicola.2a GOMNI2A
## 6  G.pallida.153 GPAL153
## 7  G.pallida.18a GPAL18A
## 8  G.pallida.19a GPAL19A
## 9  G.pallida.20a GPAL20A
## 10   G.pallida.4   GPAL4
## 11   G.pallida.6   GPAL6
## 12       G.sp.21   GSP21
## 13       G.sp.23   GSP23
## 14        G.sp10   GSP10
## 15         G.sp2    GSP2
## 16        G.sp41   GSP41

3 Genes

This chunk grabs, assigns locus tabs, renames, and saves the gene fastas (transcripts)

The locus mapping has to be preserved in case there is ever a need for the gff files

library(stringr)

for (x in (1:length(genomes))) {
  g <- readDNAStringSet(paste0("../r2_complete/",genomes[x],"/genome.all.maker.non_overlapping_ab_initio.transcripts.fasta"))
  oldname <- names(g)
  oldname <- unlist(lapply(oldname, spcSplit))
  seq <- paste(g)
  locus <- loci[x]
  number <- c(1:length(oldname))
  number <- str_pad(number, 4, pad = "0")
  genes <- data.frame(oldname, seq, locus, number)
  genes$name <- paste(genes$locus, genes$number, sep = "_")
  writeFasta(genes, paste0("genes/",genomes[x],".fna"))
  locus.map <- genes[c(1,5)]
  write.csv(locus.map, paste0("gene.to.locus.maps/",genomes[x],".locus.map.csv"))
}

4 Proteins

This is essentially the same as previous chunk

for (x in (1:length(genomes))) {
  g <- readDNAStringSet(paste0("../r2_complete/",genomes[x],"/genome.all.maker.non_overlapping_ab_initio.proteins.fasta"))
  oldname <- names(g)
  oldname <- unlist(lapply(oldname, spcSplit))
  seq <- paste(g)
  locus <- loci[x]
  number <- c(1:length(oldname))
  number <- str_pad(number, 4, pad = "0")
  genes <- data.frame(oldname, seq, locus, number)
  genes$name <- paste(genes$locus, genes$number, sep = "_")
  writeFasta(genes, paste0("proteins/",genomes[x],".faa"))
}

5 Annotations

In a previous script, I concatenated all proteins into one or two files to expedite the process. The tables are read in, separated appropriately, then appropriate annotations are added to the gene.locus lookup table. Finally, all annotations are concatenated into a single table for each genome

5.1 Creating master tables

This is only preliminary work which facilitates the downstream process

All but pfam are distilled down to only the top hit for each protein.

Note that CAZy annotations come via the EggNOG system

library(stringr)

scSplit.name <- function(x){
  strsplit(x,"\\:")[[1]][1]
}

scSplit.genome <- function(x){
  strsplit(x,":")[[1]][2]
}

#KEGG
K.all <- read.csv("../annotations/KEGG/user_ko.txt", sep = "\t",header = F, stringsAsFactors = F)
comboname <- unlist(K.all$V1)
K.all$oldname <- unlist(lapply(comboname,  scSplit.name))
K.all$genome <- unlist(lapply(comboname,  scSplit.genome))
write.csv(K.all, "master.annotation.tables/KEGG.master.csv")

#COG
COG.all <- read.csv("../annotations/COG/cog/cog-raw.txt", sep = "\t", header = F, stringsAsFactors = F)
COG.all <- COG.all[!duplicated(COG.all[,1]),]
comboname <- unlist(COG.all$V1)
COG.all$oldname <- unlist(lapply(comboname,  scSplit.name))
COG.all$genome <- unlist(lapply(comboname,  scSplit.genome))
write.csv(COG.all, "master.annotation.tables/COG.master.csv")

#pfam
P.all <- read.csv("../annotations/pfam/pfam/pfam-raw.txt", sep = "\t", header = F, stringsAsFactors = F)
comboname <- unlist(P.all$V1)
P.all$oldname <- unlist(lapply(comboname,  scSplit.name))
P.all$genome <- unlist(lapply(comboname,  scSplit.genome))
write.csv(P.all, "master.annotation.tables/pfam.master.csv")

#CAZy (via EGGNog)
E1 <- read.csv("../annotations/EGGNOG/EGGNOG1.tsv", skip = 4, sep = "\t", header=F, stringsAsFactors = F)
E2 <- read.csv("../annotations/EGGNOG/EGGNOG2.tsv", skip = 4, sep = "\t", header=F, stringsAsFactors = F)
E.all <- rbind(E1,E2)
comboname <- unlist(E.all$V1)
E.all$oldname <- unlist(lapply(comboname,  scSplit.name))
E.all$genome <- unlist(lapply(comboname,  scSplit.genome))
write.csv(E.all, "master.annotation.tables/eggnog.master.csv")
write.csv(E.all[c(24,23,16)], "master.annotation.tables/cazy.master.csv")

6 Creating genome specific annotation tables

Now for each genome we need a simple lookup table that maps locus tag to KO, KEGG description, COG, COG description, GO terms, and CAZy code

6.1 Streamlining the KEGG description table

I use this for general work, saved for later

library(dplyr)

#format the KO description table
K.desc <- read.csv("../../../KEGG/KO_Orthology_ko00001.txt", sep = "\t", header = F, stringsAsFactors = F)
colnames(K.desc) <- c("K.Cat.1", "K.Cat.2", "K.Path", "combo")
combo <- unlist(K.desc$combo)

dscSplit.KO <- function(x){
  strsplit(x,"\\ \\ ")[[1]][1]
}

dscSplit.d <- function(x){
  strsplit(x,"  ")[[1]][2]
}

K.desc$KO <- unlist(lapply(combo, dscSplit.KO))
K.desc$K.desc <- unlist(lapply(combo, dscSplit.d))

K.desc <- K.desc[-4]

write.csv(K.desc, "../../../KEGG/KO_descriptions.2020.csv")

6.2 creating genome tables

K.desc <- read.csv("../../../KEGG/KO_descriptions.2020.csv",stringsAsFactors = F)
K.desc <- K.desc[c(5,6)]
K.desc <- unique(K.desc)

for (x in c(1:length(genomes))) {
  l <- read.csv(paste0("gene.to.locus.maps/",genomes[x],".locus.map.csv"))
  l <- l[-1]
  K <- K.all[K.all$genome==genomes[x],]
  K <- K[c(3,2)]
  colnames(K) <- c("oldname","KO")
  l <- left_join(l, K, by="oldname")
  l <- left_join(l, K.desc, by = "KO")
  C <- COG.all[COG.all$genome==genomes[x],]
  C <- C[c(16,13,15)]
  colnames(C) <- c("oldname","COG","COG.desc")
  l <- left_join(l, C, by="oldname")
  E <- E.all[E.all$genome==genomes[x],]
  E <- E[c(7,22,23)]
  colnames(E) <- c("GO.terms","EggN.desc","oldname")
  l <- left_join(l, E, by="oldname")
  C <- E.all[E.all$genome==genomes[x],]
  C <- C[c(16,23)]
  colnames(C) <- c("CAZy","oldname")
  l <- left_join(l, C, by="oldname")
  l[is.na(l)] <- ""
  write.csv(l, paste0("annotations/",genomes[x],".annotations.csv"))
  }

7 Conclusions

The final protein to annotation tables can be found in two locations

DNA/AA resources