final.assemblies.genes.proteins
The goal of this script is to harvest
Assemblies
Predicted genes
Predicted proteins
Annotation tables
From the current project and rename appropriately
1 Assemblies
This code chunk digs into the assembly folders, grabs the genome fastas, and renames appropriately
library(Biostrings)
genomes <- list.files("../r2_complete/")
for (x in (1:length(genomes))) {
g <- readDNAStringSet(paste0("../r2_complete/",genomes[x],"/genome.fa"))
name <- names(g)
seq <- paste(g)
genome <- data.frame(name, seq)
writeFasta(genome,paste0("genomes/",genomes[x],".fa"))
}3 Genes
This chunk grabs, assigns locus tabs, renames, and saves the gene fastas (transcripts)
The locus mapping has to be preserved in case there is ever a need for the gff files
library(stringr)
for (x in (1:length(genomes))) {
g <- readDNAStringSet(paste0("../r2_complete/",genomes[x],"/genome.all.maker.non_overlapping_ab_initio.transcripts.fasta"))
oldname <- names(g)
oldname <- unlist(lapply(oldname, spcSplit))
seq <- paste(g)
locus <- loci[x]
number <- c(1:length(oldname))
number <- str_pad(number, 4, pad = "0")
genes <- data.frame(oldname, seq, locus, number)
genes$name <- paste(genes$locus, genes$number, sep = "_")
writeFasta(genes, paste0("genes/",genomes[x],".fna"))
locus.map <- genes[c(1,5)]
write.csv(locus.map, paste0("gene.to.locus.maps/",genomes[x],".locus.map.csv"))
}4 Proteins
This is essentially the same as previous chunk
for (x in (1:length(genomes))) {
g <- readDNAStringSet(paste0("../r2_complete/",genomes[x],"/genome.all.maker.non_overlapping_ab_initio.proteins.fasta"))
oldname <- names(g)
oldname <- unlist(lapply(oldname, spcSplit))
seq <- paste(g)
locus <- loci[x]
number <- c(1:length(oldname))
number <- str_pad(number, 4, pad = "0")
genes <- data.frame(oldname, seq, locus, number)
genes$name <- paste(genes$locus, genes$number, sep = "_")
writeFasta(genes, paste0("proteins/",genomes[x],".faa"))
}5 Annotations
In a previous script, I concatenated all proteins into one or two files to expedite the process. The tables are read in, separated appropriately, then appropriate annotations are added to the gene.locus lookup table. Finally, all annotations are concatenated into a single table for each genome
5.1 Creating master tables
This is only preliminary work which facilitates the downstream process
All but pfam are distilled down to only the top hit for each protein.
Note that CAZy annotations come via the EggNOG system
library(stringr)
scSplit.name <- function(x){
strsplit(x,"\\:")[[1]][1]
}
scSplit.genome <- function(x){
strsplit(x,":")[[1]][2]
}
#KEGG
K.all <- read.csv("../annotations/KEGG/user_ko.txt", sep = "\t",header = F, stringsAsFactors = F)
comboname <- unlist(K.all$V1)
K.all$oldname <- unlist(lapply(comboname, scSplit.name))
K.all$genome <- unlist(lapply(comboname, scSplit.genome))
write.csv(K.all, "master.annotation.tables/KEGG.master.csv")
#COG
COG.all <- read.csv("../annotations/COG/cog/cog-raw.txt", sep = "\t", header = F, stringsAsFactors = F)
COG.all <- COG.all[!duplicated(COG.all[,1]),]
comboname <- unlist(COG.all$V1)
COG.all$oldname <- unlist(lapply(comboname, scSplit.name))
COG.all$genome <- unlist(lapply(comboname, scSplit.genome))
write.csv(COG.all, "master.annotation.tables/COG.master.csv")
#pfam
P.all <- read.csv("../annotations/pfam/pfam/pfam-raw.txt", sep = "\t", header = F, stringsAsFactors = F)
comboname <- unlist(P.all$V1)
P.all$oldname <- unlist(lapply(comboname, scSplit.name))
P.all$genome <- unlist(lapply(comboname, scSplit.genome))
write.csv(P.all, "master.annotation.tables/pfam.master.csv")
#CAZy (via EGGNog)
E1 <- read.csv("../annotations/EGGNOG/EGGNOG1.tsv", skip = 4, sep = "\t", header=F, stringsAsFactors = F)
E2 <- read.csv("../annotations/EGGNOG/EGGNOG2.tsv", skip = 4, sep = "\t", header=F, stringsAsFactors = F)
E.all <- rbind(E1,E2)
comboname <- unlist(E.all$V1)
E.all$oldname <- unlist(lapply(comboname, scSplit.name))
E.all$genome <- unlist(lapply(comboname, scSplit.genome))
write.csv(E.all, "master.annotation.tables/eggnog.master.csv")
write.csv(E.all[c(24,23,16)], "master.annotation.tables/cazy.master.csv")6 Creating genome specific annotation tables
Now for each genome we need a simple lookup table that maps locus tag to KO, KEGG description, COG, COG description, GO terms, and CAZy code
6.1 Streamlining the KEGG description table
I use this for general work, saved for later
library(dplyr)
#format the KO description table
K.desc <- read.csv("../../../KEGG/KO_Orthology_ko00001.txt", sep = "\t", header = F, stringsAsFactors = F)
colnames(K.desc) <- c("K.Cat.1", "K.Cat.2", "K.Path", "combo")
combo <- unlist(K.desc$combo)
dscSplit.KO <- function(x){
strsplit(x,"\\ \\ ")[[1]][1]
}
dscSplit.d <- function(x){
strsplit(x," ")[[1]][2]
}
K.desc$KO <- unlist(lapply(combo, dscSplit.KO))
K.desc$K.desc <- unlist(lapply(combo, dscSplit.d))
K.desc <- K.desc[-4]
write.csv(K.desc, "../../../KEGG/KO_descriptions.2020.csv")6.2 creating genome tables
K.desc <- read.csv("../../../KEGG/KO_descriptions.2020.csv",stringsAsFactors = F)
K.desc <- K.desc[c(5,6)]
K.desc <- unique(K.desc)
for (x in c(1:length(genomes))) {
l <- read.csv(paste0("gene.to.locus.maps/",genomes[x],".locus.map.csv"))
l <- l[-1]
K <- K.all[K.all$genome==genomes[x],]
K <- K[c(3,2)]
colnames(K) <- c("oldname","KO")
l <- left_join(l, K, by="oldname")
l <- left_join(l, K.desc, by = "KO")
C <- COG.all[COG.all$genome==genomes[x],]
C <- C[c(16,13,15)]
colnames(C) <- c("oldname","COG","COG.desc")
l <- left_join(l, C, by="oldname")
E <- E.all[E.all$genome==genomes[x],]
E <- E[c(7,22,23)]
colnames(E) <- c("GO.terms","EggN.desc","oldname")
l <- left_join(l, E, by="oldname")
C <- E.all[E.all$genome==genomes[x],]
C <- C[c(16,23)]
colnames(C) <- c("CAZy","oldname")
l <- left_join(l, C, by="oldname")
l[is.na(l)] <- ""
write.csv(l, paste0("annotations/",genomes[x],".annotations.csv"))
}7 Conclusions
The final protein to annotation tables can be found in two locations
genome specific one-to-one mappings here: https://www.dropbox.com/sh/j4b5x3dbdpgozyw/AADmFmmidKzYAn-R2B-qAWgta?dl=0 (includes KEGG, COG, CAZy, and EggNOG)
full annotation master tables here: https://www.dropbox.com/sh/l70z298fie64ire/AABcpgkVNHHoWNOdk-Q5vHDqa?dl=0 (also includes pfam and TIGRFAM)
DNA/AA resources