Pseudomonas sp. 273 was sequenced via PacBio and a completed 1 contig genome was provided by an external company.
You will receive:
The general strategy is to:
library(Biostrings)
library(reshape2)
P273.prokka.biostring <- readAAStringSet("PROKKA_12032018/PROKKA_12032018.faa")
name <- as.character(names(P273.prokka.biostring))
sequence <- paste(P273.prokka.biostring)
length <- width(P273.prokka.biostring)
P273.prokka <- data.frame(name, length, sequence, stringsAsFactors = F)
name.split <- colsplit(P273.prokka$name, " ", c("locus","prokka_function"))
P273.prokka <- cbind(name.split,P273.prokka[-1])
cat("### Subset of the anotations")
kable(head(P273.prokka))
| locus | prokka_function | length | sequence |
|---|---|---|---|
| P273_00001 | DNase CdiA | 1899 | MNNNGTLGALGALDLTSQGIANGADSLLFSGGDMTLRAASLSNRYGDLYSQGNLSFAGQDGGRAQSLSNRSGTIEAQGDIHLDVANLENTRDVFAFEQTTTFGEFDVQCGQHCGGHDSFKRGKIDVSETIEERITQNSPAAWLTAGGNLSIDADAVENRNSTIAANGDLTINANSLLNQGNTSRTGNKVVVINVVPGDYGKIPTGQWDAMENLARAFNNKMAAGTFDQDLYDQLWAIYNGDRWAIGTPVVSWSEDGAQSAPATLQAGNRVTLNVAHNLQNGTVSEYSQAQLTGQLAGSLLGGQLGTVNLTLNKQSSDAQARGPQTVQSVTHTAADGSQQVSFIPVDYTGVPFAAVDPTAADTFRLPQGQYGMFIRSPDPQSHYLIETNPALTDLGRFLNSDYLLGKLGFDPDQAWKRLGDGAYETRLIREAIQAQTGQRFLDGLTSDYDQFQYLMDNALAAKDALQLSVGVGLSAEQVAALTHDIVWMETRVVDGQQVLVPVVYLAQTDARNLRGGSLIQGRDLNLMAGGDLTNVGTLRASEDLTATAGGSILQGGLVDAGQRVSLLAGDSIRNALAGQIRGDQVDLTALKGDIVNDRTAVTAGIGGDEYRSFLDAGASISARSELSLDAGRDITNRGSLASGGDSYLGAGRDINLQAVTDASRLRDIQQGGHHVTTTTVAQNHGSSLTAGGDLVLDAGRDLNVVGSQASAKGDLTAAAGRDINLRAVEDAASVEVRSKTSSTRTVEQTGQTRQLGAQLTAGGDLVASAGQDLNLTASTISASNEAYLYATRDVNLQAAAETDSHALSKTKRSHGLLSSSEKKTEDTSLYTTQQGSLVSADKVAIRAGQDIGVSGSDVASTNGTSLLAGRNVLIDGATETSETSHAESKKKSGVMSSGGLGFTLGSASTQATQTNHNEQTRGSTIGSVLGNVDIQAGKDLTIRGSDVVAGKDINLIGQNVDILAAQNENRSEQTYKSKTSGLTLALSGSVGSAMDSGYQTAKQAKHEDDSRLSALQGIKAGLTGVQAWQAAQQGTEGGGVSQFFGISASLGSQKSSSKQTQEQSVSQGSSLTAGNNLNILATGAGKVGQDGDIRIQGSQLKAGNDVLLAANRDITLEAAANTQKLDGKNKSSGGAVGVSVGYSADNGVGLSIFANANQGSGKEVGTGTTWTETTLDAGNQVKLVSGRDTTLKGAQVNGEQIIANVGRDLTLQSLQDSDYYDSKQKNVSAGASVAIIGTGGSASVSASQSKIDSNYKSVQEQTGLYAGKGGFQIDVGNHTQLDGSVIASTAEAEKNRLSTGTLGWSSIDNKADYKSQQQSVSLSSGSDGSGKFISNMPSGMLVAYNHGDSASGTTGSAISSGTLEIRDPASQQQDVASLSRDVEHANGSISPIFDKEKEQNRLKQVQLIAEIGTQAMDIVRTQGEIEAAEEGRKELKTQGKDNPTQKELEATVAYQNVMREYGTGSDYQRAAQAVTAALQYLAGGDIGGAIAGASAPYIAHLIKQQTGDNDTARIMAQALLGAVVAGVQGNSSVAGGIGAATGELIAANLYPGKKPEDLTENERQIVSALSSLAAGMAGGLASGDTAGAVAAAGAGKTAVDNNFLSGDQAKAFDHEMQQCTKEGDCTKVIKKYVALNDENRELLKATCSEKPWVCYGNSRDFVLTGLNSADPSRPVSSGGIENDNVRLFVQYENSLDLQYINKNTDTLYKALVFASEPENFMLMFGGLANLTNASGTSIATGAGLSMAANGGVQLATGSTGDKFDWIGFMTSGVTGGMSAGQTLTPTLQTNIGGAYISSQLNGQSSLDAMMGAMIGASLGYGAGATITSQMEKNYVNKIFGLSRNSVNALKYSEAANFPGSYLLKETPMSPIPGILGGATGSVVSESSNNAVLNGANNGK |
| P273_00002 | RNA 2’-phosphotransferase | 182 | MDTKLLNETSKFLSYVLRHEPQAIGLQLDSEGWANINALIAGAAKKGKNLDSEIIQKVVASSDKKRFSISSDGQRIRAVQGHSTPTVTLQHTEKEPPELLYHGTASRFLDSIKTQGLIPGARHYVHLSQDEQTAVEVGKRYGKPVILKIEALRMHRQGFKFFQAENGVWLADKIPANFILTE |
| P273_00003 | hypothetical protein | 70 | MMRPDAKVKAVYLYPKPVDFRKSIDGLAALVELDIKVAVFDPVLFVFLERGLLYFSYSKADHSGRIRVLK |
| P273_00004 | hypothetical protein | 93 | MATHNVVLPQPMEKSIDDLVSEGRYQNFSEVVRAGLRLLLEREAEESAKLVALRNATSSGIMQLETGRFVEIASEAQLEKYLGELGQLASSRQ |
| P273_00005 | hypothetical protein | 121 | MSDPQFRLSLDAQTDLIDILRFTQVKFGEDVRRRYQGLLRAAFVSLSAESERAGSIAREQLETGLRSLHLLYCRSEAPNGRVDRPRHVVFYRLGHDQVIEIVRILHDAMEVERHLQKVPAG |
| P273_00006 | Putative lipoprotein/NMB1164 | 225 | MKMSQGLLLGVASAALLMVGGCATESSRALPVQQVESVGKPYSGVRSPIAVGKFDNRSSYMRGIFSDGVDRLGGQAKTILITHLQQTNRFNVLDRDNMSEIQQEAAIKGQAQRLKGADYVVTGDVTEFGRKEVGDRQLFGILGRGKTQVAYAKVALNIVNISTSEVVYSTQGAGEYELSNREIIGFGGTASYDSTLNGKVLDLAMREAVNKLVNAVDSGSWNPAR |
The protein sequences were annotated via the EggNOG database. EggNOG provides links to several databases. Importing and adding the annotations to the annotation table and attaching to the annotation table:
eggnog.full <- read.csv("PROKKA_12032018.faa.emapper.annotations", sep = "\t", header = F, stringsAsFactors = FALSE)
colnames(eggnog.full) <- c("locus","Seed.Ortholo","evalue","score","Predicted.name","GO.terms","KEGG.KO","BiGG.reactions","tax.scope","eggNOG.OGs","best.OG","COG.Cat.","eggNOG.HMM.Desc.")
eggnog <- eggnog.full[c(1,6,7,10,13)]
library(dplyr)
P273.prokka.eggnog <- dplyr::left_join(P273.prokka, eggnog, by="locus")
The last addition is to translate the KO terms into protein function descriptions
library(dplyr)
KO.to.name <- read.csv("../KO.to.name.map.csv", header = T, stringsAsFactors = F)
colnames(KO.to.name)[1] = "KEGG.KO"
P273.prokka.eggnog.KO <- dplyr::left_join(P273.prokka.eggnog, KO.to.name, by = "KEGG.KO")
P273.prokka.eggnog.KO <- P273.prokka.eggnog.KO[c(1,3,2,6,9,8,7,5,4)]
write.csv(P273.prokka.eggnog.KO, "P273.full.annotations.csv")
cat("# Full annotation is provided in csv and xlsx formats under the name P273.full.annotations")
This table now contains clean sorted information from many different databases along with the actual protein sequence.
Column names and descriptions: