Geosmithia Functional Categories
The goal of this script is to present KEGG and COG functional category analyses for the entire Geosmithia genome vs the positively selcted (PS) genes
Scroll to the bottom for the final two summary tables
- note: custom KEGG heirarchies mapping file can be downloaded here: https://www.dropbox.com/s/6egy0jqocdxsza1/KEGG_hierarchy.csv?dl=0
1 COG
COG categories the OGs into larger categories, A-Z. We get COG annotations via the WebMGA server and then feed in the COG names to a mapping pipeline using files described here: http://rpubs.com/rmurdoch/COG_ID_to_function_map
1.1 Import COG ID summaries
COG.all <- read.csv("../geosmithia_annotations/cog/cog.txt", sep="\t", header = T)
COG.all <- COG.all[c(1,3)]
colnames(COG.all)[1] <- "COG"
datatable(COG.all)COG.ps <- read.csv("../geosmithia_ps_annotations/cog/cog.txt", sep = "\t", header = T)
COG.ps <- COG.ps[c(1,3)]
colnames(COG.ps)[1] <- "COG"
datatable(COG.ps)1.2 Import the COG function mapping and description
library(dplyr)
COG.ID.func <- read.csv("../../Annotations/COG/COG_function_map.csv", header=T)
COG.func.desc <- read.csv("../../Annotations/COG/fun2003-2014.tab", sep="\t", header = T)
colnames(COG.func.desc)[1] <- "func"
COG.ID.func.map <- dplyr::left_join(COG.ID.func, COG.func.desc, by="func")
colnames(COG.ID.func.map) <- c("COG","func","COG.desc","func.desc")
datatable(COG.ID.func.map)1.3 Add categories to the genome and PS COG lists
COG.all.func <- dplyr::left_join(COG.all, COG.ID.func.map, by = "COG")
COG.ps.func <- dplyr::left_join(COG.ps, COG.ID.func.map, by = "COG")
datatable(COG.all.func)datatable(COG.ps.func)This now has all of the information I need; however, it is a bit hard to deal with 3 Q, 4 R, 1 T, 2 T, as separate columns. DPLYR might be able to handle it in a summarize.
dry dplyr::count
library(dplyr)
func.counts.all <- dplyr::count(COG.all.func, func, wt=No_orfs)
func.counts.all <- func.counts.all[-c(1,2),]
func.counts.all <- dplyr::left_join(func.counts.all, COG.func.desc, by="func")
datatable(func.counts.all)func.counts.ps <- dplyr::count(COG.ps.func, func, wt=No_orfs)
func.counts.ps <- func.counts.ps[-c(1,2),]
func.counts.ps <- dplyr::left_join(func.counts.ps, COG.func.desc, by="func")
datatable(func.counts.ps)func.counts.both <- dplyr::left_join(func.counts.all, func.counts.ps, by="func")
func.counts.both <- func.counts.both[c(1,2,4,3)]
colnames(func.counts.both) <- c("COG.cat","genome.count","PS.count","description")
datatable(func.counts.both)Ultimately, very few of the PS genes are annotated by COG.
2 KEGG
mapping KO to categories is performed using a custom mapping file (downloaded from KEGG websites… somewhere, don’t remember where it is hidden)
KEGG.map <- read.csv("../../../KEGG/KEGG_hierarchy.csv",header = T)
KEGG.map <- KEGG.map[c(4,1,2)]
KEGG.map <- unique(KEGG.map)
datatable(KEGG.map[c(1:100),])There are two levels above KO, Tier1 and Tier2
2.1 Tier1
KEGG.map.1 <- KEGG.map[c(1,2)]
KEGG.map.1 <- unique(KEGG.map.1)
KEGG.genome <- read.csv("../geosmithia_annotations/KEGG/geosmithia_KEGG", sep="\t",header=F)
colnames(KEGG.genome) <- c("gene","KO")
KEGG.ps <- read.csv("../geosmithia_ps_annotations/KEGG/user_ko.csv", header = F)
colnames(KEGG.ps) <- c("gene","KO")
KEGG.g.T1 <- dplyr::left_join(KEGG.genome, KEGG.map.1, by="KO")
datatable(KEGG.g.T1)KEGG.p.T1 <- dplyr::left_join(KEGG.ps, KEGG.map.1, by="KO")
datatable(KEGG.p.T1)2.2 Tier2
KEGG.genome <- read.csv("../geosmithia_annotations/KEGG/geosmithia_KEGG", sep="\t",header=F)
colnames(KEGG.genome) <- c("gene","KO")
KEGG.ps <- read.csv("../geosmithia_ps_annotations/KEGG/user_ko.csv", header = F)
colnames(KEGG.ps) <- c("gene","KO")
KEGG.g.T2 <- dplyr::left_join(KEGG.genome, KEGG.map, by="KO")
datatable(KEGG.g.T2)KEGG.p.T2 <- dplyr::left_join(KEGG.ps, KEGG.map, by="KO")
datatable(KEGG.p.T2)2.3 Tier2 summaries
library(dplyr)
KEGG.counts.g <- dplyr::count(KEGG.g.T2, Tier2)
datatable(KEGG.counts.g)KEGG.counts.p <- dplyr::count(KEGG.p.T2, Tier2)
datatable(KEGG.counts.p)KEGG.counts.all <- dplyr::left_join(KEGG.counts.g, KEGG.counts.p, by="Tier2")
datatable(KEGG.counts.all)3 KEGG and COG final comparisons
datatable(func.counts.both)datatable(KEGG.counts.all)4 Conclusions
For both KEGG and COG, only 20-30% of the genes were actually annotated.
Both of these approaches are quite vague, especially given how small the PS annotation sets are. Again, given the poor statistical power, I think it is dangerous to make any stark conclusions from this.
A more useful outcome would have been if the PS genes were strongly enriched in particular categories, but that simply was not the case (if 50% of the 80 PS genes were involved in carbohydrate metabolism, that would be pretty damned powerful information)