This script performs GO enrichment analysis on human genes using package GOstats, and report the result using package ReportingTools.
The input file consists of two columns: gene and its classification. The script will group the genes based on their classification, and then perform GO enrichment analysis for each group. The reference genes (or callled gene universe in GOstats) are all the genes. The gene column is Entrez gene ID.
The outdir contains the result of each group with the links to GO terms and its member genes, which is useful for exploration.
Required options:
# Install packages, if not
#source("http://www.bioconductor.org/biocLite.R")
#biocLite(c("ReportingTools", "Gostats", "GSEABase", "org.Hs.eg.db"))
# parameters
library('getopt');
spec = matrix(c(
'verbose' , 'v', 2, 'integer',
'help' , 'h', 0, 'logical',
'infile' , 'i', 1, 'character',
'gene_set_file', 'g', 2, 'character',
'outdir' , 'd', 1, 'character',
'outfile' , 'o', 2, 'character',
'go_category', 'c', 1, 'character',
'pvalue' , 'p', 2, 'double',
'sample' , 's', 2, 'integer'
), byrow=TRUE, ncol=4);
opt=getopt(spec);
if ( !is.null(opt$help) || is.null(opt$infile) || is.null(opt$outdir) || is.null(opt$go_category) ) {
cat(getopt(spec, usage=TRUE));
q(status=1);
}
infile <- opt$infile
outdir <- opt$outdir
ontology_cat <- opt$go_category
pvalue_cutoff <- ifelse(is.null(opt$pvalue), 0.05, opt$pvalue)
sample_size_cutoff <- ifelse(is.null(opt$sample), 10, opt$sample)
outfile_agg <- ifelse(is.null(opt$outfile), paste(infile, ontology_cat, sep="."), opt$outfile)
gene_profile_file <- ifelse(is.null(opt$gene_set_file), paste(infile, 'rda', sep="."), opt$gene_set_file)
if (file.exists(outfile_agg)) {
print(paste("#LOG: remove outfile", outfile_agg))
file.remove(outfile_agg)
}
# load packages
library("org.Hs.eg.db")
library("GSEABase")
library("GOstats")
library("ReportingTools")
# make gene set
if (!file.exists(gene_profile_file)) {
gene_to_profile <- read.table(infile, colClasses=c("character", "character"))
gene_set <- split(gene_to_profile, gene_to_profile$V2)
print("Gene set by profile finish")
frame = toTable(org.Hs.egGO)
goframeData = data.frame(frame$go_id, frame$Evidence, frame$gene_id)
goFrame = GOFrame(goframeData, organism="Homo sapiens")
goAllFrame=GOAllFrame(goFrame)
print("GO frame finish")
gsc <- GeneSetCollection(goAllFrame, setType=GOCollection())
print("GSEABase finish")
universe = Lkeys(org.Hs.egGO)
print("Universer load")
save(gene_to_profile, gene_set, frame, goframeData, goFrame, goAllFrame, gsc, universe, file = gene_profile_file)
} else {
load(gene_profile_file)
print("load gene profile file")
}
# GO enrichment analysis and report
go_enrich <- function(x) {
genes <- x$V1
ensembl_gene_number <- length(genes)
genes <- unique(genes[!is.na(genes)])
entrez_gene_number <- length(genes)
x.profile <- gsub(',', '', as.character(unique(x$V2)))
print(paste("#LOG: Process", x.profile))
outfile <- paste(outdir, '/', paste(x.profile, ontology_cat, sep="_"), '.html', sep="")
output <- NA
if (file.exists(outfile)) {
print('#LOG: output file has existed')
}
if (length(genes) >= sample_size_cutoff) {
print(paste("#LOG: GOstats Params", x.profile))
params <- GSEAGOHyperGParams(name="custom GSEA based annot Params", geneSetCollection=gsc, geneIds=genes, universeGeneIds = universe, ontology = ontology_cat, pvalueCutoff=pvalue_cutoff, conditional=FALSE, testDirection = "over")
print(paste("#LOG: GOstats HyperGTest", x.profile))
Over <- try(hyperGTest(params), silent=TRUE)
# head(summary(Over))
if (class(Over) == 'try-error') {
Over.msg <- geterrmessage()
print(paste('#ERR: hyperGTest fails', Over.msg))
return()
} else {
if (length(which(expectedCounts(Over) < pvalue_cutoff)) > 0) {
mapped_gene_number <- geneMappedCount(Over)
mapped_universe_number <- universeMappedCount(Over)
Over.summary <- summary(Over)
newcolnames <- c( 'Profile', colnames(Over.summary), 'Gene', 'Universe')
Over.summary.ext <- cbind(x.profile, Over.summary, mapped_gene_number, mapped_universe_number)
colnames(Over.summary.ext) <- newcolnames
print(Over.summary.ext)
write.table(Over.summary.ext, file=outfile_agg, append=TRUE, quote=FALSE, sep="\t", col.names=FALSE, row.names=FALSE)
output <- Over.summary.ext
} else {
print('#LOG: no significant GO enrichment')
}
}
# reporting tools
if (class(Over) != 'try-error' && length(which(expectedCounts(Over) < pvalue_cutoff)) > 0) {
print('#LOG: report significant GO enrichment')
if (file.exists(outfile)) {
print('#LOG: output file has existed')
#return()
} else {
goReport <- HTMLReport(shortName=paste(x.profile, ontology_cat, sep="_"), title=paste(ontology_cat, x.profile, sep=":"), reportDirectory=outdir)
is_published <- try(publish(Over, goReport, selectedIDs=genes, annotation.db="org.Hs.eg"), silent=TRUE)
if (! class(is_published) =="try-error") {
finish(goReport)
print('Report finish')
} else {
is_published.msg <- geterrmessage()
print(paste('#ERR: ReportingTools publish fails', is_published.msg))
}
}
}
} else {
print(paste('#LOG: small sample less than', sample_size_cutoff, 'genes', entrez_gene_number))
}
return(output)
}
# run
result.summary <- lapply(gene_set, go_enrich)