if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
BiocManager::install("GOfuncR")
BiocManager::install("Homo.sapiens")
library(GOfuncR)
library(tidyverse)
library(ggplot2)
The geneNamesUnique.txt file is provided by Dr. Rebecca
L. Young. The significantMaps.csv file is manually created
from the differentially expressed genes produced from the DGE
Analysis
GO_input_genes <- read.delim(file = "/stor/work/Bio321G_RY_Spring2023/StudentDirectories/KyNguyen/MiniProject/GO_Analysis/geneNamesUnique.txt",
header = TRUE)
differentially_expressed_genes_mapping <- read.csv(file = "/stor/work/Bio321G_RY_Spring2023/StudentDirectories/KyNguyen/MiniProject/GO_Analysis/significantMaps.csv")
colnames(GO_input_genes)[1] <- "gene_id" # Change column name from "x" to "gene_id"
GO_input_genes <- data.frame(gene_id = unlist(strsplit(as.character(GO_input_genes$gene_id), "_"))) # Strip the "_" character of the gene_id, which create more rows
differentially_expressed_genes <- data.frame(gene_name = unlist(strsplit(as.character(differentially_expressed_genes_mapping$gene_name), "_"))) # Create new data.frame by stripping the "_" character of the gene_name, which create more rows (ignore gene_id)
differentially_expressed_genes <- pull(differentially_expressed_genes, gene_name) # Convert the list of differentially expressed genes into a vector
# Create a new column for whether the gene is the candidate gene or not
GO_input_genes <- GO_input_genes %>%
mutate(is_candidate = ifelse(tolower(gene_id) %in% tolower(differentially_expressed_genes), 1, 0)) # 1 if the differentially expressed genes is in the GO_input_genes, and vice versa
The hypergeometric test evaluates the over- or under-representation of a set of candidate genes in GO-categories, compared to a set of background genes. The input for the hypergeometric test is a dataframe with two columns
The output of go_enrich is a list of 4 elements. We’re
only focusing on the first two outputs
GO_Enrich_outputs <- go_enrich(GO_input_genes, test = "hyper")
results <- GO_Enrich_outputs$results
genes <- GO_Enrich_outputs$genes
# Subset over- and under- represented genes with p-value <= 0.05 (i.e., statistically significant genes)
overrepresented_genes <- results[results$raw_p_overrep <= 0.05,]
underrepresented_genes<- results[results$raw_p_underrep <= 0.05,]
candidate_genes <- genes[genes$is_candidate == 1,]
genes_annotation <- get_anno_categories(candidate_genes$gene_id) # Get the gene function (e.g., protein binding) and domain (e.g., biological process). The gene function is the column "name"
# Left join the over/underrepresented_genes data.frame with the genes_annotation data.frame
# You can also use the left_join() function from the dplyr library
annotated_overrepresented_genes <- merge(overrepresented_genes, genes_annotation,
by.x = "node_id", by.y = "go_id",
all.x = TRUE, all.y = FALSE)
annotated_underrepresented_genes <- merge(underrepresented_genes, genes_annotation,
by.x = "node_id", by.y = "go_id",
all.x = TRUE, all.y = FALSE)
# Grouped genes with the same function (the "name" column) and remove NA rows
annotated_overrepresented_genes <- annotated_overrepresented_genes %>%
group_by(node_id) %>%
mutate(gene = paste(gene, collapse=",")) %>%
unique %>%
na.omit
annotated_underrepresented_genes <- annotated_underrepresented_genes %>%
group_by(node_id) %>%
mutate(gene= paste(gene, collapse=",")) %>%
unique %>%
na.omit
# Remove the last two columns
annotated_overrepresented_genes[ ,c(9, 10)] <- NULL
annotated_underrepresented_genes[ ,c(9, 10)] <- NULL
# Convert to table
write.table(annotated_overrepresented_genes, "annotated_overrepresented_genes", quote = F, row.names = F, sep = "\t")
write.table(annotated_underrepresented_genes, "annotated_underrepresented_genes",quote = F, row.names = F, sep = "\t")
# Since our annotated_underrepresented_genes table is empty, we only need to work on the overrepresented table
annotated_overrepresented_genes <- annotated_overrepresented_genes %>%
mutate(tally = str_count(gene, ",") + 1) # Add a tally of the number of genes performing each function
graph_data <- annotated_overrepresented_genes %>%
dplyr::select(node_name, ontology, gene, FWER_overrep, tally)
graph_data <- graph_data[order(graph_data$FWER_overrep, decreasing = FALSE),]
graph_data$FWER_overrep <- graph_data$FWER_overrep + 0.001 # smallest value is 0 and -log(0) is undefined
graph_data <- filter(graph_data, FWER_overrep <= 0.2) # p-value less than 0.2 (i.e., 80% level of confidence)
graph_data$FWER_overrep <- -log(graph_data$FWER_overrep)/log(10) # log-transform FWER_overrep
head(graph_data, 10)
## # A tibble: 10 × 6
## # Groups: node_id [10]
## node_id node_name ontology gene FWER_overrep tally
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 GO:0004340 glucokinase activity molecul… HK3,… 3 4
## 2 GO:0004396 hexokinase activity molecul… HK1,… 3 3
## 3 GO:0005536 glucose binding molecul… HK2,… 3 4
## 4 GO:0008865 fructokinase activity molecul… HK2,… 3 4
## 5 GO:0019158 mannokinase activity molecul… HK1,… 3 4
## 6 GO:0046835 carbohydrate phosphorylation biologi… HK1,… 2.52 4
## 7 GO:0051156 glucose 6-phosphate metabolic p… biologi… HK1,… 2.52 4
## 8 GO:0001666 response to hypoxia biologi… HK2,… 2.22 4
## 9 GO:0006002 fructose 6-phosphate metabolic … biologi… HK1,… 2.05 3
## 10 GO:0045471 response to ethanol biologi… ND4,… 1.17 3
From the GO Analysis result, we can see that
ggplot(graph_data, aes(x = FWER_overrep, y = node_id)) +
geom_point(aes(size = tally, fill = ontology), alpha = 0.75, shape = 21) +
scale_size_continuous(limits = c(1, 15), range = c(1,17), breaks = c(1, 2, 3, 4)) +
labs(x= "Family-wise error rates", y = "Node ID", size = "Number of Genes", fill = "") +
theme(legend.key=element_blank(),
axis.text.x = element_text(colour = "black", size = 12, face = "bold", angle = 90, vjust = 0.3, hjust = 1),
axis.text.y = element_text(colour = "black", face = "bold", size = 11),
legend.text = element_text(size = 10, face ="bold", colour ="black"),
legend.title = element_text(size = 12, face = "bold"),
panel.background = element_blank(), panel.border = element_rect(colour = "black", fill = NA, size = 1.2),
legend.position = "right")