Load libraries.
library(ggplot2)
library(tidyr)
library(dplyr)
Read in and format data.
info <- read.table("MSigDb_hallmark_gene_sets.txt",header=FALSE,row.names=1,fill=TRUE,stringsAsFactors=FALSE)
rownames(info) <- unlist(lapply(strsplit(rownames(info),"HALLMARK_"),"[[",2))
Convert to list format.
info_list <- vector("list",length=nrow(info))
for(i in 1:nrow(info))
{
genes <- as.character(as.vector(info[i,]))
info_list[[i]] <- genes[which(genes != "")]
}
Get number of pathways each gene found in.
pathways_per_gene <- data.frame(table(unlist(info_list)))
table(pathways_per_gene$Freq)
##
## 1 2 3 4 5 6 7 8 9 10
## 2671 1002 422 160 75 30 13 7 1 3
Many genes are only found in one pathway. These aren’t really going to be very interesting for this analysis. Let’s remove them.
pathways_per_gene$Var1 <- as.vector(pathways_per_gene$Var1)
genes_unique_to_one_pathway <- pathways_per_gene$Var1[pathways_per_gene$Freq == 1]
for(i in 1:length(info_list))
{
info_list[[i]] <- setdiff(info_list[[i]],genes_unique_to_one_pathway)
}
Get the number of genes per pathway now that we eliminated genes unique to one pathway.
range(unlist(lapply(info_list,length)))
## [1] 14 168
hist(unlist(lapply(info_list,length)),xlab="Genes per pathway",ylab="Number of pathways",main="MSigDB hallmark gene sets,\nminus genes unique to one pathway")
All the pathways still have a reasonable number of genes after this filtering.
How many genes do we have that are in at least two pathways?
length(unique(unlist(info_list)))
## [1] 1713
We will want to create a binary matrix with 1713 rows (genes) x 50 columns (pathways). Put a 1 for each case where the gene is in the pathway.
binary_matrix <- matrix(0,nrow=length(unique(unlist(info_list))),ncol=nrow(info),dimnames=list(unique(unlist(info_list)),rownames(info)))
for(i in 1:nrow(info))
{
binary_matrix[info_list[[i]],i] <- 1
}
Cluster genes and pathways.
cluster_genes <- hclust(dist(binary_matrix))
cluster_pathways <- hclust(dist(t(binary_matrix)))
Convert binary data to long format.
binary_matrix_gathered <- data.frame(Gene = rep(rownames(binary_matrix),times=ncol(binary_matrix)),gather(data.frame(binary_matrix),key="Pathway",value="Present"),stringsAsFactors=FALSE)
binary_matrix_gathered$Present <- plyr::mapvalues(binary_matrix_gathered$Present,from=c(0,1),to=c("No","Yes"))
binary_matrix_gathered$Gene <- factor(binary_matrix_gathered$Gene,levels=cluster_genes$labels[cluster_genes$order])
binary_matrix_gathered$Pathway <- factor(binary_matrix_gathered$Pathway,levels=cluster_pathways$labels[cluster_pathways$order])
Output a heatmap.
ggplot(binary_matrix_gathered,
aes(x=Pathway,y=Gene,fill=Present)) +
geom_tile() +
scale_fill_manual(values=c("grey","red")) +
ggtitle("MSigDB hallmark gene sets, presence of genes per pathway\nMinus genes unique to one pathway") +
theme(axis.text.y = element_blank(),axis.ticks.y=element_blank(),axis.text.x = element_text(angle=90,hjust=1),legend.position="none")