Load libraries.

library(ggplot2)
library(tidyr)
library(dplyr)

Read in and format data.

info <- read.table("MSigDb_hallmark_gene_sets.txt",header=FALSE,row.names=1,fill=TRUE,stringsAsFactors=FALSE)
rownames(info) <- unlist(lapply(strsplit(rownames(info),"HALLMARK_"),"[[",2))

Convert to list format.

info_list <- vector("list",length=nrow(info))

for(i in 1:nrow(info))
{
    genes <- as.character(as.vector(info[i,]))
    info_list[[i]] <- genes[which(genes != "")]
}

Get number of pathways each gene found in.

pathways_per_gene <- data.frame(table(unlist(info_list)))

table(pathways_per_gene$Freq)
## 
##    1    2    3    4    5    6    7    8    9   10 
## 2671 1002  422  160   75   30   13    7    1    3

Many genes are only found in one pathway. These aren’t really going to be very interesting for this analysis. Let’s remove them.

pathways_per_gene$Var1 <- as.vector(pathways_per_gene$Var1)
genes_unique_to_one_pathway <- pathways_per_gene$Var1[pathways_per_gene$Freq == 1]

for(i in 1:length(info_list))
{
    info_list[[i]] <- setdiff(info_list[[i]],genes_unique_to_one_pathway)
}

Get the number of genes per pathway now that we eliminated genes unique to one pathway.

range(unlist(lapply(info_list,length)))
## [1]  14 168
hist(unlist(lapply(info_list,length)),xlab="Genes per pathway",ylab="Number of pathways",main="MSigDB hallmark gene sets,\nminus genes unique to one pathway")

All the pathways still have a reasonable number of genes after this filtering.

How many genes do we have that are in at least two pathways?

length(unique(unlist(info_list)))
## [1] 1713

We will want to create a binary matrix with 1713 rows (genes) x 50 columns (pathways). Put a 1 for each case where the gene is in the pathway.

binary_matrix <- matrix(0,nrow=length(unique(unlist(info_list))),ncol=nrow(info),dimnames=list(unique(unlist(info_list)),rownames(info)))

for(i in 1:nrow(info))
{
    binary_matrix[info_list[[i]],i] <- 1
}

Cluster genes and pathways.

cluster_genes <- hclust(dist(binary_matrix))
cluster_pathways <- hclust(dist(t(binary_matrix)))

Convert binary data to long format.

binary_matrix_gathered <- data.frame(Gene = rep(rownames(binary_matrix),times=ncol(binary_matrix)),gather(data.frame(binary_matrix),key="Pathway",value="Present"),stringsAsFactors=FALSE)

binary_matrix_gathered$Present <- plyr::mapvalues(binary_matrix_gathered$Present,from=c(0,1),to=c("No","Yes"))

binary_matrix_gathered$Gene <- factor(binary_matrix_gathered$Gene,levels=cluster_genes$labels[cluster_genes$order])
binary_matrix_gathered$Pathway <- factor(binary_matrix_gathered$Pathway,levels=cluster_pathways$labels[cluster_pathways$order])

Output a heatmap.

ggplot(binary_matrix_gathered,
    aes(x=Pathway,y=Gene,fill=Present)) +
    geom_tile() +
    scale_fill_manual(values=c("grey","red")) +
    ggtitle("MSigDB hallmark gene sets, presence of genes per pathway\nMinus genes unique to one pathway") +
    theme(axis.text.y = element_blank(),axis.ticks.y=element_blank(),axis.text.x = element_text(angle=90,hjust=1),legend.position="none")