Introduction

Sometimes it is needed to vizualise a matrix to see sparsity or compare different kind of ordering. In this post I will show you a solution with geom_raster() in ggplot2. It is similar that matlab spy() function. geom_raster() is a special case of geom_tile() where tiles are the same size.

Create a matrix

This chunk will create a 10×6 weighted matrix of x1 to x10 and y1 to y6.

A <- matrix(c(2 ,   0   ,   0   ,   0   ,   0   ,   2   ,
              0 ,   0   ,   0   ,   0   ,   2   ,   2   ,
              0 ,   0   ,   0   ,   0   ,   3   ,   2   ,
              0 ,   0   ,   0   ,   0   ,   0   ,   6   ,
              2 ,   2   ,   3   ,   5   ,   4   ,   2   ,
              6 ,   4   ,   3   ,   0   ,   0   ,   0   ,
              0 ,   0   ,   3   ,   1   ,   0   ,   0   ,
              0 ,   0   ,   0   ,   2   ,   2   ,   1   ,
              0 ,   0   ,   2   ,   3   ,   0   ,   0   ,
              0 ,   2   ,   2   ,   0   ,   0   ,   0), nrow=10, ncol=6, byrow = T)
rownames(A) <- paste("x", 1:10, sep="")
colnames(A) <- paste("y", 1:6, sep="")

Ordering rows and columns

Simple ordering by colsums and rowsums

ggplot2 works with longdata instead of wide data format like a matrix, so you need to convert it first.

library(reshape2)
longData <- melt(A)

If you want to control the orderings of rows and colums you need to specify the ordering with factors of rows and columns. Levels are vectors of rownames and colnames of original matrix.

library(dplyr)
longData$Var1 <- factor(longData$Var1, levels=names(sort(rowSums(A), decreasing = T)))
longData$Var2 <- factor(longData$Var2, levels=names(sort(colSums(A), decreasing = T)))

longData<-longData[longData$value!=0,]

library(ggplot2)
ggplot(longData, aes(x = Var2, y = Var1)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="") +
  theme_bw()

Ordering with TSP

There is an excellent implementation of traveling sales person (TSP) ordering in seriation package. It randomly select the first row or column in process of ordering therefore every result of sorting is different.

library(seriation)
o <- seriate(A, method="BEA_TSP")
longData$Var1 <- factor(longData$Var1, levels=names(unlist(o[[1]][])))
longData$Var2 <- factor(longData$Var2, levels=names(unlist(o[[2]][])))

longData<-longData[longData$value!=0,]

ggplot(longData, aes(x = Var2, y = Var1)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="") +
  theme_bw()

Ordering by clusters

In this example I use hierarchical clustering where distance matrix is calcualted with generalized Jaccard similarity for row vectors and column vectors.

A <- A
sim.jac <- matrix(0, nrow=nrow(A), ncol=nrow(A))
rownames(sim.jac) <- rownames(A)
colnames(sim.jac) <- rownames(A)

#weighted jaccard for rows
pairs <- t(combn(1:nrow(A), 2))

for (i in 1:nrow(pairs)){
  num <- sum(sapply(1:ncol(A), function(x)(min(A[pairs[i,1],x],A[pairs[i,2],x]))))
  den <- sum(sapply(1:ncol(A), function(x)(max(A[pairs[i,1],x],A[pairs[i,2],x]))))
  sim.jac[pairs[i,1],pairs[i,2]] <- num/den
  sim.jac[pairs[i,2],pairs[i,1]] <- num/den  
}
sim.jac[which(is.na(sim.jac))] <- 0
diag(sim.jac) <- 1
sim.jac.rows <- sim.jac

A <- t(A)
sim.jac <- matrix(0, nrow=nrow(A), ncol=nrow(A))
rownames(sim.jac) <- rownames(A)
colnames(sim.jac) <- rownames(A)

#weighted jaccard for rows
pairs <- t(combn(1:nrow(A), 2))

for (i in 1:nrow(pairs)){
  num <- sum(sapply(1:ncol(A), function(x)(min(A[pairs[i,1],x],A[pairs[i,2],x]))))
  den <- sum(sapply(1:ncol(A), function(x)(max(A[pairs[i,1],x],A[pairs[i,2],x]))))
  sim.jac[pairs[i,1],pairs[i,2]] <- num/den
  sim.jac[pairs[i,2],pairs[i,1]] <- num/den  
}
sim.jac[which(is.na(sim.jac))] <- 0
diag(sim.jac) <- 1
sim.jac.cols <- sim.jac

Q <- A
longData<-melt(Q)
longData<-longData[longData$value!=0,]
longData <- left_join(longData, cut.rows, by=c("Var2"="names"))
colnames(longData)[4] <- "Var2_clust"
longData$Var2 <- as.factor(longData$Var2)
longData <- left_join(longData, cut.cols, by=c("Var1"="names"))
colnames(longData)[5] <- "Var1_clust"

longData$Var1 <- factor(longData$Var1, levels=unique(arrange(longData, Var1_clust)[,1]))
longData$Var2 <- factor(longData$Var2, levels=unique(arrange(longData, Var2_clust)[,2]))

#3 cluster
library(ggsci)
library(scales)
mypal <- pal_simpsons("springfield", alpha = 1)(5)[c(1,2,5)]
show_col(mypal)

axis.x.colour <- (longData %>% select(Var1, Var1_clust) %>% unique %>% arrange(Var1_clust) %>% select(Var1_clust))[,1] %>% plyr::mapvalues(from=c(1:3), to=mypal)

axis.y.colour <- (longData %>% select(Var2, Var2_clust) %>% unique %>% arrange(Var2_clust) %>% select(Var2_clust))[,1] %>% plyr::mapvalues(from=c(1:3), to=mypal)

ggplot(longData, aes(x = Var1, y = Var2)) + 
  geom_raster(aes(fill=value)) + 
  scale_fill_gradient(low="grey60", high="red") +
  labs(x="", y="") +
  theme_bw() + theme(axis.text.x=element_text(size=11, face="bold"),
                     axis.text.y=element_text(size=11, face="bold"),
                     legend.text=element_text(size=8),
                     legend.title=element_text(size=8)) +
  theme(axis.text.y=element_text(colour=axis.y.colour),
        axis.text.x=element_text(colour=axis.x.colour)) +
  annotate("segment", x = 3.5, xend = 3.5, y = 0, yend = 10.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 6.5, y = 2.5, yend = 2.5, colour = "blue", size = 0.5) +
  annotate("segment", x = 0, xend = 6.5, y = 5.5, yend = 5.5, colour = "blue", size = 0.5)

Be happyR! :)

Geom_raster() in ggplot2