Girvan-Newman Algorithm is for community detection in a network, that is, a graph. Its implementation first starts with Breadth First Search (BFS) to find minimum spanning trees in a graph. Then we can compute the edge betweenness which is the number of the shortest paths going through a given edge, as a measure of centrality of an edge in a graph. By successively removing the edges with the highest betweenness, the algorithm detects communities which are disconnected components.
To visualize an example graph, igraph package is utilized.
library(igraph)
Implementation contains the following six functions.
BFS_unweightedGraph <- function(Gmat, src){
# Dijkstra type algorithm to find shortest paths from a source node for
# unweighted graphs
# output: shDist, nPaths, prevVertexList (list)
# internal control var: visited, queue
# setup param
n = nrow(Gmat) # total num of vertexes
visited = rep(FALSE, n) # visited indicator to setup a queue (default=F)
queue = numeric() # queue for BFS
shDist = rep(Inf, n) # shortest distances from the src
nPaths = rep(0, n) # num of shortest paths from the src
prevVertexList = vector("list", length = n) # shortest paths will be stored
# at the src vertex
shDist[src] = 0
nPaths[src] = 1
visited[src] = TRUE
queue = c(queue, src)
prevVertexList[[src]] = -1 # -1 means it is the src
# a list due to possible multiplicity
# BFS type algorithm
while(length(queue) != 0){
curr = queue[1] # get a current vertex from queue
queue = queue[-1] # remove curr from queue (FIFO)
# For all connected vertexes
for (u in which(Gmat[curr, ] == 1)){
# If u is not visited, push it to the queue
if (!visited[u]){
visited[u] = TRUE
queue = c(queue, u)
}
# print(queue)
# Check if it is a better path
if (shDist[u] > shDist[curr] + 1){
shDist[u] = shDist[curr] + 1
nPaths[u] = nPaths[curr]
prevVertexList[[u]] = curr
} else if (shDist[u] == shDist[curr] + 1){
# Check if additional shortest path is found
nPaths[u] = nPaths[curr] + 1
prevVertexList[[u]] = c(prevVertexList[[u]], curr)
}
}
}
result = list(numVertexes=n,
source=src,
shDist=shDist,
nPaths=nPaths,
prevVertexList=prevVertexList)
return(result)
}
BFS_unweightedGraph_getPaths <- function(BFSout, src, dest){
# Extract the shortest paths (src -> dest) from a BFS output for a Graph and the src
n = BFSout$numVertexes
tmp = BFSout$source
if (tmp != src){
print("The source of BFSout is different from the input src. Check the sanity of input.")
return()
}
prevVertexList = BFSout$prevVertexList
numPaths = BFSout$nPaths[dest]
listPaths = vector('list', numPaths)
listPathsInd = 1
# Backtracking
# Initialize
q_branching = numeric() # queue for vertexes with multiple prevVertexes (LIFO)
q_newBranch = numeric() # additional prevVertexes at q_branching
q_pastPath = list() # path up to the last q_branching
currPath = numeric()
curr = dest # current vertex
tmp = prevVertexList[[curr]]
tmp = tmp[length(tmp):1]
crawl = tmp[length(tmp)]
crawlRight = tmp[-length(tmp)]
q_branching = c( q_branching, rep(curr, length(crawlRight)) )
q_newBranch = c(q_newBranch, crawlRight)
currPath = c(currPath, curr)
if (length(crawlRight) >= 1){
for (i in 1:length(crawlRight)){
q_pastPath = c(q_pastPath, currPath)
}
}
# Stopping rule = (crawl = -1 (at the src) and q_branching is empty)
while( (crawl != -1) || (length(q_branching)!=0) ){
#print(crawl)
curr = crawl # Move to the previous vertex
tmp = prevVertexList[[curr]]
tmp = tmp[length(tmp):1]
crawl = tmp[length(tmp)]
crawlRight = tmp[-length(tmp)]
q_branching = c( q_branching, rep(curr, length(crawlRight)) )
q_newBranch = c(q_newBranch, crawlRight)
currPath = c(currPath, curr)
if (length(crawlRight) >= 1){
for (i in 1:length(crawlRight)){
q_pastPath = c(q_pastPath, list(currPath))
}
}
# When it reached the src, report one shortest path
if (crawl == -1){
# found one shortest path
listPaths[[listPathsInd]] = currPath[length(currPath):1]
listPathsInd = listPathsInd + 1
# Further if branching vertexes remain, go to the last q_branching and remove queues.
if (length(q_branching)!=0){
curr = tail(q_branching, 1) # go to the last branching vertex
crawl = tail(q_newBranch, 1)
currPath = q_pastPath[[length(q_pastPath)]]
q_branching = q_branching[-length(q_branching)] # LIFO
q_newBranch = q_newBranch[-length(q_newBranch)]
q_pastPath = q_pastPath[-length(q_pastPath)]
}
}
}
return(listPaths)
}
getEdges <- function(Gmat){
edges = list()
i <- 1
n <- nrow(Gmat)
for (i in 1:n){
for (j in which(Gmat[i, ] == 1)){
if (j > i){
e = list(c(i, j))
edges = append(edges, e)
}
}
}
return(edges)
}
myEdgeBetweenness <- function(Gadjmat){
# Initialize var
n <- nrow(Gadjmat)
edgeList <- getEdges(Gadjmat)
nEdges <- length(edgeList)
edgeBetweenness <- numeric(nEdges)
nShPathsMat <- matrix(0, n, n)
nShPathViaAnEdge <- matrix(0, n, n)
betnnessSummand <- matrix(0, nEdges, n*(n-1)/2) # May use lots of memory for large n
BFSout <- list()
# BFS for each vertex
for (i in 1:n){
BFSout[[i]] <- BFS_unweightedGraph(Gadjmat, i)
}
# Count the number of the shortest paths between i and j
for (i in c(1:(n-1))){
for (j in c((i+1):n)){
nShPathsMat[i,j] <- BFSout[[i]]$nPaths[j]
}
}
# Compute betnnessSummand
for (i in c(1:(n-1))){
for (j in c((i+1):n)){
# Extract shPaths, which is done only once per pair
pathij <- BFS_unweightedGraph_getPaths(BFSout[[i]], i, j)
# Count shPaths going through a give edge
for (k in 1:nEdges){
e <- edgeList[[k]]
count <- 0
for (l in 1:length(pathij)){
path_l = pathij[[l]]
# Check if a path contains the edge e
for (m in 1:(length(path_l)-1)){
if (all(path_l[m:(m+1)] == e) | all(path_l[m:(m+1)] == e[2:1])){
count <- count + 1
}
}
}
summand <- count / nShPathsMat[i,j]
# Indexing upper matrix cells in a linear fashion
betnnessSummand[k, (i-1)*(2*n-i)/2 + (j-i)] <- summand
}
}
}
# Compute edge betweenness
edgeBetweenness = rowSums(betnnessSummand)
# output
n = length(edgeList)
edgeMat = matrix(NA, 2, n)
for (i in 1:n){
edgeMat[, i] = cbind(edgeList[[i]][1], edgeList[[i]][2])
}
out = list(edgeMat=edgeMat, edgeBetweenness=edgeBetweenness)
return(out)
}
isConnected <- function(Gadjmat){
n <- nrow(Gadjmat)
BFS1 <- BFS_unweightedGraph(Gadjmat, 1)
return(!any(BFS1$nPaths == 0))
}
GirvanNewman1step <- function(Gmat){
# Initialize
n <- nrow(Gmat)
# Check input matrix
if (!isConnected(Gmat)){
print("Input adj matrix is not connected, so it cannot be searched by this function.")
return()
}
edgeList <- getEdges(Gmat)
edgeBetweenness <- myEdgeBetweenness(Gmat)$edgeBetweenness
ind <- which.max(edgeBetweenness)
e <- edgeList[[ind]]
Gmat2 <- Gmat
Gmat2[e[1], e[2]] <- 0
Gmat2[e[2], e[1]] <- 0
isConnected <- isConnected(Gmat2)
out <- list(n=n, edgeList=edgeList, edgeBetweenness=edgeBetweenness,
removedEdge=e, outAdjmat=Gmat2, isConnected=isConnected)
return(out)
}
Let’s consider a toy network with 8 nodes which is an unweighted graph.
# node 1,2,3,4,5,6,7,8
Gadjmat = rbind( c(0,1,1,0,1,0,0,0),
c(1,0,0,1,0,0,0,0),
c(1,0,0,0,1,0,0,0),
c(0,1,0,0,1,1,0,0),
c(1,0,1,1,0,0,0,0),
c(0,0,0,1,0,0,1,1),
c(0,0,0,0,0,1,0,0),
c(0,0,0,0,0,1,0,0) )
The igraph library gives us nice visualization of graphs.
grp1 <- graph_from_adjacency_matrix(Gadjmat, mode="undirected", diag=FALSE)
set.seed(1); plot(grp1, vertex.size=25, vertex.label.cex=1.5)
Using the BFS, we can get the shortest paths between all pairs of nodes. Total 33 shortest paths are found.
n = nrow(Gadjmat)
BFSout = vector("list", length=n)
for (i in 1:n){
BFSout[[i]] <- BFS_unweightedGraph(Gadjmat, i)
}
allPaths = list()
for (src in 1:(n-1)){
for (dest in c((src+1):n)){
tmpout = BFS_unweightedGraph_getPaths(BFSout[[src]], src, dest)
allPaths = append(allPaths, tmpout)
}
}
allPaths
## [[1]]
## [1] 1 2
##
## [[2]]
## [1] 1 3
##
## [[3]]
## [1] 1 2 4
##
## [[4]]
## [1] 1 5 4
##
## [[5]]
## [1] 1 5
##
## [[6]]
## [1] 1 2 4 6
##
## [[7]]
## [1] 1 5 4 6
##
## [[8]]
## [1] 1 2 4 6 7
##
## [[9]]
## [1] 1 5 4 6 7
##
## [[10]]
## [1] 1 2 4 6 8
##
## [[11]]
## [1] 1 5 4 6 8
##
## [[12]]
## [1] 2 1 3
##
## [[13]]
## [1] 2 4
##
## [[14]]
## [1] 2 1 5
##
## [[15]]
## [1] 2 4 5
##
## [[16]]
## [1] 2 4 6
##
## [[17]]
## [1] 2 4 6 7
##
## [[18]]
## [1] 2 4 6 8
##
## [[19]]
## [1] 3 5 4
##
## [[20]]
## [1] 3 5
##
## [[21]]
## [1] 3 5 4 6
##
## [[22]]
## [1] 3 5 4 6 7
##
## [[23]]
## [1] 3 5 4 6 8
##
## [[24]]
## [1] 4 5
##
## [[25]]
## [1] 4 6
##
## [[26]]
## [1] 4 6 7
##
## [[27]]
## [1] 4 6 8
##
## [[28]]
## [1] 5 4 6
##
## [[29]]
## [1] 5 4 6 7
##
## [[30]]
## [1] 5 4 6 8
##
## [[31]]
## [1] 6 7
##
## [[32]]
## [1] 6 8
##
## [[33]]
## [1] 7 6 8
From the shortest paths, edge betweenesses (Wikipedia) can be obtained.
myEdgeBetweenness(Gadjmat)
## $edgeMat
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 1 1 1 2 3 4 4 6 6
## [2,] 2 3 5 4 5 5 6 7 8
##
## $edgeBetweenness
## [1] 4.5 2.0 3.5 6.5 5.0 10.5 15.0 7.0 7.0
Within GirvanNewman1step(), the BFS and myEdgeBetweenness() are implemented to find and remove the edge with the highest edge betweenness. In the first running, the edge \((4, 6)\) is removed due to its highest centrality, and we find two disconnected clusters.
out <- GirvanNewman1step(Gadjmat)
out$isConnected
## [1] FALSE
grp2 <- graph_from_adjacency_matrix(out$outAdjmat, mode="undirected", diag=FALSE)
set.seed(1); plot(grp2, vertex.size=25, vertex.label.cex=1.5)