Correlation Graph, Community Detection, and Graph Manipulation

The objective here is to cluster the city in Swiss based on the given attributes using correlation network. First, we need to filter city/row with very high correlation to (at least one) other city.

This documentation can also be accessed in my Kaggle kernel, here

The corrr package works well with tidygraph and ggraph. We will use this pacakage to

library(tidygraph)

## 
## Attaching package: 'tidygraph'

## The following object is masked from 'package:stats':
## 
##     filter

library(ggraph)

## Loading required package: ggplot2

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

library(corrr)
library(tidyverse)

## -- Attaching packages --------------------------- tidyverse 1.2.1 --

## v tibble  2.1.1       v purrr   0.3.2  
## v tidyr   0.8.3       v dplyr   0.8.0.1
## v readr   1.3.1       v stringr 1.4.0  
## v tibble  2.1.1       v forcats 0.4.0

## -- Conflicts ------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks tidygraph::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()

swiss.cor <- swiss %>%    # t() for matrix transpose
  t() %>% correlate() %>%    # correlate() is equivalent to cor() but                                                put NA as its diagonal entry and different class
  shave(upper = TRUE) %>%            # Shave the data frame to lower triangular matrix
  stretch(na.rm = TRUE) %>%           
  filter(r >= 0.99)

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

swiss.cor

## # A tibble: 64 x 3
##    x            y                r
##    <chr>        <chr>        <dbl>
##  1 Courtelary   Le Locle     0.991
##  2 Delemont     Franches-Mnt 0.996
##  3 Delemont     Sarine       0.994
##  4 Franches-Mnt Sarine       0.996
##  5 Neuveville   Grandson     0.991
##  6 Neuveville   Val de Ruz   0.992
##  7 Porrentruy   Sarine       0.992
##  8 Broye        Glane        0.996
##  9 Broye        Veveyse      0.996
## 10 Broye        Monthey      0.995
## # ... with 54 more rows

Here we have constructed “the edges” dataframe for mtcars dataset.

#Create tbl_graph object
library(tidygraph)
library(ggraph)
set.seed(100)
swiss.graph <- as_tbl_graph(swiss.cor, directed = FALSE)

# There are 11 city missing, which means they have weak correlation to other city 
swiss.graph %>% activate(nodes) %>% as.data.frame() %>% nrow()

## [1] 36

nrow(swiss)

## [1] 47

#visualize the network
ggraph(swiss.graph) + 
  geom_edge_link() + 
  geom_node_point() +
  geom_node_text(
    aes(label = name), size = 3, repel = TRUE
    ) +
  theme_graph()

## Using `nicely` as default layout

First, let compare the k-means result to the network components.

set.seed(1)
swiss.clust<-kmeans(swiss,4)

We want to color the network by cluster, here we can apply join to graph object.

swiss.group <- data_frame(
  name = rownames(swiss),
  group = as.factor(swiss.clust$cluster)
)

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

swiss.graph <- swiss.graph %>%
  activate(nodes) %>%
  left_join(swiss.group, by = "name") %>%
  rename(label = name) %>% 
  activate(edges) %>% 
  rename(weight=r)

set.seed(100)
ggraph(swiss.graph) + 
  geom_edge_link(aes(width = weight), alpha = 0.2) + 
  scale_edge_width(range = c(0.2, 1.8)) +
  geom_node_point(aes(color = group), size = 3) +
  geom_node_text(aes(label = label), size = 3, repel = TRUE) +
  theme_graph()

## Using `nicely` as default layout

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

How about the negative correlation?

swiss.cor2 <- swiss %>%    
  t() %>% correlate() %>%    
  shave(upper = TRUE) %>%           
  stretch(na.rm = TRUE) %>%           
  filter(r < 0)

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

swiss.cor

## # A tibble: 64 x 3
##    x            y                r
##    <chr>        <chr>        <dbl>
##  1 Courtelary   Le Locle     0.991
##  2 Delemont     Franches-Mnt 0.996
##  3 Delemont     Sarine       0.994
##  4 Franches-Mnt Sarine       0.996
##  5 Neuveville   Grandson     0.991
##  6 Neuveville   Val de Ruz   0.992
##  7 Porrentruy   Sarine       0.992
##  8 Broye        Glane        0.996
##  9 Broye        Veveyse      0.996
## 10 Broye        Monthey      0.995
## # ... with 54 more rows

set.seed(100)
swiss.graph2 <- as_tbl_graph(swiss.cor2, directed = FALSE)

swiss.graph2 <- swiss.graph2 %>%
  activate(nodes) %>%
  left_join(swiss.group, by = "name") %>%
  rename(label = name) %>% 
  activate(edges) %>% 
  rename(weight=r)

set.seed(100)
ggraph(swiss.graph2) + 
  geom_edge_link(aes(width = -weight), alpha = 0.2) + 
  scale_edge_width(range = c(0.2, 1.8)) +
  geom_node_point(aes(color = group), size = 3) +
  geom_node_text(aes(label = label), size = 3, repel = TRUE) +
  theme_graph()

## Using `nicely` as default layout

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

community detection can be applied using group_* some of them using edges and some using node as their calculation

set.seed(100)
swiss.graph %>%
  activate(nodes) %>%
   mutate(community = as.factor(group_infomap())) %>% 
  ggraph(layout = "graphopt") + 
  geom_edge_link(width = 1, colour = "lightgray") +
  geom_node_point(aes(colour = community), size = 4) +
  geom_node_text(aes(label = label), repel = TRUE)+
  theme_graph()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

This result show us better cluster for our data. Compared to k-means, this method is more reliable since it is not just only consider the value of each node but also its pattern (in this case: correlation).

Reference:

http://www.sthda.com/english/articles/33-social-network-analysis/136-network-analysis-and-manipulation-using-r/

Correlation Graph, Community Detection, and Graph Manipulation

Steven Surya Tanujaya

July 4, 2019