The objective here is to cluster the city in Swiss based on the given attributes using correlation network. First, we need to filter city/row with very high correlation to (at least one) other city.
This documentation can also be accessed in my Kaggle kernel, here
The corrr
package works well with tidygraph
and ggraph
. We will use this pacakage to
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
##
## filter
library(ggraph)
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(corrr)
library(tidyverse)
## -- Attaching packages --------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks tidygraph::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
swiss.cor <- swiss %>% # t() for matrix transpose
t() %>% correlate() %>% # correlate() is equivalent to cor() but put NA as its diagonal entry and different class
shave(upper = TRUE) %>% # Shave the data frame to lower triangular matrix
stretch(na.rm = TRUE) %>%
filter(r >= 0.99)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
swiss.cor
## # A tibble: 64 x 3
## x y r
## <chr> <chr> <dbl>
## 1 Courtelary Le Locle 0.991
## 2 Delemont Franches-Mnt 0.996
## 3 Delemont Sarine 0.994
## 4 Franches-Mnt Sarine 0.996
## 5 Neuveville Grandson 0.991
## 6 Neuveville Val de Ruz 0.992
## 7 Porrentruy Sarine 0.992
## 8 Broye Glane 0.996
## 9 Broye Veveyse 0.996
## 10 Broye Monthey 0.995
## # ... with 54 more rows
Here we have constructed “the edges” dataframe for mtcars dataset.
#Create tbl_graph object
library(tidygraph)
library(ggraph)
set.seed(100)
swiss.graph <- as_tbl_graph(swiss.cor, directed = FALSE)
# There are 11 city missing, which means they have weak correlation to other city
swiss.graph %>% activate(nodes) %>% as.data.frame() %>% nrow()
## [1] 36
nrow(swiss)
## [1] 47
#visualize the network
ggraph(swiss.graph) +
geom_edge_link() +
geom_node_point() +
geom_node_text(
aes(label = name), size = 3, repel = TRUE
) +
theme_graph()
## Using `nicely` as default layout
First, let compare the k-means result to the network components.
set.seed(1)
swiss.clust<-kmeans(swiss,4)
We want to color the network by cluster
, here we can apply join
to graph object.
swiss.group <- data_frame(
name = rownames(swiss),
group = as.factor(swiss.clust$cluster)
)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
swiss.graph <- swiss.graph %>%
activate(nodes) %>%
left_join(swiss.group, by = "name") %>%
rename(label = name) %>%
activate(edges) %>%
rename(weight=r)
set.seed(100)
ggraph(swiss.graph) +
geom_edge_link(aes(width = weight), alpha = 0.2) +
scale_edge_width(range = c(0.2, 1.8)) +
geom_node_point(aes(color = group), size = 3) +
geom_node_text(aes(label = label), size = 3, repel = TRUE) +
theme_graph()
## Using `nicely` as default layout
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
How about the negative correlation?
swiss.cor2 <- swiss %>%
t() %>% correlate() %>%
shave(upper = TRUE) %>%
stretch(na.rm = TRUE) %>%
filter(r < 0)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
swiss.cor
## # A tibble: 64 x 3
## x y r
## <chr> <chr> <dbl>
## 1 Courtelary Le Locle 0.991
## 2 Delemont Franches-Mnt 0.996
## 3 Delemont Sarine 0.994
## 4 Franches-Mnt Sarine 0.996
## 5 Neuveville Grandson 0.991
## 6 Neuveville Val de Ruz 0.992
## 7 Porrentruy Sarine 0.992
## 8 Broye Glane 0.996
## 9 Broye Veveyse 0.996
## 10 Broye Monthey 0.995
## # ... with 54 more rows
set.seed(100)
swiss.graph2 <- as_tbl_graph(swiss.cor2, directed = FALSE)
swiss.graph2 <- swiss.graph2 %>%
activate(nodes) %>%
left_join(swiss.group, by = "name") %>%
rename(label = name) %>%
activate(edges) %>%
rename(weight=r)
set.seed(100)
ggraph(swiss.graph2) +
geom_edge_link(aes(width = -weight), alpha = 0.2) +
scale_edge_width(range = c(0.2, 1.8)) +
geom_node_point(aes(color = group), size = 3) +
geom_node_text(aes(label = label), size = 3, repel = TRUE) +
theme_graph()
## Using `nicely` as default layout
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
community detection can be applied using group_*
some of them using edges and some using node as their calculation
set.seed(100)
swiss.graph %>%
activate(nodes) %>%
mutate(community = as.factor(group_infomap())) %>%
ggraph(layout = "graphopt") +
geom_edge_link(width = 1, colour = "lightgray") +
geom_node_point(aes(colour = community), size = 4) +
geom_node_text(aes(label = label), repel = TRUE)+
theme_graph()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
This result show us better cluster for our data. Compared to k-means, this method is more reliable since it is not just only consider the value of each node but also its pattern (in this case: correlation).
Reference: