Co-occurences

Load keywords

Read the keywords harvested from Knowledge Network, which is cached here for the purposes of this survey as a TSV file. We use the widyr library to calculate pairwise counts. First we load the required libraries (igraph, ggraph, readr, widyr, dplyr) then load the TSV data file.

library(igraph)
library(ggraph)
library(readr)
library(widyr)
library(dplyr)
kw <- read_tsv("keywords/all-keywords.tsv")
kw_pairs <- kw %>% pairwise_count(keyword, pid, sort = TRUE, upper = FALSE)

Generate cooccurrence network graph of keywords

Use ggraph to plot the network co-occurences of keywords where the count n is greater than 250 co-occurences.

set.seed(1234)
gdata <-  filter(kw_pairs, n >= 250)
graph <- graph_from_data_frame(gdata)

ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Drill down

The previous graph is pretty busy. Let’s increase n threshold to greater than 1000.

set.seed(1234)
gdata <-  filter(kw_pairs, n >= 1000)
graph <- graph_from_data_frame(gdata)

ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Theme: environment

The previous graph was good to get a quick view of what’s happening. Let’s drill into a theme such as co-occurences of keywords that have “environment” in it. We’ve cached this in environment_cooccur.tsv. Let’s also relax n to 50.

kw_pairs <- read_tsv("keywords/environment_cooccur.tsv")
set.seed(1234)
gdata <-  filter(kw_pairs, n >= 50)
graph <- graph_from_data_frame(gdata)

ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

### Theme: CSIRO DAP

How about a repository centric view?

kw <- read_tsv("keywords/csiro-dap_kw.tsv")
kw_pairs <- kw %>% pairwise_count(keyword, pid, sort = TRUE, upper = FALSE)
set.seed(1234)
gdata <-  filter(kw_pairs, n >= 50)
graph <- graph_from_data_frame(gdata)

ggraph(graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Correlations

Correllations follow a similar methodology, except we use the pairwisecor() function to calculate the correlations.

Keyword correlations for research data repositories

kw <- read_tsv("keywords/researchdata_kw.tsv")
keyword_cors <- kw  %>% group_by(keyword) %>%
  filter(n() >= 50) %>%
    pairwise_cor(keyword, pid, sort = TRUE, upper = FALSE)

set.seed(1234)

keyword_cors %>%
  filter(correlation > .9) %>%
    graph_from_data_frame() %>%
      ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
      geom_node_point(size = 5) +
      geom_node_text(aes(label = name), repel = TRUE,
      point.padding = unit(0.2, "lines")) +
      theme_void()

Keyword correlations for gov data repositories

kw <- read_tsv("keywords/opengov_kw.tsv")
keyword_cors <- kw  %>% group_by(keyword) %>%
  filter(n() >= 50) %>%
    pairwise_cor(keyword, pid, sort = TRUE, upper = FALSE)

set.seed(1234)

keyword_cors %>%
  filter(correlation > .9) %>%
    graph_from_data_frame() %>%
      ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
      geom_node_point(size = 5) +
      geom_node_text(aes(label = name), repel = TRUE,
      point.padding = unit(0.2, "lines")) +
      theme_void()

Keyword correlations for CSIRO DAP repository

kw <- read_tsv("keywords/csiro-dap_kw.tsv")
keyword_cors <- kw  %>% group_by(keyword) %>%
  filter(n() >= 50) %>%
    pairwise_cor(keyword, pid, sort = TRUE, upper = FALSE)

set.seed(1234)

keyword_cors %>%
  filter(correlation > .5) %>%
    graph_from_data_frame() %>%
      ggraph(layout = "fr") +
      geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
      geom_node_point(size = 5) +
      geom_node_text(aes(label = name), repel = TRUE,
      point.padding = unit(0.2, "lines")) +
      theme_void()

Wrap up

We’ve shown how we can use keywords and some R librarires to take gain data-driven insight into the content via keywords for metadata catalogue records.

Co-occurences and correlations of Open Data dataset keywords

Jonathan Yu

18 October 2017