Read the keywords harvested from Knowledge Network, which is cached here for the purposes of this survey as a TSV file. We use the widyr library to calculate pairwise counts. First we load the required libraries (igraph, ggraph, readr, widyr, dplyr) then load the TSV data file.
library(igraph)
library(ggraph)
library(readr)
library(widyr)
library(dplyr)
kw <- read_tsv("keywords/all-keywords.tsv")
kw_pairs <- kw %>% pairwise_count(keyword, pid, sort = TRUE, upper = FALSE)
Use ggraph to plot the network co-occurences of keywords where the count n is greater than 250 co-occurences.
set.seed(1234)
gdata <- filter(kw_pairs, n >= 250)
graph <- graph_from_data_frame(gdata)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
The previous graph is pretty busy. Let’s increase n threshold to greater than 1000.
set.seed(1234)
gdata <- filter(kw_pairs, n >= 1000)
graph <- graph_from_data_frame(gdata)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
The previous graph was good to get a quick view of what’s happening. Let’s drill into a theme such as co-occurences of keywords that have “environment” in it. We’ve cached this in environment_cooccur.tsv. Let’s also relax n to 50.
kw_pairs <- read_tsv("keywords/environment_cooccur.tsv")
set.seed(1234)
gdata <- filter(kw_pairs, n >= 50)
graph <- graph_from_data_frame(gdata)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
### Theme: CSIRO DAP
How about a repository centric view?
kw <- read_tsv("keywords/csiro-dap_kw.tsv")
kw_pairs <- kw %>% pairwise_count(keyword, pid, sort = TRUE, upper = FALSE)
set.seed(1234)
gdata <- filter(kw_pairs, n >= 50)
graph <- graph_from_data_frame(gdata)
ggraph(graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Correllations follow a similar methodology, except we use the pairwisecor() function to calculate the correlations.
kw <- read_tsv("keywords/researchdata_kw.tsv")
keyword_cors <- kw %>% group_by(keyword) %>%
filter(n() >= 50) %>%
pairwise_cor(keyword, pid, sort = TRUE, upper = FALSE)
set.seed(1234)
keyword_cors %>%
filter(correlation > .9) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
kw <- read_tsv("keywords/opengov_kw.tsv")
keyword_cors <- kw %>% group_by(keyword) %>%
filter(n() >= 50) %>%
pairwise_cor(keyword, pid, sort = TRUE, upper = FALSE)
set.seed(1234)
keyword_cors %>%
filter(correlation > .9) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
kw <- read_tsv("keywords/csiro-dap_kw.tsv")
keyword_cors <- kw %>% group_by(keyword) %>%
filter(n() >= 50) %>%
pairwise_cor(keyword, pid, sort = TRUE, upper = FALSE)
set.seed(1234)
keyword_cors %>%
filter(correlation > .5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
We’ve shown how we can use keywords and some R librarires to take gain data-driven insight into the content via keywords for metadata catalogue records.