library(readr)
library(dplyr)
library(igraph)
library(ggraph)
library(ggforce)
# This shared file contains the number of question that have each pair of tags
# This counts only questions that are not deleted and have a positive score
tag_pair_data <- read_csv("http://varianceexplained.org/files/tag_pairs.csv.gz")
relationships <- tag_pair_data %>%
mutate(Fraction = Cooccur / Tag1Total) %>%
filter(Fraction >= .35) %>%
distinct(Tag1)
v <- tag_pair_data %>%
select(Tag1, Tag1Total) %>%
distinct(Tag1) %>%
filter(Tag1 %in% relationships$Tag1 |
Tag1 %in% relationships$Tag2) %>%
arrange(desc(Tag1Total))
a <- grid::arrow(length = grid::unit(.08, "inches"), ends = "first", type = "closed")
set.seed(2016)
relationships %>%
graph_from_data_frame(vertices = v) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = Fraction), arrow = a) +
geom_node_point(aes(size = Tag1Total), color = "lightblue") +
geom_node_text(aes(size = Tag1Total, label = name), check_overlap = TRUE) +
scale_size_continuous(range = c(2, 9)) +
ggforce::theme_no_axes() +
theme(legend.position = "none")
