sankeyNetwork trainingThe purpose of this noteboook is to illustrate how the sankeynetwork function from the NetworkD3 package can be used to create sankey diagrams.
library(networkD3)
library(dplyr)
library(stringr)
library(tidyr)
build_sankey_data function built by Caleb Moses.build_sankey_data <- function(data, colour_data) {
#' This is a generic function for preparing datasets to be input into
#' the `networkD3::sankeyNetwork` function.
#'
#' The columns are assumed to be in the form of:
#' var_1 -> var_2 -> ... -> var_n -> value
#' where var_i is the variable for the ith layer in the sankey chart.
#'
#' Assuming this order, `build_sankey_data` returns the Nodes and Links
#' `data.frame`s for constructing the sankey chart correctly.
space_char <- ' '
data <- data %>%
mutate_if(is.character, funs(str_replace_all(., " ", space_char)))
colour_data <- colour_data %>%
mutate_if(is.character, funs(str_replace_all(., " ", space_char)))
# Get the variable name columns
# Assumes that the values are stored in the last column
gather_cols <- names(data)[1:(ncol(data) - 1)]
value_col <- names(data)[ncol(data)]
# Prepare the `node_data` by gathering `variable`/`name` combinations.
# This is important because sometimes names are duplicated between
# layers and we still need the ids to be distinct.
#
# Of course, `id`s are attached here too so we can inspect the table
# later to make sure that the concordance is correct.
node_data <- data %>%
select(one_of(gather_cols)) %>%
gather(one_of(gather_cols), key = 'variable', value = 'name') %>%
distinct() %>%
mutate(id = as.numeric(factor(paste(variable, name, sep = "//"))) - 1) %>%
arrange(id)
# Prepare `link_data` - first we create a list of dataframes of the
# form source -> target -> value. These are later concatenated to form
# the `link_data` for `networkD3::sankeyNetwork`
link_list <- lapply(1:(ncol(data) - 2), function(i) {
# The columns of `data` are split into n - 2 dataframes here.
# The matching looks like:
# names(df1) = c('var_1', 'var_2', 'value')
# names(df2) = c('var_2', 'var_3', 'value')
# ...
# names(df[n-2]) = c('var_[n-2]', 'var_[n-1]', 'value')
#
# Before outputting, the columns are renamed to 'source', 'target'
# and 'value'. This way the tables can be concatenated with `bind_rows`.
# Extract the column names for this loop
link_cols <- names(data)[c(i, i + 1, ncol(data))]
source <- link_cols[1]
target <- link_cols[2]
value <- link_cols[3]
# Select the columns
res <- data[, link_cols]
# Rename the columns
names(res) <- c('source', 'target', 'value')
# Aggregate the `value`s by `source` and `target`.
# Then attach the `variable` names for each `source` and `target`.
res <- res %>%
group_by(source, target) %>%
summarise(value = sum(value)) %>%
ungroup() %>%
mutate(source_var = link_cols[1],
target_var = link_cols[2])
return(res)
})
# Bind the tables in `link_list` together, then join with `node_data` to get
# the correct `id`s for each link. Then, reorder the columns nicely and convert
# the `id`s to integers and `link_data` to `data.frame` to pass the type checking
# in `networkD3::sankeyNetwork`.
link_data <- link_list %>%
bind_rows() %>%
inner_join(node_data, by = c("source_var" = "variable", "source" = "name")) %>%
rename(source_id = id) %>%
inner_join(node_data, by = c("target_var" = "variable", "target" = "name")) %>%
rename(target_id = id) %>%
select(source_var, target_var, source, target, source_id, target_id, value) %>%
mutate(source_id = as.integer(source_id),
target_id = as.integer(target_id)) %>%
as.data.frame()
# `node_data` must be a data.frame and the `name` column must be a factor.
node_data <- node_data %>%
mutate(name = factor(name)) %>%
left_join(colour_data, by = c('name')) %>%
as.data.frame()
build_d3_palette <- function(names, colours) {
#' convert a character vector of hex colours into a d3 scaleOrdinal palette
if (length(names) != length(colours)) {
stop("names and colours lengths do not match")
}
name_list <- jsonlite::toJSON(names)
palette_list <- jsonlite::toJSON(colours)
palette_text <- paste0('d3.scaleOrdinal()',
'.domain(', name_list, ')',
'.range(', palette_list, ");")
return(palette_text)
}
colour_scale <- build_d3_palette(node_data$name, node_data$colour)
# package `node_data` and `link_data` together before return
res <- list(nodes = node_data, links = link_data, colour_scale = JS(colour_scale))
return(res)
}
build_sankey_data and sankeyNetwork functions.#build_sankey_data function requires df input of categorical variables with final column of numeric value.
df <-
tibble::tribble(~state1, ~state2, ~value,
"Trees", "Grass", 5,
"Trees", "Wetland", 1,
"Grass", "Trees", 7,
"Grass", "Wetland", 1,
"Wetland", "Trees", 1,
"Wetland", "Grass", 2
)
df
## # A tibble: 6 x 3
## state1 state2 value
## <chr> <chr> <dbl>
## 1 Trees Grass 5
## 2 Trees Wetland 1
## 3 Grass Trees 7
## 4 Grass Wetland 1
## 5 Wetland Trees 1
## 6 Wetland Grass 2
#build_sankey_data function requires pal input of name variable and colour hex.
#colour tibble requires
#column called `name` of categorical variable values and
#column called `colour` with hex values
pal <-
tibble::tribble(~name, ~colour,
"Trees", "purple",
"Grass", "green",
"Wetland", "brown"
)
pal
## # A tibble: 3 x 2
## name colour
## <chr> <chr>
## 1 Trees purple
## 2 Grass green
## 3 Wetland brown
sankey_data <- build_sankey_data(
data=df,
colour_data=pal)
units <- "ha"
sankeyNetwork(Links=sankey_data$links, Nodes=sankey_data$nodes,
Source="source_id", Target="target_id", Value="value",
NodeID="name", colourScale=sankey_data$colour_scale,
fontSize=14, fontFamily="Arial", units=units
)