Sankey Diagrams

library(networkD3)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Make a connection data frame
links <- data.frame(
  source=c("group_A","group_A", "group_B", "group_C", "group_C", "group_E"), 
  target=c("group_C","group_D", "group_E", "group_F", "group_G", "group_H"), 
  value=c(2,3, 2, 3, 1, 3)
)
 
# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(
  name=c(as.character(links$source), as.character(links$target)) %>% 
    unique()
)

# With networkD3, connection must be provided using id, not using real name like in the links dataframe. So we need to reformat it.
links$IDsource <- match(links$source, nodes$name)-1 
links$IDtarget <- match(links$target, nodes$name)-1
 
 
# Make the Network. I call my colour scale with the colourScale argument
p <- sankeyNetwork(Links = links, Nodes = nodes, Source = "IDsource", Target = "IDtarget", 
              Value = "value", NodeID = "name")
p

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.2.0
## ✔ ggplot2   4.0.2     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(viridis)

## Loading required package: viridisLite

library(patchwork)
library(circlize)

## ========================================
## circlize version 0.4.18
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

# Load dataset from github
data <- read.table("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/13_AdjacencyDirectedWeighted.csv", header=TRUE)
# Package
library(networkD3)

# I need a long format
data_long <- data %>%
  rownames_to_column %>%
  gather(key = 'key', value = 'value', -rowname) %>%
  filter(value > 0)
colnames(data_long) <- c("source", "target", "value")
data_long$target <- paste(data_long$target, " ", sep="")

# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(name=c(as.character(data_long$source), as.character(data_long$target)) %>% unique())

# With networkD3, connection must be provided using id, not using real name like in the links dataframe.. So we need to reformat it.
data_long$IDsource=match(data_long$source, nodes$name)-1
data_long$IDtarget=match(data_long$target, nodes$name)-1

# prepare colour scale
ColourScal ='d3.scaleOrdinal() .range(["#FDE725FF","#B4DE2CFF","#6DCD59FF","#35B779FF","#1F9E89FF","#26828EFF","#31688EFF","#3E4A89FF","#482878FF","#440154FF"])'

# Make the Network
sankeyNetwork(Links = data_long, Nodes = nodes,
                     Source = "IDsource", Target = "IDtarget",
                     Value = "value", NodeID = "name",
                     sinksRight=FALSE, colourScale=ColourScal, nodeWidth=40, fontSize=13, nodePadding=20)

Arc Diagrams

An arc diagram is a special kind of network graph. It is consituted by nodes that represent entities and by links that show relationships between entities. In arc diagrams, nodes are displayed along a single axis and links are represented with arcs.

it can highlight clusters and briges quite well if the node order is optimized
it allows to display the label of each node, which is often impossible in 2d structure.

Here is a 2D vs arc example.

library(tidyverse)
library(viridis)
library(patchwork)
library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:circlize':
## 
##     degree

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggraph)
library(colormap)

# A really simple edge list
links=data.frame(
    source=c("A", "A", "A", "A", "B"),
    target=c("B", "C", "D", "F","E")
    )

# Transform to a igraph object
mygraph <- graph_from_data_frame(links)

# Make the usual network diagram
p1 <-  ggraph(mygraph) +
  geom_edge_link(edge_colour="black", edge_alpha=0.3, edge_width=0.2) +
  geom_node_point( color="#69b3a2", size=5) +
  geom_node_text( aes(label=name), repel = TRUE, size=8, color="#69b3a2") +
  theme_void() +
  theme(
    legend.position="none",
    plot.margin=unit(rep(2,4), "cm")
  )

## Using "tree" as default layout

# Make a cord diagram
p2 <-  ggraph(mygraph, layout="linear") +
  geom_edge_arc(edge_colour="black", edge_alpha=0.3, edge_width=0.2) +
  geom_node_point( color="#69b3a2", size=5) +
  geom_node_text( aes(label=name), repel = FALSE, size=8, color="#69b3a2", nudge_y=-0.1) +
  theme_void() +
  theme(
    legend.position="none",
    plot.margin=unit(rep(2,4), "cm")
  )

p1 + p2

Let’s look at more complicated examples: https://www.data-to-viz.com/graph/arc.html

Hierarchial Edge Bundling

bundle the adjacency edges together to decrease the clutter usually observed in complex networks.
Hierarchical edge bundling reduces visual clutter

Here is an example showing the same dataset with and without the use of bundling. The use of straight line on the left results in a cluttered figure that makes impossible to read the connection. The use of bundling on the right makes a neat figure:

# Libraries
library(tidyverse)
library(viridis)
library(patchwork)
library(ggraph)
library(igraph)

# The flare dataset is provided in ggraph
edges <- flare$edges
vertices <- flare$vertices %>% arrange(name) %>% mutate(name=factor(name, name))
connections <- flare$imports

# Preparation to draw labels properly:
vertices$id=NA
myleaves=which(is.na( match(vertices$name, edges$from) ))
nleaves=length(myleaves)
vertices$id[ myleaves ] = seq(1:nleaves)
vertices$angle= 90 - 360 * vertices$id / nleaves
vertices$hjust<-ifelse( vertices$angle < -90, 1, 0)
vertices$angle<-ifelse(vertices$angle < -90, vertices$angle+180, vertices$angle)

# Build a network object from this dataset:
mygraph <- graph_from_data_frame(edges, vertices = vertices)

# The connection object must refer to the ids of the leaves:
from = match( connections$from, vertices$name)
to = match( connections$to, vertices$name)

# Basic dendrogram
p1=ggraph(mygraph, layout = 'dendrogram', circular = TRUE) +
    geom_edge_link(size=0.4, alpha=0.1) +
    geom_node_text(aes(x = x*1.01, y=y*1.01, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=1.5, alpha=1) +
    coord_fixed() +
    theme_void() +
    theme(
      legend.position="none",
      plot.margin=unit(c(0,0,0,0),"cm"),
    ) +
    expand_limits(x = c(-1.2, 1.2), y = c(-1.2, 1.2))

## Warning in geom_edge_link(size = 0.4, alpha = 0.1): Ignoring unknown
## parameters: `edge_size`

p2=ggraph(mygraph, layout = 'dendrogram', circular = TRUE) +
    geom_conn_bundle(data = get_con(from = from, to = to), alpha = 0.1, colour="#69b3a2") +
    geom_node_text(aes(x = x*1.01, y=y*1.01, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=1.5, alpha=1) +
    coord_fixed() +
    theme_void() +
    theme(
      legend.position="none",
      plot.margin=unit(c(0,0,0,0),"cm"),
    ) +
    expand_limits(x = c(-1.2, 1.2), y = c(-1.2, 1.2))

p1 + p2

Some more examples: https://www.data-to-viz.com/graph/edge_bundling.html

Paleteer and Color Selection

Why color choice matters

Good color choices make figures easier to read and interpret. In general:

use qualitative palettes for categories
use sequential palettes for low-to-high numeric values
use diverging palettes when values split around a meaningful midpoint

Install and load packages

#install.packages("paletteer")
#install.packages("ggplot2")

library(ggplot2)
library(paletteer)

head(palettes_d_names) #discrete palettes

## # A tibble: 6 × 5
##   package palette      length type       novelty
##   <chr>   <chr>         <int> <chr>      <lgl>  
## 1 amerika Dem_Ind_Rep3      3 divergent  FALSE  
## 2 amerika Dem_Ind_Rep5      5 divergent  FALSE  
## 3 amerika Dem_Ind_Rep7      7 divergent  FALSE  
## 4 amerika Democrat          3 sequential FALSE  
## 5 amerika Republican        3 sequential FALSE  
## 6 awtools a_palette         8 sequential TRUE

head(palettes_c_names) #continuous palettes

## # A tibble: 6 × 3
##   package  palette               type      
##   <chr>    <chr>                 <chr>     
## 1 ggthemes Blue-Green Sequential sequential
## 2 ggthemes Blue Light            sequential
## 3 ggthemes Orange Light          sequential
## 4 ggthemes Blue                  sequential
## 5 ggthemes Orange                sequential
## 6 ggthemes Green                 sequential

head(palettes_dynamic_names) # palettes that can generate a variable number of colors

##       package    palette length       type
## 1 cartography   blue.pal     20 sequential
## 2 cartography orange.pal     20 sequential
## 3 cartography    red.pal     20 sequential
## 4 cartography  brown.pal     20 sequential
## 5 cartography  green.pal     20 sequential
## 6 cartography purple.pal     20 sequential

df_cat <- data.frame(
  group = c("A", "B", "C", "D"),
  value = c(12, 18, 9, 15)
)

df_cont <- data.frame(
  x = 1:10,
  y = c(3, 5, 6, 8, 9, 11, 12, 15, 16, 18)
)

ggplot(df_cat, aes(x = group, y = value, fill = group)) +
  geom_col() +
  scale_fill_paletteer_d("RColorBrewer::Set2") + #discrete palette
  theme_minimal()

ggplot(mtcars, aes(x = wt, y = mpg, color = hp)) +
  geom_point(size = 3) +
  scale_color_paletteer_c("viridis::plasma") + #continuous palette
  theme_minimal()

df_div <- data.frame(
  x = letters[1:6],
  change = c(-3, -1, 0, 2, 4, 6)
)

#Use a diverging palette when the data have a meaningful center, such as zero, average change, or control value.

ggplot(df_div, aes(x = x, y = change, fill = change)) +
  geom_col() +
  scale_fill_paletteer_c("scico::vik") + #divergent palette
  theme_minimal()

Qualitative palettes

Use for:

species
treatment groups
tissue types
countries
categories with no order

Sequential palettes

Use for:

abundance
expression level
temperature
concentration
any low-to-high variable

Diverging palettes

Use for:

fold change around zero
difference from a control
positive vs negative effects

Tips for choosing colors well

Do not use too many categories at once
Make sure colors are easy to distinguish
Avoid relying on red/green alone
Use color to support the message, not distract from it
Keep palette choices consistent across related figures
Example of manual extraction from paletteer

Homework

Part 1. Color selection with paletteer

Before making your figures, choose one palette you think works well for categorical data and one that works well for ordered or numeric data. categorical: qualitative data numeric: sequential Task 1

Use paletteer to explore palettes and answer the following:

install.packages("paletteer")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

library(paletteer)
library(dplyr)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

paletteer_d("RColorBrewer::Set2", n= 8) %>% show_col()

- What palette did you choose for categorical data? I choose to use the RColorBrewer palette. - What palette did you choose for numeric data? I choose to use the viridis palette for the numeric data.

paletteer_c("viridis::plasma", n = 8) %>% show_col()

- In 3 to 5 sentences, explain why those choices make sense. The RcolorBrewer::Set2 pallete is good for cateogrial data because it includes very different and distinct colors that makes it easy to differentiate. This is not straining of the eyes.The viridis:plasma is good to use becuase it uses numeric data and provides a smooth gradient of colors from light to dark.

Requirements

Include the code you used to preview or extract palettes
Include 1 to 2 sentences on why color choice matters in scientific graphics Color choice matters because it affects how the viewers accurately interpret the data. It can also highlight important patterns.

Part 2. Bubble Maps

Create a bubble map in ggplot.

Requirements

plot at least 5 locations
use bubble size to represent a numeric value
use color intentionally with a palette you selected
include text labels for the locations
include a title
include a short caption or note explaining what size and color represent

Acceptable options

one map that includes both bubble size and labels
or two versions of the same map, one emphasizing color and one emphasizing labels

Suggested skills

geom_point()
geom_text() or geom_label()
coord_cartesian() if you want to zoom in
theme_void() or another clean theme

locations <- data.frame(
  city = c("New York", "Los Angeles", "Chicago", "Houston", "Miami"),
  lon = c(-74.0060, -118.2437, -87.6298, -95.3698, -80.1918),
  lat = c(40.7128, 34.0522, 41.8781, 29.7604, 25.7617),
  value = c(850, 400, 270, 230, 470)
)


head(map_data)

##                                                                      
## 1 function (map, region = ".", exact = FALSE, ...)                   
## 2 {                                                                  
## 3     check_installed("maps", reason = "for `map_data()`.")          
## 4     map_obj <- maps::map(map, region, exact = exact, plot = FALSE, 
## 5         fill = TRUE, ...)                                          
## 6     if (!inherits(map_obj, "map")) {

ggplot(locations, aes(x = lon, y = lat)) +
  geom_point(aes(size = value, color = value), alpha = 0.6) +
  geom_text(aes(label = city), vjust = -1) +
  scale_color_paletteer_c("viridis::viridis") +
  labs(
    title = "Bubble Map of Selected U.S. Cities",
    size = "Population size",
    color = "Index population",
    caption = "The color and bubbe size are reprsenting realtive population index for each city"
  )

Part 3. Map variation with icon labels.

Make one variation of your map using icon labels.

Requirements

include at least 3 labeled locations
labels can be marker icons, point symbols, or custom styled labels
keep it readable and not overcrowded
write 2 to 4 sentences explaining how this labeling style changes the way the map feels or reads

Examples of what counts

a leaflet map with popup markers
a static map with special point shapes and labels
a map using marker icons or custom annotation

install.packages("leaflet")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

library(leaflet)

## 
## Attaching package: 'leaflet'

## The following object is masked from 'package:networkD3':
## 
##     JS

install.packages("dplyr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

library(dplyr)

# Example data (replace with your own locations)
map_data <- tibble(
  location = c("one", "two", "three", "four", "five"),
  lat = c(40.71, 29.76, 34.05, 41.87, 25.76),
  long = c(-74.00, -95.36, -118.24, -87.62, -80.19),
  value = c(8.8, 2.3, 3.9, 2.7, 0.5)
)
head(map_data)

## # A tibble: 5 × 4
##   location   lat   long value
##   <chr>    <dbl>  <dbl> <dbl>
## 1 one       40.7  -74     8.8
## 2 two       29.8  -95.4   2.3
## 3 three     34.0 -118.    3.9
## 4 four      41.9  -87.6   2.7
## 5 five      25.8  -80.2   0.5

leaflet(data = map_data) %>% 
  addTiles() %>%
  addCircleMarkers(~long, ~lat)

Part 4. Basic Sankey Diagram

You may invent a small dataset if needed. Keep it simple.

Example ideas

students moving from major to career path
genes grouped into pathway categories
citation flow from field to topic
patients moving from diagnosis group to treatment group

Build one Sankey diagram.

Requirements

include at least 3 source categories
include at least 3 destination categories
show flow values
use color intentionally
include a short figure caption explaining what the flows mean

Keep it basic: It does not need to be interactive or highly customized. The goal is to understand the structure.

library(networkD3)

nodes <- data.frame(name = c("Marine Biology", "Biology", "Environmental science","research", "Research", "field work")) 
links <- data.frame(
  source = c(0, 0, 1, 1, 2, 2), 
  target = c(3, 5, 3, 4, 5, 4),
  value  = c(5, 10, 9, 10, 12, 7)
)

sankeyNetwork(Links= links,
              Nodes = nodes,
              Source = "source",
              Target = "target",
              Value = "value",
              NodeID= "name")

Part 5. Hierarchial edge building graph

Create one hierarchical edge bundling graph. You may use an example dataset from a package, adapt class example code, or create a very small hierarchy yourself.

Build one edge bundling graph.

Requirements

include a hierarchical structure
include linked nodes
use color intentionally
include a short paragraph answering:

What does the hierarchy represent?
What do the edges represent?
Why might edge bundling be useful compared with drawing all lines directly?

A hierarchy represents organizational structure of the data. The edges represent relationships between nodes that are not part of the hierarchy themselves. The edge building is useful because it reduced visual clutter. If you dont bundle it will be messy, but with the bundling it is cleaner and easier to read.

Note:This is meant to be an introduction, not a perfect polished figure. It is okay to rely on a tutorial example and then make small changes.

library(ggraph)
library(igraph)
library(tidyverse)


nodes <- data.frame(
  name = c("Root",
           "A","B","C",
           "A1","A2","B1","B2","C1","C2"),
  parent = c(NA,
             "Root","Root","Root",
             "A","A","B","B","C","C")
)

hierarchy_edges <- data.frame(
  from = c("Root",
           "Root","Root",
           "A","A","B","B","C","C"),
  to = c("A","B","C",
         "A1","A2","B1","B2","C1","C2")
)


graph <- graph_from_data_frame(hierarchy_edges)


edges <- data.frame(
  from = c("A1","A2","B1","C1","B2"),
  to   = c("B1","C2","C1","A1","A2")
)



ggraph(graph, layout = 'dendrogram', circular = TRUE) +

  # hierarchical tree structure
  geom_edge_diagonal(color = "blue") +

  # bundled edges
  geom_conn_bundle(
    data = get_con(from = edges$from, to = edges$to),
    aes(color = ..index..),
    width = 1,
    alpha = 0.6
  ) +

  # nodes
  geom_node_point(aes(filter = leaf), size = 3, color = "purple") +

  geom_node_text(aes(label = name, filter = leaf),
                 size = 2, angle = 90, hjust = 1) +

  scale_color_viridis_c(option = "plasma") +

  theme_void() +
  ggtitle("Hierarchical Edge Bundling Example")

## Warning: The dot-dot notation (`..index..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(index)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Network & Flow Diagrams

2026-04-05

Sankey Diagrams

Arc Diagrams

Hierarchial Edge Bundling

Paleteer and Color Selection

Why color choice matters

Install and load packages

Homework

Part 1. Color selection with paletteer

Part 2. Bubble Maps

Part 3. Map variation with icon labels.

Part 4. Basic Sankey Diagram

Part 5. Hierarchial edge building graph