library(networkD3)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Make a connection data frame
links <- data.frame(
source=c("group_A","group_A", "group_B", "group_C", "group_C", "group_E"),
target=c("group_C","group_D", "group_E", "group_F", "group_G", "group_H"),
value=c(2,3, 2, 3, 1, 3)
)
# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(
name=c(as.character(links$source), as.character(links$target)) %>%
unique()
)
# With networkD3, connection must be provided using id, not using real name like in the links dataframe. So we need to reformat it.
links$IDsource <- match(links$source, nodes$name)-1
links$IDtarget <- match(links$target, nodes$name)-1
# Make the Network. I call my colour scale with the colourScale argument
p <- sankeyNetwork(Links = links, Nodes = nodes, Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name")
p
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ ggplot2 4.0.2 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(viridis)
## Loading required package: viridisLite
library(patchwork)
library(circlize)
## ========================================
## circlize version 0.4.18
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
# Load dataset from github
data <- read.table("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/13_AdjacencyDirectedWeighted.csv", header=TRUE)
# Package
library(networkD3)
# I need a long format
data_long <- data %>%
rownames_to_column %>%
gather(key = 'key', value = 'value', -rowname) %>%
filter(value > 0)
colnames(data_long) <- c("source", "target", "value")
data_long$target <- paste(data_long$target, " ", sep="")
# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(name=c(as.character(data_long$source), as.character(data_long$target)) %>% unique())
# With networkD3, connection must be provided using id, not using real name like in the links dataframe.. So we need to reformat it.
data_long$IDsource=match(data_long$source, nodes$name)-1
data_long$IDtarget=match(data_long$target, nodes$name)-1
# prepare colour scale
ColourScal ='d3.scaleOrdinal() .range(["#FDE725FF","#B4DE2CFF","#6DCD59FF","#35B779FF","#1F9E89FF","#26828EFF","#31688EFF","#3E4A89FF","#482878FF","#440154FF"])'
# Make the Network
sankeyNetwork(Links = data_long, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name",
sinksRight=FALSE, colourScale=ColourScal, nodeWidth=40, fontSize=13, nodePadding=20)
An arc diagram is a special kind of network graph. It is consituted by nodes that represent entities and by links that show relationships between entities. In arc diagrams, nodes are displayed along a single axis and links are represented with arcs.
Here is a 2D vs arc example.
library(tidyverse)
library(viridis)
library(patchwork)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:circlize':
##
## degree
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
library(colormap)
# A really simple edge list
links=data.frame(
source=c("A", "A", "A", "A", "B"),
target=c("B", "C", "D", "F","E")
)
# Transform to a igraph object
mygraph <- graph_from_data_frame(links)
# Make the usual network diagram
p1 <- ggraph(mygraph) +
geom_edge_link(edge_colour="black", edge_alpha=0.3, edge_width=0.2) +
geom_node_point( color="#69b3a2", size=5) +
geom_node_text( aes(label=name), repel = TRUE, size=8, color="#69b3a2") +
theme_void() +
theme(
legend.position="none",
plot.margin=unit(rep(2,4), "cm")
)
## Using "tree" as default layout
# Make a cord diagram
p2 <- ggraph(mygraph, layout="linear") +
geom_edge_arc(edge_colour="black", edge_alpha=0.3, edge_width=0.2) +
geom_node_point( color="#69b3a2", size=5) +
geom_node_text( aes(label=name), repel = FALSE, size=8, color="#69b3a2", nudge_y=-0.1) +
theme_void() +
theme(
legend.position="none",
plot.margin=unit(rep(2,4), "cm")
)
p1 + p2
Let’s look at more complicated examples: https://www.data-to-viz.com/graph/arc.html
Here is an example showing the same dataset with and without the use of bundling. The use of straight line on the left results in a cluttered figure that makes impossible to read the connection. The use of bundling on the right makes a neat figure:
# Libraries
library(tidyverse)
library(viridis)
library(patchwork)
library(ggraph)
library(igraph)
# The flare dataset is provided in ggraph
edges <- flare$edges
vertices <- flare$vertices %>% arrange(name) %>% mutate(name=factor(name, name))
connections <- flare$imports
# Preparation to draw labels properly:
vertices$id=NA
myleaves=which(is.na( match(vertices$name, edges$from) ))
nleaves=length(myleaves)
vertices$id[ myleaves ] = seq(1:nleaves)
vertices$angle= 90 - 360 * vertices$id / nleaves
vertices$hjust<-ifelse( vertices$angle < -90, 1, 0)
vertices$angle<-ifelse(vertices$angle < -90, vertices$angle+180, vertices$angle)
# Build a network object from this dataset:
mygraph <- graph_from_data_frame(edges, vertices = vertices)
# The connection object must refer to the ids of the leaves:
from = match( connections$from, vertices$name)
to = match( connections$to, vertices$name)
# Basic dendrogram
p1=ggraph(mygraph, layout = 'dendrogram', circular = TRUE) +
geom_edge_link(size=0.4, alpha=0.1) +
geom_node_text(aes(x = x*1.01, y=y*1.01, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=1.5, alpha=1) +
coord_fixed() +
theme_void() +
theme(
legend.position="none",
plot.margin=unit(c(0,0,0,0),"cm"),
) +
expand_limits(x = c(-1.2, 1.2), y = c(-1.2, 1.2))
## Warning in geom_edge_link(size = 0.4, alpha = 0.1): Ignoring unknown
## parameters: `edge_size`
p2=ggraph(mygraph, layout = 'dendrogram', circular = TRUE) +
geom_conn_bundle(data = get_con(from = from, to = to), alpha = 0.1, colour="#69b3a2") +
geom_node_text(aes(x = x*1.01, y=y*1.01, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=1.5, alpha=1) +
coord_fixed() +
theme_void() +
theme(
legend.position="none",
plot.margin=unit(c(0,0,0,0),"cm"),
) +
expand_limits(x = c(-1.2, 1.2), y = c(-1.2, 1.2))
p1 + p2
Some more examples: https://www.data-to-viz.com/graph/edge_bundling.html
Good color choices make figures easier to read and interpret. In general:
#install.packages("paletteer")
#install.packages("ggplot2")
library(ggplot2)
library(paletteer)
head(palettes_d_names) #discrete palettes
## # A tibble: 6 × 5
## package palette length type novelty
## <chr> <chr> <int> <chr> <lgl>
## 1 amerika Dem_Ind_Rep3 3 divergent FALSE
## 2 amerika Dem_Ind_Rep5 5 divergent FALSE
## 3 amerika Dem_Ind_Rep7 7 divergent FALSE
## 4 amerika Democrat 3 sequential FALSE
## 5 amerika Republican 3 sequential FALSE
## 6 awtools a_palette 8 sequential TRUE
head(palettes_c_names) #continuous palettes
## # A tibble: 6 × 3
## package palette type
## <chr> <chr> <chr>
## 1 ggthemes Blue-Green Sequential sequential
## 2 ggthemes Blue Light sequential
## 3 ggthemes Orange Light sequential
## 4 ggthemes Blue sequential
## 5 ggthemes Orange sequential
## 6 ggthemes Green sequential
head(palettes_dynamic_names) # palettes that can generate a variable number of colors
## package palette length type
## 1 cartography blue.pal 20 sequential
## 2 cartography orange.pal 20 sequential
## 3 cartography red.pal 20 sequential
## 4 cartography brown.pal 20 sequential
## 5 cartography green.pal 20 sequential
## 6 cartography purple.pal 20 sequential
df_cat <- data.frame(
group = c("A", "B", "C", "D"),
value = c(12, 18, 9, 15)
)
df_cont <- data.frame(
x = 1:10,
y = c(3, 5, 6, 8, 9, 11, 12, 15, 16, 18)
)
ggplot(df_cat, aes(x = group, y = value, fill = group)) +
geom_col() +
scale_fill_paletteer_d("RColorBrewer::Set2") + #discrete palette
theme_minimal()
ggplot(mtcars, aes(x = wt, y = mpg, color = hp)) +
geom_point(size = 3) +
scale_color_paletteer_c("viridis::plasma") + #continuous palette
theme_minimal()
df_div <- data.frame(
x = letters[1:6],
change = c(-3, -1, 0, 2, 4, 6)
)
#Use a diverging palette when the data have a meaningful center, such as zero, average change, or control value.
ggplot(df_div, aes(x = x, y = change, fill = change)) +
geom_col() +
scale_fill_paletteer_c("scico::vik") + #divergent palette
theme_minimal()
Qualitative palettes
Use for:
Sequential palettes
Use for:
Diverging palettes
Use for:
Tips for choosing colors well
Before making your figures, choose one palette you think works well for categorical data and one that works well for ordered or numeric data.
Task 1
Use paletteer to explore palettes and answer the following:
Requirements
library(paletteer)
head(palettes_d_names)
## # A tibble: 6 × 5
## package palette length type novelty
## <chr> <chr> <int> <chr> <lgl>
## 1 amerika Dem_Ind_Rep3 3 divergent FALSE
## 2 amerika Dem_Ind_Rep5 5 divergent FALSE
## 3 amerika Dem_Ind_Rep7 7 divergent FALSE
## 4 amerika Democrat 3 sequential FALSE
## 5 amerika Republican 3 sequential FALSE
## 6 awtools a_palette 8 sequential TRUE
head(palettes_c_names)
## # A tibble: 6 × 3
## package palette type
## <chr> <chr> <chr>
## 1 ggthemes Blue-Green Sequential sequential
## 2 ggthemes Blue Light sequential
## 3 ggthemes Orange Light sequential
## 4 ggthemes Blue sequential
## 5 ggthemes Orange sequential
## 6 ggthemes Green sequential
head(palettes_dynamic_names)
## package palette length type
## 1 cartography blue.pal 20 sequential
## 2 cartography orange.pal 20 sequential
## 3 cartography red.pal 20 sequential
## 4 cartography brown.pal 20 sequential
## 5 cartography green.pal 20 sequential
## 6 cartography purple.pal 20 sequential
If I had to pick the palette for categorical data I would pick the blue light sequential palette because its sequential. If I had to do numerical data I would use red.pal because of its stand out color and the fact that it has a length of 20 for my numerical values. These choices make sense because they fit well into each type of data group.
Your color choice matters because choosing color blind options is important. In the case that your audience may have colorblindness you still want to make sure your figures are effective. ### Part 2. Bubble Maps
Create a bubble map in ggplot.
Requirements
Acceptable options
Suggested skills
library(ggplot2)
library(dplyr)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:viridis':
##
## unemp
## The following object is masked from 'package:purrr':
##
## map
df=tibble(city=c("New York","D.C","Houston","Las Vegas", "Seattle"), lon=c(-73.935242, -77.009056, -95.358421, -115.176468, -122.335167), lat=c(40.730610, 38.889805, 29.749907, 36.188110, 47.608013), value=c(50,80,60,40,30))
US=map_data("state")
ggplot()+
geom_polygon(data=US, aes(long,lat, group=group),fill="gray95", color="white")+
geom_point(data=df,aes(lon,lat,size=value,color=value),alpha=1)+
geom_text(data=df,aes(lon,lat,label=city),nudge=1.1,size=3)+
scale_color_viridis_c(option = "c")+
scale_size(range=c(4,18))+
coord_fixed(1.3)+
labs(
title="Bubble Map of Large US Cities",
caption = "A bubble map of large US cities that are off the top of my head.The color is to help differntiate between the points and the size indicates the proximity if each city to the college. The larger the bubble the closer it is to the college"
)+
theme_void()
## Warning in geom_text(data = df, aes(lon, lat, label = city), nudge = 1.1, :
## Ignoring unknown parameters: `nudge`
## Warning in viridisLite::viridis(n, alpha, begin, end, direction, option):
## Option 'c' does not exist. Defaulting to 'viridis'.
Make one variation of your map using icon labels.
Requirements
Examples of what counts
install.packages("ggimage")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
library(dplyr)
library(maps)
library(ggimage)
df=tibble(city=c("New York","D.C","Houston","Las Vegas", "Seattle"), lon=c(-73.935242, -77.009056, -95.358421, -115.176468, -122.335167), lat= c(40.730610, 38.889805, 29.749907, 36.188110, 47.608013), value = c(50, 80, 60, 40, 30),
image = c(
"https://www.freeiconspng.com/uploads/number-two-icon-25.png",
"https://pngimg.com/uploads/number1/number1_PNG14890.png",
"https://pngimg.com/d/number3_PNG14967.png",
"https://pngimg.com/d/number4_PNG15014.png",
"https://pngimg.com/d/number5_PNG15086.png"
)
)
US=map_data("state")
ggplot()+
geom_polygon(data=US, aes(long,lat, group=group),fill="gray95", color="white")+
geom_image(data = df,aes(lon, lat, image = image),size = 0.07) +
geom_text(data=df,aes(lon,lat,label=city),nudge_y=2,size=3)+
scale_size(range=c(4,18))+
coord_fixed(1.3)+
labs(
title="Bubble Map of Large US Cities"
)+
theme_void()
The labeling style changes the way the information is presented, instead
of using color coding it is now numbered allowing for some people to
easily understand the graph. This numbring system also allows you to
remoe the legend however now data is less backed up as we simply put
images on the coordinates of each city. ### Part 4. Basic Sankey
Diagram
You may invent a small dataset if needed. Keep it simple.
Example ideas
Build one Sankey diagram.
Requirements
Keep it basic: It does not need to be interactive or highly customized. The goal is to understand the structure.
install.packages("ggalluvial")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
library(dplyr)
library(ggalluvial)
df=tibble(
diagnosis=c("cardiac","cardiac","cardiac","respiratory","respiratory","respiratory", "neurological","neurological","neurological"),
treatment=c("surgery","surgery","surgery","therapy","therapy","therapy","medication","medication","medication"),
patients=c(10,20,30,40,50,60,70,80,90)
)
ggplot(df,aes(axis1=diagnosis,axis2=treatment,y=patients))+
geom_alluvium(aes(fill=diagnosis),width=1/10,alpha=0.5)+
geom_stratum(width=1/8,fill="white",color="black")+
geom_label(stat="stratum",aes(label = after_stat(stratum)),size=2)+
scale_fill_brewer(palette = "set2")+
scale_x_discrete(limits=c("diagnosis","treatment"),expand=c(.1,.1))+
labs(
title = "Sankey Diagram", caption = "Flow in this case means the number of patients that recieved their proper treatment type.")+
theme_classic()
## Warning: Unknown palette: "set2"
Create one hierarchical edge bundling graph. You may use an example dataset from a package, adapt class example code, or create a very small hierarchy yourself.
Build one edge bundling graph.
Requirements
Note:This is meant to be an introduction, not a perfect polished figure. It is okay to rely on a tutorial example and then make small changes.
library(tidyverse)
library(ggraph)
library(igraph)
library(viridis)
hierarchy = tribble(
~from, ~to,
"root", "Group_A",
"root", "Group_B",
"root", "Group_C",
"Group_A", "A1",
"Group_A", "A2",
"Group_B", "B1",
"Group_B", "B2",
"Group_C", "C1",
"Group_C", "C2")
links = tribble(
~from, ~to,
"A1", "B2",
"A2", "C1",
"B1", "C2")
tree = graph_from_data_frame(hierarchy, directed = TRUE)
ggraph(tree,layout="dendrogram", circular=TRUE)+
geom_edge_diagonal(alpha=0.5,color="lightgreen")+
geom_conn_bundle(
data = get_con(from=links$from,to=links$to,tree=tree),
aes(color=..index..),
alpha=0.7,
width=1)+
scale_color_viridis_c(option="C")+
geom_node_point(size=2)+
geom_node_text(aes(label=name), repel=TRUE, size=2)+
theme_void()
Im not understanding where the error is coming from however. A hierarchy is used to show the relationship between variable using the lines (trees/branches). The edges represent the variables and how they connect with each other.Edge bundling helps clear the figure and make it more readable as the groups with connections are sorted near each other removing clutter. This heirarchy is simply an example of one connecting points.