suppressPackageStartupMessages( require(oetteR) )
suppressPackageStartupMessages( require(tidyverse) )

1 Introduction

Alluvial Plots can be a powerfull tool to visualise categorical data. It will group observations that have similar values across a set of dimensions and visualise them as flows. The individual flows can be emphasised through different colouring methods.

2 Alluvial Plots

2.1 Visualize Data in a tidy data format

For this dataformat the f_plot_alluvial function is suitable. Also see the help documentation and the examples of that function.

2.1.1 Data


data_ls = mtcars %>%
  f_clean_data()
#> [1] "Number of excluded observations: 0"

data_tidy = data_ls$data
max_variables = 5
variables = c( data_ls$categoricals[1:3], data_ls$numericals[1:3] )

head(data_tidy, 10) %>%
  knitr::kable()

mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
21.0	6	160.0	110	3.90	2.620	16.46	0	1	4	4
21.0	6	160.0	110	3.90	2.875	17.02	0	1	4	4
22.8	4	108.0	93	3.85	2.320	18.61	1	1	4	1
21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2
18.1	6	225.0	105	2.76	3.460	20.22	1	0	3	1
14.3	8	360.0	245	3.21	3.570	15.84	0	0	3	4
24.4	4	146.7	62	3.69	3.190	20.00	1	0	4	2
22.8	4	140.8	95	3.92	3.150	22.90	1	0	4	2
19.2	6	167.6	123	3.92	3.440	18.30	1	0	4	4

2.1.2 Colouring


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'last_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'all_flows' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'values' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.1.3 Ordering

The order of the variables on the x axis is determined by the parameter variables. The order of any y values can be changed using the oder_levels argument. Simply pass the values you want to reorder as a character vector.


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'values'
                , order_levels = c('1', '0') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.2 Visualize data in the ‘gathered’ format

2.2.1 Data

Here we have more than one row for each observation and measurements that belong to the same group such as mean arrival delay is gathered in one column, which is indexed by the quarter column. In an alluvial Plot we might want to add another independent variable for coloring like in this case carrier.


monthly_flights = nycflights13::flights %>%
  group_by(month, tailnum, origin, dest, carrier) %>%
  summarise() %>%
  group_by( tailnum, origin, dest, carrier) %>%
  count() %>%
  filter( n == 12 ) %>%
  select( - n ) %>%
  left_join( nycflights13::flights ) %>%
  .[complete.cases(.), ] %>%
  ungroup() %>%
  mutate( tailnum = pmap_chr(list(tailnum, origin, dest, carrier), paste )
          , qu = cut(month, 4)) %>%
  group_by(tailnum, carrier, origin, dest, qu ) %>%
  summarise( mean_arr_delay = mean(arr_delay) ) %>%
  ungroup() %>%
  mutate( mean_arr_delay = ifelse( mean_arr_delay < 10, 'on_time', 'late' ) )

levels(monthly_flights$qu) = c('Q1', 'Q2', 'Q3', 'Q4')

data_gath = monthly_flights

head(data_gath, 10) %>%
knitr::kable()

tailnum	carrier	origin	dest	qu	mean_arr_delay
N0EGMQ LGA BNA MQ	MQ	LGA	BNA	Q1	on_time
N0EGMQ LGA BNA MQ	MQ	LGA	BNA	Q2	on_time
N0EGMQ LGA BNA MQ	MQ	LGA	BNA	Q3	on_time
N0EGMQ LGA BNA MQ	MQ	LGA	BNA	Q4	on_time
N11150 EWR MCI EV	EV	EWR	MCI	Q1	late
N11150 EWR MCI EV	EV	EWR	MCI	Q2	late
N11150 EWR MCI EV	EV	EWR	MCI	Q3	on_time
N11150 EWR MCI EV	EV	EWR	MCI	Q4	late
N12125 EWR LAX UA	UA	EWR	LAX	Q1	on_time
N12125 EWR LAX UA	UA	EWR	LAX	Q2	on_time


col_x = 'qu'
col_y = 'mean_arr_delay'
col_fill = 'carrier'
col_id = 'tailnum'

2.2.2 Coloring

2.2.2.1 color by independent variable `carrier`


f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"

2.2.2.1.1 Move colored independent variable to the left


f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill, fill_right = F )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"

2.2.2.2 Other coloring options


f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'last_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'all_flows' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'value' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

2.2.3 Ordering

2.2.3.1 Y levels of dependent variable

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
                     , order_levels_y = c('on_time', 'late') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

2.2.3.2 Y levels of independent color variable `carrier`


order_by_carrier_size = data_gath %>%
  group_by(carrier) %>%
  count() %>%
  arrange( desc(n) ) %>%
  .[['carrier']]

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill
                     , order_levels_fill = order_by_carrier_size )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"

2.2.3.3 X levels

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
                     , order_levels_x = c('Q4', 'Q3', 'Q2', 'Q1') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

2.3 General

2.3.1 Change the colors

Any color palette can be passed to both functions.


f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'last_variable'
                     , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Purples') )
                     , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Oranges') ) )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' 
                , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Reds') )
                , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Greens') )
                )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.3.2 Missing Data

Missing Data will automatically be labeled as NA. The label can be changed and it can be ordered as usual

2.3.2.1 Gathered Data


data = data_gath %>%
  select(tailnum, qu, mean_arr_delay) %>%
  sample_frac(0.9)

f_plot_alluvial_1v1( data, col_x, col_y, col_id, fill_by = 'last_variable'
                         , NA_label = 'none'
                         , order_levels_y = 'none')
#> [1] "Number of flows: 60"
#> [1] "Original Dataframe reduced to 16.6 %"
#> [1] "Maximum weight of a singfle flow 23.5 %"

2.3.2.2 Tidy Data


data = data_tidy
data$cyl[1:4] = NA

f_plot_alluvial( data = data
                     , variables = variables
                     , max_variables = max_variables
                     , fill_by = 'first_variable'
                     , NA_label = 'none'
                     , order_levels = 'none' )
#> [1] "Number of flows: 18"
#> [1] "Original Dataframe reduced to 56.2 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.3.3 Connect Flows to observations in original data


link = file.path( system.file(package = 'oetteR')
                  , 'Rmd vignettes'
                  , 'vignette_visualising_regression_models.html' )

The plot objects returned by both functions have an attribute called data_key which is an x-y table arranged like the alluvial plot one column containing the original ID. See vignette for visualising regression models for an example on how this is effectively used.

2.3.3.1 tidy data


p = f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

p$data_key %>%
  head(10) %>%
  knitr::kable()

ID	cyl	vs	am	mpg	disp	alluvial_id	n
27	4	0	1	MH	LL	1	1
21	4	1	0	MH	LL	2	1
6	6	1	0	M	M	8	1
4	6	1	0	MH	M	9	1
24	8	0	0	LL	MH	10	1
17	8	0	0	ML	HH	13	1
25	8	0	0	M	HH	15	1
8	4	1	0	MH	ML	3	2
9	4	1	0	MH	ML	3	2
3	4	1	1	MH	LL	4	2

2.3.3.2 gathered data


p = f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill = 'carrier' )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"


p$data_key %>%
  head(10) %>%
  knitr::kable()

tailnum	carrier	Q1	Q2	Q3	Q4	n	alluvial_id
N0EGMQ LGA BNA MQ	MQ	on_time	on_time	on_time	on_time	2	82
N11150 EWR MCI EV	EV	late	late	on_time	late	4	46
N12125 EWR LAX UA	UA	on_time	on_time	on_time	on_time	5	91
N13716 EWR SNA UA	UA	on_time	on_time	on_time	on_time	5	91
N14102 EWR MCO UA	UA	on_time	on_time	on_time	on_time	5	91
N14120 EWR MCO UA	UA	late	on_time	on_time	on_time	2	86
N161UW LGA CLT US	US	on_time	on_time	late	on_time	11	100
N169UW EWR CLT US	US	on_time	late	late	on_time	7	96
N169UW LGA CLT US	US	on_time	on_time	on_time	on_time	37	102
N17122 EWR LAX UA	UA	on_time	on_time	on_time	on_time	5	91

2.3.4 The Plot objects can be manipulated like any other ggplot object

2.3.4.1 Flip


p = p +
  coord_flip()

p

2.3.4.2 Title


p = p +
  ggtitle('Look at my flip')

p

2.3.4.3 Repel Labels

Unfortunately does not work yet


p = p %>%
  ggrepel::geom_text_repel()
#> Error: Mapping must be created by `aes()` or `aes_()`

Alluvial Plots

Bjoern Oettinghaus

2018-03-02

1 Introduction

2 Alluvial Plots

2.1 Visualize Data in a tidy data format

2.1.1 Data

2.1.2 Colouring

2.1.3 Ordering

2.2 Visualize data in the ‘gathered’ format

2.2.1 Data

2.2.2 Coloring

2.2.2.1 color by independent variable `carrier`

2.2.2.1.1 Move colored independent variable to the left

2.2.2.2 Other coloring options

2.2.3 Ordering

2.2.3.1 Y levels of dependent variable

2.2.3.2 Y levels of independent color variable `carrier`

2.2.3.3 X levels

2.3 General

2.3.1 Change the colors

2.3.2 Missing Data

2.3.2.1 Gathered Data

2.3.2.2 Tidy Data

2.3.3 Connect Flows to observations in original data

2.3.3.1 tidy data

2.3.3.2 gathered data

2.3.4 The Plot objects can be manipulated like any other ggplot object

2.3.4.1 Flip

2.3.4.2 Title

2.3.4.3 Repel Labels

ID	cyl	vs	am	mpg	disp	alluvial_id	n
27	4	0	1	MH	LL	1	1
21	4	1	0	MH	LL	2	1
6	6	1	0	M	M	8	1
4	6	1	0	MH	M	9	1
24	8	0	0	LL	MH	10	1
17	8	0	0	ML	HH	13	1
25	8	0	0	M	HH	15	1
8	4	1	0	MH	ML	3	2
9	4	1	0	MH	ML	3	2
3	4	1	1	MH	LL	4	2

ID	cyl	vs	am	mpg	disp	alluvial_id	n
27	4	0	1	MH	LL	1	1
21	4	1	0	MH	LL	2	1
6	6	1	0	M	M	8	1
4	6	1	0	MH	M	9	1
24	8	0	0	LL	MH	10	1
17	8	0	0	ML	HH	13	1
25	8	0	0	M	HH	15	1
8	4	1	0	MH	ML	3	2
9	4	1	0	MH	ML	3	2
3	4	1	1	MH	LL	4	2

Alluvial Plots

Bjoern Oettinghaus

2018-03-02

1 Introduction

2 Alluvial Plots

2.1 Visualize Data in a tidy data format

2.1.1 Data

2.1.2 Colouring

2.1.3 Ordering

2.2 Visualize data in the ‘gathered’ format

2.2.1 Data

2.2.2 Coloring

2.2.2.1 color by independent variable carrier

2.2.2.1.1 Move colored independent variable to the left

2.2.2.2 Other coloring options

2.2.3 Ordering

2.2.3.1 Y levels of dependent variable

2.2.3.2 Y levels of independent color variable carrier

2.2.3.3 X levels

2.3 General

2.3.1 Change the colors

2.3.2 Missing Data

2.3.2.1 Gathered Data

2.3.2.2 Tidy Data

2.3.3 Connect Flows to observations in original data

2.3.3.1 tidy data

2.3.3.2 gathered data

2.3.4 The Plot objects can be manipulated like any other ggplot object

2.3.4.1 Flip

2.3.4.2 Title

2.3.4.3 Repel Labels

2.2.2.1 color by independent variable `carrier`

2.2.3.2 Y levels of independent color variable `carrier`

ID	cyl	vs	am	mpg	disp	alluvial_id	n
27	4	0	1	MH	LL	1	1
21	4	1	0	MH	LL	2	1
6	6	1	0	M	M	8	1
4	6	1	0	MH	M	9	1
24	8	0	0	LL	MH	10	1
17	8	0	0	ML	HH	13	1
25	8	0	0	M	HH	15	1
8	4	1	0	MH	ML	3	2
9	4	1	0	MH	ML	3	2
3	4	1	1	MH	LL	4	2