suppressPackageStartupMessages( require(oetteR) )
suppressPackageStartupMessages( require(tidyverse) )

1 Introduction

Alluvial Plots can be a powerfull tool to visualise categorical data. It will group observations that have similar values across a set of dimensions and visualise them as flows. The individual flows can be emphasised through different colouring methods.

2 Alluvial Plots

2.1 Visualize Data in a tidy data format

For this dataformat the f_plot_alluvial function is suitable. Also see the help documentation and the examples of that function.

2.1.1 Data


data_ls = mtcars %>%
  f_clean_data()
#> [1] "Number of excluded observations: 0"

data_tidy = data_ls$data
max_variables = 5
variables = c( data_ls$categoricals[1:3], data_ls$numericals[1:3] )

head(data_tidy, 10) %>%
  knitr::kable()
mpg cyl disp hp drat wt qsec vs am gear carb
21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4

2.1.2 Colouring


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'last_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'all_flows' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'values' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.1.3 Ordering

The order of the variables on the x axis is determined by the parameter variables. The order of any y values can be changed using the oder_levels argument. Simply pass the values you want to reorder as a character vector.


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'values'
                , order_levels = c('1', '0') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.2 Visualize data in the ‘gathered’ format

2.2.1 Data

Here we have more than one row for each observation and measurements that belong to the same group such as mean arrival delay is gathered in one column, which is indexed by the quarter column. In an alluvial Plot we might want to add another independent variable for coloring like in this case carrier.


monthly_flights = nycflights13::flights %>%
  group_by(month, tailnum, origin, dest, carrier) %>%
  summarise() %>%
  group_by( tailnum, origin, dest, carrier) %>%
  count() %>%
  filter( n == 12 ) %>%
  select( - n ) %>%
  left_join( nycflights13::flights ) %>%
  .[complete.cases(.), ] %>%
  ungroup() %>%
  mutate( tailnum = pmap_chr(list(tailnum, origin, dest, carrier), paste )
          , qu = cut(month, 4)) %>%
  group_by(tailnum, carrier, origin, dest, qu ) %>%
  summarise( mean_arr_delay = mean(arr_delay) ) %>%
  ungroup() %>%
  mutate( mean_arr_delay = ifelse( mean_arr_delay < 10, 'on_time', 'late' ) )

levels(monthly_flights$qu) = c('Q1', 'Q2', 'Q3', 'Q4')

data_gath = monthly_flights

head(data_gath, 10) %>%
knitr::kable()
tailnum carrier origin dest qu mean_arr_delay
N0EGMQ LGA BNA MQ MQ LGA BNA Q1 on_time
N0EGMQ LGA BNA MQ MQ LGA BNA Q2 on_time
N0EGMQ LGA BNA MQ MQ LGA BNA Q3 on_time
N0EGMQ LGA BNA MQ MQ LGA BNA Q4 on_time
N11150 EWR MCI EV EV EWR MCI Q1 late
N11150 EWR MCI EV EV EWR MCI Q2 late
N11150 EWR MCI EV EV EWR MCI Q3 on_time
N11150 EWR MCI EV EV EWR MCI Q4 late
N12125 EWR LAX UA UA EWR LAX Q1 on_time
N12125 EWR LAX UA UA EWR LAX Q2 on_time

col_x = 'qu'
col_y = 'mean_arr_delay'
col_fill = 'carrier'
col_id = 'tailnum'

2.2.2 Coloring

2.2.2.1 color by independent variable carrier


f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"

2.2.2.1.1 Move colored independent variable to the left

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill, fill_right = F )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"

2.2.2.2 Other coloring options


f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'last_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'all_flows' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'value' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

2.2.3 Ordering

2.2.3.1 Y levels of dependent variable

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
                     , order_levels_y = c('on_time', 'late') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

2.2.3.2 Y levels of independent color variable carrier


order_by_carrier_size = data_gath %>%
  group_by(carrier) %>%
  count() %>%
  arrange( desc(n) ) %>%
  .[['carrier']]

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill
                     , order_levels_fill = order_by_carrier_size )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"

2.2.3.3 X levels

f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
                     , order_levels_x = c('Q4', 'Q3', 'Q2', 'Q1') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"

2.3 General

2.3.1 Change the colors

Any color palette can be passed to both functions.


f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'last_variable'
                     , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Purples') )
                     , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Oranges') ) )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"


f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' 
                , col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Reds') )
                , col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Greens') )
                )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.3.2 Missing Data

Missing Data will automatically be labeled as NA. The label can be changed and it can be ordered as usual

2.3.2.1 Gathered Data


data = data_gath %>%
  select(tailnum, qu, mean_arr_delay) %>%
  sample_frac(0.9)

f_plot_alluvial_1v1( data, col_x, col_y, col_id, fill_by = 'last_variable'
                         , NA_label = 'none'
                         , order_levels_y = 'none')
#> [1] "Number of flows: 60"
#> [1] "Original Dataframe reduced to 16.6 %"
#> [1] "Maximum weight of a singfle flow 23.5 %"

2.3.2.2 Tidy Data


data = data_tidy
data$cyl[1:4] = NA

f_plot_alluvial( data = data
                     , variables = variables
                     , max_variables = max_variables
                     , fill_by = 'first_variable'
                     , NA_label = 'none'
                     , order_levels = 'none' )
#> [1] "Number of flows: 18"
#> [1] "Original Dataframe reduced to 56.2 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

2.3.3 Connect Flows to observations in original data


link = file.path( system.file(package = 'oetteR')
                  , 'Rmd vignettes'
                  , 'vignette_visualising_regression_models.html' )

The plot objects returned by both functions have an attribute called data_key which is an x-y table arranged like the alluvial plot one column containing the original ID. See vignette for visualising regression models for an example on how this is effectively used.

2.3.3.1 tidy data


p = f_plot_alluvial( data = data_tidy
                , variables = variables
                , max_variables = max_variables
                , fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"

p$data_key %>%
  head(10) %>%
  knitr::kable()
ID cyl vs am mpg disp alluvial_id n
27 4 0 1 MH LL 1 1
21 4 1 0 MH LL 2 1
6 6 1 0 M M 8 1
4 6 1 0 MH M 9 1
24 8 0 0 LL MH 10 1
17 8 0 0 ML HH 13 1
25 8 0 0 M HH 15 1
8 4 1 0 MH ML 3 2
9 4 1 0 MH ML 3 2
3 4 1 1 MH LL 4 2

2.3.3.2 gathered data


p = f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill = 'carrier' )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"


p$data_key %>%
  head(10) %>%
  knitr::kable()
tailnum carrier Q1 Q2 Q3 Q4 n alluvial_id
N0EGMQ LGA BNA MQ MQ on_time on_time on_time on_time 2 82
N11150 EWR MCI EV EV late late on_time late 4 46
N12125 EWR LAX UA UA on_time on_time on_time on_time 5 91
N13716 EWR SNA UA UA on_time on_time on_time on_time 5 91
N14102 EWR MCO UA UA on_time on_time on_time on_time 5 91
N14120 EWR MCO UA UA late on_time on_time on_time 2 86
N161UW LGA CLT US US on_time on_time late on_time 11 100
N169UW EWR CLT US US on_time late late on_time 7 96
N169UW LGA CLT US US on_time on_time on_time on_time 37 102
N17122 EWR LAX UA UA on_time on_time on_time on_time 5 91

2.3.4 The Plot objects can be manipulated like any other ggplot object

2.3.4.1 Flip


p = p +
  coord_flip()

p

2.3.4.2 Title


p = p +
  ggtitle('Look at my flip')

p

2.3.4.3 Repel Labels

Unfortunately does not work yet


p = p %>%
  ggrepel::geom_text_repel()
#> Error: Mapping must be created by `aes()` or `aes_()`