suppressPackageStartupMessages( require(oetteR) )
suppressPackageStartupMessages( require(tidyverse) )
Alluvial Plots can be a powerfull tool to visualise categorical data. It will group observations that have similar values across a set of dimensions and visualise them as flows. The individual flows can be emphasised through different colouring methods.
For this dataformat the f_plot_alluvial function is suitable. Also see the help documentation and the examples of that function.
data_ls = mtcars %>%
f_clean_data()
#> [1] "Number of excluded observations: 0"
data_tidy = data_ls$data
max_variables = 5
variables = c( data_ls$categoricals[1:3], data_ls$numericals[1:3] )
head(data_tidy, 10) %>%
knitr::kable()
| mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb |
|---|---|---|---|---|---|---|---|---|---|---|
| 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
| 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
| 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
| 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
| 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
| 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
| 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
| 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
| 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 |
| 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'last_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'all_flows' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'values' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
The order of the variables on the x axis is determined by the parameter variables. The order of any y values can be changed using the oder_levels argument. Simply pass the values you want to reorder as a character vector.
f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'values'
, order_levels = c('1', '0') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
Here we have more than one row for each observation and measurements that belong to the same group such as mean arrival delay is gathered in one column, which is indexed by the quarter column. In an alluvial Plot we might want to add another independent variable for coloring like in this case carrier.
monthly_flights = nycflights13::flights %>%
group_by(month, tailnum, origin, dest, carrier) %>%
summarise() %>%
group_by( tailnum, origin, dest, carrier) %>%
count() %>%
filter( n == 12 ) %>%
select( - n ) %>%
left_join( nycflights13::flights ) %>%
.[complete.cases(.), ] %>%
ungroup() %>%
mutate( tailnum = pmap_chr(list(tailnum, origin, dest, carrier), paste )
, qu = cut(month, 4)) %>%
group_by(tailnum, carrier, origin, dest, qu ) %>%
summarise( mean_arr_delay = mean(arr_delay) ) %>%
ungroup() %>%
mutate( mean_arr_delay = ifelse( mean_arr_delay < 10, 'on_time', 'late' ) )
levels(monthly_flights$qu) = c('Q1', 'Q2', 'Q3', 'Q4')
data_gath = monthly_flights
head(data_gath, 10) %>%
knitr::kable()
| tailnum | carrier | origin | dest | qu | mean_arr_delay |
|---|---|---|---|---|---|
| N0EGMQ LGA BNA MQ | MQ | LGA | BNA | Q1 | on_time |
| N0EGMQ LGA BNA MQ | MQ | LGA | BNA | Q2 | on_time |
| N0EGMQ LGA BNA MQ | MQ | LGA | BNA | Q3 | on_time |
| N0EGMQ LGA BNA MQ | MQ | LGA | BNA | Q4 | on_time |
| N11150 EWR MCI EV | EV | EWR | MCI | Q1 | late |
| N11150 EWR MCI EV | EV | EWR | MCI | Q2 | late |
| N11150 EWR MCI EV | EV | EWR | MCI | Q3 | on_time |
| N11150 EWR MCI EV | EV | EWR | MCI | Q4 | late |
| N12125 EWR LAX UA | UA | EWR | LAX | Q1 | on_time |
| N12125 EWR LAX UA | UA | EWR | LAX | Q2 | on_time |
col_x = 'qu'
col_y = 'mean_arr_delay'
col_fill = 'carrier'
col_id = 'tailnum'
carrier
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill, fill_right = F )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'last_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'all_flows' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', fill_by = 'value' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
, order_levels_y = c('on_time', 'late') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
carrier
order_by_carrier_size = data_gath %>%
group_by(carrier) %>%
count() %>%
arrange( desc(n) ) %>%
.[['carrier']]
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, col_fill
, order_levels_fill = order_by_carrier_size )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'first_variable'
, order_levels_x = c('Q4', 'Q3', 'Q2', 'Q1') )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
Any color palette can be passed to both functions.
f_plot_alluvial_1v1( data_gath, col_x, col_y, col_id, fill_by = 'last_variable'
, col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Purples') )
, col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Oranges') ) )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 4 %"
#> [1] "Maximum weight of a singfle flow 32.3 %"
f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'first_variable'
, col_vector_flow = rev( RColorBrewer::brewer.pal(9, 'Reds') )
, col_vector_value = rev( RColorBrewer::brewer.pal(9, 'Greens') )
)
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
Missing Data will automatically be labeled as NA. The label can be changed and it can be ordered as usual
data = data_gath %>%
select(tailnum, qu, mean_arr_delay) %>%
sample_frac(0.9)
f_plot_alluvial_1v1( data, col_x, col_y, col_id, fill_by = 'last_variable'
, NA_label = 'none'
, order_levels_y = 'none')
#> [1] "Number of flows: 60"
#> [1] "Original Dataframe reduced to 16.6 %"
#> [1] "Maximum weight of a singfle flow 23.5 %"
data = data_tidy
data$cyl[1:4] = NA
f_plot_alluvial( data = data
, variables = variables
, max_variables = max_variables
, fill_by = 'first_variable'
, NA_label = 'none'
, order_levels = 'none' )
#> [1] "Number of flows: 18"
#> [1] "Original Dataframe reduced to 56.2 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
link = file.path( system.file(package = 'oetteR')
, 'Rmd vignettes'
, 'vignette_visualising_regression_models.html' )
The plot objects returned by both functions have an attribute called data_key which is an x-y table arranged like the alluvial plot one column containing the original ID. See vignette for visualising regression models for an example on how this is effectively used.
p = f_plot_alluvial( data = data_tidy
, variables = variables
, max_variables = max_variables
, fill_by = 'first_variable' )
#> [1] "Number of flows: 16"
#> [1] "Original Dataframe reduced to 50 %"
#> [1] "Maximum weight of a singfle flow 15.6 %"
p$data_key %>%
head(10) %>%
knitr::kable()
| ID | cyl | vs | am | mpg | disp | alluvial_id | n |
|---|---|---|---|---|---|---|---|
| 27 | 4 | 0 | 1 | MH | LL | 1 | 1 |
| 21 | 4 | 1 | 0 | MH | LL | 2 | 1 |
| 6 | 6 | 1 | 0 | M | M | 8 | 1 |
| 4 | 6 | 1 | 0 | MH | M | 9 | 1 |
| 24 | 8 | 0 | 0 | LL | MH | 10 | 1 |
| 17 | 8 | 0 | 0 | ML | HH | 13 | 1 |
| 25 | 8 | 0 | 0 | M | HH | 15 | 1 |
| 8 | 4 | 1 | 0 | MH | ML | 3 | 2 |
| 9 | 4 | 1 | 0 | MH | ML | 3 | 2 |
| 3 | 4 | 1 | 1 | MH | LL | 4 | 2 |
p = f_plot_alluvial_1v1( data_gath, col_x = 'qu', col_y = 'mean_arr_delay', col_id = 'tailnum', col_fill = 'carrier' )
#> [1] "Number of flows: 108"
#> [1] "Original Dataframe reduced to 26.9 %"
#> [1] "Maximum weight of a singfle flow 9.2 %"
p$data_key %>%
head(10) %>%
knitr::kable()
| tailnum | carrier | Q1 | Q2 | Q3 | Q4 | n | alluvial_id |
|---|---|---|---|---|---|---|---|
| N0EGMQ LGA BNA MQ | MQ | on_time | on_time | on_time | on_time | 2 | 82 |
| N11150 EWR MCI EV | EV | late | late | on_time | late | 4 | 46 |
| N12125 EWR LAX UA | UA | on_time | on_time | on_time | on_time | 5 | 91 |
| N13716 EWR SNA UA | UA | on_time | on_time | on_time | on_time | 5 | 91 |
| N14102 EWR MCO UA | UA | on_time | on_time | on_time | on_time | 5 | 91 |
| N14120 EWR MCO UA | UA | late | on_time | on_time | on_time | 2 | 86 |
| N161UW LGA CLT US | US | on_time | on_time | late | on_time | 11 | 100 |
| N169UW EWR CLT US | US | on_time | late | late | on_time | 7 | 96 |
| N169UW LGA CLT US | US | on_time | on_time | on_time | on_time | 37 | 102 |
| N17122 EWR LAX UA | UA | on_time | on_time | on_time | on_time | 5 | 91 |
p = p +
coord_flip()
p
p = p +
ggtitle('Look at my flip')
p
Unfortunately does not work yet
p = p %>%
ggrepel::geom_text_repel()
#> Error: Mapping must be created by `aes()` or `aes_()`