knitr::opts_chunk$set(echo = TRUE, fig.align = 'center')
# Load your package when you want to use it:
pacman::p_load(tidyverse)
# Changing default theme to theme_test()
theme_set(theme_test())
theme_update(plot.title = element_text(hjust = 0.5,
size = 14))
We’ll be using the penguins data set, which is in the palmerpenguins package:
pacman::p_load(palmerpenguins)
# Using data(data_set) will have it appear in the global environment:
data(penguins)
If we are working with a “raw” data set (unsummarized, each row
represents 1 case), we can use geom_bar()
to form the bar
chart.
If you want the bars to be vertical, map the categorical variable to
x
. If the bars are to be horizontal, map the categorical
variable to y
# Vertical Bars
ggplot(
data = penguins,
mapping = aes(x = species)
) +
# geom_bar() creates a bar chart for raw data
geom_bar(
fill = "aquamarine", # bar color
color = "black", # bar outline
width = 0.5 # width of the bars
) +
labs(x = "Penguin Species") + # change the x-axis label
scale_y_continuous(expand = c(0, 0, 0.05, 0))
By default geom_bar()
will display the counts on the
non-assigned axis. But what do we do if we want proportions?
It’s easiest to use summarized data and calculate the proportions
ourselves, but there is a way using geom_bar()
Just use the following aesthetics inside aes()
inside
geom_bar()
(won’t explain what they do, but it gets the job
done!)
x = ..prop..
or y = ..prop..
(whichever
one hasn’t been assigned)group = 1
Let’s have the bars be horizontal:
ggplot(
data = penguins,
# fct_infreq() will reorder the groups IN FREQuency order
mapping = aes(y = fct_infreq(species))
) +
geom_bar(
mapping = aes(
# display proportions on the x-axis instead of counts
x = after_stat(prop),
group = 1 # tells geom_bar() how to calculate the proportions
),
fill = "violet",
color = "black",
width = 0.5) +
labs(
y = "Penguin Species",
x = "Proportion"
) +
# Changing the x-axis to be a percent
scale_x_continuous(
expand = c(0, 0, 0.05, 0),
labels = scales::label_percent()
)
geom_col()
If we have a data set with the data already summarised, we will need
to map the counts for species to y. However, geom_bar()
needs either x or y to be unassigned. So what would it look like if we
tried to use geom_bar()
with a summarised data set?
Quickest way to make a summarized data set is to use the
count()
function in the dplyr package
# Let's create a data frame that summarizes the penguin species
species_summarised <-
penguins |>
count(species) |>
# Change the column name from n to Count
rename(count = n) |>
# Next let's add the column for the proportion as well using count/sum(count)
mutate(prop = count/sum(count))
species_summarised
Now to create a bar chart using the summarized data frame. We can
specify the groups on one axis and the counts on the other. But if we
do, we can’t use geom_bar()
anymore. Instead, we use
geom_col()
, which stands for “geometry column”
# Bar chart displaying counts
ggplot(
data = species_summarised,
# fct_reorder will order the species by a second column
mapping = aes(x = fct_reorder(species, count))
) +
geom_col(
mapping = aes(y = count),
fill = "steelblue1",
color = "black",
width = 0.5
) +
labs(title = "Bar Chart with Geom_col") +
scale_y_continuous(expand = c(0, 0, 0.05, 0))
### Bar charts displaying proportions
ggplot(
data = species_summarised,
# Using -prop will order the rows from tallest to shortest
mapping = aes(x = fct_reorder(species, -prop))
) +
geom_col(
mapping = aes(y = prop),
fill = "orange2",
color = "black",
width = 0.5
) +
labs(
title = "Bar Chart for Proportions",
x = "Penguin Species",
y = "Proportion"
) +
scale_y_continuous(
expand = c(0, 0, 0.05, 0),
labels = scales::label_percent()
)
geom_col()
over
geom_bar()
There are some additional benefits for working with the summarized
data and geom_col()
vs the raw data and
geom_bar()
We can add the counts (or proportions) above the bars using
geom_text()
and labels =
argument inside
aes()
# Display the proportions on the y-axis and the counts above the bars
ggplot(
data = species_summarised,
mapping = aes(
x = fct_reorder(species, count),
y = prop
)
) +
geom_col(
fill = "steelblue1",
color = "black",
width = 0.5
) +
geom_text(
mapping = aes(label = count),
vjust = -0.25 # vjust = vertical justification.
) +
labs(title = "Bar Chart with Counts above Bars",
x = "Penguin Species",
y = "Proportion") +
scale_y_continuous(expand = c(0, 0, 0.05, 0))
We can swap it as well, but it is a good idea to round (or show percentages instead of proportions!)
An easier way is to use geom_bar_text()
from
ggfittext
# Using mutate to add a column with the percentage and sign
ggplot(
data = species_summarised,
mapping = aes(
x = fct_reorder(species, count),
y = prop
)
) +
geom_col(
fill = "orange2",
color = "black",
width = 0.5
) +
ggfittext::geom_bar_text(
mapping = aes(label = paste0(round(prop*100, digits = 1), "%")),
contrast = T
) +
labs(
title = "Bar Chart with Percentage in Bars",
x = "Penguin Species",
y = "Proportion"
) +
scale_y_continuous(
expand = c(0, 0, 0.05, 0),
labels = scales::label_percent()
)
Don’t use them
Waffle charts are somewhat new and an alternative to pie charts. You
can use geom_waffle()
inside the waffle package to
create them.
To use geom_waffle()
, you need to work with the
summarized data and specify 2 arguments:
fill =
the column with the group namesvalues =
the column with the counts#install.packages("waffle", repos = "https://cinc.rud.is")
library(waffle)
## Warning: package 'waffle' was built under R version 4.4.3
ggplot(
data = species_summarised,
mapping = aes(
fill = species,
values = count
)
) +
# make the waffle have about the same number of rows and columns
geom_waffle(
n_rows = round(sqrt(nrow(penguins)))
) +
labs(fill = "Penguin Species") +
coord_equal() +
theme_void() +
theme(legend.position = "top")