knitr::opts_chunk$set(echo = TRUE,
fig.align = "center")
## Load the libraries we will be using
pacman::p_load(tidyverse, ggfittext)
## Since we're creating bar charts, let's use theme_test()
theme_set(theme_test())
# Changing the default theme setting to have the title centered
theme_update(plot.title = element_text(hjust = 0.5))
## We'll be using the titanic data set
t_df <- read.csv("titanic.csv",
stringsAsFactors = T) # Changes strings to factors
# Checking the order of the class column
levels(t_df$class)
## [1] "Crew" "First" "Second" "Third"
Before we begin, let’s change the order of class to be in
first, second, third, and crew using mutate()
and
factor()
t_df <-
t_df |>
mutate(
class = factor(class, levels = c("First", "Second", "Third", "Crew"))
)
We’ve seen previously how to create bar charts using
geom_bar()
. It is fairly straight forward to create a bar
chart using geom_bar()
when working with the unsummarized
data, as long as you want to display the counts on the
y-axis:
# bar chart of class:
ggplot(
data = t_df,
mapping = aes(x = class)
) +
geom_bar(
fill = "steelblue",
color = "black"
) +
labs(
x = "Passenger Class",
title = "Titanic Passengers using geom_bar()"
)
If we wanted a stacked bar chart showing the association between passenger class and survival status:
# bar chart of survival by class:
ggplot(
data = t_df,
mapping = aes(x = class,
fill = fct_rev(status)) # fill = status changes the color in the bars according to status
) +
geom_bar(color = "black") +
labs(
x = "Passenger Class",
fill = "Survival \nStatus",
title = "Titanic Passengers using geom_bar()"
) +
# Usually red = bad and blue = good, so let's change it with scale_fill_manual
scale_fill_manual(
values = c("Alive" = "steelblue",
"Dead" = "darkred")
)
If you want to show the conditional proportions (percentage that
survived for each class), include position = "fill"
inside
geom_bar()
:
# bar chart of survival by class:
ggplot(
data = t_df,
mapping = aes(x = class,
fill = fct_rev(status))
) + # fct_rev() reverses the position of the factors in the graph
geom_bar(
color = "black",
position = "fill"
) + # Display the conditional proportions
labs(
x = "Passenger Class",
fill = "Survival \nStatus",
title = "Titanic Passengers using geom_bar()",
y = "Status Percentage by Class"
) +
# Usually red = bad and blue = good, so let's change it with scale_fill_manual
scale_fill_manual(
values = c("Alive" = "steelblue",
"Dead" = "darkred")
) +
# And display percentages on the y-axis and remove the extra space on the bottom:
scale_y_continuous(
labels = scales::label_percent(),
expand = c(0, 0, 0.05, 0)
)
What if instead we wanted to show a side-by-side bar chart with the conditional percentages on the y-axis?
Can’t do that with geom_bar()
. Need to use
geom_col()
and the summarized data instead!
Let’s start simple, by creating a bar chart for just a single variable: class.
We can calculate the counts for each passenger class and the corresponding proportions using:
group_by()
summarize(counts = n())
mutate()
You’ll need to fill in group_by()
and
mutate()
accordingly!
Remember than n()
doesn’t have any arguments, it just
counts the number of rows when used inside summarize()
!
Create a data set that has class, counts, and the proportions of each passenger class below and save it as class_sum:
class_sum <-
t_df |>
# group_by, summarize, and n() will count how many passengers of each type
group_by(class) |>
summarize(counts = n()) |>
# mutate() will convert the counts to proportions
mutate(class_prop = counts/sum(counts))
# Display the data set:
class_sum
## # A tibble: 4 × 3
## class counts class_prop
## <fct> <int> <dbl>
## 1 First 325 0.148
## 2 Second 285 0.129
## 3 Third 706 0.321
## 4 Crew 885 0.402
Now that we have a data set of the summarized data, how do we create a bar chart from it?
The problem with geom_bar()
is that it only wants 1
x
or y
aesthetic, so you can’t specify the
height of the bar with y
. So how do we get around that.
There are a couple, but the simplest is to just use a different
geom
- geom_col()
.
geom_col()
wants both an x
and a
y
, where 1 is assigned to categorical variable and the
other is assigned to numeric variable. It will then create a column or
bar that starts at 0 and ends at the specified height.
Create a bar chart with the proportions on the y-axis using
geom_col()
ggplot(
data = class_sum,
mapping = aes(x = class,
y = class_prop)
) +
# Need geom_col() over geom_bar() since we are specifying the bar height
geom_col(
fill = "steelblue",
color = "black"
) +
labs(
x = "Passenger Class",
y = NULL,
title = "Bar Chart using geom_col"
) +
# add this to change the y-axis labels to percentages and remove the space at the bottom of the graph:
scale_y_continuous(
labels = scales::label_percent(accuracy = 1),
expand = c(0, 0, 0.05, 0)
)
We’ll use the same 3 functions: group_by()
and
summarize()
along with n()
to calculate the
totals for each class and status combination. You just need to include
both class and status in group_by()
, just make sure to list
the variable you want to find the proportion of (status) last and don’t
include ungroup()
anywhere.
Start by calculating the total number of passengers of each class and
status type. Save it as t_sum
. Name the column that
contains the count totals counts
t_sum <-
t_df |>
# Including 2 columns in group_by() will calculate how many of each
# class status combo there are in the data
group_by(class, status) |>
summarize(counts = n())
## `summarise()` has grouped output by 'class'. You can override using the
## `.groups` argument.
# Display the results below:
t_sum
## # A tibble: 8 × 3
## # Groups: class [4]
## class status counts
## <fct> <fct> <int>
## 1 First Alive 203
## 2 First Dead 122
## 3 Second Alive 118
## 4 Second Dead 167
## 5 Third Alive 178
## 6 Third Dead 528
## 7 Crew Alive 212
## 8 Crew Dead 673
So now we have the counts. How do we get the conditional proportions: the percentage of first class survivors that lived, the percentage of second class passengers that died, etc…?
\[p_{A|1st} = \frac{\textrm{Number of first class passengers that lived}}{\textrm{Number of first class passengers}} = \frac{203}{203 + 122} \approx 0.625\]
So we have the numerator of all of our fractions, but how do we get the denominator?
Notice that t_sum
is still grouped by the first variable
listed: class. You can combine mutate()
with
sum()
to calculate the totals for each passenger class
while keeping each of the individual rows. Try it out below, but don’t
save the results!
t_sum |>
mutate(class_total = sum(counts))
## # A tibble: 8 × 4
## # Groups: class [4]
## class status counts class_total
## <fct> <fct> <int> <int>
## 1 First Alive 203 325
## 2 First Dead 122 325
## 3 Second Alive 118 285
## 4 Second Dead 167 285
## 5 Third Alive 178 706
## 6 Third Dead 528 706
## 7 Crew Alive 212 885
## 8 Crew Dead 673 885
You should see 325 repeated twice for each row with “First” listed. Then the same for “Second” (285), “Third” (706), and “Crew” (885).
We can use these counts to calculate the conditional proportions! Try
calculating the conditional proportions by including
status_prop = counts/sum(counts)
inside
mutate()
. This time, save the results as t_sum
again
t_sum <-
t_sum |>
mutate(status_prop = counts/sum(counts)) |>
# now include an ungroup at the end of the pipe chain
ungroup()
# Displaying the results:
t_sum
## # A tibble: 8 × 4
## class status counts status_prop
## <fct> <fct> <int> <dbl>
## 1 First Alive 203 0.625
## 2 First Dead 122 0.375
## 3 Second Alive 118 0.414
## 4 Second Dead 167 0.586
## 5 Third Alive 178 0.252
## 6 Third Dead 528 0.748
## 7 Crew Alive 212 0.240
## 8 Crew Dead 673 0.760
You can check to see if you did it correctly with the code below:
t_sum |>
group_by(class) |>
summarize(total_prop = sum(status_prop))
## # A tibble: 4 × 2
## class total_prop
## <fct> <dbl>
## 1 First 1
## 2 Second 1
## 3 Third 1
## 4 Crew 1
If done correctly, the code above should return four 1
Using the summarized data, create the side-by-side bar chart of
conditional proportions by copying and pasting the code in the
geom_bar_2var code chunk and using geom_col()
in place of
geom_bar()
along with the appropriate changes (remember, to
make a side-by-side bar chart, include position = “dodge” somewhere
inside the geom!).
# bar chart of survival by class:
ggplot(
data = t_sum,
mapping = aes(x = class,
fill = status,
y = status_prop)
) +
geom_col(
color = "black",
position = "dodge"
) + # make a side-by-side bar chart
labs(
x = "Passenger Class",
fill = "Survival \nStatus",
title = "Titanic Passengers using geom_bar()",
y = "Status Percentage by Class"
) +
# Usually red = bad and blue = good, so let's change it with scale_fill_manual
scale_fill_manual(
values = c("Alive" = "steelblue",
"Dead" = "darkred")
) +
# And display percentages on the y-axis and remove the extra space on the bottom:
scale_y_continuous(
labels = scales::percent,
expand = c(0, 0, 0.05, 0)
)
Using what you’ve learned here along with facet_wrap()
,
divide the graph above into 2 small multiples, one for male passengers
and one for female passengers. To use the summarized data, you’ll need
to create it from the original, raw data set since t_sum
doesn’t have the sex column in it!
# Creating the graph in one biiiiig chain
t_df |>
# List the 3 groups we want to calculate the totals of
group_by(
sex, class, status
) |>
# Use summarize() and n() to calculate the totals
summarize(counts = n()) |>
# And mutate to calculate the conditional proportions: status/(sex, class)
mutate(prop = counts/sum(counts)) |>
# We can pipe the data set directly into ggplot()
ggplot(
mapping = aes(x = class,
fill = fct_rev(status),
y = prop)
) +
geom_col(
color = "black",
position = "dodge"
) +
# Using facet_wrap() to create 2 small multiples:
facet_wrap(
facets = ~ sex,
ncol = 1
) +
labs(
x = "Passenger Class",
fill = "Survival \nStatus",
y = "Status Percentage by Class"
) +
# Usually red = bad and blue = good, so let's change it with scale_fill_manual
scale_fill_manual(
values = c("Alive" = "steelblue",
"Dead" = "darkred")
) +
# And display percentages on the y-axis and remove the extra space on the bottom:
scale_y_continuous(
labels = scales::percent,
expand = c(0, 0, 0.05, 0)
)