What they are for
A raincloud plot combines:
This makes it useful for showing:
Use a raincloud plot when you want to compare the distribution of a numeric variable across groups.
# Install if needed:
# install.packages(c("ggplot2", "ggdist", "palmerpenguins", "dplyr"))
library(ggplot2)
library(ggdist)
library(palmerpenguins)
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
penguins_clean <- penguins %>%
select(species, body_mass_g) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = species)) +
stat_halfeye(
adjust = 0.5,
width = 0.6,
justification = -0.2,
.width = 0,
point_colour = NA
) +
geom_boxplot(
width = 0.12,
outlier.shape = NA,
alpha = 0.5
) +
geom_jitter(
width = 0.08,
alpha = 0.5,
size = 1.5
) +
labs(
title = "Raincloud Plot of Penguin Body Mass",
x = "Species",
y = "Body Mass (g)"
) +
theme_minimal() +
theme(legend.position = "none")
If you have a second category you want to show….
library(ggplot2)
library(ggdist)
penguins_clean <- penguins %>%
select(species, sex, body_mass_g) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = sex)) +
# half-eye distribution
stat_halfeye(
position = position_dodge(width = 0.75),
adjust = 0.6,
width = 0.55,
.width = 0,
justification = -0.2,
point_colour = NA,
alpha = 0.5
) +
# boxplot summary
geom_boxplot(
aes(color = sex),
width = 0.12,
position = position_dodge(width = 0.75),
outlier.shape = NA,
alpha = 0.65,
linewidth = 0.5
) +
# raw data points
geom_jitter(
aes(color = sex),
position = position_jitterdodge(
jitter.width = 0.08,
dodge.width = 0.75
),
size = 1.8,
alpha = 0.25
) +
labs(
title = "Penguin Body Mass by Species and Sex",
subtitle = "Raincloud plot showing distribution, summary statistics, and individual observations",
x = "Species",
y = "Body Mass (g)",
fill = "Sex",
color = "Sex"
) +
scale_fill_manual(values = c("female" = "mistyrose3", "male" = "darkseagreen3")) +
scale_color_manual(values = c("female" = "indianred3", "male" = "seagreen4")) +
theme_classic(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.position = "right"
)
library(ggplot2)
library(ggdist)
library(palmerpenguins)
library(dplyr)
penguins_clean <- penguins %>%
filter(!is.na(species), !is.na(body_mass_g)) %>%
mutate(species = factor(species, levels = c("Adelie", "Chinstrap", "Gentoo")))
ggplot(penguins_clean, aes(x = species, y = body_mass_g)) +
# half violin (raincloud shape)
stat_halfeye(
adjust = 0.6,
width = 0.6,
.width = 0,
justification = -0.3,
point_colour = NA,
fill = "#74a9cf",
alpha = 0.7
) +
# boxplot
geom_boxplot(
width = 0.12,
outlier.shape = NA,
fill = "white",
color = "black",
linewidth = 0.8
) +
# points (aligned dots instead of jitter chaos)
geom_dotplot(
binaxis = "y",
stackdir = "down",
dotsize = 0.6,
fill = "gray40",
alpha = 0.7
) +
labs(
title = "Penguin Body Mass by Species",
x = "Species",
y = "Body Mass (g)"
) +
theme_classic(base_size = 16) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title = element_text(face = "bold")
)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
A sina plot is similar to a jitter plot, but the points are spread based on the density of the data. That means:
This makes it a nice alternative to:
It shows both individual observations and distribution shape.
Use a sina plot when you want to show raw data points without them overlapping too much, while also giving a sense of density.
Points are not randomly scattered Wider sections indicate a greater concentration of values
# Install if needed:
# install.packages(c("ggplot2", "ggforce", "palmerpenguins", "dplyr"))
library(ggplot2)
library(ggforce)
library(palmerpenguins)
library(dplyr)
penguins_clean <- penguins %>%
select(species, flipper_length_mm) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = flipper_length_mm, color = species)) +
geom_sina(alpha = 0.7, size = 2) +
labs(
title = "Sina Plot of Penguin Flipper Length",
x = "Species",
y = "Flipper Length (mm)"
) +
theme_minimal() +
theme(legend.position = "none")
A Cleveland dot plot is used to compare values across categories using dots instead of bars.
It is helpful because:
Use a Cleveland dot plot when comparing one summary value across categories.
Examples:
This example compares the average body mass of penguin species.
# Install if needed:
# install.packages(c("ggplot2", "palmerpenguins", "dplyr"))
library(ggplot2)
library(palmerpenguins)
library(dplyr)
species_summary <- penguins %>%
group_by(species) %>%
summarise(mean_body_mass = mean(body_mass_g, na.rm = TRUE)) %>%
arrange(mean_body_mass)
ggplot(species_summary, aes(x = mean_body_mass, y = reorder(species, mean_body_mass))) +
geom_point(size = 4) +
labs(
title = "Cleveland Dot Plot of Mean Penguin Body Mass",
x = "Mean Body Mass (g)",
y = "Species"
) +
theme_minimal()
If you have groups…
grouped_summary <- penguins %>%
group_by(species, sex) %>%
summarise(mean_bill_length = mean(bill_length_mm, na.rm = TRUE), .groups = "drop")
ggplot(grouped_summary, aes(x = mean_bill_length, y = species, color = sex)) +
geom_point(size = 3, position = position_dodge(width = 0.4)) +
labs(
title = "Grouped Cleveland Dot Plot of Mean Bill Length",
x = "Mean Bill Length (mm)",
y = "Species",
color = "Sex"
) +
theme_minimal()
A forest plot shows:
Use a forest plot when:
library(ggplot2)
library(dplyr)
library(palmerpenguins)
summary_data <- penguins %>%
group_by(species) %>%
summarise(
mean_mass = mean(body_mass_g, na.rm = TRUE),
sd = sd(body_mass_g, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
lower = mean_mass - 1.96 * se,
upper = mean_mass + 1.96 * se
)
ggplot(summary_data, aes(x = mean_mass, y = reorder(species, mean_mass))) +
geom_point(size = 4) +
geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0.2) +
labs(
title = "Forest Plot of Mean Penguin Body Mass",
x = "Mean Body Mass (g) with 95% CI",
y = "Species"
) +
theme_minimal()
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.
If you have grouped data…
summary_grouped <- penguins %>%
group_by(species, sex) %>%
summarise(
mean_mass = mean(body_mass_g, na.rm = TRUE),
sd = sd(body_mass_g, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
lower = mean_mass - 1.96 * se,
upper = mean_mass + 1.96 * se,
.groups = "drop"
)
ggplot(summary_grouped, aes(x = mean_mass, y = species, color = sex)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_errorbarh(
aes(xmin = lower, xmax = upper),
position = position_dodge(width = 0.5),
height = 0.2
) +
labs(
title = "Forest Plot of Body Mass by Species and Sex",
x = "Mean Body Mass (g) with 95% CI",
y = "Species",
color = "Sex"
) +
theme_minimal()
## `height` was translated to `width`.
You will be using this dataset for the homework
library(ggplot2)
library(dplyr)
library(palmerpenguins)
penguins_clean <- penguins %>%
filter(!is.na(species), !is.na(sex), !is.na(body_mass_g))
penguins_clean
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
ggplot(penguins_clean, aes(x = species, y = body_mass_g, color = sex)) +
geom_sina(alpha = 0.8, size = 3) +
labs(
title = "Sina Plot of Penguin Penguin Body Magg (g)",
x = "Species",
y = "Body Mass (g)"
) +
theme_minimal()
I chose a Sina plot for my data because I feel it best represents the raw data for this chart. Using a sina plot allows me to see al the points on the graph which also allows me to clearly see the clusters and differences between the penguin sexes and species. Plus, having them side by side allows me to all groups easily.
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = species)) +
stat_halfeye(
adjust = 0.4,
width = 0.5,
justification = -0.1,
.width = 0,
point_colour = NA
) +
geom_boxplot(
width = 0.12,
outlier.shape = NA,
alpha = 0.8
) +
geom_jitter(
width = 0.07,
alpha = 0.4,
size = 1.3
) +
labs(
title = "Raincloud Plot of Penguin Species & Their Body Mass",
x = "Species",
y = "Body Mass (g)"
) +
theme_minimal() +
theme(legend.position = "none")
The species with the highest body mass is the Gentoo, their body mass averages are significantly higher than the other two species.The species with the greatest variability has to go to the Gentoo once again, as their geom jitter is much more spread out when compared to the others. A box plot alone would not be able to show me individiual points nor the figures on the right that give frequency. ### Part 3. Create a forest plot. Now summarize the data and visualize uncertainty.
# Don't forget, you will need to make summary data.
summary_data <- penguins_clean %>%
group_by(species) %>%
summarise(
mean_mass = mean(body_mass_g, na.rm = TRUE),
sd = sd(body_mass_g, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
lower = mean_mass - 1.96 * se,
upper = mean_mass + 1.96 * se
)
ggplot(summary_data, aes(x = mean_mass, y = reorder(species, mean_mass),color=species)) +
geom_point(size = 5) +
geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0.3) +
labs(
title = "Forest Plot of Mean Penguin Body Mass",
x = "Mean Body Mass (g)",
y = "Species"
) +
theme_minimal()+
theme(legend.position = "none")
## `height` was translated to `width`.
The group with the highest mean body mass is the Gentoo species. When comparing confidence intervals the chinstrap species appears to have the largest one. This is because the error bars are the largest out of the three. The Adelie species and the Gentoo species are very different as they are on polar opposites of the graph. Adelie sits on the lower end around 3700g while Gentoo sits high up with around 5000g of average body mass.