What they are for
A raincloud plot combines:
This makes it useful for showing:
Use a raincloud plot when you want to compare the distribution of a numeric variable across groups.
# Install if needed:
# install.packages(c("ggplot2", "ggdist", "palmerpenguins", "dplyr"))
library(ggplot2)
library(ggdist)
library(palmerpenguins)
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
penguins_clean <- penguins %>%
select(species, body_mass_g) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = species)) +
stat_halfeye(
adjust = 0.5,
width = 0.6,
justification = -0.2,
.width = 0,
point_colour = NA
) +
geom_boxplot(
width = 0.12,
outlier.shape = NA,
alpha = 0.5
) +
geom_jitter(
width = 0.08,
alpha = 0.5,
size = 1.5
) +
labs(
title = "Raincloud Plot of Penguin Body Mass",
x = "Species",
y = "Body Mass (g)"
) +
theme_minimal() +
theme(legend.position = "none")
If you have a second category you want to show….
library(ggplot2)
library(ggdist)
penguins_clean <- penguins %>%
select(species, sex, body_mass_g) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = sex)) +
# half-eye distribution
stat_halfeye(
position = position_dodge(width = 0.75),
adjust = 0.6,
width = 0.55,
.width = 0,
justification = -0.2,
point_colour = NA,
alpha = 0.5
) +
# boxplot summary
geom_boxplot(
aes(color = sex),
width = 0.12,
position = position_dodge(width = 0.75),
outlier.shape = NA,
alpha = 0.65,
linewidth = 0.5
) +
# raw data points
geom_jitter(
aes(color = sex),
position = position_jitterdodge(
jitter.width = 0.08,
dodge.width = 0.75
),
size = 1.8,
alpha = 0.25
) +
labs(
title = "Penguin Body Mass by Species and Sex",
subtitle = "Raincloud plot showing distribution, summary statistics, and individual observations",
x = "Species",
y = "Body Mass (g)",
fill = "Sex",
color = "Sex"
) +
scale_fill_manual(values = c("female" = "mistyrose3", "male" = "darkseagreen3")) +
scale_color_manual(values = c("female" = "indianred3", "male" = "seagreen4")) +
theme_classic(base_size = 13) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.title = element_text(face = "bold"),
legend.position = "right"
)
library(ggplot2)
library(ggdist)
library(palmerpenguins)
library(dplyr)
penguins_clean <- penguins %>%
filter(!is.na(species), !is.na(body_mass_g)) %>%
mutate(species = factor(species, levels = c("Adelie", "Chinstrap", "Gentoo")))
ggplot(penguins_clean, aes(x = species, y = body_mass_g)) +
# half violin (raincloud shape)
stat_halfeye(
adjust = 0.6,
width = 0.6,
.width = 0,
justification = -0.3,
point_colour = NA,
fill = "#74a9cf",
alpha = 0.7
) +
# boxplot
geom_boxplot(
width = 0.12,
outlier.shape = NA,
fill = "white",
color = "black",
linewidth = 0.8
) +
# points (aligned dots instead of jitter chaos)
geom_dotplot(
binaxis = "y",
stackdir = "down",
dotsize = 0.6,
fill = "gray40",
alpha = 0.7
) +
labs(
title = "Penguin Body Mass by Species",
x = "Species",
y = "Body Mass (g)"
) +
theme_classic(base_size = 16) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.title = element_text(face = "bold")
)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
A sina plot is similar to a jitter plot, but the points are spread based on the density of the data. That means:
This makes it a nice alternative to:
It shows both individual observations and distribution shape.
Use a sina plot when you want to show raw data points without them overlapping too much, while also giving a sense of density.
Points are not randomly scattered Wider sections indicate a greater concentration of values
# Install if needed:
# install.packages(c("ggplot2", "ggforce", "palmerpenguins", "dplyr"))
library(ggplot2)
library(ggforce)
library(palmerpenguins)
library(dplyr)
penguins_clean <- penguins %>%
select(species, flipper_length_mm) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = flipper_length_mm, color = species)) +
geom_sina(alpha = 0.7, size = 2) +
labs(
title = "Sina Plot of Penguin Flipper Length",
x = "Species",
y = "Flipper Length (mm)"
) +
theme_minimal() +
theme(legend.position = "none")
A Cleveland dot plot is used to compare values across categories using dots instead of bars.
It is helpful because:
Use a Cleveland dot plot when comparing one summary value across categories.
Examples:
This example compares the average body mass of penguin species.
# Install if needed:
# install.packages(c("ggplot2", "palmerpenguins", "dplyr"))
library(ggplot2)
library(palmerpenguins)
library(dplyr)
species_summary <- penguins %>%
group_by(species) %>%
summarise(mean_body_mass = mean(body_mass_g, na.rm = TRUE)) %>%
arrange(mean_body_mass)
ggplot(species_summary, aes(x = mean_body_mass, y = reorder(species, mean_body_mass))) +
geom_point(size = 4) +
labs(
title = "Cleveland Dot Plot of Mean Penguin Body Mass",
x = "Mean Body Mass (g)",
y = "Species"
) +
theme_minimal()
If you have groups…
grouped_summary <- penguins %>%
group_by(species, sex) %>%
summarise(mean_bill_length = mean(bill_length_mm, na.rm = TRUE), .groups = "drop")
ggplot(grouped_summary, aes(x = mean_bill_length, y = species, color = sex)) +
geom_point(size = 3, position = position_dodge(width = 0.4)) +
labs(
title = "Grouped Cleveland Dot Plot of Mean Bill Length",
x = "Mean Bill Length (mm)",
y = "Species",
color = "Sex"
) +
theme_minimal()
A forest plot shows:
Use a forest plot when:
library(ggplot2)
library(dplyr)
library(palmerpenguins)
summary_data <- penguins %>%
group_by(species) %>%
summarise(
mean_mass = mean(body_mass_g, na.rm = TRUE),
sd = sd(body_mass_g, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
lower = mean_mass - 1.96 * se,
upper = mean_mass + 1.96 * se
)
ggplot(summary_data, aes(x = mean_mass, y = reorder(species, mean_mass))) +
geom_point(size = 4) +
geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0.2) +
labs(
title = "Forest Plot of Mean Penguin Body Mass",
x = "Mean Body Mass (g) with 95% CI",
y = "Species"
) +
theme_minimal()
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.
If you have grouped data…
summary_grouped <- penguins %>%
group_by(species, sex) %>%
summarise(
mean_mass = mean(body_mass_g, na.rm = TRUE),
sd = sd(body_mass_g, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
lower = mean_mass - 1.96 * se,
upper = mean_mass + 1.96 * se,
.groups = "drop"
)
ggplot(summary_grouped, aes(x = mean_mass, y = species, color = sex)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_errorbarh(
aes(xmin = lower, xmax = upper),
position = position_dodge(width = 0.5),
height = 0.2
) +
labs(
title = "Forest Plot of Body Mass by Species and Sex",
x = "Mean Body Mass (g) with 95% CI",
y = "Species",
color = "Sex"
) +
theme_minimal()
## `height` was translated to `width`.
You will be using this dataset for the homework
library(ggplot2)
library(dplyr)
library(palmerpenguins)
penguins_clean <- penguins %>%
filter(!is.na(species), !is.na(sex), !is.na(body_mass_g))
grouped_summary <- penguins_clean %>%
group_by(species, sex) %>%
summarise(mean_bill_length = mean(bill_length_mm, na.rm = TRUE), .groups = "drop")
ggplot(grouped_summary, aes(x = mean_bill_length, y = species, color = sex)) +
geom_point(size = 3, position = position_dodge(width = 0.4)) +
labs(
title = "Grouped Cleveland Dot Plot of Mean Bill Length",
x = "Mean Bill Length (mm)",
y = "Species",
color = "Sex"
) +
theme_minimal()
library(ggplot2)
library(ggdist)
library(palmerpenguins)
library(dplyr)
penguins_clean <- penguins_clean %>%
select(species, body_mass_g) %>%
na.omit()
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = species)) +
stat_halfeye(
adjust = 0.5,
width = 0.6,
justification = -0.2,
.width = 0,
point_colour = NA
) +
geom_boxplot(
width = 0.12,
outlier.shape = NA,
alpha = 0.5
) +
geom_jitter(
width = 0.08,
alpha = 0.5,
size = 1.5
) +
labs(
title = "Raincloud Plot of Penguin Body Mass",
x = "Species",
y = "Body Mass (g)"
) +
theme_minimal() +
theme(legend.position = "none")
# Don't forget, you will need to make summary data.
library(ggplot2)
library(dplyr)
library(palmerpenguins)
summary_grouped <- penguins %>%
group_by(species, sex) %>%
summarise(
mean_mass = mean(body_mass_g, na.rm = TRUE),
sd = sd(body_mass_g, na.rm = TRUE),
n = n(),
se = sd / sqrt(n),
lower = mean_mass - 1.96 * se,
upper = mean_mass + 1.96 * se,
.groups = "drop"
)
ggplot(summary_grouped, aes(x = mean_mass, y = species, color = sex)) +
geom_point(position = position_dodge(width = 0.5), size = 3) +
geom_errorbarh(
aes(xmin = lower, xmax = upper),
position = position_dodge(width = 0.5),
height = 0.2
) +
labs(
title = "Forest Plot of Body Mass by Species and Sex",
x = "Mean Body Mass (g) with 95% CI",
y = "Species",
color = "Sex"
) +
theme_minimal()
## `height` was translated to `width`.