Raincloud Plots

What they are for

A raincloud plot combines:

This makes it useful for showing:

Use a raincloud plot when you want to compare the distribution of a numeric variable across groups.

# Install if needed:
# install.packages(c("ggplot2", "ggdist", "palmerpenguins", "dplyr"))

library(ggplot2)
library(ggdist)
library(palmerpenguins)
## 
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
penguins_clean <- penguins %>%
  select(species, body_mass_g) %>%
  na.omit()

ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = species)) +
  stat_halfeye(
    adjust = 0.5,
    width = 0.6,
    justification = -0.2,
    .width = 0,
    point_colour = NA
  ) +
  geom_boxplot(
    width = 0.12,
    outlier.shape = NA,
    alpha = 0.5
  ) +
  geom_jitter(
    width = 0.08,
    alpha = 0.5,
    size = 1.5
  ) +
  labs(
    title = "Raincloud Plot of Penguin Body Mass",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

If you have a second category you want to show….

library(ggplot2)
library(ggdist)
penguins_clean <- penguins %>%
  select(species, sex, body_mass_g) %>%
  na.omit()

ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = sex)) +
  
  # half-eye distribution
  stat_halfeye(   #IMPORTANT
    position = position_dodge(width = 0.75),
    adjust = 0.6,
    width = 0.55,
    .width = 0,
    justification = -0.2,
    point_colour = NA,
    alpha = 0.5
  ) +
  
  # boxplot summary
  geom_boxplot(
    aes(color = sex),
    width = 0.12,
    position = position_dodge(width = 0.75),
    outlier.shape = NA,
    alpha = 0.65,
    linewidth = 0.5
  ) +
  
  # raw data points
  geom_jitter(
    aes(color = sex),
    position = position_jitterdodge(
      jitter.width = 0.08,
      dodge.width = 0.75
    ),
    size = 1.8,
    alpha = 0.25
  ) +
  
  labs(
    title = "Penguin Body Mass by Species and Sex",
    subtitle = "Raincloud plot showing distribution, summary statistics, and individual observations",
    x = "Species",
    y = "Body Mass (g)",
    fill = "Sex",
    color = "Sex"
  ) +
  
  scale_fill_manual(values = c("female" = "mistyrose3", "male" = "darkseagreen3")) +
  scale_color_manual(values = c("female" = "indianred3", "male" = "seagreen4")) +
  
  theme_classic(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold"),
    legend.position = "right"
  )

library(ggplot2)
library(ggdist)
library(palmerpenguins)
library(dplyr)

penguins_clean <- penguins %>%
  filter(!is.na(species), !is.na(body_mass_g)) %>%
  mutate(species = factor(species, levels = c("Adelie", "Chinstrap", "Gentoo")))

ggplot(penguins_clean, aes(x = species, y = body_mass_g)) +
  
  # half violin (raincloud shape)
  stat_halfeye(
    adjust = 0.6,
    width = 0.6,
    .width = 0,
    justification = -0.3,
    point_colour = NA,
    fill = "#74a9cf",
    alpha = 0.7
  ) +
  
  # boxplot
  geom_boxplot(
    width = 0.12,
    outlier.shape = NA,
    fill = "white",
    color = "black",
    linewidth = 0.8
  ) +
  
  # points (aligned dots instead of jitter chaos)
  geom_dotplot(
    binaxis = "y",
    stackdir = "down", # key thing 
    dotsize = 0.6,
    fill = "gray40",
    alpha = 0.7
  ) +
  
  labs(
    title = "Penguin Body Mass by Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  
  theme_classic(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(face = "bold")
  )
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Sina Plots

A sina plot is similar to a jitter plot, but the points are spread based on the density of the data. That means:

This makes it a nice alternative to:

It shows both individual observations and distribution shape.

Use a sina plot when you want to show raw data points without them overlapping too much, while also giving a sense of density.

Points are not randomly scattered Wider sections indicate a greater concentration of values

# Install if needed:
# install.packages(c("ggplot2", "ggforce", "palmerpenguins", "dplyr"))

library(ggplot2)
library(ggforce)
library(palmerpenguins)
library(dplyr)

penguins_clean <- penguins %>%
  select(species, flipper_length_mm) %>%
  na.omit()

ggplot(penguins_clean, aes(x = species, y = flipper_length_mm, color = species)) +
  geom_sina(alpha = 0.7, size = 2) + # key thing 
  labs(
    title = "Sina Plot of Penguin Flipper Length",
    x = "Species",
    y = "Flipper Length (mm)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Cleveland Plots

A Cleveland dot plot is used to compare values across categories using dots instead of bars.

It is helpful because:

Use a Cleveland dot plot when comparing one summary value across categories.

Examples:

This example compares the average body mass of penguin species.

# Install if needed:
# install.packages(c("ggplot2", "palmerpenguins", "dplyr"))

library(ggplot2)
library(palmerpenguins)
library(dplyr)

species_summary <- penguins %>%
  group_by(species) %>%
  summarise(mean_body_mass = mean(body_mass_g, na.rm = TRUE)) %>%
  arrange(mean_body_mass)

ggplot(species_summary, aes(x = mean_body_mass, y = reorder(species, mean_body_mass))) +
  geom_point(size = 4) +
  labs(
    title = "Cleveland Dot Plot of Mean Penguin Body Mass",
    x = "Mean Body Mass (g)",
    y = "Species"
  ) +
  theme_minimal()

If you have groups…

grouped_summary <- penguins %>%
  group_by(species, sex) %>%
  summarise(mean_bill_length = mean(bill_length_mm, na.rm = TRUE), .groups = "drop")

ggplot(grouped_summary, aes(x = mean_bill_length, y = species, color = sex)) +
  geom_point(size = 3, position = position_dodge(width = 0.4)) +
  labs(
    title = "Grouped Cleveland Dot Plot of Mean Bill Length",
    x = "Mean Bill Length (mm)",
    y = "Species",
    color = "Sex"
  ) +
  theme_minimal()

Forest Plots

A forest plot shows:

Use a forest plot when:

library(ggplot2)
library(dplyr)
library(palmerpenguins)

summary_data <- penguins %>%
  group_by(species) %>%
  summarise(
    mean_mass = mean(body_mass_g, na.rm = TRUE),
    sd = sd(body_mass_g, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),
    lower = mean_mass - 1.96 * se,
    upper = mean_mass + 1.96 * se
  )


ggplot(summary_data, aes(x = mean_mass, y = reorder(species, mean_mass))) +
  geom_point(size = 4) +
  geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0.2) +
  labs(
    title = "Forest Plot of Mean Penguin Body Mass",
    x = "Mean Body Mass (g) with 95% CI",
    y = "Species"
  ) +
  theme_minimal()
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.

If you have grouped data…

summary_grouped <- penguins %>%
  group_by(species, sex) %>%
  summarise(
    mean_mass = mean(body_mass_g, na.rm = TRUE),
    sd = sd(body_mass_g, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),
    lower = mean_mass - 1.96 * se,
    upper = mean_mass + 1.96 * se,
    .groups = "drop"
  )

ggplot(summary_grouped, aes(x = mean_mass, y = species, color = sex)) +
  geom_point(position = position_dodge(width = 0.5), size = 3) +
  geom_errorbarh(
    aes(xmin = lower, xmax = upper),
    position = position_dodge(width = 0.5),
    height = 0.2
  ) +
  labs(
    title = "Forest Plot of Body Mass by Species and Sex",
    x = "Mean Body Mass (g) with 95% CI",
    y = "Species",
    color = "Sex"
  ) +
  theme_minimal() 
## `height` was translated to `width`.

Homework

You will be using this dataset for the homework

library(ggplot2)
library(dplyr)
library(palmerpenguins)

penguins_clean <- penguins %>%
  filter(!is.na(species), !is.na(sex), !is.na(body_mass_g))

Part 1. Create one plot of your choice to visualize the relationship between:

  • Species
  • Mass
  • Sex
ggplot(penguins_clean, aes(species,body_mass_g, fill = sex)) +
  geom_violin(alpha = 0.2, trim = FALSE) +
  geom_boxplot(width = 0.2, position = position_dodge(width = 0.9)) +
  labs(
    title = "Body Mass by Species and Sex",
    x = "Species",
    y = "Body Mass (g)",
    fill = "Sex"
  ) +
  theme_minimal()

1. What plot type did you choose?

I decided to create a boxplot with a violin plot overlap to show the relationships between the 3 categories.

2. Why is this plot appropriate for this data?

This plot is appropriate becaue it cleary shows the relationship in two differnt ways. It allows us to see the median weight and the distribution between gender and species.

3. What patterns do you observe?

In all 3 species the males weigh more than the females.

Part 2. Create a raincloud plot showing body mass across species.

ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = species)) +
  stat_halfeye(
    adjust = 0.5,
    width = 0.6,
    justification = -0.2,
    .width = 0,
    point_colour = NA
  ) +
    geom_jitter(
    width = 0.08,
    alpha = 0.5,
    size = 1.5
  ) +
  geom_boxplot(
    width = 0.12,
    outlier.shape = NA,
    alpha = 0.5
  ) +
  labs(
    title = "Raincloud Plot of Penguin Body Mass Across Species",
    x = "Species",
    y = "Body Mass (g)"
  ) +
  theme_minimal() 

1. Which species has the highest body mass?

The Gentoo species has the highest body mass

2. Which species shows the greatest variability?

The Gentoo species also has the greatest variability within the species.

3. What does this plot show that a boxplot alone would not?

This plot shows the distribution by visually showing where the higher concentrations are and if the data is skewed or not. This graph also plots each individual plot to allow for easy visualization of the outliers.

Part 3. Create a forest plot. Now summarize the data and visualize uncertainty.

# Don't forget, you will need to make summary data. 

summary_data2 <- penguins_clean %>%
  group_by(species) %>%
  summarise(
    mean_mass = mean(body_mass_g, na.rm = TRUE),
    sd = sd(body_mass_g, na.rm = TRUE),
    n = n(),
    se = sd / sqrt(n),
    lower = mean_mass - 1.96 * se,
    upper = mean_mass + 1.96 * se
  )

ggplot(summary_data2 , aes(mean_mass, reorder(species, mean_mass))) +
  geom_point(size = 5, shape= 17, color = "blue") +
  geom_errorbarh(aes(xmin = lower, xmax = upper), width = 0.2) +
  labs(
    title = "Forest Plot of Mean Penguin Body Mass",
    x = "Mean Body Mass (g) with 95% CI",
    y = "Species"
  ) +
  theme_bw()

1. Which group has the highest mean body mass?

The Gentoo Species has the highest mean body mass.

2. Which group has the widest confidence interval? Why?

The Chinstrap group had the biggest confidence interval because there was a skew in the data. The data was skewed highly in the middle which can lead to less confidence.

3. Do any groups appear clearly different (based on overlap)?

When comparing the 3 species the Gentoo species is extremely differnt than the Chinstrap and Adelle species.