Libraries

library(tidyverse)
library(forcats)
library(lubridate)
library(ggthemes)
library(quantreg)
library(openair)
library(lattice)
library(RColorBrewer)
library(rvest)

Data

data("iris")
data("mtcars")
data("diamonds")
data("economics")
mtcars$fcyl <- as.factor(mtcars$cyl)
mtcars$fam <- as.factor(mtcars$am)
mtcars$car <- row.names(mtcars)
load("~/R Data/fish.RData")
Vocab <- read.csv("https://raw.githubusercontent.com/sigmasigmaiota/vocab/master/vocab.csv")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## embedded nul(s) found in input
gm2007 <- read.csv("gm2007.csv")
head(Vocab)
##   X.1        X year    sex education vocabulary
## 1   1 19740001 1974   Male        14          9
## 2   2 19740002 1974   Male        16          9
## 3   3 19740003 1974 Female        10          9
## 4   4 19740004 1974 Female        10          5
## 5   5 19740005 1974 Female        12          8
## 6   6 19740006 1974   Male        16          8

5. Statistics

A picture paints a thousand words, which is why R ggplot2 is such a powerful tool for graphical data analysis. In this chapter, you’ll progress from simply plotting data to applying a variety of statistical methods. These include a variety of linear models, descriptive and inferential statistics (mean, standard deviation and confidence intervals) and custom functions.

5.1 Smoothing

To practice on the remaining layers (statistics, coordinates and facets), we’ll continue working on several datasets from the first course.

# Using mtcars, draw a scatter plot of mpg vs. wt
ggplot(mtcars, aes( x= wt, y=mpg)) + geom_point()+ theme_gray()

# Amend the plot to add a smooth layer
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() + geom_smooth()+ theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Amend the plot. Use lin. reg. smoothing; turn off std err ribbon
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'

# Amend the plot. Swap geom_smooth() for stat_smooth().
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) + theme_gray()
## `geom_smooth()` using formula 'y ~ x'

5.2 Grouping variables

We’ll continue with the previous exercise by considering the situation of looking at sub-groups in our dataset. For this we’ll encounter the invisible group aesthetic.

# Using mtcars, plot mpg vs. wt, colored by fcyl
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
  # Add a point layer
 geom_point() +
  # Add a smooth lin reg stat, no ribbon
  stat_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'

# Amend the plot to add another smooth layer with dummy grouping
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  stat_smooth(aes(group = 1), method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

5.3 Modifying stat_smooth

In the previous exercise we used se = FALSE in stat_smooth() to remove the 95% Confidence Interval. Here we’ll consider another argument, span, used in LOESS smoothing, and we’ll take a look at a nice scenario of properly mapping different models.

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  # Add 3 smooth LOESS stats, varying span & color
  stat_smooth(se = FALSE, color = "red", span = 0.9) +
  stat_smooth(se = FALSE, color = "green", span = 0.6) +
  stat_smooth(se = FALSE, color = "blue", span = 0.3)+ theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Amend the plot to color by fcyl
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  # Add a smooth LOESS stat, no ribbon
 stat_smooth(method = "lm", se = FALSE) +
  # Add a smooth lin. reg. stat, no ribbon
    stat_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Amend the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
  geom_point() +
  # Map color to dummy variable "All"
  stat_smooth(aes(color = "All"), se = FALSE) +
  stat_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

5.4 Quantiles

Here, we’ll continue with the Vocab dataset and use stat_quantile() to apply a quantile regression.

ggplot(Vocab, aes(x = education, y = vocabulary)) +
  geom_jitter(alpha = 0.25) +
  # Add a quantile stat, at 0.05, 0.5, and 0.95
  stat_quantile(quantiles = c(0.05, 0.5, 0.95))
## Warning: Removed 1 rows containing non-finite values (stat_quantile).
## Smoothing formula not specified. Using: y ~ x
## Warning: Removed 1 rows containing missing values (geom_point).

# Amend the plot to color by year_group
ggplot(Vocab, aes(x = education, y = vocabulary, color = year)) +
  geom_jitter(alpha = 0.25) +
  stat_quantile(quantiles = c(0.05, 0.5, 0.95)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_quantile).
## Smoothing formula not specified. Using: y ~ x
## Warning: Removed 1 rows containing missing values (geom_point).

5.5 Using stat_sum

In the Vocab dataset, education and vocabulary are integer variables. In the first course, you saw that this is one of the four causes of overplotting. You’d get a single point at each intersection between the two variables.

One solution, shown in the step 1, is jittering with transparency. Another solution is to use stat_sum(), which calculates the total number of overlapping observations and maps that onto the size aesthetic.

# Run this, look at the plot, then update it
ggplot(Vocab, aes(x = education, y = vocabulary)) +
  # Replace this with a sum stat
  geom_jitter(alpha = 0.25) + theme_gray()
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(Vocab, aes(x = education, y = vocabulary)) +stat_sum(alpha = 0.25) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).

ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_sum() +
  # Add a size scale, from 1 to 10
  scale_size(range=c(1, 10)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).

# Amend the stat to use proportion sizes
ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_sum(aes(size = ..prop..))  + theme_gray() 
## Warning: Removed 1 rows containing non-finite values (stat_sum).

# Amend the plot to group by education
ggplot(Vocab, aes(x = education, y = vocabulary, group = education)) +
  stat_sum(aes(size = ..prop..)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).

5.6 Stats outside geoms

In the following exercises, we’ll aim to make the plot shown in the viewer. Here, we’ll establish our positions and base layer of the plot.

Establishing these items as independent objects will allow us to recycle them easily in many layers, or plots.

  • position_jitter() adds jittering (e.g. for points).
  • position_dodge() dodges geoms, (e.g. bar, col, boxplot, violin, errorbar, pointrange).
  • position_jitterdodge() jitters and dodges geoms, (e.g. points).
posn_j <- position_jitter(width = 0.2)
posn_d <- position_dodge(width = 0.1)
posn_jd <- position_jitterdodge(jitter.width = 0.2, dodge.width = 0.1)

# Create the plot base: wt vs. fcyl, colored by fam
p_wt_vs_fcyl_by_fam <- ggplot(mtcars, aes(x= fcyl, y=wt, color = fam)) + theme_gray()

# Add a point layer
p_wt_vs_fcyl_by_fam + geom_point() + theme_gray()

# Add jittering only
p_wt_vs_fcyl_by_fam_jit <- p_wt_vs_fcyl_by_fam + geom_point(position = posn_j) + theme_gray()
p_wt_vs_fcyl_by_fam_jit 

# Add jittering only
p_wt_vs_fcyl_by_fam + geom_point(position = posn_j) + theme_gray()

# Add dodging only
p_wt_vs_fcyl_by_fam + geom_point(position = posn_d) + theme_gray()

# Add jittering and dodging
p_wt_vs_fcyl_by_fam + geom_point(position = posn_jd) + theme_gray()

###5.7 Plotting variations The preparation is done; now let’s explore stat_summary().

Summary statistics refers to a combination of location (mean or median) and spread (standard deviation or confidence interval).

These metrics are calculated in stat_summary() by passing a function to the fun.data argument. mean_sdl(), calculates multiples of the standard deviation and mean_cl_normal() calculates the t-corrected 95% CI.

p_wt_vs_fcyl_by_fam_jit +
  # Add a summary stat of std deviation limits
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = posn_d) + theme_gray()

p_wt_vs_fcyl_by_fam_jit +
  # Change the geom to be an errorbar
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = posn_d, geom = "errorbar") + theme_gray()

p_wt_vs_fcyl_by_fam_jit +
  # Add a summary stat of normal confidence limits
 stat_summary(fun.data = mean_cl_normal, position = posn_d) + theme_gray()

6. Coordinates

The Coordinates layers offer specific and very useful tools for efficiently and accurately communicating data. Here we’ll look at the various ways of effectively using these layers, so you can clearly visualize lognormal datasets, variables with units, and periodic data.

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
iris.smooth <- ggplot(iris, aes(x = Sepal.Length, y= Sepal.Width, 
        color = Species)) + geom_point(alpha = 0.7) + geom_smooth() + theme_gray()
iris.smooth
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

iris.smooth + scale_x_continuous(limits = c(4.5, 5.5)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 95 rows containing non-finite values (stat_smooth).
## Warning: Removed 95 rows containing missing values (geom_point).

iris.smooth + xlim(c(4.5, 5.5)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 95 rows containing non-finite values (stat_smooth).
## Removed 95 rows containing missing values (geom_point).

iris.smooth + coord_cartesian(xlim = c(4.5, 5.5)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

6.1 Zooming In

The big difference is that the scale functions change the underlying dataset, which affects calculations made by computed geoms (like histograms or smooth trend lines), whereas coordinate functions make no changes to the dataset.

# Run the code, view the plot, then update it
mtcars_wt_hp <- ggplot(mtcars, aes(x = wt, y = hp, color = fam)) +
  geom_point() + geom_smooth() +
  # Add a continuous x scale from 3 to 6
  scale_x_continuous(limit = c(3, 6)) + theme_gray()
mtcars_wt_hp
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.002
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.572
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 12 rows containing missing values (geom_point).

mtcars_wt_hp + coord_cartesian(xlim = c(3, 6)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.002
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.572
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 12 rows containing missing values (geom_point).

Aspect ratio I: 1:1 ratios We can set the aspect ratio of a plot with coord_fixed(), which uses ratio = 1 as a default. A 1:1 aspect ratio is most appropriate when two continuous variables are on the same scale, as with the iris dataset.

ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_jitter() +
  geom_smooth(method = "lm", se = FALSE) + theme_gray()+
  # Fix the coordinate ratio
coord_fixed(1)
## `geom_smooth()` using formula 'y ~ x'

6.2 Expand and clip

The coord_*() layer functions offer two useful arguments that work well together: expand and clip.

  • expand sets a buffer margin around the plot, so data and axes don’t overlap. Setting expand to 0 draws the axes to the limits of the data.
  • clip decides whether plot elements that would lie outside the plot panel are displayed or ignored (“clipped”).
ggplot(mtcars, aes(wt, mpg)) + geom_point(size = 2) +
  # Add Cartesian coordinates with zero expansion
   theme_gray() + coord_cartesian(expand = 0)

ggplot(mtcars, aes(wt, mpg)) + geom_point(size = 2) +
  # Turn clipping off
  coord_cartesian(expand = 0, clip = "off") + theme_gray() +
  # Remove axis lines
  theme(axis.line= element_blank())

6.3 Transform the Raw Data and Log-transforming scales

ggplot(msleep, aes(x = bodywt, y = 1)) + geom_jitter() + theme_gray() + 
  scale_x_continuous(limits = c(0, 7000), breaks = seq(0, 7000, 1000)) +
  labs(x = "Body Weight (kg)", title = "Raw data") + theme_gray()

ggplot(msleep, aes(x =log10(bodywt) , y = 1)) + geom_jitter() +
  scale_x_continuous(limits = c(-3, 4), breaks = -3:4) +
  labs(x = "Body Weight log10(kg)", title = "log10 trans of raw walues")+ theme_gray()

ggplot(msleep, aes(x =log10(bodywt) , y = 1)) + geom_jitter() +
  scale_x_continuous(limits = c(-3, 4), breaks = -3:4) + 
  annotation_logticks(sides = "b")

  labs(x = "Body Weight log10(kg)", title = "log10 trans of raw walues")+ theme_gray()
## NULL
ggplot(msleep, aes(x =bodywt , y = 1)) + geom_jitter() +
  scale_x_log10(limits = c(1e-03, 1e+4)) + 
  labs(x = "Body Weight log10(kg)", title = "log10 trans using scale_x_log10()")+ theme_gray()

ggplot(msleep, aes(x =bodywt , y = 1)) + geom_jitter() +
  coord_trans(x = "log10") + 
  labs(x = "Body Weight log10(kg)", title = "log10 trans using cord_tans()")+ theme_gray()

Using scale_y_log10() and scale_x_log10() is equivalent to transforming our actual dataset before getting to ggplot2.

Using coord_trans(), setting x = “log10” and/or y = “log10” arguments, transforms the data after statistics have been calculated. The plot will look the same as with using scale_*_log10(), but the scales will be different, meaning that we’ll see the original values on our log10 transformed axes. This can be useful since log scales can be somewhat unintuitive.

# Produce a scatter plot of brainwt vs. bodywt
ggplot(msleep, aes(x = bodywt, y = brainwt)) + geom_point() +
  ggtitle("Raw Values") + theme_gray()
## Warning: Removed 27 rows containing missing values (geom_point).

# Add scale_*_*() functions
ggplot(msleep, aes(bodywt, brainwt)) + geom_point() +
  scale_x_log10() + scale_y_log10() +
  ggtitle("Scale_x_log10 functions") + theme_gray()
## Warning: Removed 27 rows containing missing values (geom_point).

# Perform a log10 coordinate system transformation
ggplot(msleep, aes(bodywt, brainwt)) +
  geom_point() +
  coord_trans(x = "log10", y = "log10") + ggtitle("Cord_trans log10()") + theme_gray()
## Warning: Removed 27 rows containing missing values (geom_point).

6.4 Adding stats to transformed scales

# Plot with a scale_*_*() function:
ggplot(msleep, aes(bodywt, brainwt)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  # Add a log10 x scale
  scale_x_log10() +
  # Add a log10 y scale
  scale_y_log10() +
  ggtitle("Scale_ functions")+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 27 rows containing non-finite values (stat_smooth).
## Warning: Removed 27 rows containing missing values (geom_point).

# Plot with transformed coordinates
ggplot(msleep, aes(bodywt, brainwt)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  # Add a log10 coordinate transformation for x and y axes
  coord_trans(x= "log10", y = "log10")+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 27 rows containing non-finite values (stat_smooth).
## Removed 27 rows containing missing values (geom_point).

6.5 Useful double axes

Double x and y-axes are a contentious topic in data visualization. We’ll revisit that discussion at the end of chapter 4. Here, I want to review a great use case where double axes actually do add value to a plot.

Our goal plot is displayed in the viewer. The two axes are the raw temperature values on a Fahrenheit scale and the transformed values on a Celsius scale.

You can imagine a similar scenario for Log-transformed and original values, miles and kilometers, or pounds and kilograms. A scale that is unintuitive for many people can be made easier by adding a transformation as a double axis.

airquality <- airquality %>% 
  mutate(Date = make_date(1974, Month, Day))

# Using airquality, plot Temp vs. Date, # Add a line layer
ggplot(airquality, aes(Month, y = Temp)) + geom_line() +
  labs(x = "Date (1973)", y = "Fahrenheit")+ theme_gray()

# Define breaks (Fahrenheit), # Convert y_breaks from Fahrenheit to Cels.
y_breaks <- c(59, 68, 77, 86, 95, 104)
y_labels <- (y_breaks - 32) * 5 / 9
# Create a secondary x-axis
secondary_y_axis <- sec_axis(
  # Use identity transformation
  trans = identity, name = "Celsius",
  # Define breaks and labels as above
  breaks = y_breaks, labels = y_labels)
# Examine the object
secondary_y_axis
## <ggproto object: Class AxisSecondary, gg>
##     axis: NULL
##     break_info: function
##     breaks: 59 68 77 86 95 104
##     create_scale: function
##     detail: 1000
##     empty: function
##     guide: waiver
##     init: function
##     labels: 15 20 25 30 35 40
##     make_title: function
##     mono_test: function
##     name: Celsius
##     trans: function
##     transform_range: function
##     super:  <ggproto object: Class AxisSecondary, gg>
# From previous step
y_breaks <- c(59, 68, 77, 86, 95, 104)
y_labels <- (y_breaks - 32) * 5 / 9
secondary_y_axis <- sec_axis(
  trans = identity, name = "Celsius",
  breaks = y_breaks, labels = y_labels)

# Update the plot
ggplot(airquality, aes(x = Date, y = Temp)) +
  geom_line() +
  # Add the secondary y-axis 
  scale_y_continuous(sec.axis = secondary_y_axis) +
  labs(x = "Date (1973)", y = "Fahrenheit") + theme_gray()

6.4 Flipping axes I

Flipping axes means to reverse the variables mapped onto the x and y aesthetics. We can just change the mappings in aes(), but we can also use the coord_flip() layer function.

There are two reasons to use this function: * We want a vertical geom to be horizontal, or * We’ve completed a long series of plotting functions and want to flip it without having to rewrite all our commands.

# Plot fcyl bars, filled by fam, # Place bars side by side
ggplot(mtcars, aes(fcyl, fill = fam)) + geom_bar(position = "dodge")+ theme_gray()

ggplot(mtcars, aes(fcyl, fill = fam)) + geom_bar(position = "dodge") +
  # Flip the x and y coordinates
  coord_flip()  + theme_gray()

ggplot(mtcars, aes(fcyl, fill = fam)) +
  # Set a dodge width of 0.5 for partially overlapping bars
  geom_bar(position = position_dodge(width = 0.5)) + coord_flip()+ theme_gray()

# Plot of wt vs. car
ggplot(mtcars, aes(x=car, y=wt)) +
  # Add a point layer
  geom_point() + labs(x = "car", y = "weight")+ theme_gray()

# Flip the axes to set car to the y axis
ggplot(mtcars, aes(car, wt)) + geom_point() +
  labs(x = "car", y = "weight") + coord_flip() + theme_gray()

6.5 Polar coordinates

The coord_polar() function converts a planar x-y Cartesian plot to polar coordinates. This can be useful if you are producing pie charts.

We can imagine two forms for pie charts - the typical filled circle, or a colored ring.

Typical pie charts omit all of the non-data ink. Pie charts are not really better than stacked bar charts.

# Run the code, view the plot, then update it
ggplot(mtcars, aes(x = 1, fill = fcyl)) + geom_bar() +
  # Add a polar coordinate system
  coord_polar(theta = "y")+ theme_gray()

ggplot(mtcars, aes(x = 1, fill = fcyl)) + geom_bar(width = 0.1) +
  coord_polar(theta = "y") +
  # Add a continuous x scale from 0.5 to 1.5
  scale_x_continuous(limits = c(0.5, 1.5))+ theme_gray()

6.6 Wind rose plots

Polar coordinate plots are well-suited to scales like compass direction or time of day. A popular example is the “wind rose”.

# grab a table of values from the web with a little scraping
url <- 'http://snowfence.umn.edu/Components/winddirectionanddegreeswithouttable3.htm'
page <- read_html(url)
directions_raw <- page %>% html_node('td table') %>% html_table(header = TRUE)
directions <- directions_raw %>% 
    set_names(~tolower(sub(' Direction', '', .x))) %>% 
    slice(-1) %>% 
    separate(degree, c('degree_min', 'degree_max'), sep = '\\s+-\\s+', convert = TRUE)
directions
## # A tibble: 16 × 3
##    cardinal degree_min degree_max
##    <chr>         <dbl>      <dbl>
##  1 N             349.        11.2
##  2 NNE            11.2       33.8
##  3 NE             33.8       56.2
##  4 ENE            56.2       78.8
##  5 E              78.8      101. 
##  6 ESE           101.       124. 
##  7 SE            124.       146. 
##  8 SSE           146.       169. 
##  9 S             169.       191. 
## 10 SSW           191.       214. 
## 11 SW            214.       236. 
## 12 WSW           236.       259. 
## 13 W             259.       281. 
## 14 WNW           281.       304. 
## 15 NW            304.       326. 
## 16 NNW           326.       349.
wind <- mydata %>% 
    mutate(wd_cardinal = cut(
        wd, 
        breaks = c(0, directions$degree_max, 360), 
        labels = c(directions$cardinal, 'N')
    ), ws_interval = cut(ws,
                         breaks = c(0,2,4,6,8,10,12,14),
                         include.lowest=T))

# Using wind, plot wd filled by ws
ggplot(wind, aes(wd_cardinal, fill = ws_interval)) +
  # Add a bar layer with width 1
  geom_bar(width = 1)+ theme_gray()

# Convert to polar coordinates:
ggplot(wind, aes(wd_cardinal, fill = ws_interval)) +
  geom_bar(width = 1) +
  coord_polar()+ theme_gray()

# Convert to polar coordinates:
ggplot(wind, aes(wd_cardinal, fill = ws_interval)) +
  geom_bar(width = 1) +
  coord_polar(start = -pi/16) + theme_gray()

7. Facet

Faceting splits the data up into groups, according to a categorical variable, then plots each group in its own panel. For splitting the data by one or two categorical variables, facet_grid() is best.

ggplot(mtcars, aes(wt, mpg)) +  geom_point() +
  # Facet rows by am
  facet_grid(rows = vars(am)) + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # Facet columns by cyl
 facet_grid(cols = vars(cyl))  + theme_gray()

ggplot(mtcars, aes(wt, mpg)) +  geom_point() +
  # Facet rows by am and columns by cyl
  facet_grid(rows = vars(am), cols = vars(cyl)) + theme_gray()

7.1 Many variables

In addition to aesthetics, facets are another way of encoding factor (i.e. categorical) variables. They can be used to reduce the complexity of plots with many variables.

Two variables are mapped onto the color aesthetic, using hue and lightness. To achieve this we combined fcyl and fam into a single interaction variable, fcyl_fam. This will allow us to take advantage of Color Brewer’s Paired color palette.

mtcars <- mtcars %>% 
  mutate(fcyl_fam = interaction(fcyl, fam, sep=":"))
# See the interaction column
mtcars$fcyl_fam
##  [1] 6:1 6:1 4:1 6:0 8:0 6:0 8:0 4:0 4:0 6:0 6:0 8:0 8:0 8:0 8:0 8:0 8:0 4:1 4:1
## [20] 4:1 4:0 8:0 8:0 8:0 8:0 4:1 4:1 4:1 8:1 6:1 8:1 4:1
## Levels: 4:0 6:0 8:0 4:1 6:1 8:1
# Color the points by fcyl_fam
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam)) + geom_point() +
  # Use a paired color palette
  scale_color_brewer(palette = "Paired") + theme_gray()

# Update the plot to map disp to size
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size = disp)) +
  geom_point() + scale_color_brewer(palette = "Paired") + theme_gray()

# Update the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size = disp)) +
  geom_point() + scale_color_brewer(palette = "Paired") +
  # Grid facet on gear and vs
  facet_grid(rows = vars(gear), cols = vars(vs)) + theme_gray()

7.2 Formula notation

# facet_grid(rows = vars(A))    facet_grid(A ~ .)
# facet_grid(cols = vars(B))    facet_grid(. ~ B)
# facet_grid(rows = vars(A), cols = vars(B))    facet_grid(A ~ B)
ggplot(mtcars, aes(wt, mpg)) +  geom_point() +
  # Facet rows by am using formula notation
 facet_grid(am ~ .)  + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # Facet columns by cyl using formula notation
  facet_grid(. ~ cyl) + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # Facet rows by am and columns by cyl using formula notation
  facet_grid(am ~ cyl) + theme_gray()

7.3 Labeling facets

If our factor levels are not clear, your facet labels may be confusing. We can assign proper labels in your original data before plotting or we can use the labeller argument in the facet layer.

The default value is= * label_value: Default, displays only the value

Common alternatives are:

  • label_both: Displays both the value and the variable name
  • label_context: Displays only the values or both the values and variables depending on whether multiple factors are faceted
# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # The default is label_value
 facet_grid(cols= vars(cyl)) + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # Displaying both the values and the variables
  facet_grid(cols = vars(cyl), labeller = label_both) + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # Label context
  facet_grid(cols = vars(cyl), labeller = label_context) + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  # Two variables
  facet_grid(cols = vars(vs, cyl), labeller = label_context) + theme_gray()

7.4 Setting order

If you want to change the order of your facets, it’s best to properly define your factor variables before plotting.

Let’s see this in action with the mtcars transmission variable am. In this case, 0 = “automatic” and 1 = “manual”.

Here, we’ll make am a factor variable and relabel the numbers to proper names. The default order is alphabetical. To rearrange them we’ll call fct_rev() from the forcats package to reverse the order.

# Make factor, set proper labels explictly
mtcars$fam <- factor(mtcars$am, labels = c(`0` = "automatic",
                                           `1` = "manual"))
# Default order is alphabetical
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
  facet_grid(cols = vars(fam)) + theme_gray()

# Make factor, set proper labels explictly, and
# manually set the label order
mtcars$fam <- factor(mtcars$am,
                     levels = c(0, 1),
                     labels = c("manual", "automatic"))
# View again
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  facet_grid(cols = vars(fam)) + theme_gray()

7.5 Facet plotting spaces

Continuous variables

By default every facet of a plot has the same axes. If the data ranges vary wildly between facets, it can be clearer if each facet has its own scale. This is achieved with the scales argument to facet_grid().

  • “fixed” (default): axes are shared between facets.
  • free: each facet has its own axes.
  • free_x: each facet has its own x-axis, but the y-axis is shared.
  • free_y: each facet has its own y-axis, but the x-axis is shared.

When faceting by columns, “free_y” has no effect, but we can adjust the x-axis. In contrast, when faceting by rows, “free_x” has no effect, but we can adjust the y-axis.

ggplot(mtcars, aes(wt, mpg)) + geom_point() + 
  # Facet columns by cyl 
  facet_grid(cols = vars(cyl)) + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() + 
  # Update the faceting to free the x-axis scales
  facet_grid(cols = vars(cyl), scales ="free_x") + theme_gray()

ggplot(mtcars, aes(wt, mpg)) + geom_point() + 
  # Swap cols for rows; free the y-axis scales
  facet_grid(rows = vars(cyl), scales = "free_y") + theme_gray()

Categorical variables

When you have a categorical variable with many levels which are not all present in each sub-group of another variable, it’s usually desirable to drop the unused levels.

By default, each facet of a plot is the same size. This behavior can be changed with the spaces argument, which works in the same way as scales: “free_x” allows different sized facets on the x-axis, “free_y”, allows different sized facets on the y-axis, “free” allows different sizes in both directions.

ggplot(mtcars, aes(x = mpg, y = car, color = fam)) + geom_point() +
  # Facet rows by gear
  facet_grid(rows = vars(gear)) + theme_gray()

ggplot(mtcars, aes(x = mpg, y = car, color = fam)) + geom_point() +
  # Free the y scales and space
  facet_grid(rows = vars(gear), scales ="free_y", space = "free_y") + theme_gray()

mtcars2 <- mtcars %>% 
  # arrange from lo to hi
  arrange(-mpg) %>% 
  # redefine in factor leveles
  mutate(car = as_factor(car))

ggplot(mtcars2, aes(x = mpg, y = car, color = fam)) +
  geom_point() +
  # Free the y scales and space
  facet_grid(rows = vars(gear),
  scales = "free_y",
  space = "free_y") + theme_gray()

7.6 Facet wrap & margins

facet_grid() is fantastic for categorical variables with a small number of levels. Although it is possible to facet variables with many levels, the resulting plot will be very wide or very tall, which can make it difficult to view.

The solution is to use facet_wrap() which separates levels along one axis but wraps all the subsets across a given number of rows or columns.

ggplot(Vocab, aes(x = education, y = vocabulary)) + stat_smooth(method = "lm", se = FALSE) +
  # Create facets, wrapping by year, using vars()
  facet_wrap(vars(year))+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

ggplot(Vocab, aes(x = education, y = vocabulary)) + stat_smooth(method = "lm", se = FALSE) +
  # Create facets, wrapping by year, using a formula
 facet_wrap(~ year)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

ggplot(Vocab, aes(x = education, y = vocabulary)) + stat_smooth(method = "lm", se = FALSE) +
  # Update the facet layout, using 11 columns
  facet_wrap(~ year, ncol = 11)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

7.7 Margin plots

Facets are great for seeing subsets in a variable, but sometimes you want to see both those subsets and all values in a variable.

Here, the margins argument to facet_grid() is your friend.

  • FALSE (default): no margins.
  • TRUE: add margins to every variable being faceted by.
  • c(“variable1”, “variable2”): only add margins to the variables listed.
mtcars$fvs <- factor(mtcars$vs)

ggplot(mtcars, aes(x = wt, y = mpg)) +  geom_point() +
  # Facet rows by fvs and fam, and cols by gear
   facet_grid(rows = vars(fvs, fam), cols = vars(gear))

ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point() +
  # Update the facets to add margins
  facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = TRUE)

ggplot(mtcars, aes(x = wt, y = mpg)) +  geom_point() +
  # Update the facets to only show margins on fam
  facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = "fam")

ggplot(mtcars, aes(x = wt, y = mpg)) +  geom_point() +
  # Update the facets to only show margins on gear and fvs
  facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = c("gear", "fvs"))

8. Best Practices

8.1 Bar plots

Dynamite plots

“Dynamite plots” (bar plots with error bars) are not well suited for their intended purpose of depicting distributions. If we really want error bars on bar plots, we can of course get them, but we’ll need to set the positions manually. A point geom will typically serve you much better.

# Plot wt vs. fcyl
ggplot(mtcars, aes(x = fcyl, y = wt)) +
  # Add a bar summary stat of means, colored skyblue
  stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
  # Add an errorbar summary stat std deviation limits
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.

Position dodging we will add a distinction between transmission type, fam, for the dynamite plots and explore position dodging (where bars are side-by-side).

# Update the aesthetics to color and fill by fam
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun.y = mean, geom = "bar") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.

# Set alpha for the first and set position for each stat summary function
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun.y = mean, geom = "bar", alpha = 0.5, position = "dodge") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", position = "dodge", width = 0.1) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.

# Define a dodge position object with width 0.9
posn_d <- position_dodge(width = 0.9)

# For each summary stat, update the position to posn_d
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun.y = mean, geom = "bar", position = posn_d, alpha = 0.5) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), width = 0.1, position = posn_d, geom = "errorbar") + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.

8.2 Using aggregated data

If it is appropriate to use bar plots, then it nice to give an impression of the number of values in each group.

stat_summary() doesn’t keep track of the count. stat_sum() does (that’s the whole point), but it’s difficult to access. It’s more straightforward to calculate exactly what we want to plot ourselves.

mtcars_by_cyl <- mtcars %>%
  group_by(cyl) %>% 
  summarize(mean_wt = mean(wt),
          sd_wt = sd(wt),
          n_wt = n()) %>% 
          mutate( prop = n_wt/sum(n_wt))
mtcars_by_cyl
## # A tibble: 3 × 5
##     cyl mean_wt sd_wt  n_wt  prop
##   <dbl>   <dbl> <dbl> <int> <dbl>
## 1     4    2.29 0.570    11 0.344
## 2     6    3.12 0.356     7 0.219
## 3     8    4.00 0.759    14 0.438
# Using mtcars_cyl, plot mean_wt vs. cyl
ggplot(mtcars_by_cyl, aes(cyl, mean_wt)) +
  # Add a bar layer with identity stat, filled skyblue
  geom_bar(stat = "identity", fill = "skyblue") + theme_gray()

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  # Swap geom_bar() for geom_col()
  geom_col(fill = "skyblue") + theme_gray()

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  # Set the width aesthetic to prop
  geom_col(fill = "skyblue", aes(width = prop)) + theme_gray()
## Warning: Ignoring unknown aesthetics: width

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  geom_col(aes(width = prop), fill = "skyblue") +
  # Add an errorbar layer
  geom_errorbar(
    # ... at mean weight plus or minus 1 std dev
    aes(ymin = mean_wt - sd_wt,
    ymax = mean_wt + sd_wt),
    # with width 0.1
    width = 0.1
  ) + theme_gray()
## Warning: Ignoring unknown aesthetics: width

8.3 Heat maps

Since heat maps encode color on a continuous scale, they are difficult to accurately decode. Hence, heat maps are most useful if you have a small number of boxes and/or a clear pattern that allows you to overcome decoding difficulties.

To produce them, map two categorical variables onto the x and y aesthetics, along with a continuous variable onto fill. The geom_tile() layer adds the boxes.

library(lattice)
str(barley)
## 'data.frame':    120 obs. of  4 variables:
##  $ yield  : num  27 48.9 27.4 39.9 33 ...
##  $ variety: Factor w/ 10 levels "Svansota","No. 462",..: 3 3 3 3 3 3 7 7 7 7 ...
##  $ year   : Factor w/ 2 levels "1932","1931": 2 2 2 2 2 2 2 2 2 2 ...
##  $ site   : Factor w/ 6 levels "Grand Rapids",..: 3 6 4 5 1 2 3 6 4 5 ...
# Using barley, plot variety vs. year, filled by yield
ggplot(barley, aes(year, variety, fill = yield))+
  # Add a tile geom
  geom_tile() + theme_gray()

ggplot(barley, aes(x = year, y = variety, fill = yield)) + geom_tile() + 
  # Facet, wrapping by site, with 1 column
  facet_wrap(facets = vars(site), ncol = 1) +
  # Add a fill scale using an 2-color gradient
  scale_fill_gradient(low = "white", high = "red") + theme_gray()

# A palette of 9 reds
red_brewer_palette <- brewer.pal(9, "Reds")
# Update the plot
ggplot(barley, aes(x = year, y = variety, fill = yield)) + geom_tile() + 
  facet_wrap(facets = vars(site), ncol = 1) +
  # Update scale to use n-colors from red_brewer_palette
  scale_fill_gradientn(colors = red_brewer_palette ) + theme_gray()

Heat map alternatives

There are several alternatives to heat maps. The best choice really depends on the data and the story you want to tell with this data. If there is a time component, the most obvious choice is a line plot.

# Using barley, plot yield vs. year, colored and grouped by variety
ggplot(barley, aes(x = year, y = yield, color = variety, group = variety)) +
  # Add a line layer
  geom_line() +
  # Facet, wrapping by site, with 1 row
  facet_wrap( ~ site, nrow = 1) + theme_gray()

# Using barely, plot yield vs. year, colored, grouped, and filled by site
ggplot(barley, aes(x = year, y = yield, color = site, group = site, fill = site)) +
  # Add a line summary stat aggregated by mean
  stat_summary(fun.y = mean, geom = "line") +
  # Add a ribbon summary stat with 10% opacity, no color
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "ribbon", alpha = 0.1, color = NA) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.

8.4 When good data makes bad plots

When you first encounter a data visualization, either from yourself or a colleague, you always want to critically ask if it’s obscuring the data in any way.

Let’s take a look at the steps we could take to produce and improve the plot in the view.

The data comes from an experiment where the effect of two different types of vitamin C sources, orange juice or ascorbic acid, were tested on the growth of the odontoblasts (cells responsible for tooth growth) in 60 guinea pigs.

The data is stored in the TG data frame, which contains three variables: dose, len, and supp.

library(datasets)
TG = (ToothGrowth)
str(TG)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
# Initial plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl,
               fun.args = list(mult = 1),
               position = position_dodge(0.1)) 
# View plot
growth_by_dose

# Change type
TG$dose <- as.numeric(as.character(TG$dose))

# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl,
               fun.args = list(mult = 1),
               position = position_dodge(0.2))  + theme_gray()
# View plot
growth_by_dose

# Change type
TG$dose <- as.numeric(as.character(TG$dose))

# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl,
               fun.args = list(mult = 1),
               position = position_dodge(0.2)) +
  # Use the right geometry
  stat_summary(fun.y = mean,
               geom = "line",
               position = position_dodge(0.1)) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
# View plot
growth_by_dose

# Change type
TG$dose <- as.numeric(as.character(TG$dose))

# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl,
               fun.args = list(mult = 1),
               position = position_dodge(0.2)) +
  stat_summary(fun.y = mean,
               geom = "line",
               position = position_dodge(0.1)) +
  theme_classic() +
  # Adjust labels and colors:
  labs(x = "Dose (mg/day)", y = "Odontoblasts length (mean, standard deviation)", color = "Supplement") +
  scale_color_brewer(palette = "Set1", labels = c("Orange juice", "Ascorbic acid")) +
  scale_y_continuous(limits = c(0,35), breaks = seq(0, 35, 5), expand = c(0,0))
## Warning: `fun.y` is deprecated. Use `fun` instead.
# View plot
growth_by_dose

8.5 Density plots

# plot 1: Density of price for each type of cut of the diamond:
ggplot(data=diamonds,aes(x=price, group=cut, fill=cut)) + 
    geom_density(adjust=1.5)

# plot 2: Density plot with transparency (using the alpha argument):
ggplot(data=diamonds,aes(x=price, group=cut, fill=cut)) + 
    geom_density(adjust=1.5 , alpha=0.2)

8.6 Correlation plots

#install.packages("GGally")
library("GGally")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Prepare some data
df <- mtcars[, c(1,3,4,5,6,7)]
# Correlation plot 1
ggcorr(df, palette = "RdBu", label = TRUE)

# Correlation plot 2
ggpairs(df)

The End.

Thanks DataCamp