Data Visualization with ggplot2

Libraries

library(tidyverse)
library(forcats)
library(lubridate)
library(ggthemes)
library(quantreg)
library(openair)
library(lattice)
library(RColorBrewer)
library(rvest)

Data

data("iris")
data("mtcars")
data("diamonds")
data("economics")
mtcars$fcyl <- as.factor(mtcars$cyl)
mtcars$fam <- as.factor(mtcars$am)
mtcars$car <- row.names(mtcars)
load("~/R Data/fish.RData")
Vocab <- read.csv("https://raw.githubusercontent.com/sigmasigmaiota/vocab/master/vocab.csv")
gm2007 <- read.csv("gm2007.csv")
head(Vocab)

##   X.1        X year    sex education vocabulary
## 1   1 19740001 1974   Male        14          9
## 2   2 19740002 1974   Male        16          9
## 3   3 19740003 1974 Female        10          9
## 4   4 19740004 1974 Female        10          5
## 5   5 19740005 1974 Female        12          8
## 6   6 19740006 1974   Male        16          8

0. Course Description

The ability to produce meaningful and beautiful data visualizations is an essential part of your skill set as a data scientist. This course, the first R data visualization tutorial in the series, introduces you to the principles of good visualizations and the grammar of graphics plotting concepts implemented in the ggplot2 package.

1. Introduction

In this chapter we’ll get you into the right frame of mind for developing meaningful visualizations with R. You’ll understand that as a communications tool, visualizations require you to think about your audience first. You’ll also be introduced to the basics of ggplot2 - the 7 different grammatical elements (layers) and aesthetic mappings.

# Explore the mtcars data frame with str()
str(mtcars)

## 'data.frame':    32 obs. of  14 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
##  $ fcyl: Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
##  $ fam : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
##  $ car : chr  "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...

# Execute the following command
ggplot(mtcars, aes(cyl, mpg)) +
  geom_point()

# Change the command below so that cyl is treated as factor
ggplot(mtcars, aes(factor(cyl), mpg)) + 
  geom_point()

1.1 The grammar of graphics

he data, aesthetics, and geom layers. We’ll get to making pretty plots in the last chapter with the themes layer.

# Edit to add a color aesthetic mapped to disp
ggplot(mtcars, aes(wt, mpg, color = disp)) +
  geom_point()

# Change the color aesthetic to a size aesthetic
ggplot(mtcars, aes(wt, mpg, size = disp)) +
  geom_point()

1.2 Adding geometries

The diamonds dataset contains details of 1,000 diamonds. Among the variables included are carat (a measurement of the diamond’s size) and price.

geom_point() adds points (as in a scatter plot).
geom_smooth() adds a smooth trend curve.

# Add geom_point() with +
ggplot(diamonds, aes(carat, price)) +
  geom_point()

# Add geom_smooth() with +
ggplot(diamonds, aes(carat, price)) +
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Map the color aesthetic to clarity
ggplot(diamonds, aes(carat, price, color = clarity)) +
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Make the points 40% opaque
ggplot(diamonds, aes(carat, price, color = clarity)) +
  geom_point(alpha = 0.4) +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Draw a ggplot
plt_price_vs_carat <- ggplot(diamonds, aes(carat, price))
# Add a point layer to plt_price_vs_carat
plt_price_vs_carat + geom_point()

# Edit this to make points 20% opaque: plt_price_vs_carat_transparent
plt_price_vs_carat_transparent <- plt_price_vs_carat + geom_point(alpha = 0.2)
# See the plot
plt_price_vs_carat_transparent

# Edit this to map color to clarity, # Assign the updated plot to a new object
plt_price_vs_carat_by_clarity <- plt_price_vs_carat + geom_point(aes(color = clarity))
# See the plot
plt_price_vs_carat_by_clarity

2. Aesthetics

Aesthetic mappings are the cornerstone of the grammar of graphics plotting concept. This is where the magic happens - converting continuous and categorical data into visual scales that provide access to a large amount of information in a very short time. In this chapter you’ll understand how to choose the best aesthetic mappings for your data.

2.1 All about aesthetics: color, shape and size

These are the aesthetics you can consider within aes() in this chapter: x, y, color, fill, size, alpha, labels and shape.

In the following exercise the fcyl column is categorical. It is cyl transformed into a factor.

mtcars$fcyl <- as.factor(mtcars$cyl)

# Map x to mpg and y to fcyl
ggplot(mtcars, aes(x=mpg, y=fcyl)) +
  geom_point()

# Swap mpg and fcyl
ggplot(mtcars, aes(x=fcyl, y=mpg)) +
  geom_point()

# Map x to wt, y to mpg and color to fcyl
ggplot(mtcars, aes(x=wt,y= mpg, color=fcyl)) +
  geom_point()

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
  # Set the shape and size of the points
  geom_point(shape=1, size=4)

# Map fcyl to fill
ggplot(mtcars, aes(wt, mpg, fill = fcyl)) +
  geom_point(shape = 1, size = 4)

ggplot(mtcars, aes(wt, mpg, fill = cyl)) +
  # Change point shape; set alpha
  geom_point(shape = 21, size = 4, alpha=0.6)

# Map color to fam
ggplot(mtcars, aes(wt, mpg, fill = cyl, color = am)) +
  geom_point(shape = 21, size = 4, alpha = 0.6)

# Establish the base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))
# Map fcyl to size
plt_mpg_vs_wt +
  geom_point(aes(size = fcyl))

## Warning: Using size for a discrete variable is not advised.

# Map fcyl to alpha, not size
plt_mpg_vs_wt +
  geom_point(aes(alpha = fcyl))

## Warning: Using alpha for a discrete variable is not advised.

# Map fcyl to shape, not alpha
plt_mpg_vs_wt +
  geom_point(aes(shape = fcyl))

# Use text layer and map fcyl to label
plt_mpg_vs_wt +
  geom_text(aes(label = fcyl))

2.2 Using attributes

his time you’ll use these arguments to set attributes of the plot, not map variables onto aesthetics.

You can specify colors in R using hex codes: a hash followed by two hexadecimal numbers each for red, green, and blue (“#RRGGBB”). Hexadecimal is base-16 counting. You have 0 to 9, and A representing 10 up to F representing 15. Pairs of hexadecimal numbers give you a range from 0 to 255. “#000000” is “black” (no color), “#FFFFFF” means “white”, and `“#00FFFF” is cyan (mixed green and blue).

# A hexadecimal color
my_blue <- "#4ABEFF"
ggplot(mtcars, aes(wt, mpg)) +
  # Set the point color and alpha
  geom_point(color = my_blue, alpha = 0.6)

# Change the color mapping to a fill mapping
ggplot(mtcars, aes(wt, mpg, fill = cyl)) +
  # Set point size and shape
  geom_point(color = my_blue, size = 10, shape = 1)

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
  # Add point layer with alpha 0.5
 geom_point(alpha = 0.5)

ggplot(mtcars, aes(wt, mpg, color = cyl)) +
  # Add text layer with label rownames(mtcars) and color red
  geom_text(label =rownames(mtcars), color = "red")

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
  # Add points layer with shape 24 and color yellow
  geom_point(color = "yellow", shape = 24)

# 3 aesthetics: qsec vs. mpg, colored by fcyl
ggplot(mtcars, aes(mpg, qsec, color = mtcars$fcyl)) +
  geom_point()

## Warning: Use of `mtcars$fcyl` is discouraged. Use `fcyl` instead.

# 4 aesthetics: add a mapping of shape to fam
ggplot(mtcars, aes(mpg, qsec, shape = mtcars$fam, color = mtcars$fcyl)) +
  geom_point()

## Warning: Use of `mtcars$fam` is discouraged. Use `fam` instead.
## Use of `mtcars$fcyl` is discouraged. Use `fcyl` instead.

# 5 aesthetics: add a mapping of size to hp / wt
ggplot(mtcars, aes(mpg, qsec, color = fcyl, shape = fam, size = hp/wt)) +
  geom_point()

2.3 Modifying aesthetics

In this exercise, you’ll modify some aesthetics to make a bar plot of the number of cylinders for cars with different types of transmission.

You’ll also make use of some functions for improving the appearance of the plot.

labs() to set the x- and y-axis labels. It takes strings for each argument.
scale_color_manual() defines properties of the color scale (i.e. axis). The first argument sets the legend title. values is a named vector of colors to use.

ggplot(mtcars, aes(as.factor(cyl), fill = as.factor(am))) +
  geom_bar() +
  # Set the axis labels
  labs(x= "Number of Cylinders", y= "Count" )

palette <- c(automatic = "#377EB8", manual = "#E41A1C")
ggplot(mtcars, aes(as.factor(cyl), fill = as.factor(am))) +
  geom_bar() +
  labs(x = "Number of Cylinders", y = "Count") +
  # Set the fill color scale
  scale_fill_manual("Transmission", values = palette)

palette <- c(automatic = "#377EB8", manual = "#E41A1C")
# Set the position
ggplot(mtcars, aes(as.factor(cyl), fill = as.factor(am))) +
  geom_bar(position = "dodge") +
  labs(x = "Number of Cylinders", y = "Count")

  scale_fill_manual("Transmission", values = palette)

## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: fill
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: automatic manual
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: Transmission
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range, gg>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range, gg>
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

2.4 Setting a dummy aesthetic

In the last chapter you saw that all the visible aesthetics can serve as attributes and aesthetics, but I very conveniently left out x and y. That’s because although you can make univariate plots (such as histograms, which you’ll get to in the next chapter), a y-axis will always be provided, even if you didn’t ask for it.

When using setting y-axis limits, you can specify the limits as separate arguments, or as a single numeric vector. That is, ylim(lo, hi) or ylim(c(lo, hi)).

# Plot 0 vs. mpg
ggplot(mtcars, aes(mpg, 0)) +
  # Add jitter
  geom_point(position="jitter")

ggplot(mtcars, aes(mpg, 0)) +
  geom_jitter() +
  # Set the y-axis limits
  ylim(-2, 2)

3. Geometries

A plot’s geometry dictates what visual elements will be used. In this chapter, we’ll familiarize you with the geometries used in the three most common plot types you’ll encounter - scatter plots, bar charts and line plots. We’ll look at a variety of different ways to construct these plots.

3.1 Scatter plots

Scatter plots (using geom_point()) are intuitive, easily understood, and very common, but we must always consider overplotting, particularly in the following four situations:

Large datasets
Aligned values on a single axis
Low-precision data
Integer data

Typically, alpha blending (i.e. adding transparency) is recommended when using solid shapes. Alternatively, you can use opaque, hollow shapes.

Small points are suitable for large datasets with regions of high density (lots of overlapping).

# Plot price vs. carat, colored by clarity
plt_price_vs_carat_by_clarity <- ggplot(diamonds, aes(carat, price, color = clarity))
# Add a point layer with tiny points
plt_price_vs_carat_by_clarity + geom_point(alpha = 0.5, shape = ".", size = 1)

# Set transparency to 0.5
plt_price_vs_carat_by_clarity + geom_point(alpha = 0.5, shape = 16)

# Plot base
plt_mpg_vs_fcyl_by_fam <- ggplot(mtcars, aes(fcyl, mpg, color = fam))
# Default points are shown for comparison
plt_mpg_vs_fcyl_by_fam + geom_point()

# Default points are shown for comparison
plt_mpg_vs_fcyl_by_fam + geom_point()

# Alter the point positions by jittering, width 0.3
plt_mpg_vs_fcyl_by_fam + geom_point(position = position_jitter(width = 0.3))

# Default points are shown for comparison
plt_mpg_vs_fcyl_by_fam + geom_point()

# Now jitter and dodge the point positions
plt_mpg_vs_fcyl_by_fam + geom_point(position = position_jitterdodge(jitter.width= 0.3, dodge.width=0.3))

You already saw how to deal with overplotting when using geom_point() in two cases:

Large datasets
Aligned values on a single axis We used position = ‘jitter’ inside geom_point() or geom_jitter().

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  # Swap for jitter layer with width 0.1
  geom_jitter(width = 0.1, alpha = 0.5)

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  # Set the position to jitter
  geom_point(position = "jitter", alpha = 0.5)

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  # Use a jitter position function with width 0.1
  geom_point(position = position_jitter(width = 0.1), alpha = 0.5)

# Plot vocabulary vs. education
ggplot(Vocab, aes(education, vocabulary)) +
  # Add a point layer
  geom_point()

ggplot(Vocab, aes(education, vocabulary)) +
  # Change to a jitter layer
  geom_jitter()

ggplot(Vocab, aes(education, vocabulary)) +
  # Set the transparency to 0.2
  geom_jitter(alpha = 0.2)

ggplot(Vocab, aes(education, vocabulary)) +
  # Set the shape to 1
  geom_jitter(alpha = 0.2, shape = 1)

Notice how jittering and alpha blending serves as a great solution to the overplotting problem here. Setting the shape to 1 didn’t really help, but it was useful in the previous exercises when you had less data. You need to consider each plot individually. You’ll encounter this dataset again when you look at bar plots.

3.2 Histograms

Recall that histograms cut up a continuous variable into discrete bins and, by default, maps the internally calculated count variable (the number of observations in each bin) onto the y aesthetic. An internal variable called density can be accessed by using the .. notation, i.e. ..density… Plotting this variable will show the relative frequency, which is the height times the width of each bin.

ggplot(mtcars, aes(mpg)) +
  # Set the binwidth to 1
  geom_histogram(binwidth = 1)

# Map y to ..density..
ggplot(mtcars, aes(mpg, ..density..)) +
  geom_histogram(binwidth = 1)

datacamp_light_blue <- "#51A8C9"
ggplot(mtcars, aes(mpg, ..density..)) +
  # Set the fill color to datacamp_light_blue
  geom_histogram(binwidth = 1, fill = datacamp_light_blue)

3.3 Positions in histograms

Here, we’ll examine the various ways of applying positions to histograms. geom_histogram(), a special case of geom_bar(), has a position argument that can take on the following values:

stack (the default): Bars for different groups are stacked on top of each other.
dodge: Bars for different groups are placed side by side.
fill: Bars for different groups are shown as proportions.
identity: Plot the values as they appear in the dataset.

# Update the aesthetics so the fill color is by fam
ggplot(mtcars, aes(mpg, fill = fam)) +
  geom_histogram(binwidth = 1)

ggplot(mtcars, aes(mpg, fill = fam)) +
  # Change the position to dodge
  geom_histogram(position = "dodge", binwidth = 1)

ggplot(mtcars, aes(mpg, fill = fam)) +
  # Change the position to fill
  geom_histogram(binwidth = 1, position = "fill")

## Warning: Removed 16 rows containing missing values (geom_bar).

ggplot(mtcars, aes(mpg, fill = fam)) +
  # Change the position to identity, with transparency 0.4
  geom_histogram(binwidth = 1, position = "identity", alpha = 0.4)

3.4 Bar plots

Let’s see how the position argument changes geom_bar().

We have three position options:

stack: The default
dodge: Preferred
fill: To show proportions

While we will be using geom_bar() here, note that the function geom_col() is just geom_bar() where both the position and stat arguments are set to “identity”. It is used when we want the heights of the bars to represent the exact values in the data.

# Plot fcyl, filled by fam
ggplot(mtcars, aes(fcyl, fill = fam)) +
  # Add a bar layer
  geom_bar()

ggplot(mtcars, aes(fcyl, fill = fam)) +
  # Set the position to "fill"
  geom_bar(position = "fill")

ggplot(mtcars, aes(fcyl, fill = fam)) +
  # Change the position to "dodge"
  geom_bar(position = "dodge")

You can customize bar plots further by adjusting the dodging so that your bars partially overlap each other. Instead of using position = “dodge”, you’re going to use position_dodge(), like you did with position_jitter() in the the previous exercises. Here, you’ll save this as an object, posn_d, so that you can easily reuse it.

Remember, the reason you want to use position_dodge() (and position_jitter()) is to specify how much dodging (or jittering) you want.

ggplot(mtcars, aes(cyl, fill = fam)) +
  # Change position to use the functional form, with width 0.2
  geom_bar(position = position_dodge(width = 0.2))

ggplot(mtcars, aes(cyl, fill = fam)) +
  # Set the transparency to 0.6
  geom_bar(position = position_dodge(width = 0.2), alpha = 0.6)

Bar plots: sequential color palette

In this bar plot, we’ll fill each segment according to an ordinal variable. The best way to do that is with a sequential color palette.

Here’s an example of using a sequential color palette with the mtcars dataset:

ggplot(mtcars, aes(fcyl, fill = fam)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set1")

Vocab$education <- as.factor(Vocab$education)
Vocab$vocabulary <- as.factor(Vocab$vocabulary)

# Plot education, filled by vocabulary
ggplot(Vocab, aes(education, fill = vocabulary)) +
  # Add a bar layer with position "fill"
  geom_bar(position = "fill")

# Plot education, filled by vocabulary
ggplot(Vocab, aes(education, fill = vocabulary)) +
  # Add a bar layer with position "fill"
  geom_bar(position = "fill") +
  # Add a brewer fill scale with default palette
  scale_fill_brewer()

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

# Definition of a set of blue colors
blues <- brewer.pal(9, "Blues") # from the RColorBrewer package

# 1 - Make a color range using colorRampPalette() and the set of blues
blue_range <- colorRampPalette(blues)

# Plot education, filled by vocabulary
ggplot(Vocab, aes(education, fill = vocabulary)) +
  # Add a bar layer with position "fill"
  geom_bar(position = "fill") +
  # Add a brewer fill scale with default palette
 scale_fill_manual(values = blue_range(11))

3.5 Line plots

# Print the head of economics
head(economics)

## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018

# Using economics, plot unemploy vs. date
ggplot(economics, aes(date, unemploy)) +
  # Make it a line plot
geom_line()

# Change the y-axis to the proportion of the population that is unemployed
ggplot(economics, aes(date, unemploy/pop)) +
  geom_line()

3.6 Multiple time series

We already saw how the form of your data affects how you can plot it. Let’s explore that further with multiple time series. Here, it’s important that all lines are on the same scale, and if possible, on the same plot.

# Plot the Rainbow Salmon time series
ggplot(fish.species, aes(x = Year, y = Rainbow)) +
  geom_line()

# Plot the Pink Salmon time series
ggplot(fish.species, aes(x = Year, y = Pink)) +
  geom_line()

# Plot multiple time-series by grouping by species
ggplot(fish.tidy, aes(Year, Capture)) +
  geom_line(aes(group = Species))

# Plot multiple time-series by coloring by species
ggplot(fish.tidy, aes(Year, Capture, color = Species)) +
  geom_line()

4. Themes

n this chapter, we’ll explore how understanding the structure of your data makes data visualization much easier. Plus, it’s time to make our plots pretty. This is the last step in the data viz process. The Themes layer will enable you to make publication quality plots directly in R.

4.1 Moving legends

To change stylistic elements of a plot, call theme() and set plot properties to a new value. For example, the following changes the legend position. p + theme(legend.position = new_value)

Here, the new value can be

“top”, “bottom”, “left”, or “right’”: place it at that side of the plot.
“none”: don’t draw it.
c(x, y): c(0, 0) means the bottom-left and c(1, 1) means the top-right.

prop_unemployed_over_time <- economics %>% mutate(prop_unemployed_over_time= unemploy/pop)

# View the default plot
plt_prop_unemployed_over_time <- ggplot(prop_unemployed_over_time, aes(date, prop_unemployed_over_time)) + geom_line()
# Remove legend entirely
plt_prop_unemployed_over_time +
  theme(legend.position = "none")

# Position the legend at the bottom of the plot
plt_prop_unemployed_over_time +
 theme(legend.position = "bottom")

# Position the legend inside the plot at (0.6, 0.1)
plt_prop_unemployed_over_time +
 theme(legend.position = c(0.6, 0.1))

4.2 Modifying theme elements

Many plot elements have multiple properties that can be set. For example, line elements in the plot such as axes and gridlines have a color, a thickness (size), and a line type (solid line, dashed, or dotted). To set the style of a line, you use element_line(). For example, to make the axis lines into red, dashed lines, you would use the following.

p + theme(axis.line = element_line(color = “red”, linetype = “dashed”))

Similarly, element_rect() changes rectangles and element_text() changes text. You can remove a plot element using element_blank().

plt_prop_unemployed_over_time +
  theme(
    # For all rectangles, set the fill color to grey92
    rect = element_rect(fill = "grey92"),
    # For the legend key, turn off the outline
    legend.key = element_rect(color = NA))

plt_prop_unemployed_over_time +
  theme(
    rect = element_rect(fill = "grey92"),
    legend.key = element_rect(color = NA),
    # Turn off axis ticks
    axis.ticks = element_blank(),
    # Turn off the panel grid
    panel.grid = element_blank())

plt_prop_unemployed_over_time +
  theme(
    rect = element_rect(fill = "grey92"),
    legend.key = element_rect(color = NA),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    # Add major y-axis panel grid lines back
     panel.grid.major.y =element_line(
      # Set the color to white
      color = "white",
      # Set the size to 0.5
      size = 0.5,
      # Set the line type to dotted
      linetype = "dotted"))

plt_prop_unemployed_over_time +
  theme(
    rect = element_rect(fill = "grey92"),
    legend.key = element_rect(color = NA),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    panel.grid.major.y = element_line(
      color = "white",
      size = 0.5,
      linetype = "dotted"),
    # Set the axis text color to grey25
   axis.text = element_text(color="grey25"),
    # Set the plot title font face to italic and font size to 16
   plot.title = element_text(face="italic", size = 16))

4.3 Modifying whitespace

Whitespace means all the non-visible margins and spacing in the plot.

To set a single whitespace value, use unit(x, unit), where x is the amount and unit is the unit of measure.

Borders require you to set 4 positions, so use margin(top, right, bottom, left, unit). To remember the margin order, think TRouBLe.

The default unit is “pt” (points), which scales well with text. Other options include “cm”, “in” (inches) and “lines” (of text).

plt_mpg_vs_wt_by_cyl <-ggplot(mtcars, aes(wt, mpg)) + geom_point(aes(color = factor(cyl))) + labs(x = "weight(1000/lbs)", y = "Miles Per Gallon")
plt_mpg_vs_wt_by_cyl

plt_mpg_vs_wt_by_cyl +
  theme(
    # Set the axis tick length to 2 lines
    axis.ticks.length = unit(2, "lines"))

plt_mpg_vs_wt_by_cyl +
  theme(
    # Set the legend key size to 3 centimeters
     legend.key.size = unit(3, "cm"))

plt_mpg_vs_wt_by_cyl +
  theme(
    # Set the legend margin to (20, 30, 40, 50) points
    legend.margin=margin(20, 30, 40, 50, "pt"))

plt_mpg_vs_wt_by_cyl +
  theme(
    # Set the plot margin to (10, 30, 50, 70) millimeters
    plot.margin = margin(10, 30, 50, 70, "mm"))

4.4 Theme flexibility

In addition to making your own themes, there are several out-of-the-box solutions that may save you lots of time.

theme_gray() is the default.
theme_bw() is useful when you use transparency.
theme_classic() is more traditional.
theme_void() removes everything but the data.

library(ggthemes)
# Add a black and white theme
plt_prop_unemployed_over_time + theme_bw()

# Add a classic theme
plt_prop_unemployed_over_time + theme_classic()

# Add a void theme
plt_prop_unemployed_over_time + theme_void()

# Use the fivethirtyeight theme
plt_prop_unemployed_over_time + theme_fivethirtyeight()

# Use Tufte's theme
plt_prop_unemployed_over_time + theme_tufte()

# Use the Wall Street Journal theme
plt_prop_unemployed_over_time +
  theme_wsj()

Outside of ggplot2, another source of built-in themes is the ggthemes package. The workspace already contains the plt_prop_unemployed_over_time, the line plot from before. Let’s explore some of the ready-made ggthemes themes.

# Use the fivethirtyeight theme
plt_prop_unemployed_over_time +
  theme_fivethirtyeight()

# Use Tufte's theme
plt_prop_unemployed_over_time +
 theme_tufte()

# Use the Wall Street Journal theme
plt_prop_unemployed_over_time +
  theme_wsj()

4.5 Setting themes

Reusing a theme across many plots helps to provide a consistent style. You have several options for this.

Assign the theme to a variable, and add it to each plot.
Set your theme as the default using theme_set().

# Theme layer saved as an object, theme_recession
theme_recession <- theme(
  rect = element_rect(fill = "grey92"),
  legend.key = element_rect(color = NA),
  axis.ticks = element_blank(),
  panel.grid = element_blank(),
  panel.grid.major.y = element_line(color = "white", size = 0.5, linetype = "dotted"),
  axis.text = element_text(color = "grey25"),
  plot.title = element_text(face = "italic", size = 16),
  legend.position = c(0.6, 0.1))
# Combine the Tufte theme with theme_recession
theme_tufte_recession <- theme_tufte() + theme_recession
# Add the recession theme to the plot
plt_prop_unemployed_over_time + theme_tufte_recession

theme_recession <- theme(
  rect = element_rect(fill = "grey92"),
  legend.key = element_rect(color = NA),
  axis.ticks = element_blank(),
  panel.grid = element_blank(),
  panel.grid.major.y = element_line(color = "white", size = 0.5, linetype = "dotted"),
  axis.text = element_text(color = "grey25"),
  plot.title = element_text(face = "italic", size = 16),
  legend.position = c(0.6, 0.1))
theme_tufte_recession <- theme_tufte() + theme_recession
# Set theme_tufte_recession as the default theme
theme_set(theme_tufte_recession)
# Draw the plot (without explicitly adding a theme)
plt_prop_unemployed_over_time

We’ve seen many examples of beautiful, publication-quality plots. Let’s take a final look and put all the pieces together.

  # Add Tufte's theme
plt_prop_unemployed_over_time + theme_tufte()

plt_prop_unemployed_over_time + theme_tufte() +
  # Add individual theme elements
  theme(
    # Turn off the legend
     legend.position = "none",
    # Turn off the axis ticks
    axis.ticks = element_blank())

plt_prop_unemployed_over_time + theme_tufte() +
  theme(
    legend.position = "none",
    axis.ticks = element_blank(),
    # Set the axis title's text color to grey60
    axis.title= element_text(color = "grey60"),
    # Set the axis text's text color to grey60
   axis.text = element_text(color = "grey60"))

plt_prop_unemployed_over_time + theme_tufte() +
  theme(
    legend.position = "none",
    axis.ticks = element_blank(),
    axis.title = element_text(color = "grey60"),
    axis.text = element_text(color = "grey60"),
    # Set the panel gridlines major y values
    panel.grid.major.y = element_line(
      # Set the color to grey60
     color = "grey60",
      # Set the size to 0.25
       size = 0.25,
      # Set the linetype to dotted
      linetype = "dotted"))

4.6 Effective explanatory plots

Let’s focus on producing beautiful and effective explanatory plots. In the next couple of exercises, you’ll create a plot that is similar to the one shown in the video using gm2007, a filtered subset of the gapminder dataset.

This type of plot will be in an info-viz style, meaning that it would be similar to something you’d see in a magazine or website for a mostly lay audience.

# Add a geom_segment() layer
ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = 30, yend = country), size = 2) + scale_x_continuous(position = "top")

# Add a geom_text() layer
ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = 30, yend = country), size = 2) +
  geom_text(aes(label = lifeExp), color = "white", size = 1.5) +scale_x_continuous(position = "top")

# Set the color scale
palette <- brewer.pal(5, "RdYlBu")[-(2:4)]

# Modify the scales
plt_country_vs_lifeExp <- ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = 30, yend = country), size = 2) +
  geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
  scale_x_continuous("", expand = c(0, 0), limits = c(30, 90), position = "top") +
  scale_color_gradientn(colors = palette)
plt_country_vs_lifeExp

# Add a title and caption
ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
  geom_point(size = 4) +
  geom_segment(aes(xend = 30, yend = country), size = 2) +
  geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
  scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") +
  scale_color_gradientn(colors = palette) +
  labs(title = "Highest and lowest life expectancies, 2007", caption = "Source: gapminder")

4.7 Using annotate() for embellishments

In the previous exercise, we completed our basic plot. Now let’s polish it by playing with the theme and adding annotations. In this exercise, you’ll use annotate() to add text and a curve to the plot.

global_mean <- mean(gm2007$lifeExp)
x_start <- global_mean + 4
y_start <- 5.5
x_end <- global_mean
y_end <- 7.5

step_1_themes <- theme_classic() +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text = element_text(color = "black"),
        axis.title = element_blank(),
        legend.position = "none") 

step_3_annotation <- annotate(
    "text",
    x = x_start, y = y_start,
    label = "The\nglobal\naverage",
    vjust = 1, size = 3, color = "grey40")



# Define the theme
plt_country_vs_lifeExp +
  step_1_themes +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text = element_text(color = "black"),
        axis.title = element_blank(),
        legend.position = "none") +
  scale_x_continuous(position = "top")

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

# Add a vertical line
plt_country_vs_lifeExp +
  step_1_themes +
  geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
  scale_x_continuous(position = "top")

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

# Add text
plt_country_vs_lifeExp +
  step_1_themes +
  geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
  annotate(
    "text",
    x = x_start, y = y_start,
    label = "The\nglobal\naverage",
    vjust = 1, size = 3, color = "grey40"
  ) + scale_x_continuous(position = "top")

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

# Add a curve
plt_country_vs_lifeExp +  
  step_1_themes +
  geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
  step_3_annotation +
  annotate(
    "curve",
    x = x_start, y = y_start,
    xend = x_end, yend = y_end,
    arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
    color = "grey40"
  ) + scale_x_continuous(position = "top")

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

- My Favorite Team -