Libraries
library(tidyverse)
library(forcats)
library(lubridate)
library(ggthemes)
library(quantreg)
library(openair)
library(lattice)
library(RColorBrewer)
library(rvest)
Data
data("iris")
data("mtcars")
data("diamonds")
data("economics")
mtcars$fcyl <- as.factor(mtcars$cyl)
mtcars$fam <- as.factor(mtcars$am)
mtcars$car <- row.names(mtcars)
load("~/R Data/fish.RData")
Vocab <- read.csv("https://raw.githubusercontent.com/sigmasigmaiota/vocab/master/vocab.csv")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## embedded nul(s) found in input
gm2007 <- read.csv("gm2007.csv")
head(Vocab)
## X.1 X year sex education vocabulary
## 1 1 19740001 1974 Male 14 9
## 2 2 19740002 1974 Male 16 9
## 3 3 19740003 1974 Female 10 9
## 4 4 19740004 1974 Female 10 5
## 5 5 19740005 1974 Female 12 8
## 6 6 19740006 1974 Male 16 8
A picture paints a thousand words, which is why R ggplot2 is such a powerful tool for graphical data analysis. In this chapter, you’ll progress from simply plotting data to applying a variety of statistical methods. These include a variety of linear models, descriptive and inferential statistics (mean, standard deviation and confidence intervals) and custom functions.
To practice on the remaining layers (statistics, coordinates and facets), we’ll continue working on several datasets from the first course.
# Using mtcars, draw a scatter plot of mpg vs. wt
ggplot(mtcars, aes( x= wt, y=mpg)) + geom_point()+ theme_gray()
# Amend the plot to add a smooth layer
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() + geom_smooth()+ theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Amend the plot. Use lin. reg. smoothing; turn off std err ribbon
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
# Amend the plot. Swap geom_smooth() for stat_smooth().
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) + theme_gray()
## `geom_smooth()` using formula 'y ~ x'
We’ll continue with the previous exercise by considering the situation of looking at sub-groups in our dataset. For this we’ll encounter the invisible group aesthetic.
# Using mtcars, plot mpg vs. wt, colored by fcyl
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
# Add a point layer
geom_point() +
# Add a smooth lin reg stat, no ribbon
stat_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
# Amend the plot to add another smooth layer with dummy grouping
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
stat_smooth(aes(group = 1), method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
In the previous exercise we used se = FALSE in stat_smooth() to remove the 95% Confidence Interval. Here we’ll consider another argument, span, used in LOESS smoothing, and we’ll take a look at a nice scenario of properly mapping different models.
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add 3 smooth LOESS stats, varying span & color
stat_smooth(se = FALSE, color = "red", span = 0.9) +
stat_smooth(se = FALSE, color = "green", span = 0.6) +
stat_smooth(se = FALSE, color = "blue", span = 0.3)+ theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Amend the plot to color by fcyl
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Add a smooth LOESS stat, no ribbon
stat_smooth(method = "lm", se = FALSE) +
# Add a smooth lin. reg. stat, no ribbon
stat_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# Amend the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
geom_point() +
# Map color to dummy variable "All"
stat_smooth(aes(color = "All"), se = FALSE) +
stat_smooth(method = "lm", se = FALSE)+ theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Here, we’ll continue with the Vocab dataset and use stat_quantile() to apply a quantile regression.
ggplot(Vocab, aes(x = education, y = vocabulary)) +
geom_jitter(alpha = 0.25) +
# Add a quantile stat, at 0.05, 0.5, and 0.95
stat_quantile(quantiles = c(0.05, 0.5, 0.95))
## Warning: Removed 1 rows containing non-finite values (stat_quantile).
## Smoothing formula not specified. Using: y ~ x
## Warning: Removed 1 rows containing missing values (geom_point).
# Amend the plot to color by year_group
ggplot(Vocab, aes(x = education, y = vocabulary, color = year)) +
geom_jitter(alpha = 0.25) +
stat_quantile(quantiles = c(0.05, 0.5, 0.95)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_quantile).
## Smoothing formula not specified. Using: y ~ x
## Warning: Removed 1 rows containing missing values (geom_point).
In the Vocab dataset, education and vocabulary are integer variables. In the first course, you saw that this is one of the four causes of overplotting. You’d get a single point at each intersection between the two variables.
One solution, shown in the step 1, is jittering with transparency. Another solution is to use stat_sum(), which calculates the total number of overlapping observations and maps that onto the size aesthetic.
# Run this, look at the plot, then update it
ggplot(Vocab, aes(x = education, y = vocabulary)) +
# Replace this with a sum stat
geom_jitter(alpha = 0.25) + theme_gray()
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(Vocab, aes(x = education, y = vocabulary)) +stat_sum(alpha = 0.25) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_sum() +
# Add a size scale, from 1 to 10
scale_size(range=c(1, 10)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).
# Amend the stat to use proportion sizes
ggplot(Vocab, aes(x = education, y = vocabulary)) +
stat_sum(aes(size = ..prop..)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).
# Amend the plot to group by education
ggplot(Vocab, aes(x = education, y = vocabulary, group = education)) +
stat_sum(aes(size = ..prop..)) + theme_gray()
## Warning: Removed 1 rows containing non-finite values (stat_sum).
In the following exercises, we’ll aim to make the plot shown in the viewer. Here, we’ll establish our positions and base layer of the plot.
Establishing these items as independent objects will allow us to recycle them easily in many layers, or plots.
posn_j <- position_jitter(width = 0.2)
posn_d <- position_dodge(width = 0.1)
posn_jd <- position_jitterdodge(jitter.width = 0.2, dodge.width = 0.1)
# Create the plot base: wt vs. fcyl, colored by fam
p_wt_vs_fcyl_by_fam <- ggplot(mtcars, aes(x= fcyl, y=wt, color = fam)) + theme_gray()
# Add a point layer
p_wt_vs_fcyl_by_fam + geom_point() + theme_gray()
# Add jittering only
p_wt_vs_fcyl_by_fam_jit <- p_wt_vs_fcyl_by_fam + geom_point(position = posn_j) + theme_gray()
p_wt_vs_fcyl_by_fam_jit
# Add jittering only
p_wt_vs_fcyl_by_fam + geom_point(position = posn_j) + theme_gray()
# Add dodging only
p_wt_vs_fcyl_by_fam + geom_point(position = posn_d) + theme_gray()
# Add jittering and dodging
p_wt_vs_fcyl_by_fam + geom_point(position = posn_jd) + theme_gray()
###5.7 Plotting variations The preparation is done; now let’s explore stat_summary().
Summary statistics refers to a combination of location (mean or median) and spread (standard deviation or confidence interval).
These metrics are calculated in stat_summary() by passing a function to the fun.data argument. mean_sdl(), calculates multiples of the standard deviation and mean_cl_normal() calculates the t-corrected 95% CI.
p_wt_vs_fcyl_by_fam_jit +
# Add a summary stat of std deviation limits
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = posn_d) + theme_gray()
p_wt_vs_fcyl_by_fam_jit +
# Change the geom to be an errorbar
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = posn_d, geom = "errorbar") + theme_gray()
p_wt_vs_fcyl_by_fam_jit +
# Add a summary stat of normal confidence limits
stat_summary(fun.data = mean_cl_normal, position = posn_d) + theme_gray()
The Coordinates layers offer specific and very useful tools for efficiently and accurately communicating data. Here we’ll look at the various ways of effectively using these layers, so you can clearly visualize lognormal datasets, variables with units, and periodic data.
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris.smooth <- ggplot(iris, aes(x = Sepal.Length, y= Sepal.Width,
color = Species)) + geom_point(alpha = 0.7) + geom_smooth() + theme_gray()
iris.smooth
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
iris.smooth + scale_x_continuous(limits = c(4.5, 5.5)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 95 rows containing non-finite values (stat_smooth).
## Warning: Removed 95 rows containing missing values (geom_point).
iris.smooth + xlim(c(4.5, 5.5)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 95 rows containing non-finite values (stat_smooth).
## Removed 95 rows containing missing values (geom_point).
iris.smooth + coord_cartesian(xlim = c(4.5, 5.5)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The big difference is that the scale functions change the underlying dataset, which affects calculations made by computed geoms (like histograms or smooth trend lines), whereas coordinate functions make no changes to the dataset.
# Run the code, view the plot, then update it
mtcars_wt_hp <- ggplot(mtcars, aes(x = wt, y = hp, color = fam)) +
geom_point() + geom_smooth() +
# Add a continuous x scale from 3 to 6
scale_x_continuous(limit = c(3, 6)) + theme_gray()
mtcars_wt_hp
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.002
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.572
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 12 rows containing missing values (geom_point).
mtcars_wt_hp + coord_cartesian(xlim = c(3, 6)) + theme_gray()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.168
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.002
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 3.572
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4e-06
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 12 rows containing missing values (geom_point).
Aspect ratio I: 1:1 ratios We can set the aspect ratio of a plot with coord_fixed(), which uses ratio = 1 as a default. A 1:1 aspect ratio is most appropriate when two continuous variables are on the same scale, as with the iris dataset.
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_jitter() +
geom_smooth(method = "lm", se = FALSE) + theme_gray()+
# Fix the coordinate ratio
coord_fixed(1)
## `geom_smooth()` using formula 'y ~ x'
The coord_*() layer functions offer two useful arguments that work well together: expand and clip.
ggplot(mtcars, aes(wt, mpg)) + geom_point(size = 2) +
# Add Cartesian coordinates with zero expansion
theme_gray() + coord_cartesian(expand = 0)
ggplot(mtcars, aes(wt, mpg)) + geom_point(size = 2) +
# Turn clipping off
coord_cartesian(expand = 0, clip = "off") + theme_gray() +
# Remove axis lines
theme(axis.line= element_blank())
ggplot(msleep, aes(x = bodywt, y = 1)) + geom_jitter() + theme_gray() +
scale_x_continuous(limits = c(0, 7000), breaks = seq(0, 7000, 1000)) +
labs(x = "Body Weight (kg)", title = "Raw data") + theme_gray()
ggplot(msleep, aes(x =log10(bodywt) , y = 1)) + geom_jitter() +
scale_x_continuous(limits = c(-3, 4), breaks = -3:4) +
labs(x = "Body Weight log10(kg)", title = "log10 trans of raw walues")+ theme_gray()
ggplot(msleep, aes(x =log10(bodywt) , y = 1)) + geom_jitter() +
scale_x_continuous(limits = c(-3, 4), breaks = -3:4) +
annotation_logticks(sides = "b")
labs(x = "Body Weight log10(kg)", title = "log10 trans of raw walues")+ theme_gray()
## NULL
ggplot(msleep, aes(x =bodywt , y = 1)) + geom_jitter() +
scale_x_log10(limits = c(1e-03, 1e+4)) +
labs(x = "Body Weight log10(kg)", title = "log10 trans using scale_x_log10()")+ theme_gray()
ggplot(msleep, aes(x =bodywt , y = 1)) + geom_jitter() +
coord_trans(x = "log10") +
labs(x = "Body Weight log10(kg)", title = "log10 trans using cord_tans()")+ theme_gray()
Using scale_y_log10() and scale_x_log10() is equivalent to transforming our actual dataset before getting to ggplot2.
Using coord_trans(), setting x = “log10” and/or y = “log10” arguments, transforms the data after statistics have been calculated. The plot will look the same as with using scale_*_log10(), but the scales will be different, meaning that we’ll see the original values on our log10 transformed axes. This can be useful since log scales can be somewhat unintuitive.
# Produce a scatter plot of brainwt vs. bodywt
ggplot(msleep, aes(x = bodywt, y = brainwt)) + geom_point() +
ggtitle("Raw Values") + theme_gray()
## Warning: Removed 27 rows containing missing values (geom_point).
# Add scale_*_*() functions
ggplot(msleep, aes(bodywt, brainwt)) + geom_point() +
scale_x_log10() + scale_y_log10() +
ggtitle("Scale_x_log10 functions") + theme_gray()
## Warning: Removed 27 rows containing missing values (geom_point).
# Perform a log10 coordinate system transformation
ggplot(msleep, aes(bodywt, brainwt)) +
geom_point() +
coord_trans(x = "log10", y = "log10") + ggtitle("Cord_trans log10()") + theme_gray()
## Warning: Removed 27 rows containing missing values (geom_point).
# Plot with a scale_*_*() function:
ggplot(msleep, aes(bodywt, brainwt)) + geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add a log10 x scale
scale_x_log10() +
# Add a log10 y scale
scale_y_log10() +
ggtitle("Scale_ functions")+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 27 rows containing non-finite values (stat_smooth).
## Warning: Removed 27 rows containing missing values (geom_point).
# Plot with transformed coordinates
ggplot(msleep, aes(bodywt, brainwt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# Add a log10 coordinate transformation for x and y axes
coord_trans(x= "log10", y = "log10")+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 27 rows containing non-finite values (stat_smooth).
## Removed 27 rows containing missing values (geom_point).
Double x and y-axes are a contentious topic in data visualization. We’ll revisit that discussion at the end of chapter 4. Here, I want to review a great use case where double axes actually do add value to a plot.
Our goal plot is displayed in the viewer. The two axes are the raw temperature values on a Fahrenheit scale and the transformed values on a Celsius scale.
You can imagine a similar scenario for Log-transformed and original values, miles and kilometers, or pounds and kilograms. A scale that is unintuitive for many people can be made easier by adding a transformation as a double axis.
airquality <- airquality %>%
mutate(Date = make_date(1974, Month, Day))
# Using airquality, plot Temp vs. Date, # Add a line layer
ggplot(airquality, aes(Month, y = Temp)) + geom_line() +
labs(x = "Date (1973)", y = "Fahrenheit")+ theme_gray()
# Define breaks (Fahrenheit), # Convert y_breaks from Fahrenheit to Cels.
y_breaks <- c(59, 68, 77, 86, 95, 104)
y_labels <- (y_breaks - 32) * 5 / 9
# Create a secondary x-axis
secondary_y_axis <- sec_axis(
# Use identity transformation
trans = identity, name = "Celsius",
# Define breaks and labels as above
breaks = y_breaks, labels = y_labels)
# Examine the object
secondary_y_axis
## <ggproto object: Class AxisSecondary, gg>
## axis: NULL
## break_info: function
## breaks: 59 68 77 86 95 104
## create_scale: function
## detail: 1000
## empty: function
## guide: waiver
## init: function
## labels: 15 20 25 30 35 40
## make_title: function
## mono_test: function
## name: Celsius
## trans: function
## transform_range: function
## super: <ggproto object: Class AxisSecondary, gg>
# From previous step
y_breaks <- c(59, 68, 77, 86, 95, 104)
y_labels <- (y_breaks - 32) * 5 / 9
secondary_y_axis <- sec_axis(
trans = identity, name = "Celsius",
breaks = y_breaks, labels = y_labels)
# Update the plot
ggplot(airquality, aes(x = Date, y = Temp)) +
geom_line() +
# Add the secondary y-axis
scale_y_continuous(sec.axis = secondary_y_axis) +
labs(x = "Date (1973)", y = "Fahrenheit") + theme_gray()
Flipping axes means to reverse the variables mapped onto the x and y aesthetics. We can just change the mappings in aes(), but we can also use the coord_flip() layer function.
There are two reasons to use this function: * We want a vertical geom to be horizontal, or * We’ve completed a long series of plotting functions and want to flip it without having to rewrite all our commands.
# Plot fcyl bars, filled by fam, # Place bars side by side
ggplot(mtcars, aes(fcyl, fill = fam)) + geom_bar(position = "dodge")+ theme_gray()
ggplot(mtcars, aes(fcyl, fill = fam)) + geom_bar(position = "dodge") +
# Flip the x and y coordinates
coord_flip() + theme_gray()
ggplot(mtcars, aes(fcyl, fill = fam)) +
# Set a dodge width of 0.5 for partially overlapping bars
geom_bar(position = position_dodge(width = 0.5)) + coord_flip()+ theme_gray()
# Plot of wt vs. car
ggplot(mtcars, aes(x=car, y=wt)) +
# Add a point layer
geom_point() + labs(x = "car", y = "weight")+ theme_gray()
# Flip the axes to set car to the y axis
ggplot(mtcars, aes(car, wt)) + geom_point() +
labs(x = "car", y = "weight") + coord_flip() + theme_gray()
The coord_polar() function converts a planar x-y Cartesian plot to polar coordinates. This can be useful if you are producing pie charts.
We can imagine two forms for pie charts - the typical filled circle, or a colored ring.
Typical pie charts omit all of the non-data ink. Pie charts are not really better than stacked bar charts.
# Run the code, view the plot, then update it
ggplot(mtcars, aes(x = 1, fill = fcyl)) + geom_bar() +
# Add a polar coordinate system
coord_polar(theta = "y")+ theme_gray()
ggplot(mtcars, aes(x = 1, fill = fcyl)) + geom_bar(width = 0.1) +
coord_polar(theta = "y") +
# Add a continuous x scale from 0.5 to 1.5
scale_x_continuous(limits = c(0.5, 1.5))+ theme_gray()
Polar coordinate plots are well-suited to scales like compass direction or time of day. A popular example is the “wind rose”.
# grab a table of values from the web with a little scraping
url <- 'http://snowfence.umn.edu/Components/winddirectionanddegreeswithouttable3.htm'
page <- read_html(url)
directions_raw <- page %>% html_node('td table') %>% html_table(header = TRUE)
directions <- directions_raw %>%
set_names(~tolower(sub(' Direction', '', .x))) %>%
slice(-1) %>%
separate(degree, c('degree_min', 'degree_max'), sep = '\\s+-\\s+', convert = TRUE)
directions
## # A tibble: 16 × 3
## cardinal degree_min degree_max
## <chr> <dbl> <dbl>
## 1 N 349. 11.2
## 2 NNE 11.2 33.8
## 3 NE 33.8 56.2
## 4 ENE 56.2 78.8
## 5 E 78.8 101.
## 6 ESE 101. 124.
## 7 SE 124. 146.
## 8 SSE 146. 169.
## 9 S 169. 191.
## 10 SSW 191. 214.
## 11 SW 214. 236.
## 12 WSW 236. 259.
## 13 W 259. 281.
## 14 WNW 281. 304.
## 15 NW 304. 326.
## 16 NNW 326. 349.
wind <- mydata %>%
mutate(wd_cardinal = cut(
wd,
breaks = c(0, directions$degree_max, 360),
labels = c(directions$cardinal, 'N')
), ws_interval = cut(ws,
breaks = c(0,2,4,6,8,10,12,14),
include.lowest=T))
# Using wind, plot wd filled by ws
ggplot(wind, aes(wd_cardinal, fill = ws_interval)) +
# Add a bar layer with width 1
geom_bar(width = 1)+ theme_gray()
# Convert to polar coordinates:
ggplot(wind, aes(wd_cardinal, fill = ws_interval)) +
geom_bar(width = 1) +
coord_polar()+ theme_gray()
# Convert to polar coordinates:
ggplot(wind, aes(wd_cardinal, fill = ws_interval)) +
geom_bar(width = 1) +
coord_polar(start = -pi/16) + theme_gray()
Faceting splits the data up into groups, according to a categorical variable, then plots each group in its own panel. For splitting the data by one or two categorical variables, facet_grid() is best.
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet rows by am
facet_grid(rows = vars(am)) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet columns by cyl
facet_grid(cols = vars(cyl)) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet rows by am and columns by cyl
facet_grid(rows = vars(am), cols = vars(cyl)) + theme_gray()
In addition to aesthetics, facets are another way of encoding factor (i.e. categorical) variables. They can be used to reduce the complexity of plots with many variables.
Two variables are mapped onto the color aesthetic, using hue and lightness. To achieve this we combined fcyl and fam into a single interaction variable, fcyl_fam. This will allow us to take advantage of Color Brewer’s Paired color palette.
mtcars <- mtcars %>%
mutate(fcyl_fam = interaction(fcyl, fam, sep=":"))
# See the interaction column
mtcars$fcyl_fam
## [1] 6:1 6:1 4:1 6:0 8:0 6:0 8:0 4:0 4:0 6:0 6:0 8:0 8:0 8:0 8:0 8:0 8:0 4:1 4:1
## [20] 4:1 4:0 8:0 8:0 8:0 8:0 4:1 4:1 4:1 8:1 6:1 8:1 4:1
## Levels: 4:0 6:0 8:0 4:1 6:1 8:1
# Color the points by fcyl_fam
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam)) + geom_point() +
# Use a paired color palette
scale_color_brewer(palette = "Paired") + theme_gray()
# Update the plot to map disp to size
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size = disp)) +
geom_point() + scale_color_brewer(palette = "Paired") + theme_gray()
# Update the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size = disp)) +
geom_point() + scale_color_brewer(palette = "Paired") +
# Grid facet on gear and vs
facet_grid(rows = vars(gear), cols = vars(vs)) + theme_gray()
# facet_grid(rows = vars(A)) facet_grid(A ~ .)
# facet_grid(cols = vars(B)) facet_grid(. ~ B)
# facet_grid(rows = vars(A), cols = vars(B)) facet_grid(A ~ B)
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet rows by am using formula notation
facet_grid(am ~ .) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet columns by cyl using formula notation
facet_grid(. ~ cyl) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet rows by am and columns by cyl using formula notation
facet_grid(am ~ cyl) + theme_gray()
If our factor levels are not clear, your facet labels may be confusing. We can assign proper labels in your original data before plotting or we can use the labeller argument in the facet layer.
The default value is= * label_value: Default, displays only the value
Common alternatives are:
# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# The default is label_value
facet_grid(cols= vars(cyl)) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Displaying both the values and the variables
facet_grid(cols = vars(cyl), labeller = label_both) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Label context
facet_grid(cols = vars(cyl), labeller = label_context) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Two variables
facet_grid(cols = vars(vs, cyl), labeller = label_context) + theme_gray()
If you want to change the order of your facets, it’s best to properly define your factor variables before plotting.
Let’s see this in action with the mtcars transmission variable am. In this case, 0 = “automatic” and 1 = “manual”.
Here, we’ll make am a factor variable and relabel the numbers to proper names. The default order is alphabetical. To rearrange them we’ll call fct_rev() from the forcats package to reverse the order.
# Make factor, set proper labels explictly
mtcars$fam <- factor(mtcars$am, labels = c(`0` = "automatic",
`1` = "manual"))
# Default order is alphabetical
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
facet_grid(cols = vars(fam)) + theme_gray()
# Make factor, set proper labels explictly, and
# manually set the label order
mtcars$fam <- factor(mtcars$am,
levels = c(0, 1),
labels = c("manual", "automatic"))
# View again
ggplot(mtcars, aes(wt, mpg)) +
geom_point() +
facet_grid(cols = vars(fam)) + theme_gray()
Continuous variables
By default every facet of a plot has the same axes. If the data ranges vary wildly between facets, it can be clearer if each facet has its own scale. This is achieved with the scales argument to facet_grid().
When faceting by columns, “free_y” has no effect, but we can adjust the x-axis. In contrast, when faceting by rows, “free_x” has no effect, but we can adjust the y-axis.
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Facet columns by cyl
facet_grid(cols = vars(cyl)) + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Update the faceting to free the x-axis scales
facet_grid(cols = vars(cyl), scales ="free_x") + theme_gray()
ggplot(mtcars, aes(wt, mpg)) + geom_point() +
# Swap cols for rows; free the y-axis scales
facet_grid(rows = vars(cyl), scales = "free_y") + theme_gray()
Categorical variables
When you have a categorical variable with many levels which are not all present in each sub-group of another variable, it’s usually desirable to drop the unused levels.
By default, each facet of a plot is the same size. This behavior can be changed with the spaces argument, which works in the same way as scales: “free_x” allows different sized facets on the x-axis, “free_y”, allows different sized facets on the y-axis, “free” allows different sizes in both directions.
ggplot(mtcars, aes(x = mpg, y = car, color = fam)) + geom_point() +
# Facet rows by gear
facet_grid(rows = vars(gear)) + theme_gray()
ggplot(mtcars, aes(x = mpg, y = car, color = fam)) + geom_point() +
# Free the y scales and space
facet_grid(rows = vars(gear), scales ="free_y", space = "free_y") + theme_gray()
mtcars2 <- mtcars %>%
# arrange from lo to hi
arrange(-mpg) %>%
# redefine in factor leveles
mutate(car = as_factor(car))
ggplot(mtcars2, aes(x = mpg, y = car, color = fam)) +
geom_point() +
# Free the y scales and space
facet_grid(rows = vars(gear),
scales = "free_y",
space = "free_y") + theme_gray()
facet_grid() is fantastic for categorical variables with a small number of levels. Although it is possible to facet variables with many levels, the resulting plot will be very wide or very tall, which can make it difficult to view.
The solution is to use facet_wrap() which separates levels along one axis but wraps all the subsets across a given number of rows or columns.
ggplot(Vocab, aes(x = education, y = vocabulary)) + stat_smooth(method = "lm", se = FALSE) +
# Create facets, wrapping by year, using vars()
facet_wrap(vars(year))+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
ggplot(Vocab, aes(x = education, y = vocabulary)) + stat_smooth(method = "lm", se = FALSE) +
# Create facets, wrapping by year, using a formula
facet_wrap(~ year)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
ggplot(Vocab, aes(x = education, y = vocabulary)) + stat_smooth(method = "lm", se = FALSE) +
# Update the facet layout, using 11 columns
facet_wrap(~ year, ncol = 11)+ theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
Facets are great for seeing subsets in a variable, but sometimes you want to see both those subsets and all values in a variable.
Here, the margins argument to facet_grid() is your friend.
mtcars$fvs <- factor(mtcars$vs)
ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point() +
# Facet rows by fvs and fam, and cols by gear
facet_grid(rows = vars(fvs, fam), cols = vars(gear))
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
# Update the facets to add margins
facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = TRUE)
ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point() +
# Update the facets to only show margins on fam
facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = "fam")
ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point() +
# Update the facets to only show margins on gear and fvs
facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = c("gear", "fvs"))
Dynamite plots
“Dynamite plots” (bar plots with error bars) are not well suited for their intended purpose of depicting distributions. If we really want error bars on bar plots, we can of course get them, but we’ll need to set the positions manually. A point geom will typically serve you much better.
# Plot wt vs. fcyl
ggplot(mtcars, aes(x = fcyl, y = wt)) +
# Add a bar summary stat of means, colored skyblue
stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
# Add an errorbar summary stat std deviation limits
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
Position dodging we will add a distinction between transmission type, fam, for the dynamite plots and explore position dodging (where bars are side-by-side).
# Update the aesthetics to color and fill by fam
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
stat_summary(fun.y = mean, geom = "bar") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
# Set alpha for the first and set position for each stat summary function
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
stat_summary(fun.y = mean, geom = "bar", alpha = 0.5, position = "dodge") +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", position = "dodge", width = 0.1) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
# Define a dodge position object with width 0.9
posn_d <- position_dodge(width = 0.9)
# For each summary stat, update the position to posn_d
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
stat_summary(fun.y = mean, geom = "bar", position = posn_d, alpha = 0.5) +
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), width = 0.1, position = posn_d, geom = "errorbar") + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
If it is appropriate to use bar plots, then it nice to give an impression of the number of values in each group.
stat_summary() doesn’t keep track of the count. stat_sum() does (that’s the whole point), but it’s difficult to access. It’s more straightforward to calculate exactly what we want to plot ourselves.
mtcars_by_cyl <- mtcars %>%
group_by(cyl) %>%
summarize(mean_wt = mean(wt),
sd_wt = sd(wt),
n_wt = n()) %>%
mutate( prop = n_wt/sum(n_wt))
mtcars_by_cyl
## # A tibble: 3 × 5
## cyl mean_wt sd_wt n_wt prop
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 4 2.29 0.570 11 0.344
## 2 6 3.12 0.356 7 0.219
## 3 8 4.00 0.759 14 0.438
# Using mtcars_cyl, plot mean_wt vs. cyl
ggplot(mtcars_by_cyl, aes(cyl, mean_wt)) +
# Add a bar layer with identity stat, filled skyblue
geom_bar(stat = "identity", fill = "skyblue") + theme_gray()
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
# Swap geom_bar() for geom_col()
geom_col(fill = "skyblue") + theme_gray()
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
# Set the width aesthetic to prop
geom_col(fill = "skyblue", aes(width = prop)) + theme_gray()
## Warning: Ignoring unknown aesthetics: width
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
geom_col(aes(width = prop), fill = "skyblue") +
# Add an errorbar layer
geom_errorbar(
# ... at mean weight plus or minus 1 std dev
aes(ymin = mean_wt - sd_wt,
ymax = mean_wt + sd_wt),
# with width 0.1
width = 0.1
) + theme_gray()
## Warning: Ignoring unknown aesthetics: width
Since heat maps encode color on a continuous scale, they are difficult to accurately decode. Hence, heat maps are most useful if you have a small number of boxes and/or a clear pattern that allows you to overcome decoding difficulties.
To produce them, map two categorical variables onto the x and y aesthetics, along with a continuous variable onto fill. The geom_tile() layer adds the boxes.
library(lattice)
str(barley)
## 'data.frame': 120 obs. of 4 variables:
## $ yield : num 27 48.9 27.4 39.9 33 ...
## $ variety: Factor w/ 10 levels "Svansota","No. 462",..: 3 3 3 3 3 3 7 7 7 7 ...
## $ year : Factor w/ 2 levels "1932","1931": 2 2 2 2 2 2 2 2 2 2 ...
## $ site : Factor w/ 6 levels "Grand Rapids",..: 3 6 4 5 1 2 3 6 4 5 ...
# Using barley, plot variety vs. year, filled by yield
ggplot(barley, aes(year, variety, fill = yield))+
# Add a tile geom
geom_tile() + theme_gray()
ggplot(barley, aes(x = year, y = variety, fill = yield)) + geom_tile() +
# Facet, wrapping by site, with 1 column
facet_wrap(facets = vars(site), ncol = 1) +
# Add a fill scale using an 2-color gradient
scale_fill_gradient(low = "white", high = "red") + theme_gray()
# A palette of 9 reds
red_brewer_palette <- brewer.pal(9, "Reds")
# Update the plot
ggplot(barley, aes(x = year, y = variety, fill = yield)) + geom_tile() +
facet_wrap(facets = vars(site), ncol = 1) +
# Update scale to use n-colors from red_brewer_palette
scale_fill_gradientn(colors = red_brewer_palette ) + theme_gray()
Heat map alternatives
There are several alternatives to heat maps. The best choice really depends on the data and the story you want to tell with this data. If there is a time component, the most obvious choice is a line plot.
# Using barley, plot yield vs. year, colored and grouped by variety
ggplot(barley, aes(x = year, y = yield, color = variety, group = variety)) +
# Add a line layer
geom_line() +
# Facet, wrapping by site, with 1 row
facet_wrap( ~ site, nrow = 1) + theme_gray()
# Using barely, plot yield vs. year, colored, grouped, and filled by site
ggplot(barley, aes(x = year, y = yield, color = site, group = site, fill = site)) +
# Add a line summary stat aggregated by mean
stat_summary(fun.y = mean, geom = "line") +
# Add a ribbon summary stat with 10% opacity, no color
stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "ribbon", alpha = 0.1, color = NA) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
When you first encounter a data visualization, either from yourself or a colleague, you always want to critically ask if it’s obscuring the data in any way.
Let’s take a look at the steps we could take to produce and improve the plot in the view.
The data comes from an experiment where the effect of two different types of vitamin C sources, orange juice or ascorbic acid, were tested on the growth of the odontoblasts (cells responsible for tooth growth) in 60 guinea pigs.
The data is stored in the TG data frame, which contains three variables: dose, len, and supp.
library(datasets)
TG = (ToothGrowth)
str(TG)
## 'data.frame': 60 obs. of 3 variables:
## $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
## $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
## $ dose: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
# Initial plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
stat_summary(fun.data = mean_sdl,
fun.args = list(mult = 1),
position = position_dodge(0.1))
# View plot
growth_by_dose
# Change type
TG$dose <- as.numeric(as.character(TG$dose))
# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
stat_summary(fun.data = mean_sdl,
fun.args = list(mult = 1),
position = position_dodge(0.2)) + theme_gray()
# View plot
growth_by_dose
# Change type
TG$dose <- as.numeric(as.character(TG$dose))
# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
stat_summary(fun.data = mean_sdl,
fun.args = list(mult = 1),
position = position_dodge(0.2)) +
# Use the right geometry
stat_summary(fun.y = mean,
geom = "line",
position = position_dodge(0.1)) + theme_gray()
## Warning: `fun.y` is deprecated. Use `fun` instead.
# View plot
growth_by_dose
# Change type
TG$dose <- as.numeric(as.character(TG$dose))
# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
stat_summary(fun.data = mean_sdl,
fun.args = list(mult = 1),
position = position_dodge(0.2)) +
stat_summary(fun.y = mean,
geom = "line",
position = position_dodge(0.1)) +
theme_classic() +
# Adjust labels and colors:
labs(x = "Dose (mg/day)", y = "Odontoblasts length (mean, standard deviation)", color = "Supplement") +
scale_color_brewer(palette = "Set1", labels = c("Orange juice", "Ascorbic acid")) +
scale_y_continuous(limits = c(0,35), breaks = seq(0, 35, 5), expand = c(0,0))
## Warning: `fun.y` is deprecated. Use `fun` instead.
# View plot
growth_by_dose
# plot 1: Density of price for each type of cut of the diamond:
ggplot(data=diamonds,aes(x=price, group=cut, fill=cut)) +
geom_density(adjust=1.5)
# plot 2: Density plot with transparency (using the alpha argument):
ggplot(data=diamonds,aes(x=price, group=cut, fill=cut)) +
geom_density(adjust=1.5 , alpha=0.2)
#install.packages("GGally")
library("GGally")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Prepare some data
df <- mtcars[, c(1,3,4,5,6,7)]
# Correlation plot 1
ggcorr(df, palette = "RdBu", label = TRUE)
# Correlation plot 2
ggpairs(df)
The End.
Thanks DataCamp