These notes are designed to track Section 3 of R for Data Science, by Wickham and Grolemund.
First get all of the packages included in the tidyverse. This includes ggplot2.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
Examine the mpg dataframe from ggplot2.
str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
summary(mpg)
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
The following variables are character: class, fl, drv and trans. Get tables of these to see what they mean.
table(mpg$class)
##
## 2seater compact midsize minivan pickup subcompact
## 5 47 41 11 33 35
## suv
## 62
table(mpg$fl)
##
## c d e p r
## 1 5 8 52 168
table(mpg$drv)
##
## 4 f r
## 103 106 25
table(mpg$trans)
##
## auto(av) auto(l3) auto(l4) auto(l5) auto(l6) auto(s4)
## 5 2 83 39 6 3
## auto(s5) auto(s6) manual(m5) manual(m6)
## 3 16 58 19
The minimal ggplot2 command does nothing but specify a dataframe. Note that ggplot only works on dataframes. Run a minimal command and create an object to examine.
g1 = ggplot(data = mpg)
str(g1)
## List of 9
## $ data :Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 11 variables:
## ..$ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## ..$ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## ..$ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## ..$ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## ..$ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## ..$ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## ..$ drv : chr [1:234] "f" "f" "f" "f" ...
## ..$ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## ..$ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## ..$ fl : chr [1:234] "p" "p" "p" "p" ...
## ..$ class : chr [1:234] "compact" "compact" "compact" "compact" ...
## $ layers : list()
## $ scales :Classes 'ScalesList', 'ggproto' <ggproto object: Class ScalesList>
## add: function
## clone: function
## find: function
## get_scales: function
## has_scale: function
## input: function
## n: function
## non_position_scales: function
## scales: NULL
## super: <ggproto object: Class ScalesList>
## $ mapping : list()
## $ theme : list()
## $ coordinates:Classes 'CoordCartesian', 'Coord', 'ggproto' <ggproto object: Class CoordCartesian, Coord>
## aspect: function
## distance: function
## expand: TRUE
## is_linear: function
## labels: function
## limits: list
## range: function
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## train: function
## transform: function
## super: <ggproto object: Class CoordCartesian, Coord>
## $ facet :Classes 'FacetNull', 'Facet', 'ggproto' <ggproto object: Class FacetNull, Facet>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map: function
## map_data: function
## params: list
## render_back: function
## render_front: function
## render_panels: function
## setup_data: function
## setup_params: function
## shrink: TRUE
## train: function
## train_positions: function
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet>
## $ plot_env :<environment: R_GlobalEnv>
## $ labels : list()
## - attr(*, "class")= chr [1:2] "gg" "ggplot"
The book suggests a template starting with just a minimal command. I usually prefer to place the basic aes specification at the beginning.
g1 = ggplot(data = mpg, aes(x = displ,y=hwy))
str(g1)
## List of 9
## $ data :Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 11 variables:
## ..$ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## ..$ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## ..$ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## ..$ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## ..$ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## ..$ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## ..$ drv : chr [1:234] "f" "f" "f" "f" ...
## ..$ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## ..$ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## ..$ fl : chr [1:234] "p" "p" "p" "p" ...
## ..$ class : chr [1:234] "compact" "compact" "compact" "compact" ...
## $ layers : list()
## $ scales :Classes 'ScalesList', 'ggproto' <ggproto object: Class ScalesList>
## add: function
## clone: function
## find: function
## get_scales: function
## has_scale: function
## input: function
## n: function
## non_position_scales: function
## scales: NULL
## super: <ggproto object: Class ScalesList>
## $ mapping :List of 2
## ..$ x: symbol displ
## ..$ y: symbol hwy
## $ theme : list()
## $ coordinates:Classes 'CoordCartesian', 'Coord', 'ggproto' <ggproto object: Class CoordCartesian, Coord>
## aspect: function
## distance: function
## expand: TRUE
## is_linear: function
## labels: function
## limits: list
## range: function
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## train: function
## transform: function
## super: <ggproto object: Class CoordCartesian, Coord>
## $ facet :Classes 'FacetNull', 'Facet', 'ggproto' <ggproto object: Class FacetNull, Facet>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map: function
## map_data: function
## params: list
## render_back: function
## render_front: function
## render_panels: function
## setup_data: function
## setup_params: function
## shrink: TRUE
## train: function
## train_positions: function
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet>
## $ plot_env :<environment: R_GlobalEnv>
## $ labels :List of 2
## ..$ x: chr "displ"
## ..$ y: chr "hwy"
## - attr(*, "class")= chr [1:2] "gg" "ggplot"
The object g1 has a lot of content, but attempting to display it produces an empty graph, basically a blank slate. Note tha the two axes have been created with useful values.
g1
With the aesthetics specified at the beginning we don’t need to repeat the aesthestics in the following layers.
Start with a layer containing a geom, geom_point() to create a scatterplot.
g2 = g1 + geom_point()
g2
We can add an aesthetic in the geom_point geom. Use color to indicate the class of variable.
g2 = g1 + geom_point(aes(color = class))
g2
Note that the original aesthestics specified when we created g1 are preserved.
Now we can add another layer. Add a layer with a second geom, geom_smooth() to add a loess curve.
g3 = g2 + geom_smooth()
g3
## `geom_smooth()` using method = 'loess'
There are other variables which might influence highway glass mileage. How can we capture the influence of these variables? We have other visual attributes we could use to include these variables in the graph. The number of cylinders is a good example. Let’s use the size of the point. We’ll recreate g2 adding this mapping. Note at an item in an aesthestics list is of the form
g2 = g1 + geom_point(aes(color = class, size = cyl))
g2
Note that the variable cyl is highly correlated with engine displacement. What about drivetrain type. We could use shape to add this variable to a new version of g2.
g2 = g1 + geom_point(aes(color = class,size = cyl, shape = drv))
g2
What do you see?
Instead of adding categorical variables to the scatterplot, we could switch directions and create a graph placing a categorical variable on the x axis. Can we base such a graph on g1?
g2 = g1 + geom_boxplot(aes(x = drv,y=hwy))
g2
It almost works, but the values of displ have been lost, although the x axis still has the label “displ.” It would be better to create a new base graph with the variable we want given the x axis.
h1 = ggplot(data=mpg,aes(x=drv,y=hwy))
h2 = h1 + geom_boxplot()
h2
Exercise. Create a new variable diff in the mpg dataframe. diff = hwy - cty.
Create a graph to show how diff depends on displ?
mpg$diff = mpg$hwy - mpg$cty
g1 = ggplot(data = mpg,aes(x=displ,y=diff))
g2 = g1 + geom_point()
g2
Create a graph to show how diff depends on a categorical variable like drv.
g1 = ggplot(data = mpg,aes(x=drv,y=diff))
g2 = g1 + geom_boxplot()
g3 = g1 + geom_point()
g2
g3
g4 = g1 + geom_point(aes(color = class))
g4
g5 = g1 + geom_jitter()
g5
g6 = g1 + geom_jitter(aes(color=class))
g6
How do we look at the distribution of a categorical variable?
g1 = ggplot(data = mpg, aes(x=class)) + geom_bar()
g1