ggplot2 basics

  1. Diving in: scatterplots & aesthetics
  2. Facetting
  3. Geoms
  4. Histograms and barcharts
  5. Scatterplots for large data
# install.packages("ggplot2")
library(ggplot2)
?mpg
## starting httpd help server ... done
head(mpg)
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl      trans   drv   cty   hwy    fl
##          <chr> <chr> <dbl> <int> <int>      <chr> <chr> <int> <int> <chr>
## 1         audi    a4   1.8  1999     4   auto(l5)     f    18    29     p
## 2         audi    a4   1.8  1999     4 manual(m5)     f    21    29     p
## 3         audi    a4   2.0  2008     4 manual(m6)     f    20    31     p
## 4         audi    a4   2.0  2008     4   auto(av)     f    21    30     p
## 5         audi    a4   2.8  1999     6   auto(l5)     f    16    26     p
## 6         audi    a4   2.8  1999     6 manual(m5)     f    18    26     p
## # ... with 1 more variables: class <chr>
str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
summary(mpg)
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00
qplot(displ, hwy, data = mpg)

Additional Variables

Can display additional variables with aesthetics (like shape, colour, size) or facetting (small multiples displaying different subsets)

qplot(displ, hwy, colour = class, data = mpg)

Faceting

Small multiples displaying different subsets of the data. Useful for exploring conditional relationships. Useful for large data.

facet_grid(): 2d grid, rows ~ cols, . for no split facet_wrap(): 1d ribbon wrapped into 2d

qplot(displ, hwy, data = mpg) +
facet_grid(. ~ cyl)

qplot(displ, hwy, data = mpg) +
facet_grid(drv ~ .)

qplot(displ, hwy, data = mpg) +
facet_grid(drv ~ cyl)

qplot(displ, hwy, data = mpg) +
facet_wrap(~ class)

Geom Specification

qplot(cty, hwy, data = mpg)

qplot(cty, hwy, data = mpg, geom = "jitter")

qplot(class, hwy, data = mpg)

qplot(reorder(class, hwy), hwy, data = mpg)

qplot(reorder(class, hwy), hwy, data = mpg, geom = "jitter")

qplot(reorder(class, hwy), hwy, data = mpg, geom = "boxplot")

qplot(reorder(class, hwy), hwy, data = mpg,
geom = c("jitter", "boxplot"))

~54,000 round diamonds from http://www.diamondse.info/. Carat, colour, clarity, cut Total depth, table, depth, width, height Price

Histograms and barcharts

Used to display the distribution of a variable Categorical variable ??? bar chart Continuous variable ??? histogram

# With only one variable, qplot guesses that
# you want a bar chart or histogram
qplot(cut, data = diamonds)

qplot(carat, data = diamonds)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(carat, data = diamonds, binwidth = 1)

qplot(carat, data = diamonds, binwidth = 0.1)

qplot(carat, data = diamonds, binwidth = 0.01)

resolution(diamonds$carat)
## [1] 0.01
last_plot() + xlim(0, 3)
## Warning: Removed 32 rows containing non-finite values (stat_bin).

Experiment with Binwidth

qplot(table, data = diamonds, binwidth = 1)

# To zoom in on a plot region use xlim() and ylim()


qplot(table, data = diamonds, binwidth = 1) +
xlim(50, 70)
## Warning: Removed 12 rows containing non-finite values (stat_bin).

qplot(table, data = diamonds, binwidth = 0.1) +
xlim(50, 70)
## Warning: Removed 12 rows containing non-finite values (stat_bin).

qplot(table, data = diamonds, binwidth = 0.1) +
xlim(50, 70) + ylim(0, 50)
## Warning: Removed 12 rows containing non-finite values (stat_bin).
## Warning: Removed 15 rows containing missing values (geom_bar).

# Note that this type of zooming discards data
# outside of the plot regions
# See coord_cartesian() for an alternative
qplot(depth, data = diamonds, binwidth = 0.2)

qplot(depth, data = diamonds, binwidth = 0.2,
fill = cut) + xlim(55, 70)
## Warning: Removed 45 rows containing non-finite values (stat_bin).

qplot(depth, data = diamonds, binwidth = 0.2)+
xlim(55, 70) + facet_wrap(~ cut)
## Warning: Removed 45 rows containing non-finite values (stat_bin).

qplot(price, data = diamonds, binwidth = 500) + facet_wrap(~ cut)

Problems

Each histogram far away from the others, but we know stacking is hard to read ??? use another way of displaying densities Varying relative abundance makes comparisons difficult ??? rescale to ensure constant area

# Large distances make comparisons hard
qplot(price, data = diamonds, binwidth = 500) +
facet_wrap(~ cut)

# Stacked heights hard to compare
qplot(price, data = diamonds, binwidth = 500, fill = cut)

# Much better - but still have differing relative abundance
qplot(price, data = diamonds, binwidth = 500,
geom = "freqpoly", colour = cut)

# Instead of displaying count on y-axis, display density
# .. indicates that variable isn't in original data
qplot(price, ..density.., data = diamonds, binwidth = 500,
geom = "freqpoly", colour = cut)

# To use with histogram, you need to be explicit
qplot(price, ..density.., data = diamonds, binwidth = 500,
geom = "histogram") + facet_wrap(~ cut)

Additional Geoms

# There are two ways to add additional geoms
# 1) A vector of geom names:
qplot(price, carat, data = diamonds,
geom = c("point", "smooth"))
## `geom_smooth()` using method = 'gam'

# 2) Add on extra geoms
qplot(price, carat, data = diamonds) + geom_smooth()
## `geom_smooth()` using method = 'gam'

# This how you get help about a specific geom:
# ?geom_smooth

To set aesthetics

# To set aesthetics to a particular value, you need
# to wrap that value in I()
qplot(price, carat, data = diamonds, colour = "blue")

qplot(price, carat, data = diamonds, colour = I("blue"))

# Practical application: varying alpha
qplot(price, carat, data = diamonds, alpha = I(1/10))

qplot(price, carat, data = diamonds, alpha = I(1/50))

qplot(price, carat, data = diamonds, alpha = I(1/100))

qplot(price, carat, data = diamonds, alpha = I(1/250))

Color

qplot(carat, price, data = diamonds,
colour = clarity)

qplot(log10(carat), log10(price),
data = diamonds, colour = clarity)

qplot(log10(carat), log10(carat / price),
data = diamonds, colour = clarity)

qplot(log10(carat), log10(price), data = diamonds,
geom = "hex", bins = 10) + facet_wrap(~ clarity)
## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

## Warning: Computation failed in `stat_binhex()`:
## Package `hexbin` required for `stat_binhex`.
## Please install and try again.

qplot(log10(carat), log10(price), data = diamonds,
colour = clarity, geom = "smooth")
## `geom_smooth()` using method = 'gam'