## Step 3: Set up key libraries and source code
# proj.path = file.path("C:/Users/uzhanou/Documents/R/Tidyverse/data visualization/code");
# setwd(proj.path)

Chapter 1: Hello ggplot2

#      x      y    data
qplot(displ, hwy, data = mpg)

# Highlight different subgroups
g <- qplot(displ, hwy, data = mpg, color = drv)
summary(g)

## data: manufacturer, model, displ, year, cyl, trans, drv, cty, hwy,
##   fl, class [234x11]
## mapping:  x = ~displ, y = ~hwy, colour = ~drv
## faceting: <ggproto object: Class FacetNull, Facet, gg>
##     compute_layout: function
##     draw_back: function
##     draw_front: function
##     draw_labels: function
##     draw_panels: function
##     finish_data: function
##     init_scales: function
##     map_data: function
##     params: list
##     setup_data: function
##     setup_params: function
##     shrink: TRUE
##     train_scales: function
##     vars: function
##     super:  <ggproto object: Class FacetNull, Facet, gg>
## -----------------------------------
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

A graphing template

ggplot(data = ) + (mapping = aes())

Chapter 2: Getting started with qplot

data(diamonds)
set.seed(1410) # Make the sample reproducible

# random sample of 100 diamonds
dsmall <- diamonds[sample(nrow(diamonds),100),]

qplot(carat, price, data = diamonds)

plot log(price) vs. log(carat)

qplot(log(carat), log(price), data = diamonds)

The relationship between the volume of the diamond (approximated by x × y × z) and its weight

qplot(carat, x * y * z, data = diamonds)

qplot can do convert a categorical variable in your data into something that plot knows how to use this automatically, and it will automatically provide a legend that maps the displayed attributes to the data values

qplot(carat, price, data = dsmall, colour = color) # colour

qplot(carat, price, data = dsmall, shape = cut)    # shape

## Warning: Using shapes for an ordinal variable is not advised

For every aesthetic attribute, there is a function, called a scale, which maps data values to valid values for that aesthetic.

qplot(carat, price, data = diamonds, alpha = I(1/10))

qplot(carat, price, data = diamonds, alpha = I(1/100))

qplot(carat, price, data = diamonds, alpha = I(1/200))

2.5 Plot geoms

Geom, short for geometric object, describes the type of object that is used to display the data.

geom = "point" draws points to produce a scatterplot. This is the default when you supply both x and y arguments to qplot().
geom = "smooth" fits a smoother to the data and displays the smooth and its standard error,
geom = "boxplot" produces a box-and-whisker plot to summarise the distribution of a set of points,
geom = "path" and geom = "line" draw lines between the data points.

For 1d distributions, your choice of geoms is guided by the variable type:

For continuous variables, geom = "histogram" draws a histogram, geom ="freqpoly" a frequency polygon, and geom = "density" creates a density plot,The histogram geom is the default when you only supply an x value to qplot().
For discrete variables, geom = "bar" makes a bar chart

2.5.1 Adding a smoother to a plot

If you have a scatterplot with many data points, it can be hard to see exactly what trend is shown by the data. In this case you may want to add a smoothed line to the plot. This is easily done using the smooth geom.

qplot(carat, price, data = dsmall, geom = c("point", "smooth"))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

qplot(carat, price, data = diamonds, geom = c("point", "smooth"))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Notice that we have combined multiple geoms by supplying a vector of geom names created with c().

method = "loess", the default for small n, uses a smooth local regression. More details about the algorithm used can be found in ?loess. The wiggliness of the line is controlled by the span parameter, which ranges from 0 (exceedingly wiggly) to 1 (not so wiggly)5.

qplot(carat, price, data = dsmall, geom = c("point", "smooth"),
span = 0.2)

## Warning: Ignoring unknown parameters: span

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

qplot(carat, price, data = dsmall, geom = c("point", "smooth"),
span = 1)

## Warning: Ignoring unknown parameters: span

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2.5.2 Boxplots and jittered points

qplot(color, price / carat, data = diamonds, 
      geom = "jitter", alpha = I(1 / 5))

qplot(color, price / carat, data = diamonds, 
      geom = "jitter", alpha = I(1 / 50))

qplot(color, price / carat, data = diamonds, 
      geom = "jitter", alpha = I(1 / 200))

2.5.3 Histogram and density plots

Histogram and density plots show the distribution of a single variable. They provide more information about the distribution of a single group than boxplots do, but it is harder to compare many groups

qplot(carat, data = diamonds, geom = "histogram")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(carat, data = diamonds, geom = "density")

For the density plot, the adjust argument controls the degree of smoothness (high values of adjust produce smoother plots). For the histogram, the binwidth argument controls the amount of smoothing by setting the bin size. (Break points can also be specified explicitly, using the breaks argument.)

It is very important to experiment with the level of smoothing.

we experiment with three values of binwidth: 1.0, 0.1 and 0.01.

qplot(carat, data = diamonds, geom = "histogram", 
      binwidth = 1, xlim = c(0,3))

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

qplot(carat, data = diamonds, geom = "histogram", 
      binwidth = 0.1, xlim = c(0,3))

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

qplot(carat, data = diamonds, geom = "histogram", 
      binwidth = 0.01, xlim = c(0,3))

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

To compare the distributions of different subgroups, just add an aesthetic mapping, as in the following code.

qplot(carat, data = diamonds, geom = "density", colour = color)

qplot(carat, data = diamonds, geom = "histogram", fill = color)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.5.4 Bar charts

The discrete analogue of histogram is the bar chart, geom = "bar". The bar geom counts the number of instances of each class so that you don’t need to tabulate your values beforehand.

qplot(color, data = diamonds, geom = "bar")

qplot(color, data = diamonds, geom = "bar", 
      weight = carat) +
      scale_y_continuous("carat")

2.5.5 Time series with line and path plots

Line and path plots are typically used for time series data.

Line plots join the points from left to right, while
path plots join them in the order that they appear in the dataset (a line plot is just a path plot of the data sorted by x value).
- Line plots usually have time on the x-axis, showing how a single variable has changed over time.
- Path plots show how two variables have simultaneously changed over time, with time encoded in the way that the points are joined together.

Example:

Because there is no time variable in the diamonds data, we use the economics dataset, which contains economic data on the US measured over the last 40 years.

The first shows an unemployment rate and the second shows the median number of weeks unemployed.

qplot(date, unemploy / pop, data = economics, geom = "line")

qplot(date, uempmed, data = economics, geom = "line")

To examine this relationship in greater detail, we would like to draw both time series on the same plot. We could draw a scatterplot of unemployment rate vs. length of unemployment, but then we could no longer see the evolution. over time. The solution is to join points adjacent in time with line segments, forming a path plot.

year <- function(x) as.POSIXlt(x)$year + 1900

qplot(unemploy / pop, uempmed, data = economics, 
      geom = c("point", "path"))

qplot(unemploy / pop, uempmed, data = economics,
      geom = "path", colour = year(date)) + scale_area()

## Error in scale_area(): could not find function "scale_area"

2.6 Faceting

Faceting takes an alternative approach: It creates tables of graphics by splitting the data into subsets and displaying the same graph for each subset in an arrangement that facilitates comparison.

qplot(carat, data = diamonds, facets = color ~ .,
      geom = "histogram", binwidth = 0.1, xlim = c(0, 3))

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 14 rows containing missing values (geom_bar).

qplot(carat, ..density.., data = diamonds, facets = color ~ .,
      geom = "histogram", binwidth = 0.1, xlim = c(0, 3))

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 14 rows containing missing values (geom_bar).

2.7 Other options

xlim, ylim: set limits for the x- and y-axes, each a numeric vector of length two, e.g., xlim=c(0, 20) or ylim=c(-0.9, -0.5).
log: a character vector indicating which (if any) axes should be logged. For example, log="x" will log the x-axis, log="xy" will log both.
main: main title for the plot, centered in large text at the top of the plot. This can be a string (e.g., main="plot title") or an expression (e.g., main = expression(beta[1] == 1)). See ?plotmath for more examples of using mathematical formulae.
xlab, ylab: labels for the x- and y-axes. As with the plot title, these can be character strings or mathematical expressions.

Chapter 3 Mastering the grammar

3.1 Introduction

This chapter describes the theoretical basis of ggplot2: the layered grammar of graphics.

3.3 Building a scatterplot

It is a scatterplot of two continuous variables (engine displacement and highway mpg), with points coloured by a third variable (number of cylinders).

data(mpg)
qplot(displ, hwy, data = mpg, colour = factor(cyl))

Mapping aesthetics to data

Points, lines and bars are all examples of geometric objects, or geoms. Geoms determine the “type” of the plot.

Named Plot	Geom	Other features
scatterplot	point
bubblechart	point	size mapped to a variable
barchart	bar
box-and-whisker plot	boxplot
line chart	line

Table 3.3: A selection of named plots and the geoms that they correspond to.

Scaling

We need to convert them from data units (e.g., litres, miles per gallon and number of cylinders) to physical units (e.g., pixels and colours) that the computer can display. This conversion process is called scaling and performed by scales.

A final step determines how the two positions (x and y) are combined to form the final location on the plot. This is done by the coordinate system, or coord.

To create a complete plot we need to combine graphical objects from three sources:

the data, represented by the point geom;
the scales and coordinate system, which generate axes and legends so that we can read values from the graph; and
plot annotations, such as the background and plot title.
an additional step in the process: after mapping the data to aesthetics, the data is passed to a statistical transformation, or stat, which manipulates the data in some useful way.

Scaling actually occurs in three parts: transforming, training and mapping.

3.5 Components of the layered grammar

All together, the layered grammar defines a plot as the combination of:

A default dataset and set of mappings from variables to aesthetics.
One or more layers, each composed of a geometric object, a statistical transformation, and a position adjustment, and optionally, a dataset and aesthetic mappings.
One scale for each aesthetic mapping.
A coordinate system.
The faceting specification.

3.5.1 Layers

Layers are responsible for creating the objects that we perceive on the plot.

A layer is composed of four parts:

data and aesthetic mapping,
a statistical transformation (stat),
a geometric object (geom)
and a position adjustment.

3.5.2 Scales

A scale controls the mapping from data to aesthetic attributes, and we need a scale for every aesthetic used on a plot. Each scale operates across all the data in the plot, ensuring a consistent mapping from data to aesthetics.

3.5.3 Coordinate system

A coordinate system, or coord for short, maps the position of objects onto the plane of the plot. Position is often specified by two coordinates (x, y), but potential could be three or more (although this is not yet implemented in ggplot2).

3.5.4 Faceting

There is also another thing that turns out to be sufficiently useful that we should include it in our general framework: faceting, a general case of the conditioned or trellised plots. This makes it easy to create small multiples each showing a different subset of the whole dataset. This is a powerful tool when investigating whether patterns hold across all conditions.

The faceting specification describes which variables should be used to split up the data, and whether position scales should be free or constrained.

3.6 Data structure

This grammar is encoded into R data structures in a fairly straightforward way. A plot object is a list with components data, mapping (the default aesthetic mappings), layers, scales, coordinates and facet. The plot object has one other component we haven’t discussed yet: options.

Chapter 4 Building a plot layer by layer

4.2 creating a plot

To create the plot object ourselves, we use ggplot(). This has two arguments: data and aesthetic mapping.

p <- ggplot(diamonds, aes(carat, price, colour = cut))
summary(p)

## data: carat, cut, color, clarity, depth, table, price, x, y, z
##   [53940x10]
## mapping:  x = ~carat, y = ~price, colour = ~cut
## faceting: <ggproto object: Class FacetNull, Facet, gg>
##     compute_layout: function
##     draw_back: function
##     draw_front: function
##     draw_labels: function
##     draw_panels: function
##     finish_data: function
##     init_scales: function
##     map_data: function
##     params: list
##     setup_data: function
##     setup_params: function
##     shrink: TRUE
##     train_scales: function
##     vars: function
##     super:  <ggproto object: Class FacetNull, Facet, gg>

This plot object cannot be displayed until we add a layer.

4.3 Layers

A minimal layer may do nothing more than specify a geom, a way of visually representing the data. If we add a point geom to the plot we just created, we create a scatterplot, which can then be rendered.

p <- p + layer(geom = "point")

## Error: Attempted to create layer with no stat.

4.4 Data

The restriction on the data is simple: it must be a data frame.

This restriction also makes it very easy to produce the same plot for different data: you just change the data frame. You can replace the old dataset with %+%, as shown in the following example.

p <- ggplot(mtcars, aes(mpg, wt, colour = cyl)) + geom_point()
p

mtcars <- transform(mtcars, mpg = mpg ^ 2)
p %+% mtcars  # replace old data

4.5 Aesthetic mappings

To describe the way that variables in the data are mapped to things that we can perceive on the plot (the “aesthetics”), we use the aes function. The aes function takes a list of aesthetic-variable pairs like these:

aes(x = weight, y = height, colour = age)

## Aesthetic mapping: 
## * `x`      -> `weight`
## * `y`      -> `height`
## * `colour` -> `age`

Any variable in an aes() specification must be contained inside the plot or layer data. This is one of the ways in which ggplot2 objects are guaranteed to be entirely self-contained, so that they can be stored and re-used.

4.5.1 Plots and layers

The default aesthetic mappings can be set when the plot is initialised or modified later using +.

p <- ggplot(mtcars) + aes(wt,hp)
p <- ggplot(mtcars, aes(x = mpg, y = wt)) 
p + geom_point()

The default mappings in the plot p can be extended or overridden in the layers, as with the following code.

p + geom_point(aes(colour = factor(cyl)))

p + geom_point(aes(y = disp))

4.5.2 Setting vs. mapping

Instead of mapping an aesthetic property to a variable, you can set it to a single value by specifying it in the layer parameters. Aesthetics can vary for each observation being plotted, while parameters do not.

We map an aesthetic to a variable (e.g., (aes(colour = cut))) or set it to a constant (e.g., colour = "red").

For example, the following layer sets the colour of the points, using the colour parameter of the layer:

p <- ggplot(mtcars, aes(mpg, wt))
p + geom_point(colour = "darkblue")

This sets the point colour to be dark blue instead of black. This is quite different than (maps)

p + geom_point(aes(colour = "darkblue"))

This maps (not sets) the colour to the value “darkblue”. This effectively creates a new variable containing only the value “darkblue” and then maps colour to that new variable. Because this value is discrete, the default colour scale uses evenly spaced colours on the colour wheel, and since there is only one value this colour is pinkish. The difference between setting and mapping is illustrated above.

4.5.3 Grouping

In ggplot2, geoms can be roughly divided into individual and collective geoms. An individual geom has a distinctive graphical object for each row in the data frame.

How do we control which observations go in which individual graphical element?

This is the job of the group aesthetic.

By default, the group is set to the interaction of all discrete variables in the plot. This often partitions the data correctly, but when it does not, or when no discrete variable is used in the plot, you will need to explicitly define the grouping structure, by mapping group to a variable that has a different value for each group. The interaction() function is useful if a single pre-existing variable doesn’t cleanly separate groups, but a combination does.

There are three common cases where the default is not enough, and we will consider each one below. In the following examples, we will use a simple longitudinal dataset, Oxboys, from the nlme package. It records the heights (height) and centered ages (age) of 26 boys (Subject), measured on nine occasions (Occasion).

Multiple groups, one aesthetic.

In many situations, you want to separate your data into groups, but render them in the same way.

The first plot in figure below shows a set of time series plots, one for each boy. You can see the separate growth trajectories for each boy, but there is no way to see which boy belongs to which trajectory.

p <- ggplot(Oxboys, aes(age, height, group = Subject)) +
geom_line()

p

We specified the Subject as the grouping variable to get a line for each boy.

Different groups on different layers.

Sometimes we want to plot summaries based on different levels of aggregation. Different layers might have different group aesthetics, so that some display individual level data while others display summaries of larger groups.

p + geom_smooth(aes(group = Subject), method = "lm", se = F)

This is not what we wanted; we have inadvertently added a smoothed line for each boy. This new layer needs a different group aesthetic, group = 1, so that the new line will be based on all the data, as shown in the plot below. The modified layer looks like this:

p + geom_smooth(aes(group = 1), method="lm", size = 2, se = F)

Overriding the default grouping.

The plot has a discrete scale but you want to draw lines that connect across groups. This is the strategy used in interaction plots, profile plots, and parallel coordinate plots, among others.

boysbox <- ggplot(Oxboys, aes(Occasion, height)) + geom_boxplot()
boysbox + geom_line(aes(group = Subject), colour = "#3366FF")

4.5.4 Matching aesthetics to graphic objects

Another important issue with collective geom is how the aesthetics of the individual observations are mapped to the aesthetics of the complete entity.

4.6 Geoms

Geometric objects, or geoms for short, perform the actual rendering of the layer, control the type of plot that you create. For example, using a point geom will create a scatterplot, while using a line geom will create a line plot. Table 4.2 below lists all of the geoms available in ggplot2.

Name	Description
abline	Line, specified by slope and intercept
area	Area plots
bar	Bars, rectangles with bases on y-axis
blank	Blank, draws nothing
boxplot	Box-and-whisker plot
contour	Display contours of a 3d surface in 2d
crossbar	Hollow bar with middle indicated by horizontal line
density	Display a smooth density estimate
density_2d	Contours from a 2d density estimate
errorbar	Error bars
histogram	Histogram
hline	Line, horizontal
interval	Base for all interval (range) geoms
jitter	Points, jittered to reduce overplotting
line	Connect observations, in order of x value
linerange	An interval represented by a vertical line
path	Connect observations, in original order
point	Points, as for a scatterplot
pointrange	An interval represented by a vertical line, with a point in the middle
polygon	Polygon, a filled path
quantile	Add quantile lines from a quantile regression
ribbon	Ribbons, y range with continuous x values
rug	Marginal rug plots
segment	Single line segments
smooth	Add a smoothed condition mean
step	Connect observations by stairs
text	Textual annotations
tile	Tile plot as densely as possible, assuming that every tile is the same size
vline	Line, vertical

Table 4.2 Geoms in ggplot2

Each geom has a set of aesthetics that it understands, and a set that are required for drawing. For example, a point requires x and y position, and understands colour, size and shape aesthetics. A bar requires height (ymax), and understands width, border colour and fill colour. These are listed for all geoms in Table 4.3 in the book (page. 57).

4.7 Stat

A statistical transformation, or stat, transforms the data, typically by summarising it in some manner.

All currently available stats are listed in Table 4.4 below.

Name	Description
bin	Bin data
boxplot	Calculate components of box-and-whisker plot
contour	Contours of 3d data
density	Density estimation, 1d
density_2d	Density estimation, 2d
function	Superimpose a function
identity	Don’t transform data
qq	Calculation for quantile-quantile plot
quantile	Continuous quantiles
smooth	Add a smoother
spoke	Convert angle and radius to xend and yend
step	Create stair steps
sum	Sum unique values. Useful for overplotting on scatterplots
summary	Summarise y values at every unique x
unique	Remove duplicates

Table 4.4: Stats in ggplot2

To make sense in a graphic context a stat must be location-scale invariant: \(f(x + a) = f(x) + a\) and \(f(b · x) = b · f(x)\). This ensures that the transformation stays the same when you change the scales of the plot. A stat takes a dataset as input and returns a dataset as output, and so a stat can add new variables to the original dataset. It is possible to map aesthetics to these new variables.

These generated variables can be used instead of the variables present in the original dataset.

4.8 Position adjustments

Position adjustments apply minor tweaks to the position of elements within a layer. Table 4.5 lists all of the position adjustments available within ggplot2. Position adjustments are normally used with discrete data. Continuous data typically doesn’t overlap exactly, and when it does (because of high data density) minor adjustments, like jittering, are usually insufficient to fix the problem.

Name	Description
dodge	Adjust position by dodging overlaps to the side
fill	Stack overlapping objects and standardise have equal height
identity	Don’t adjust position
jitter	Jitter points to avoid overplotting
stack	Stack overlapping objects on top of one another

Table 4.5: The five position adjustments.

4.9 Pulling it all together

Once you have become comfortable with combining layers, you will be able to create graphics that are both intricate and useful.

Chapter 5 Toolbox

5.2 Overall layering strategy

In general, there are three purposes for a layer:

To display the data.
To display a statistical summary of the data.
To add additional metadata, context and annotations.

5.3 Basic plot types

These geoms are the fundamental building blocks of ggplot2. Most of these geoms are associated with a named plot: when that geom is used by itself in a plot, that plot has a special name.

Each of these geoms is two dimensional and requires both x and y aesthetics. All understand colour and size aesthetics, and the filled geoms (bar, tile and polygon) also understand fill. The point geom uses shape and line and path geoms understand linetype. The geoms are used for displaying data, summaries computed elsewhere, and metadata.

geom_area() draws an area plot, which is a line plot filled to the y-axis (filled lines). Multiple groups will be stacked on top of each other.
geom_bar(stat = “identity”)() makes a barchart. We need stat = “identity” because the default stat automatically counts values (so is essentially a 1d geom, see § 5.4). The identity stat leaves the data unchanged.

By default, multiple bars in the same location will be stacked on top of one another.

geom_line() makes a line plot. The group aesthetic determines which observations are connected; see Section 4.5.3 for more details. geom_path is similar to a geom_line, but lines are connected in the order they appear in the data, not from left to right.
geom_point() produces a scatterplot.
geom_polygon() draws polygons, which are filled paths. Each vertex of the polygon requires a separate row in the data. It is often useful to merge a data frame of polygon coordinates with the data just prior to plotting. Section 5.7 illustrates this concept in more detail for map data.
geom_text() adds labels at the specified points. This is the only geom in this group that requires another aesthetic: label. It also has optional aesthetics hjust and vjust that control the horizontal and vertical position of the text; and angle which controls the rotation of the text. See Appendex B for more details.
geom_tile() makes a image plot or level plot. The tiles form a regular tessellation of the plane and typically have the fill aesthetic mapped to another variable.

df <- data.frame(x = c(3, 1, 5),
                 y = c(2, 4, 6),
                 label = c("a","b","c"))

p <- ggplot(df, aes(x, y, label = label)) +
xlab(NULL) + ylab(NULL)

## opts(title = "mytitle") changed to labs(title = "mytitle") 

# scatterplot
p + geom_point() + labs(title = "geom_point")

# Barchart
p + geom_bar(stat = "identity") + labs(title = "geom_bar(stat=\"identity\")")

# Line plot
p + geom_line() + labs(title = "geom_line")

# area plot
p + geom_area() + labs(title = "geom_area")

# Similar to line plot, but lines are connected in the order they appear
# in the data, not from left to right.
p + geom_path() + labs(title = "geom_path")

# Adding title text
p + geom_text() + labs(title = "geom_text")

p + geom_tile() + labs(title = "geom_tile")

# polygon + title
p + geom_polygon() + labs(title = "geom_polygon")

5.4 Displaying distributions

depth_dist <- ggplot(diamonds, aes(depth)) + xlim(58, 68)

# Histogram with multiple facet by cut condition 
depth_dist +
  geom_histogram(aes(y = ..density..), binwidth = 0.1) + facet_grid(cut ~ .)

## Warning: Removed 669 rows containing non-finite values (stat_bin).

## Warning: Removed 10 rows containing missing values (geom_bar).

# Histogram with position-fill
depth_dist + geom_histogram(aes(fill = cut), binwidth = 0.1, position = "fill")

## Warning: Removed 669 rows containing non-finite values (stat_bin).

## Warning: Removed 10 rows containing missing values (geom_bar).

depth_dist + geom_freqpoly(aes(y = ..density.., colour = cut), binwidth = 0.1)

## Warning: Removed 669 rows containing non-finite values (stat_bin).

## Warning: Removed 10 rows containing missing values (geom_path).

geom_boxplot = stat_boxplot + geom_boxplot: box-and-whisker plot, for a continuous variable conditioned by a categorical variable.
geom_jitter = position_jitter + geom_point: a crude way of looking at discrete distributions by adding random noise to the discrete values so that they don’t overplot.
geom_density = stat_density + geom_area: a smoothed version of the frequency polygon based on kernel smoothers.

5.5 Dealing with overplotting

The scatterplot is a very important tool for assessing the relationship between two continuous variables. However, when the data is large, often points will be plotted on top of each other, obscuring the true relationship. This problem is called overplotting and there are a number of ways to deal with it:

Small amounts of overplotting can sometimes be alleviated by making the points smaller, or using hollow glyphs.

df <- data.frame(x = rnorm(2000), y = rnorm(2000))
norm <- ggplot(df, aes(x, y))
norm + geom_point()               # Regular

norm + geom_point(shape = 1)      # Hallow dot

norm + geom_point(shape = ".")    # Pixel sized

For larger datasets with more overplotting, you can use alpha blending (transparency) to make the points transparent. If you specify alpha as a ratio, the denominator gives the number of points that must be overplotted to give a solid colour. In R, the lowest amount of transparency you can use is 1/256, so it will not be effective for heavy overplotting.

#alpha blending (transparency)
norm + geom_point(colour = alpha("black", 1/3))

norm + geom_point(colour = alpha("black", 1/5))

norm + geom_point(colour = alpha("black", 1/10))

If there is some discreteness in the data, you can randomly jitter the points to alleviate some overlaps. This is particularly useful in conjunction with transparency. By default, the amount of jitter added is 40% of the resolution of the data, which leaves a small gap between adjacent regions.

td <- ggplot(diamonds, aes(table, depth)) +
xlim(50, 70) + ylim(50, 70)
td + geom_point()

## Warning: Removed 36 rows containing missing values (geom_point).

td + geom_jitter()

## Warning: Removed 41 rows containing missing values (geom_point).

jit <- position_jitter(width = 0.5)
td + geom_jitter(position = jit)

## Warning: Removed 41 rows containing missing values (geom_point).

td + geom_jitter(position = jit, colour = alpha("black", 1/10))

## Warning: Removed 41 rows containing missing values (geom_point).

td + geom_jitter(position = jit, colour = alpha("black", 1/50))

## Warning: Removed 41 rows containing missing values (geom_point).

td + geom_jitter(position = jit, colour = alpha("black", 1/200))

## Warning: Removed 41 rows containing missing values (geom_point).

Bin the points and count the number in each bin, then visualise that count in some way (the 2d generalisation of the histogram). Breaking the plot into many small squares can produce distracting visual artefacts.

d <- ggplot(diamonds, aes(carat, price)) + xlim(1,3) +
opts(legend.position = "none")

## Error in opts(legend.position = "none"): could not find function "opts"

d + stat_bin2d()

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_bin2d(bins = 10)

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_bin2d(binwidth = c(0.02, 200))

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_binhex()

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_binhex(bins = 10)

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_binhex(binwidth = c(0.02, 200))

## Error in eval(expr, envir, enclos): object 'd' not found

Estimate the 2d density with stat_density2d, and overlay contours from this distribution on the scatterplot, or display the density by itself as coloured tiles, or points with size proportional to density.

d <- ggplot(diamonds, aes(carat, price)) + xlim(1,3) +
opts(legend.position = "none")

## Error in opts(legend.position = "none"): could not find function "opts"

d + geom_point() + geom_density2d()

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_density2d(geom = "point", aes(size = ..density..),
contour = F) + scale_area(to = c(0.2, 1.5))

## Error in eval(expr, envir, enclos): object 'd' not found

d + stat_density2d(geom = "tile", aes(fill = ..density..),
contour = F)

## Error in eval(expr, envir, enclos): object 'd' not found

last_plot() + scale_fill_gradient(limits = c(1e-5,8e-4))

Another approach to dealing with overplotting is to add data summaries to help guide the eye to the true shape of the pattern within the data.

5.6 Surface plots

ggplot2 currently does not support true 3d surfaces. However, it does support the common tools for representing 3d surfaces in 2d: contours, coloured tiles and bubble plots.

‘ggplot2 Elegant Graphics for Data Analysis’ Book Study Notes

Ou Zhang

3/14/2019