R for Data Science(2e)

9 Layers

set up

download the package and data

if(!require(tidyverse))
        {install.packages("tidyverse")}

## Loading required package: tidyverse

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'forcats' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyverse)

# color_point
ggplot(mpg, aes(x = displ, y = hwy, color = class)) +
  geom_point()

# shape_point
ggplot(mpg, aes(x = displ, y = hwy, shape = class)) +
  geom_point()

## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many of them.

## Warning: Removed 62 rows containing missing values or values outside the scale range
## (`geom_point()`).

# size_point 
ggplot(mpg, aes(x = displ, y = hwy, size = class)) +
  geom_point()

## Warning: Using size for a discrete variable is not advised.

#> Warning: Using size for a discrete variable is not advised.

# alpha_point
ggplot(mpg, aes(x = displ, y = hwy, alpha = class)) +
  geom_point()

## Warning: Using alpha for a discrete variable is not advised.

#> Warning: Using alpha for a discrete variable is not advised.

# outside the aes
ggplot(mpg, aes(x = displ, y = hwy)) +
        geom_point(colour = "yellow", fill = "red", size = 2, shape = 21)

9.2.1 Exercises

Create a scatterplot of hwy vs. displ where the points are pink filled in triangles.
Why did the following code not result in a plot with blue points?

ggplot(mpg) + geom_point(aes(x = displ, y = hwy, color = “blue”))

What does the stroke aesthetic do? What shapes does it work with? (Hint: use ?geom_point)

In ggplot2, the stroke aesthetic controls the width (thickness) of a point’s border, measured in mm. It functions similarly to the linewidth aesthetic used for lines and polygons, allowing you to distinguish the thickness of an outline from the overall size of the shape

What happens if you map an aesthetic to something other than a variable name, like aes(color = displ < 5)? Note, you’ll also need to specify x and y.

#1
ggplot(mpg, aes(x = displ, y = hwy)) +
        geom_point(shape = 17, colour = "pink")

#2
ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, color = "blue"))

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy), color = "blue")

#3
ggplot(mpg, aes(displ, hwy)) +
  geom_point(shape = 21,    # Circle with fill
             fill = "red",   # Interior color
             colour = "black", # Border color
             size = 5,       # Total interior size
             stroke = 2,     # Thickness of the black border
             alpha = 0.5)

#4
ggplot(mpg, aes(x = displ, y = hwy, color = displ < 5)) +
        geom_point(shape = 17)

9.3 Geometric objects

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + 
  geom_point(aes(shape = drv)) +
  geom_smooth(aes(linetype = drv))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# a single line
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# group aesthetic by itself does not add a legend or distinguishing features to the geom
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(group = drv))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv, linetype = drv))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# suppress the legend and 
#When you set "se" to FALSE, you are telling R to hide that grey ribbon 
# and only display the smoothed line itself.
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv, linetype = drv), 
              show.legend = FALSE, se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Local layer

# local layer aes won't affect other layers
ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point(aes(color = class)) + 
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# highling subset of data, specifying the colour
ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point(color = "blue") + 
  geom_point(
    data = mpg |> filter(class == "2seater"), 
    color = "red"
  ) +
  geom_point(
    data = mpg |> filter(class == "2seater"), 
    shape = "circle open", size = 3, color = "red"
  )

# using another method but there is no control of the colours
ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point(aes(color = class == "2seater"))

Geoms are the fundamental building blocks of ggplot2

# histogram
ggplot(mpg, aes(x = hwy)) +
  geom_histogram(binwidth = 2)

# density
ggplot(mpg, aes(x = hwy)) +
  geom_density()

# boxplot
ggplot(mpg, aes(x = hwy)) +
  geom_boxplot()

### 9.3.1 Exercises

What geom would you use to draw a line chart? A boxplot? A histogram? An area chart?

In ggplot2, geometric objects (geoms) determine the visual representation of your data. To draw the specified charts, you would use the following functions: - Line Chart: geom_line(). It connects data points from left to right to display trends, often over time. - Boxplot: geom_boxplot(). It visualizes the distribution of data through a five-number summary (minimum, first quartile, median, third quartile, and maximum). - Histogram: geom_histogram(). It displays the distribution of a continuous variable by “binning” values into ranges and counting the number of observations in each. - Area Chart: geom_area(). It is similar to a line chart but fills the area between the line and the x-axis with color or shading

#Line Chart
mpg |> 
        group_by(year) |> 
        summarise(mean = mean(displ)) |>
        ggplot(aes(x = year, y = mean)) +
        geom_line()+
        geom_point()

#Boxplot
ggplot(mpg,
       aes(x = displ, colour = as.factor(year)))+
        geom_boxplot()+
        labs(colour = "Year")

#histogram
ggplot(mpg,
       aes(x = hwy, fill = cty > 15)) +
        geom_histogram(bins = 20)

#area chart
mpg |> 
        group_by(year) |> 
        summarise(mean = mean(displ)) |>
        ggplot(aes(x = year, y = mean)) +
        geom_area(fill = "green", alpha = 0.5)

Earlier in this chapter we used show.legend without explaining it:

ggplot(mpg, aes(x = displ, y = hwy)) + geom_smooth(aes(color = drv), show.legend = FALSE)

What does show.legend = FALSE do here? What happens if you remove it? Why do you think we used it earlier?

Stop showing the legend.

What does the se argument to geom_smooth() do?
In ggplot2, when you set “se” to FALSE, you are telling R to hide that grey ribbon . The grey ribbon drawn by geom_smooth() represents the confidence interval around the smoothed line. Statistical Meaning: By default, it displays a 95% confidence interval for the smoothed mean. This means that if you were to repeat the sampling process many times, the true underlying trend would fall within that shaded region 95% of the time.
Recreate the R code necessary to generate the following graphs. Note that wherever a categorical variable is used in the plot, it’s drv.

ggplot(mpg,
       aes(x = displ,
           y = hwy)) +
        geom_point(size = 2) +
        geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg,
       aes(x = displ,
           y = hwy)) +
        geom_point(size = 2) +
        geom_smooth(
                se = FALSE,
                aes(group = drv))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg,
       aes(x = displ,
           y = hwy,
           colour = drv)) +
        geom_point(size = 2) +
        geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg,
       aes(x = displ,
           y = hwy)) +
        geom_point(aes(colour = drv), size = 2) +
        geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg,
       aes(x = displ,
           y = hwy)) +
        geom_point(aes(colour = drv), size = 2) +
        geom_smooth(aes(linetype = drv), se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg,
       aes(x = displ,
           y = hwy)) +
        geom_point(shape = 1, size =2, colour = "white", stroke = 2)+
         geom_point(aes(colour = drv), size = 2)

9.4 Facets

ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_wrap(~cyl)

ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_grid(drv ~ cyl) # formula: rows ~ cols

ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_grid(drv ~ cyl, scales = "free")

# free scales
#"free_x" will allow for different scales of x-axis across columns
# "free_y" will allow for different scales of x-axis across rows
ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_grid(drv ~ cyl, scales = "free_y")

9.4.1 Exercises

What happens if you facet on a continuous variable?

#create a separate facet for every unique value in that dataset
ggplot(mpg,
       aes(x = displ, y = hwy)) +
        geom_point()+
        facet_wrap(~cyl)

ggplot(mpg,
       aes(x = displ, y = hwy)) +
        geom_point()+
        facet_wrap(~cty)

length(unique(mpg$cty))

## [1] 21

ggplot(mpg,
       aes(x = cty, y = hwy)) +
        geom_point()+
        facet_wrap(~displ)

length(unique(mpg$displ))

## [1] 35

#use cut to transform your numbers into categories like
ggplot(mpg,
       aes(x = cty, y = hwy)) +
        geom_point()+
        facet_wrap(~ cut_width(displ, 2))

a graph for each value of the continuous variable: create a separate facet for every unique value in that dataset

cut function

cut_interval() Divides data into a fixed number of bins (e.g., exactly 5 bins). cut_width() Divides data by a fixed width (e.g., a bin for every 10 units). cut_number() Divides data so each facet has roughly the same number of data points. cut() Standard R function for custom, manual break points.

What do the empty cells in the plot above with facet_grid(drv ~ cyl) mean? Run the following code. How do they relate to the resulting plot?

ggplot(mpg) + geom_point(aes(x = drv, y = cyl))

ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_grid(drv ~ cyl)

ggplot(mpg) + 
  geom_point(aes(x = drv, y = cyl))

3. What plots does the following code make? What does . do?

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .) #row

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(. ~ cyl) # column

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~drv) #column

4. Take the first faceted plot in this section:

ggplot(mpg) + geom_point(aes(x = displ, y = hwy)) + facet_wrap(~ cyl, nrow = 2)

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) + 
  facet_wrap(~ cyl, nrow = 2)

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour = cyl))

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy, colour = as.factor(cyl)))

What are the advantages to using faceting instead of the color aesthetic? What are the disadvantages? How might the balance change if you had a larger dataset?

Read ?facet_wrap. What does nrow do? What does ncol do? What other options control the layout of the individual panels? Why doesn’t facet_grid() have nrow and ncol arguments?

p <- ggplot(mpg, aes(displ, hwy)) + geom_point()

# Use vars() to supply faceting variables:
p + facet_wrap(vars(class))

# Control the number of rows and columns with nrow and ncol
p + facet_wrap(vars(class), nrow = 4)

p + facet_wrap(vars(class), ncol = 4)

# You can facet by multiple variables
ggplot(mpg, aes(displ, hwy)) +
  geom_point() +
  facet_wrap(vars(cyl, drv))

# Use the `labeller` option to control how labels are printed:
ggplot(mpg, aes(displ, hwy)) +
  geom_point() +
  facet_wrap(vars(cyl, drv), labeller = "label_both")

# Use of facet_grid
p + facet_grid(cyl ~ drv, , labeller = "label_both")# row: cyl, column: drv

p + facet_grid(drv ~ cyl, , labeller = "label_both") # row: drv, column: cyl

Which of the following plots makes it easier to compare engine size (displ) across cars with different drive trains? What does this say about when to place a faceting variable across rows or columns?

ggplot(mpg, aes(x = displ)) + 
  geom_histogram() + 
  facet_grid(drv ~ .)

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(mpg, aes(x = displ)) + 
  geom_histogram() +
  facet_grid(. ~ drv)

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

7. Recreate the following plot using facet_wrap() instead of facet_grid(). How do the positions of the facet labels change?

ggplot(mpg) + geom_point(aes(x = displ, y = hwy)) + facet_grid(drv ~ .)

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~drv)

ggplot(mpg) + 
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~drv, ncol = 1)

9.5.1 Exercises

What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function?
What does geom_col() do? How is it different from geom_bar()?

g <- ggplot(mpg, aes(class))
g + geom_bar() # y: frequency count of X

ggplot(mpg, aes(x = class, y = hwy)) + # both x and y are variables of the data
        geom_col(aes(fill = drv))

ggplot(mpg, aes(x = class, y = after_stat(prop), group = 1)) + #stat: using prop instead of count
        geom_bar()

g + geom_bar(aes(y = after_stat(prop), group = 1)) #stat: using prop instead of count with local mapping

Most geoms and stats come in pairs that are almost always used in concert. Make a list of all the pairs. What do they have in common? (Hint: Read through the documentation.) Visual Goal,Geom Function,Stat Function Bar Chart (counting),geom_bar(),stat_count() Histogram,geom_histogram(),stat_bin() Frequency Polygon,geom_freqpoly(),stat_bin() Boxplot,geom_boxplot(),stat_boxplot() Density Plot,geom_density(),stat_density() Smooth Trend Line,geom_smooth(),stat_smooth() Violin Plot,geom_violin(),stat_ydensity() 2D Heatmap (Squares),geom_bin2d(),stat_bin2d() 2D Heatmap (Hexagons),geom_hex(),stat_binhex() Quantile Regression,geom_quantile(),stat_quantile() Contours,geom_contour(),stat_contour() Point Counts,geom_count(),stat_sum() QQ Plot,geom_qq(),stat_qq()
What variables does stat_smooth() compute? What arguments control its behavior?
In our proportion bar chart, we needed to set group = 1. Why? In other words, what is the problem with these two graphs?

ggplot(diamonds, aes(x = cut, y = after_stat(prop))) + geom_bar() ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) + geom_bar()

ggplot(diamonds, aes(x = cut)) + #use the default after_stat(count)
  geom_bar()

ggplot(diamonds, aes(x = cut, y = after_stat(prop))) + 
  geom_bar()

ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) + 
  geom_bar()

ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) + 
  geom_bar()

ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = 1)) + 
  geom_bar() #group = 1: Denominator is the whole dataset (but you lose fill)

## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = color)) + 
  geom_bar() # group = color: Denominator is the count within each color.

ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(count/sum(count)))) + 
  geom_bar()

#show the proportion of each cut/color combination relative to the entire dataset (not just within its own color), you should use after_stat(count/sum(count))

9.6 Position adjustments

ggplot(mpg, aes(x = drv, color = drv)) + 
  geom_bar()

ggplot(mpg, aes(x = drv, fill = drv)) + 
  geom_bar()

#map the fill aesthetic to another variable
#default setting of geom_bar(): position = "stack"
ggplot(mpg, aes(x = drv, fill = class)) + 
  geom_bar()

# position = "identity" will place each object exactly where it falls in the context of the graph. 
# This is not very useful for bars, because it overlaps them. 
#To see that overlapping we either need to make the bars slightly transparent by setting alpha to a small value... 
ggplot(mpg, aes(x = drv, fill = class)) + 
  geom_bar(alpha = 1/5, position = "identity")

# or completely transparent by setting fill = NA.
ggplot(mpg, aes(x = drv, color = class)) + 
  geom_bar(fill = NA, position = "identity")

# position = "fill" works like stacking, but makes each set of stacked bars the same height. This makes it easier to compare proportions across groups.
ggplot(mpg, aes(x = drv, fill = class)) + 
  geom_bar(position = "fill")

# position = "dodge" places overlapping objects directly beside one another. This makes it easier to compare individual values.
ggplot(mpg, aes(x = drv, fill = class)) + 
  geom_bar(position = "dodge")

#position = "jitter" adds a small amount of random noise to each point. 
#This spreads the points out because no two points are likely to receive the same amount of random noise.
ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point(position = "jitter")

9.6.1 Exercises

What is the problem with the following plot? How could you improve it? ggplot(mpg, aes(x = cty, y = hwy)) + geom_point()

q <- ggplot(mpg, aes(x = cty, y = hwy))  
q + geom_point()

q + geom_point(alpha = 0.3)

q + geom_jitter()

q + geom_count()

What, if anything, is the difference between the two plots? Why?

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point()

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(position = "identity")

# no difference

What parameters to geom_jitter() control the amount of jittering?

p <- ggplot(mpg, aes(cyl, hwy))
p + geom_point()

p + geom_jitter()

# Add aesthetic mappings
p + geom_jitter(aes(colour = class))

# Use smaller width/height to emphasise categories
p+ geom_jitter()

p+ geom_jitter(width = 0.15)

p + geom_jitter(height = 0.15)

p + geom_jitter(width = 0.15, height = 0.15)

p + geom_jitter(width = 0.15, height = 0.15, alpha = 0.3)

# Use larger width/height to completely smooth away discreteness
ggplot(mpg, aes(cty, hwy)) +
  geom_jitter()

ggplot(mpg, aes(cty, hwy)) +
  geom_jitter(width = 0.75, height = 0.75)

4. Compare and contrast geom_jitter() with geom_count().

What’s the default position adjustment for geom_boxplot()? Create a visualization of the mpg dataset that demonstrates it.

position = “dodge2”

p <- ggplot(mpg, aes(y = hwy, x = class))
p + geom_boxplot()

p + geom_boxplot(position = "dodge")

p + geom_boxplot(position = "identity")

ggplot(data = mpg, mapping = aes(x = class, y = hwy, color = drv)) +
  geom_boxplot() +
  labs(title = "Default 'dodge2' Position Adjustment in geom_boxplot()",
       x = "Vehicle Class",
       y = "Highway MPG",
       color = "Drive Type")

ggplot(data = mpg, mapping = aes(x = class, y = hwy, color = drv)) +
  geom_boxplot(position = "identity") +
  labs(title = " 'identity' Position Adjustment in geom_boxplot()",
       x = "Vehicle Class",
       y = "Highway MPG",
       color = "Drive Type")

9.8 The layered grammar of graphics

ggplot(data = ) + ( mapping = aes(), stat = , position = ) + +