–metadata title=“R for Data Science”

R for Data Science(2e)

1 Data visualization

download the package and data

if(!require(tidyverse))
        {install.packages("tidyverse")}
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
if(!require(palmerpenguins))
   {install.packages("palmerpenguins")} #https://allisonhorst.github.io/palmerpenguins/
## Loading required package: palmerpenguins
## Warning: package 'palmerpenguins' was built under R version 4.5.2
## 
## Attaching package: 'palmerpenguins'
## 
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
library(palmerpenguins)
if(!require(ggthemes))
{install.packages("ggthemes")}
## Loading required package: ggthemes
## Warning: package 'ggthemes' was built under R version 4.5.2
library(ggthemes)
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

ggplot

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
)

geom: the geometrical object that a plot uses to represent data

  • bar geoms (geom_bar())
  • line charts use line geoms (geom_line())
  • boxplots use boxplot geoms (geom_boxplot())
  • scatterplots use point geoms (geom_point())
ggplot(
        data = penguins, 
        mapping = aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(na.rm = TRUE)

It’s always a good idea to be skeptical of any apparent relationship between two variables and ask if there may be other variables that explain or change the nature of this apparent relationship.

ggplot(
        data = penguins, 
        mapping = aes(x = flipper_length_mm, 
                      y = body_mass_g, 
                      colour = species)) +
geom_point(na.rm = TRUE)

Adding a smooth curve displaying the relationship between body mass and flipper length

When aesthetic mappings are defined in ggplot(), at the global level
–>each of the subsequent geom layers of the plot

Each geom function in ggplot2 can also take a mapping argument
–>aesthetic mappings at the local level that are added to those inherited from the global level.

ggplot(
        data = penguins, 
        mapping = aes(x = flipper_length_mm, 
                      y = body_mass_g, 
                      colour = species)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

ggplot(
        data = penguins, 
        mapping = aes(x = flipper_length_mm, 
                      y = body_mass_g )) +
geom_point(mapping = aes(colour = species),
            na.rm = TRUE) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(aes(color = species, shape = species)) +
  geom_smooth(method = "lm") +
  labs(
    title = "Body mass and flipper length",
    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
    x = "Flipper length (mm)", y = "Body mass (g)",
    color = "Species", shape = "Species"
  ) +
  scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

### 1.2.5 Exercises

  1. How many rows are in penguins? How many columns?
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

Rows: 344 Columns: 8

  1. What does the bill_depth_mm variable in the penguins data frame describe? Read the help for ?penguins to find out.
    bill_depth_mm a number denoting bill depth (millimeters)

  2. Make a scatterplot of bill_depth_mm vs. bill_length_mm. That is, make a scatterplot with bill_depth_mm on the y-axis and bill_length_mm on the x-axis. Describe the relationship between these two variables.

ggplot(data = penguins,
       mapping = aes(x = bill_length_mm, y = bill_depth_mm))+
geom_point(na.rm = TRUE,
           mapping = aes(colour = species, shape = species))+
geom_smooth(method = "lm") +
labs(title = "Scatterplot of Bill Depth vs Bill Length",
     x = "Bill Length (mm)",
     y = "Bill Depth (mm)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

  1. What happens if you make a scatterplot of species vs. bill_depth_mm? What might be a better choice of geom?
ggplot(data = penguins,
       mapping = aes(x = bill_depth_mm, y = species))+
geom_point(na.rm = TRUE)+
labs(title = "Scatterplot of Species vs Bill Depth",
     x = "Bill Depth (mm)",
     y = "Species")

ggplot(data = penguins,
       mapping = aes(x = bill_depth_mm, y = species))+
geom_boxplot(na.rm = TRUE)+
labs(title = "Boxplot of Species vs Bill Depth",
     x = "Bill Depth (mm)",
     y = "Species")       

  1. Why does the following give an error and how would you fix it?

ggplot(data = penguins) + geom_point()

ggplot(data = penguins,
       mapping = aes(x = flipper_length_mm, y = bill_length_mm)) + 
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

  1. What does the na.rm argument do in geom_point()? What is the default value of the argument? Create a scatterplot where you successfully use this argument set to TRUE.

  2. Add the following caption to the plot you made in the previous exercise: “Data come from the palmerpenguins package.” Hint: Take a look at the documentation for labs()

ggplot(data = penguins,
       mapping = aes(x = bill_depth_mm, y = species))+
geom_boxplot(na.rm = TRUE)+
labs(title = "Boxplot of Species vs Bill Depth",
     x = "Bill Depth (mm)",
     y = "Species",
        caption = "Data come from the palmerpenguins package.")

  1. Recreate the following visualization. What aesthetic should bill_depth_mm be mapped to? And should it be mapped at the global level or at the geom level?
ggplot(
        data = penguins,
        mapping = aes(
                x = flipper_length_mm,
                y = body_mass_g)
        ) +
geom_point(na.rm = TRUE,
                mapping = aes(colour = bill_depth_mm)) +
geom_smooth(method = "loess",
            na.rm = TRUE)
## `geom_smooth()` using formula = 'y ~ x'

  1. Run this code in your head and predict what the output will look like. Then, run the code in R and check your predictions.
ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
) +
  geom_point() +
  geom_smooth(se = FALSE) 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

#When you set this to FALSE, you are telling R to hide that grey ribbon 
# and only display the smoothed line itself.
  1. Will these two graphs look different? Why/why not?
ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot() +
  geom_point(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  ) +
  geom_smooth(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

They are the same. Universal mapping vs local mapping

1.3 ggplot2 calls

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

penguins |> 
  ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

1.4.3 Exercises

  1. Make a bar plot of species of penguins, where you assign species to the y aesthetic. How is this plot different?
ggplot(penguins,
       aes(y = fct_infreq(species)))+
geom_bar(na.rm=TRUE)

  1. How are the following two plots different? Which aesthetic, color or fill, is more useful for changing the color of bars?

ggplot(penguins, aes(x = species)) + geom_bar(color = “red”)

ggplot(penguins, aes(x = species)) + geom_bar(fill = “red”)

ggplot(penguins, aes(x = species)) +
  geom_bar(color = "red") #outline colour

ggplot(penguins, aes(x = species)) +
  geom_bar(fill = "red") # fill colour

3.What does the bins argument in geom_histogram() do?

ggplot(penguins, aes(x = bill_length_mm)) +
  geom_histogram(binwidth = 10) 
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = bill_length_mm)) +
  geom_histogram(binwidth = 100) 
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = bill_length_mm)) +
  geom_histogram(binwidth = 1) 
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = bill_length_mm)) +
  geom_histogram(bin= 20) 
## Warning in geom_histogram(bin = 20): Ignoring unknown parameters: `bin`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

4. Make a histogram of the carat variable in the diamonds dataset that is available when you load the tidyverse package. Experiment with different binwidths. What binwidth reveals the most interesting patterns?

glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
summary(diamonds$carat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.4000  0.7000  0.7979  1.0400  5.0100
diamonds |> 
        ggplot(aes(x=carat,
                   fill = color)) +
        geom_histogram(binwidth = 0.1)

diamonds |> 
        ggplot(aes(x=carat,
                   fill = color)) +
        geom_histogram(binwidth = 0.2) 

diamonds |> 
        ggplot(aes(x=carat,
                   fill = color)) +
        geom_histogram(binwidth = 0.5)  

diamonds |> 
        ggplot(aes(x=carat,
                   fill = color)) +
        geom_histogram(binwidth = 1) 

### 1.5 Visualizing relationships 1.5.1 A numerical and a categorical variable

ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#middle line: median (50% percentile), box: 75% to 25% percentile, 
# box height: interquatile range (IQR), whisker: 1.5 IQR
ggplot(penguins, 
       aes(x = body_mass_g,
           colour = species,
           fill = species)) + 
geom_density(na.rm = TRUE,
             linewidth = 1,
             alpha = 0.4)+
labs(title = "Density graph of Body Mass",
     x = "Body Mass in g",
     y = "Density")

### 1.5.2 Two categorical variables

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar()

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

ggplot(penguins, aes(x = island, y = species)) +
  geom_point() # not suitable for two categorical variables

1.5.4 Three or more variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(colour = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(colour = species)) +
        facet_wrap(~island)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

### 1.5.5 Exercises

  1. The mpg data frame that is bundled with the ggplot2 package contains 234 observations collected by the US Environmental Protection Agency on 38 car models. Which variables in mpg are categorical? Which variables are numerical? (Hint: Type ?mpg to read the documentation for the dataset.) How can you see this information when you run mpg?
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…
  1. Make a scatterplot of hwy vs. displ using the mpg data frame. Next, map a third, numerical variable to color, then size, then both color and size, then shape. How do these aesthetics behave differently for categorical vs. numerical variables?

  2. What happens if you map the same variable to multiple aesthetics?

ggplot(mpg,
       aes(x = displ, y = hwy, colour = cty))+
geom_point(na.rm = TRUE)

ggplot(mpg,
       aes(x = displ, y = hwy, colour = cty, size = cty))+ 
        #continuous variable cannot be mapped to shape
geom_point(na.rm = TRUE)

ggplot(mpg,
       aes(x = displ, y = hwy, colour = fl, shape = class, size = drv))+
geom_point(na.rm = TRUE)
## Warning: Using size for a discrete variable is not advised.
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many of them.

ggplot(mpg,
       aes(x = displ, y = hwy, colour = fl, shape = fl, size = fl))+
geom_point(na.rm = TRUE)
## Warning: Using size for a discrete variable is not advised.

3. In the scatterplot of hwy vs. displ, what happens if you map a third variable to linewidth?

ggplot(mpg,
       aes(x = displ, y = hwy, linewidth = cty))+
geom_point(na.rm = TRUE)

  1. Make a scatterplot of bill_depth_mm vs. bill_length_mm and color the points by species. What does adding coloring by species reveal about the relationship between these two variables? What about faceting by species?
ggplot(penguins,
       aes(x = bill_length_mm, y = bill_depth_mm, colour = species))+
geom_point(na.rm = TRUE)+
geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

ggplot(penguins,
       aes(x = bill_length_mm, y = bill_depth_mm))+
geom_point(na.rm = TRUE)+
geom_smooth(method = lm)+
facet_wrap(vars(species))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

  1. Why does the following yield two separate legends? How would you fix it to combine the two legends?

ggplot( data = penguins, mapping = aes( x = bill_length_mm, y = bill_depth_mm, color = species, shape = species ) ) + geom_point() + labs(color = “Species”)

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm, 
    color = species, shape = species
  )
) +
  geom_point() +
  labs(color = "Species")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm, 
    color = species, shape = species
  )
) +
  geom_point() +
  labs(color = "Species", shape = "Species")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm, 
    color = species, shape = species
  )
) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

  1. Create the two following stacked bar plots. Which question can you answer with the first one? Which question can you answer with the second one?

ggplot(penguins, aes(x = island, fill = species)) + geom_bar(position = “fill”) ggplot(penguins, aes(x = species, fill = island)) + geom_bar(position = “fill”)

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

#species proportion in each island
ggplot(penguins, aes(x = species, fill = island)) +
  geom_bar(position = "fill")

# geographical distribution of species among the three islands