1 Loading libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(palmerpenguins)
library(ggthemes)

2 Exploration

penguins
## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

3 Ggplot

ggplot(data = penguins,
       mapping = aes(x= flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

4 Visuals

ggplot(penguins, aes(x = species)) +
  geom_bar()

ggplot(penguins, aes(x = fct_infreq(species))) +
  geom_bar()

5 Numerical variables

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 200)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 20)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 2000)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_density()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

6 Exercises

6.1 Make a bar plot of species of penguins, where you assign species to the y aesthetic. How is this plot different?

ggplot(penguins, aes(y = species)) +
  geom_bar()

The plot is different as it is horizontal in nature compared to putting the Species on the x axis. It shows the same results but just in a different fashion.

6.2 How are the following two plots different? Which aesthetic, color or fill, is more useful for changing the color of bars?

ggplot(penguins, aes(x = species)) +
  geom_bar(color = "red")

ggplot(penguins, aes(x = species)) +
  geom_bar(fill = "red")

The first bar uses the color function to color the perimeter of the bar chart while the fill function fills the bar with a specified color. The fill option would be more useful as this can be used later to outline the differences between variables in colour.

7 Numerical and categorical variables

ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot() +
  labs(title = "Body mass by Species (Boxplot)")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(penguins, aes(x = body_mass_g, color = species)) +
  geom_density(linewidth = 0.75) +
  labs(title = "Body mass by Species (Density)")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
  geom_density(alpha = 0.5) +
  labs(title = "Body mass by Species (w/ alpha 0.5)")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

8 Two catergorial variables

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar() +
  labs(title = "Island and Species")

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill") +
  labs(title = "Island and Speices (w/ position fill)")

### Two plus numerical variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = species)) +
  facet_wrap(~island)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

9 Exercises

9.1 The mpg data frame that is bundled with the ggplot2 package contains 234 observations collected by the US Environmental Protection Agency on 38 car models. Which variables in mpg are categorical? Which variables are numerical? (Hint: Type ?mpg to read the documentation for the dataset.) How can you see this information when you run mpg?

mpg
## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # ℹ 224 more rows
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…

Categorical: manufacturer, model, trans, drv, fl, class
Numerical: displ, year, cyl,cty, hwy

9.2 Make a scatterplot of hwy vs. displ using the mpg data frame. Next, map a third, numerical variable to color, then size, then both color and size, then shape. How do these aesthetics behave differently for categorical vs. numerical variables?

ggplot(mpg, aes(x = hwy, y = displ)) +
  geom_point() +
  labs(title = "Highway miles by displacement")

ggplot(mpg, aes(x = hwy, y = displ)) +
  geom_point(aes(colour = year)) +
  labs(title = "Highway miles by displacement and year")

ggplot(mpg, aes(x = hwy, y = displ)) +
  geom_point(aes(color = year, shape = drv)) +
  labs(title = "Highway miles by displacement, year and drive type")

The aesthetics behave differently with the x and y variables being mapped on their corrosponding axis while color changes the colour of the observation based on its numerical value. For example, a car made in 2008 is a light blue colour and the colour gets darker as the car ages. The shape reflects the drive train of the car so it’s easier to find either category.

9.3 In the scatterplot of hwy vs. displ, what happens if you map a third variable to linewidth?

ggplot(mpg, aes(x = hwy, y = displ, linewidth = cty)) +
  geom_point()

Linewidth is ignored.

9.4 What happens if you map the same variable to multiple aesthetics?

ggplot(mpg, aes(x = hwy, y = displ)) +
  geom_point(aes(colour = cty))

The cty variable is now added into the plot as a colour.

9.5 Make a scatterplot of bill_depth_mm vs. bill_length_mm and color the points by species. What does adding coloring by species reveal about the relationship between these two variables? What about faceting by species?

ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, colour = species)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

The colouring reveals that Gentoo penguins have the shortest bill depth but the longest bill length. While chinstrap penguins have a larger bill length than the Gentoo and Adelie penguins. The Adelie penguins have the shortest bill length but have slightly more observations where their bill depth is longer than the Chinstrap penguins.

9.6 Why does the following yield two separate legends? How would you fix it to combine the two legends?

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm, 
    color = species, shape = species
  )
) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

To fix this visualisation, I removed the additional lavel which was present in the end part of the code. This was creating a duplicate of the legend.

9.7 Create the two following stacked bar plots. Which question can you answer with the first one? Which question can you answer with the second one?

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

ggplot(penguins, aes(x = species, fill = island)) +
  geom_bar(position = "fill")

From the first plot, you can see that Torgersen island is populated by Adelie penguins only. Dream island is populated by both Adelie and Chinstrap penguins with Chinstrap penguins having a slightly higher popuation. Biscoe island is 75% populated by Gentoo penguins with a pocket population of 25% Adelie penguins.

10 Saving your plots

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(filename = "penguin-plot.png")
## Saving 7 x 5 in image
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

11 Last exercise

11.1 Run the following lines of code. Which of the two plots is saved as mpg-plot.png? Why?

11.2 What do you need to change in the code above to save the plot as a PDF instead of a PNG? How could you find out what types of image files would work in ggsave()?

ggplot(mpg, aes(x = class)) +
  geom_bar()

ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()

ggsave("mpg-plot.png")
## Saving 7 x 5 in image

The file is being saved becasue the ggsave function is used to same the plot as a png. To save it as a pdf it would need a ggsave of mpg-plot.pdf.