Setup

knitr::opts_chunk$set(echo = T,
                      fig.align = "center")

# Load the tidyverse
library(tidyverse)

The diamonds data

We’ll start by using the diamonds data frame, stored in ggplot2.
Take a look at it:

diamonds <- diamonds

tibble(diamonds)
## # A tibble: 53,940 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ℹ 53,930 more rows

For more info, do help(diamonds) on the console

Q1) Look at diamond weight (carat)

Start by creating a blank map called gg_carat with carat mapped to the x-axis. make sure to include an appropriate theme!

# blank graph with x = carat named gg_carat
gg_carat <- 
  ggplot(
    data = diamonds,
    mapping = aes(
      x = carat
    )
  ) + 
  
  theme_bw() + 
  
  scale_y_continuous(
    expand = c(0, 0, 0.05, 0)
  )

gg_carat

Create histograms below using the blank map with 20, 30, and 40 bins.

The lines should be black and the bars white.

Which would you recommend?

# histogram with 20 bins
gg_carat + 
  geom_histogram(
    fill = "white",
    color = "black",
    bins = 20
  )

# histogram with 20 bins
gg_carat + 
  geom_histogram(
    fill = "white",
    color = "black",
    bins = 30
  )

# histogram with 40 bins
gg_carat + 
  geom_histogram(
    fill = "white",
    color = "black",
    bins = 40
  )

density plot for diamond weight

Repeat the graph above, but create a density plot instead of a histogram

gg_carat + 
  geom_density(
    fill = "white"
  )

Describe the important features of the data using the plots above.

The shape of carat is unimodal and right skewed

Mean vs Median

How should the mean and median of carat compare to each other? Briefly explain why

Since it is right skewed, the mean will be larger than the mean

Find the mean and the median in the code chunk below:

mean(diamonds$carat)
## [1] 0.7979397
median(diamonds$carat)
## [1] 0.7

Question 2) Price by cut, color, and clarity

Create box plots of price by cut of the diamond. Save it as gg_price_cut

gg_price_cut <- 
  ggplot(
    data = diamonds,
    mapping = aes(
      x = price,
      y = cut
    )
  ) + 
  
  geom_boxplot(
    mapping = aes(fill = cut),
    show.legend = F
  ) +
  
  labs(x = NULL) +
  
  # Add this line of code to change the x-axis labels to be in dollars
  scale_x_continuous(labels = scales::dollar)


gg_price_cut

Create small multiples of the box plots above by diamond color.

gg_price_cut +
  # Small multiples from 1 variable -> facet_wrap
  facet_wrap(
    facets = vars(color),
    ncol = 2
  )

Create small multiples of the boxplot of this section by color and clarity

gg_price_cut +
  # Small multiples from 2 columns -> facet_grid
  facet_grid(
    rows = vars(clarity),
    cols = vars(color)
    )

Question 3) Looking at length and width of diamonds

3.1) width by length

The code below will create a data set removing certain outliers from the data

# Form the diamonds2 data set
RNGversion("4.1.0")
set.seed(1870)
diamonds2 <- 
  diamonds |> 
  # changing the name of some columns
  rename(
    length = x,
    width = y,
    height = z
  ) |> 
  # Keeping diamonds that aren't outliers
  filter(
    length > 3,
    width < 20
  ) |> 
  # a random 2000 diamonds
  slice_sample(n = 2000)

tibble(diamonds2)
## # A tibble: 2,000 × 10
##    carat cut       color clarity depth table price length width height
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int>  <dbl> <dbl>  <dbl>
##  1  0.62 Very Good D     VVS2     58.1  63    3050   5.59  5.66   3.27
##  2  0.37 Very Good E     VVS1     60.9  56    1055   4.65  4.67   2.84
##  3  0.91 Premium   F     SI2      60.5  59    2461   6.21  6.19   3.75
##  4  2.32 Fair      H     SI1      62    62   18026   8.47  8.31   5.2 
##  5  0.78 Ideal     D     SI1      61.1  57    3214   5.95  5.97   3.64
##  6  0.9  Good      D     SI1      63.8  58    4193   6.09  6.13   3.9 
##  7  0.33 Ideal     H     VS1      61.7  54     579   4.45  4.47   2.75
##  8  0.39 Ideal     E     VVS2     62.1  54.6  1100   4.67  4.7    2.91
##  9  0.42 Very Good G     SI1      62    54     751   4.8   4.82   2.98
## 10  1.71 Good      J     VS2      62.8  58    8831   7.48  7.55   4.72
## # ℹ 1,990 more rows

Using diamonds2, create a plot for length (x) and width (y). Save the blank map as gg_diamond_size

gg_diamond_size <- 
  ggplot(
    data = diamonds2,
    mapping = aes(
      x = length,
      y = width
    )
  ) + 
  
  theme_bw() + 
  
  labs(
    title = "Length and Width of 2000 Randomly Selected Diamonds"
  )

Add the correct geom to make the scatterplot

gg_diamond_size + 
  geom_point()

Repeat the previous code chunk, but include a solution to the overplotting problem

gg_diamond_size + 
  geom_point(
    alpha = 0.1
  )

3.2) Create a similar plot as above, but also map price to the color

gg_diamond_size + 
  
  geom_point(
    mapping = aes(color = price),
    alpha = 0.5
  ) + 
  
  # Add this line of code to change the labels and the colors used to represent price
  scale_color_viridis_c(
    labels = scales::dollar
  )

Question 4) Penguin mass by flipper length

pacman::p_load(palmerpenguins)

# The code below will get the penguin data set to appear in the global environment
penguins <- penguins

4a) Create a scatterplot of body mass vs flipper length

First, create a blank map with body_mass_g mapped to the y-axis and flipper_length_mm to the x-axis. Improve the theme and the labels on the axes as well.

gg_penguin <- 
  ggplot(
    data = penguins,
    mapping = aes(
      x = flipper_length_mm,
      y = body_mass_g
    )
  ) + 
  
  labs(
    x = "Flipper Length (mm)",
    y = "Body Mass (g)"
  ) + 
  
  theme_bw()

gg_penguin

Next, add the correct geom to the blank map to create the scatterplot, then describe the 4 important features of the scatterplot

gg_penguin + 
  
  geom_point()
## Warning: Removed 2 rows containing missing values (`geom_point()`).

Direction = Positive Outliers = None Trend = Linear Strength = Moderately Strong

Finally, calculate the correlation between the two variables. Does it support your answer?

cor(
  x = penguins$flipper_length_mm,
  y = penguins$body_mass_g,
  use = "pair" # Ignore any missing values in flipper length or body mass
)
## [1] 0.8712018

Yes, the correlation indicates a strong, positive association

4b) Including species

Recreate the scatterplot for 4a), but include species in the plot as well.

gg_penguin + 
  geom_point(
    mapping = aes(color = species)
  )
## Warning: Removed 2 rows containing missing values (`geom_point()`).

Briefly describe any conclusions you can reach from the scatterplot with species included.

Adelie and Chinstrap penguins are very similar with respect to flipper length and body weight. Gentoo penguins have longer flippers and weigh more overall than the other two species of penguin

4c) Small multiples

Instead of including the 3 species in the same scatterplot, create small multiples of the plot by species.

gg_penguin + 
  
  geom_point(
    mapping = aes(color = species),
    show.legend = F
  ) +
  
  facet_wrap(
    facets = vars(species),
    scales = "free",
    ncol = 2
  )
## Warning: Removed 2 rows containing missing values (`geom_point()`).

If you were to describe the association between flipper length and body mass for any one species, how would you describe it? Is it different than your answer when we ignored species?

While there is still a positive, linear association between flipper length and body weight, the correlation appears to be weaker than if we looked at the scatterplot with all 3 species together.

Not done in the practice, but we can calculate the correlation between flipper and body weight for each species separately:

penguins |> 
  summarize(
    .by = species,  # Calculating the correlation for each species
    flip_weight_cor = cor(x = body_mass_g,
                          y = flipper_length_mm,
                          use = "pair")
  )
## # A tibble: 3 × 2
##   species   flip_weight_cor
##   <fct>               <dbl>
## 1 Adelie              0.468
## 2 Gentoo              0.703
## 3 Chinstrap           0.642
cor(
  x = penguins$flipper_length_mm,
  y = penguins$body_mass_g,
  use = "pair"
)
## [1] 0.8712018