knitr::opts_chunk$set(echo = T,
fig.align = "center")
# Load the tidyverse
library(tidyverse)
We’ll start by using the diamonds data frame, stored in
ggplot2.
Take a look at it:
diamonds <- diamonds
tibble(diamonds)
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ℹ 53,930 more rows
For more info, do help(diamonds)
on the console
Start by creating a blank map called gg_carat with carat mapped to the x-axis. make sure to include an appropriate theme!
# blank graph with x = carat named gg_carat
gg_carat <-
ggplot(
data = diamonds,
mapping = aes(
x = carat
)
) +
theme_bw() +
scale_y_continuous(
expand = c(0, 0, 0.05, 0)
)
gg_carat
Create histograms below using the blank map with 20, 30, and 40 bins.
The lines should be black and the bars white.
Which would you recommend?
# histogram with 20 bins
gg_carat +
geom_histogram(
fill = "white",
color = "black",
bins = 20
)
# histogram with 20 bins
gg_carat +
geom_histogram(
fill = "white",
color = "black",
bins = 30
)
# histogram with 40 bins
gg_carat +
geom_histogram(
fill = "white",
color = "black",
bins = 40
)
Repeat the graph above, but create a density plot instead of a histogram
gg_carat +
geom_density(
fill = "white"
)
Describe the important features of the data using the plots above.
The shape of carat is unimodal and right skewed
How should the mean and median of carat compare to each other? Briefly explain why
Since it is right skewed, the mean will be larger than the mean
Find the mean and the median in the code chunk below:
mean(diamonds$carat)
## [1] 0.7979397
median(diamonds$carat)
## [1] 0.7
Create box plots of price by cut of the diamond. Save it as gg_price_cut
gg_price_cut <-
ggplot(
data = diamonds,
mapping = aes(
x = price,
y = cut
)
) +
geom_boxplot(
mapping = aes(fill = cut),
show.legend = F
) +
labs(x = NULL) +
# Add this line of code to change the x-axis labels to be in dollars
scale_x_continuous(labels = scales::dollar)
gg_price_cut
Create small multiples of the box plots above by diamond color.
gg_price_cut +
# Small multiples from 1 variable -> facet_wrap
facet_wrap(
facets = vars(color),
ncol = 2
)
Create small multiples of the boxplot of this section by color and clarity
gg_price_cut +
# Small multiples from 2 columns -> facet_grid
facet_grid(
rows = vars(clarity),
cols = vars(color)
)
The code below will create a data set removing certain outliers from the data
# Form the diamonds2 data set
RNGversion("4.1.0")
set.seed(1870)
diamonds2 <-
diamonds |>
# changing the name of some columns
rename(
length = x,
width = y,
height = z
) |>
# Keeping diamonds that aren't outliers
filter(
length > 3,
width < 20
) |>
# a random 2000 diamonds
slice_sample(n = 2000)
tibble(diamonds2)
## # A tibble: 2,000 × 10
## carat cut color clarity depth table price length width height
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.62 Very Good D VVS2 58.1 63 3050 5.59 5.66 3.27
## 2 0.37 Very Good E VVS1 60.9 56 1055 4.65 4.67 2.84
## 3 0.91 Premium F SI2 60.5 59 2461 6.21 6.19 3.75
## 4 2.32 Fair H SI1 62 62 18026 8.47 8.31 5.2
## 5 0.78 Ideal D SI1 61.1 57 3214 5.95 5.97 3.64
## 6 0.9 Good D SI1 63.8 58 4193 6.09 6.13 3.9
## 7 0.33 Ideal H VS1 61.7 54 579 4.45 4.47 2.75
## 8 0.39 Ideal E VVS2 62.1 54.6 1100 4.67 4.7 2.91
## 9 0.42 Very Good G SI1 62 54 751 4.8 4.82 2.98
## 10 1.71 Good J VS2 62.8 58 8831 7.48 7.55 4.72
## # ℹ 1,990 more rows
Using diamonds2, create a plot for length (x) and width (y). Save the blank map as gg_diamond_size
gg_diamond_size <-
ggplot(
data = diamonds2,
mapping = aes(
x = length,
y = width
)
) +
theme_bw() +
labs(
title = "Length and Width of 2000 Randomly Selected Diamonds"
)
Add the correct geom to make the scatterplot
gg_diamond_size +
geom_point()
Repeat the previous code chunk, but include a solution to the overplotting problem
gg_diamond_size +
geom_point(
alpha = 0.1
)
gg_diamond_size +
geom_point(
mapping = aes(color = price),
alpha = 0.5
) +
# Add this line of code to change the labels and the colors used to represent price
scale_color_viridis_c(
labels = scales::dollar
)
pacman::p_load(palmerpenguins)
# The code below will get the penguin data set to appear in the global environment
penguins <- penguins
First, create a blank map with body_mass_g mapped to the y-axis and flipper_length_mm to the x-axis. Improve the theme and the labels on the axes as well.
gg_penguin <-
ggplot(
data = penguins,
mapping = aes(
x = flipper_length_mm,
y = body_mass_g
)
) +
labs(
x = "Flipper Length (mm)",
y = "Body Mass (g)"
) +
theme_bw()
gg_penguin
Next, add the correct geom to the blank map to create the scatterplot, then describe the 4 important features of the scatterplot
gg_penguin +
geom_point()
## Warning: Removed 2 rows containing missing values (`geom_point()`).
Direction = Positive Outliers = None Trend = Linear Strength = Moderately Strong
Finally, calculate the correlation between the two variables. Does it support your answer?
cor(
x = penguins$flipper_length_mm,
y = penguins$body_mass_g,
use = "pair" # Ignore any missing values in flipper length or body mass
)
## [1] 0.8712018
Yes, the correlation indicates a strong, positive association
Recreate the scatterplot for 4a), but include species in the plot as well.
gg_penguin +
geom_point(
mapping = aes(color = species)
)
## Warning: Removed 2 rows containing missing values (`geom_point()`).
Briefly describe any conclusions you can reach from the scatterplot with species included.
Adelie and Chinstrap penguins are very similar with respect to flipper length and body weight. Gentoo penguins have longer flippers and weigh more overall than the other two species of penguin
Instead of including the 3 species in the same scatterplot, create small multiples of the plot by species.
gg_penguin +
geom_point(
mapping = aes(color = species),
show.legend = F
) +
facet_wrap(
facets = vars(species),
scales = "free",
ncol = 2
)
## Warning: Removed 2 rows containing missing values (`geom_point()`).
If you were to describe the association between flipper length and body mass for any one species, how would you describe it? Is it different than your answer when we ignored species?
While there is still a positive, linear association between flipper length and body weight, the correlation appears to be weaker than if we looked at the scatterplot with all 3 species together.
Not done in the practice, but we can calculate the correlation between flipper and body weight for each species separately:
penguins |>
summarize(
.by = species, # Calculating the correlation for each species
flip_weight_cor = cor(x = body_mass_g,
y = flipper_length_mm,
use = "pair")
)
## # A tibble: 3 × 2
## species flip_weight_cor
## <fct> <dbl>
## 1 Adelie 0.468
## 2 Gentoo 0.703
## 3 Chinstrap 0.642
cor(
x = penguins$flipper_length_mm,
y = penguins$body_mass_g,
use = "pair"
)
## [1] 0.8712018