–metadata title=“R for Data Science”
download the package and data
if(!require(tidyverse))
{install.packages("tidyverse")}
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
if(!require(palmerpenguins))
{install.packages("palmerpenguins")} #https://allisonhorst.github.io/palmerpenguins/
## Loading required package: palmerpenguins
## Warning: package 'palmerpenguins' was built under R version 4.5.2
##
## Attaching package: 'palmerpenguins'
##
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
library(palmerpenguins)
if(!require(ggthemes))
{install.packages("ggthemes")}
## Loading required package: ggthemes
## Warning: package 'ggthemes' was built under R version 4.5.2
library(ggthemes)
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
ggplot
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
)
geom: the geometrical object that a plot uses to represent data
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(na.rm = TRUE)
It’s always a good idea to be skeptical of any apparent relationship between two variables and ask if there may be other variables that explain or change the nature of this apparent relationship.
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
colour = species)) +
geom_point(na.rm = TRUE)
Adding a smooth curve displaying the relationship between body mass and flipper length
When aesthetic mappings are defined in ggplot(), at the global
level
–>each of the subsequent geom layers of the plot
Each geom function in ggplot2 can also take a mapping argument
–>aesthetic mappings at the local level that are added to those
inherited from the global level.
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
colour = species)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g )) +
geom_point(mapping = aes(colour = species),
na.rm = TRUE) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(aes(color = species, shape = species)) +
geom_smooth(method = "lm") +
labs(
title = "Body mass and flipper length",
subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
x = "Flipper length (mm)", y = "Body mass (g)",
color = "Species", shape = "Species"
) +
scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
### 1.2.5
Exercises
glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
Rows: 344 Columns: 8
What does the bill_depth_mm variable in the penguins data frame
describe? Read the help for ?penguins to find out.
bill_depth_mm a number denoting bill depth (millimeters)
Make a scatterplot of bill_depth_mm vs. bill_length_mm. That is, make a scatterplot with bill_depth_mm on the y-axis and bill_length_mm on the x-axis. Describe the relationship between these two variables.
ggplot(data = penguins,
mapping = aes(x = bill_length_mm, y = bill_depth_mm))+
geom_point(na.rm = TRUE,
mapping = aes(colour = species, shape = species))+
geom_smooth(method = "lm") +
labs(title = "Scatterplot of Bill Depth vs Bill Length",
x = "Bill Length (mm)",
y = "Bill Depth (mm)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
ggplot(data = penguins,
mapping = aes(x = bill_depth_mm, y = species))+
geom_point(na.rm = TRUE)+
labs(title = "Scatterplot of Species vs Bill Depth",
x = "Bill Depth (mm)",
y = "Species")
ggplot(data = penguins,
mapping = aes(x = bill_depth_mm, y = species))+
geom_boxplot(na.rm = TRUE)+
labs(title = "Boxplot of Species vs Bill Depth",
x = "Bill Depth (mm)",
y = "Species")
ggplot(data = penguins) + geom_point()
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm, y = bill_length_mm)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
What does the na.rm argument do in geom_point()? What is the default value of the argument? Create a scatterplot where you successfully use this argument set to TRUE.
Add the following caption to the plot you made in the previous exercise: “Data come from the palmerpenguins package.” Hint: Take a look at the documentation for labs()
ggplot(data = penguins,
mapping = aes(x = bill_depth_mm, y = species))+
geom_boxplot(na.rm = TRUE)+
labs(title = "Boxplot of Species vs Bill Depth",
x = "Bill Depth (mm)",
y = "Species",
caption = "Data come from the palmerpenguins package.")
ggplot(
data = penguins,
mapping = aes(
x = flipper_length_mm,
y = body_mass_g)
) +
geom_point(na.rm = TRUE,
mapping = aes(colour = bill_depth_mm)) +
geom_smooth(method = "loess",
na.rm = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
#When you set this to FALSE, you are telling R to hide that grey ribbon
# and only display the smoothed line itself.
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot() +
geom_point(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_smooth(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
They are the same. Universal mapping vs local mapping
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
penguins |>
ggplot(aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins,
aes(y = fct_infreq(species)))+
geom_bar(na.rm=TRUE)
ggplot(penguins, aes(x = species)) + geom_bar(color = “red”)
ggplot(penguins, aes(x = species)) + geom_bar(fill = “red”)
ggplot(penguins, aes(x = species)) +
geom_bar(color = "red") #outline colour
ggplot(penguins, aes(x = species)) +
geom_bar(fill = "red") # fill colour
3.What does the bins argument in geom_histogram() do?
ggplot(penguins, aes(x = bill_length_mm)) +
geom_histogram(binwidth = 10)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(penguins, aes(x = bill_length_mm)) +
geom_histogram(binwidth = 100)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(penguins, aes(x = bill_length_mm)) +
geom_histogram(binwidth = 1)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(penguins, aes(x = bill_length_mm)) +
geom_histogram(bin= 20)
## Warning in geom_histogram(bin = 20): Ignoring unknown parameters: `bin`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
4. Make a histogram of the carat variable in the diamonds dataset that
is available when you load the tidyverse package. Experiment with
different binwidths. What binwidth reveals the most interesting
patterns?
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
summary(diamonds$carat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2000 0.4000 0.7000 0.7979 1.0400 5.0100
diamonds |>
ggplot(aes(x=carat,
fill = color)) +
geom_histogram(binwidth = 0.1)
diamonds |>
ggplot(aes(x=carat,
fill = color)) +
geom_histogram(binwidth = 0.2)
diamonds |>
ggplot(aes(x=carat,
fill = color)) +
geom_histogram(binwidth = 0.5)
diamonds |>
ggplot(aes(x=carat,
fill = color)) +
geom_histogram(binwidth = 1)
### 1.5
Visualizing relationships 1.5.1 A numerical and a categorical
variable
ggplot(penguins, aes(x = species, y = body_mass_g)) +
geom_boxplot()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#middle line: median (50% percentile), box: 75% to 25% percentile,
# box height: interquatile range (IQR), whisker: 1.5 IQR
ggplot(penguins,
aes(x = body_mass_g,
colour = species,
fill = species)) +
geom_density(na.rm = TRUE,
linewidth = 1,
alpha = 0.4)+
labs(title = "Density graph of Body Mass",
x = "Body Mass in g",
y = "Density")
### 1.5.2
Two categorical variables
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar()
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar(position = "fill")
ggplot(penguins, aes(x = island, y = species)) +
geom_point() # not suitable for two categorical variables
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(colour = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(colour = species)) +
facet_wrap(~island)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
### 1.5.5
Exercises
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
## $ class <chr> "compact", "compact", "compact", "compact", "compact", "c…
Make a scatterplot of hwy vs. displ using the mpg data frame. Next, map a third, numerical variable to color, then size, then both color and size, then shape. How do these aesthetics behave differently for categorical vs. numerical variables?
What happens if you map the same variable to multiple aesthetics?
ggplot(mpg,
aes(x = displ, y = hwy, colour = cty))+
geom_point(na.rm = TRUE)
ggplot(mpg,
aes(x = displ, y = hwy, colour = cty, size = cty))+
#continuous variable cannot be mapped to shape
geom_point(na.rm = TRUE)
ggplot(mpg,
aes(x = displ, y = hwy, colour = fl, shape = class, size = drv))+
geom_point(na.rm = TRUE)
## Warning: Using size for a discrete variable is not advised.
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
## that many of them.
ggplot(mpg,
aes(x = displ, y = hwy, colour = fl, shape = fl, size = fl))+
geom_point(na.rm = TRUE)
## Warning: Using size for a discrete variable is not advised.
3. In the scatterplot of hwy vs. displ, what happens if you map a third
variable to linewidth?
ggplot(mpg,
aes(x = displ, y = hwy, linewidth = cty))+
geom_point(na.rm = TRUE)
ggplot(penguins,
aes(x = bill_length_mm, y = bill_depth_mm, colour = species))+
geom_point(na.rm = TRUE)+
geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
ggplot(penguins,
aes(x = bill_length_mm, y = bill_depth_mm))+
geom_point(na.rm = TRUE)+
geom_smooth(method = lm)+
facet_wrap(vars(species))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
ggplot( data = penguins, mapping = aes( x = bill_length_mm, y = bill_depth_mm, color = species, shape = species ) ) + geom_point() + labs(color = “Species”)
ggplot(
data = penguins,
mapping = aes(
x = bill_length_mm, y = bill_depth_mm,
color = species, shape = species
)
) +
geom_point() +
labs(color = "Species")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(
x = bill_length_mm, y = bill_depth_mm,
color = species, shape = species
)
) +
geom_point() +
labs(color = "Species", shape = "Species")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data = penguins,
mapping = aes(
x = bill_length_mm, y = bill_depth_mm,
color = species, shape = species
)
) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = island, fill = species)) + geom_bar(position = “fill”) ggplot(penguins, aes(x = species, fill = island)) + geom_bar(position = “fill”)
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar(position = "fill")
#species proportion in each island
ggplot(penguins, aes(x = species, fill = island)) +
geom_bar(position = "fill")
# geographical distribution of species among the three islands