#Penguin Activity! Review your wrangling skills!
Install the palmerpenguins package if you do not have it
#install.packages(“palmerpenguins”)
Warning: package 'ggplot2' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 4.0.0 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (palmerpenguins) #to load the penguins dataset
Warning: package 'palmerpenguins' was built under R version 4.5.2
Attaching package: 'palmerpenguins'
The following objects are masked from 'package:datasets':
penguins, penguins_raw
#load the dataset penguins
data ("penguins" ) # to see it in the environment
Count penguins by species
penguins |>
count (species)
# A tibble: 3 × 2
species n
<fct> <int>
1 Adelie 152
2 Chinstrap 68
3 Gentoo 124
Count penguins by species and island
penguins |>
count (species, island)
# A tibble: 5 × 3
species island n
<fct> <fct> <int>
1 Adelie Biscoe 44
2 Adelie Dream 56
3 Adelie Torgersen 52
4 Chinstrap Dream 68
5 Gentoo Biscoe 124
Filter penguins by species (Adelie)
penguinsA <- penguins |>
filter (species == "Adelie" )
Count Adelie penguins by island and sex
penguinsA |>
count (species, island)
# A tibble: 3 × 3
species island n
<fct> <fct> <int>
1 Adelie Biscoe 44
2 Adelie Dream 56
3 Adelie Torgersen 52
Select specific columns (species, island, flipper_length_mm))
penguins |>
select (species, island, flipper_length_mm)
# A tibble: 344 × 3
species island flipper_length_mm
<fct> <fct> <int>
1 Adelie Torgersen 181
2 Adelie Torgersen 186
3 Adelie Torgersen 195
4 Adelie Torgersen NA
5 Adelie Torgersen 193
6 Adelie Torgersen 190
7 Adelie Torgersen 181
8 Adelie Torgersen 195
9 Adelie Torgersen 193
10 Adelie Torgersen 190
# ℹ 334 more rows
Mutate new columns for flipper length in cm and inches (cm = mm/10)(in = mm/25.4)
penguins <- penguins %>% mutate (flipperLengthCM = flipper_length_mm/ 10 )
penguins <- penguins %>% mutate (flipperLengthIN = flipper_length_mm/ 25.4 )
Group penguins by species
penguins |>
group_by (species)
# A tibble: 344 × 10
# Groups: species [3]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 4 more variables: sex <fct>, year <int>, flipperLengthCM <dbl>,
# flipperLengthIN <dbl>
Summarize penguin data by species (basic)
penguins |>
group_by (species) |>
summarize (
n = n ()
)
# A tibble: 3 × 2
species n
<fct> <int>
1 Adelie 152
2 Chinstrap 68
3 Gentoo 124
Summarize penguin data by species (additional statistics)
penguins |>
group_by (species) |>
summarize (
n = n (),
mean_mass = mean (body_mass_g),
max_flipper_length = max (flipper_length_mm),
percent_female = sum (sex == "female" ) / n ()
)
# A tibble: 3 × 5
species n mean_mass max_flipper_length percent_female
<fct> <int> <dbl> <int> <dbl>
1 Adelie 152 NA NA NA
2 Chinstrap 68 3733. 212 0.5
3 Gentoo 124 NA NA NA
Summarize penguin data by species (handling missing values)
penguins |>
filter (! is.na (body_mass_g), ! is.na (flipper_length_mm), ! is.na (sex)) |>
group_by (species) |>
summarize (
n = n (),
mean_mass = mean (body_mass_g),
max_flipper_length = max (flipper_length_mm),
percent_female = sum (sex == "female" ) / n ()
)
# A tibble: 3 × 5
species n mean_mass max_flipper_length percent_female
<fct> <int> <dbl> <int> <dbl>
1 Adelie 146 3706. 210 0.5
2 Chinstrap 68 3733. 212 0.5
3 Gentoo 119 5092. 231 0.487
OR
penguins |>
group_by (species) |>
summarize (
n = n (),
mean_mass = mean (body_mass_g, na.rm = TRUE ),
max_flipper_length = max (flipper_length_mm, na.rm = TRUE ),
percent_female = sum (sex == "female" , na.rm = TRUE ) / n ()
)
# A tibble: 3 × 5
species n mean_mass max_flipper_length percent_female
<fct> <int> <dbl> <int> <dbl>
1 Adelie 152 3701. 210 0.480
2 Chinstrap 68 3733. 212 0.5
3 Gentoo 124 5076. 231 0.468