# Packages
library(tidyverse)R for Marine Science Workshop 2
Introduction
This file covers more advanced data wrangling techniques, including:
- Pivoting data to move around rows and columns in tables
- Separating and uniting tables or specific columns
- Standardizing text and dates in data sets
- Dealing with NA values
Rules for tidy data:
Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.
ALWAYS: Put each dataset in a tibble and each variable in a column.
Example of working with tidy data
# Compute rate per 10,000 (existing data set)
table1 %>%
mutate(rate = cases / population * 10000)# A tibble: 6 × 5
country year cases population rate
<chr> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan 1999 745 19987071 0.373
2 Afghanistan 2000 2666 20595360 1.29
3 Brazil 1999 37737 172006362 2.19
4 Brazil 2000 80488 174504898 4.61
5 China 1999 212258 1272915272 1.67
6 China 2000 213766 1280428583 1.67
# Compute cases per year
table1 %>%
count(year, wt = cases)# A tibble: 2 × 2
year n
<dbl> <dbl>
1 1999 250740
2 2000 296920
# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))Pivoting data: Lengthening data sets
# Usually used to fix when one variable is spread across multiple columns or one observation across multiple rows
# Increases rows & decreases columns: pivot_longer()
#Data on ranked songs of 2000 (each song's rank listed over 76 weeks)
billboard# A tibble: 317 × 79
artist track date.entered wk1 wk2 wk3 wk4 wk5 wk6 wk7 wk8
<chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 Pac Baby… 2000-02-26 87 82 72 77 87 94 99 NA
2 2Ge+her The … 2000-09-02 91 87 92 NA NA NA NA NA
3 3 Doors D… Kryp… 2000-04-08 81 70 68 67 66 57 54 53
4 3 Doors D… Loser 2000-10-21 76 76 72 69 67 65 55 59
5 504 Boyz Wobb… 2000-04-15 57 34 25 17 17 31 36 49
6 98^0 Give… 2000-08-19 51 39 34 26 26 19 2 2
7 A*Teens Danc… 2000-07-08 97 97 96 95 100 NA NA NA
8 Aaliyah I Do… 2000-01-29 84 62 51 41 38 35 35 38
9 Aaliyah Try … 2000-03-18 59 53 38 28 21 18 16 14
10 Adams, Yo… Open… 2000-08-26 76 76 74 69 68 67 61 58
# ℹ 307 more rows
# ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
# wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
# wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
# wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
# wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
# wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, …
# Move weeks into one column to tidy
billboard |>
pivot_longer(
cols = starts_with("wk"), # specifies which columns to pivot
names_to = "week", # names the variable stored in the column names
values_to = "rank", # names the variable stored in the cell values that we named rank (had to create those 2 variables)
values_drop_na = TRUE
)# A tibble: 5,307 × 5
artist track date.entered week rank
<chr> <chr> <date> <chr> <dbl>
1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87
2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82
3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72
4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77
5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87
6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94
7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99
8 2Ge+her The Hardest Part Of ... 2000-09-02 wk1 91
9 2Ge+her The Hardest Part Of ... 2000-09-02 wk2 87
10 2Ge+her The Hardest Part Of ... 2000-09-02 wk3 92
# ℹ 5,297 more rows
Pivoting: longer example
# Create dataset "df" w/3 varriables
df <- tribble(
~id, ~bp1, ~bp2,
"A", 100, 120,
"B", 140, 115,
"C", 120, 125
)
# Tidy data to have 3 variables: id (already exists), measurement (the column names), & value (the cell values)
df |>
pivot_longer(
cols = bp1:bp2,
names_to = "measurement",
values_to = "value"
)# A tibble: 6 × 3
id measurement value
<chr> <chr> <dbl>
1 A bp1 100
2 A bp2 120
3 B bp1 140
4 B bp2 115
5 C bp1 120
6 C bp2 125
Pivoting data: widening data sets
cms_patient_experience #load buit in data set expressing patient satisfaction w/ health organizations# A tibble: 500 × 5
org_pac_id org_nm measure_cd measure_title prf_rate
<chr> <chr> <chr> <chr> <dbl>
1 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 63
2 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 87
3 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 86
4 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 57
5 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 85
6 0446157747 USC CARE MEDICAL GROUP INC CAHPS_GRP… CAHPS for MI… 24
7 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 59
8 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 85
9 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 83
10 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI… 63
# ℹ 490 more rows
cms_patient_experience |>
distinct(measure_cd, measure_title) #See complete set of variables# A tibble: 6 × 2
measure_cd measure_title
<chr> <chr>
1 CAHPS_GRP_1 CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
2 CAHPS_GRP_2 CAHPS for MIPS SSM: How Well Providers Communicate
3 CAHPS_GRP_3 CAHPS for MIPS SSM: Patient's Rating of Provider
4 CAHPS_GRP_5 CAHPS for MIPS SSM: Health Promotion and Education
5 CAHPS_GRP_8 CAHPS for MIPS SSM: Courteous and Helpful Office Staff
6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources
# measure_cd doesn’t hint at the meaning of the variable and measure_title is a long sentence containing spacesCode below still results in multiple rows for each organization since we still need to tell pivot_wider() which column or columns have values that uniquely identify each row
# add line to specify organizations
cms_patient_experience |>
pivot_wider(
id_cols = starts_with("org"),
names_from = measure_cd,
values_from = prf_rate
)# A tibble: 95 × 8
org_pac_id org_nm CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3 CAHPS_GRP_5 CAHPS_GRP_8
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0446157747 USC C… 63 87 86 57 85
2 0446162697 ASSOC… 59 85 83 63 88
3 0547164295 BEAVE… 49 NA 75 44 73
4 0749333730 CAPE … 67 84 85 65 82
5 0840104360 ALLIA… 66 87 87 64 87
6 0840109864 REX H… 73 87 84 67 91
7 0840513552 SCL H… 58 83 76 58 78
8 0941545784 GRITM… 46 86 81 54 NA
9 1052612785 COMMU… 65 84 80 58 87
10 1254237779 OUR L… 61 NA NA 65 NA
# ℹ 85 more rows
# ℹ 1 more variable: CAHPS_GRP_12 <dbl>
Widening data sets example:
# Create data set with blood pressure measurments from 2 patients
df <- tribble(
~id, ~measurement, ~value,
"A", "bp1", 100,
"B", "bp1", 140,
"B", "bp2", 115,
"A", "bp2", 120,
"A", "bp3", 105
)
# Rearrange data with patients as rows
df |>
pivot_wider(
names_from = measurement,
values_from = value
)# A tibble: 2 × 4
id bp1 bp2 bp3
<chr> <dbl> <dbl> <dbl>
1 A 100 120 105
2 B 140 115 NA
Pivoting Data: Palmer Penguins
Lengthening data
library(palmerpenguins)
head(penguins)# A tibble: 6 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
# ℹ 2 more variables: sex <fct>, year <int>
# Create a long version of the penguins data set
penguins_long <- penguins |>
pivot_longer(
cols = c(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g),
names_to = "measurement_type",
values_to = "value"
)
# View the result
head(penguins_long)# A tibble: 6 × 6
species island sex year measurement_type value
<fct> <fct> <fct> <int> <chr> <dbl>
1 Adelie Torgersen male 2007 bill_length_mm 39.1
2 Adelie Torgersen male 2007 bill_depth_mm 18.7
3 Adelie Torgersen male 2007 flipper_length_mm 181
4 Adelie Torgersen male 2007 body_mass_g 3750
5 Adelie Torgersen female 2007 bill_length_mm 39.5
6 Adelie Torgersen female 2007 bill_depth_mm 17.4
#Create histogram of long data
penguins_long |>
drop_na(value) |>
ggplot(aes(x = value, fill = species)) +
geom_histogram(bins = 30, alpha = 0.7, colour = "black") +
facet_wrap(~ measurement_type, scales = "free_x") +
theme_minimal() +
labs(
title = "Morphometric distributions across penguin species",
x = "Measurement value",
y = "Frequency"
)Widening data (can use to present summary statistics as a cross-tabulated matrix. It is much easier for a reader to parse a wide table than a long, repetitive list).
# Calculate mean body mass for each species on each island
mass_summary <- penguins |>
drop_na(body_mass_g) |>
group_by(species, island) |>
summarise(mean_mass = mean(body_mass_g))
head(mass_summary)# A tibble: 5 × 3
# Groups: species [3]
species island mean_mass
<fct> <fct> <dbl>
1 Adelie Biscoe 3710.
2 Adelie Dream 3688.
3 Adelie Torgersen 3706.
4 Chinstrap Dream 3733.
5 Gentoo Biscoe 5076.
# Output is technically "tidy" and computationally useful, but it is not ideal for publication presentation. We can use pivot_wider() to pull the island names up into columns, creating a clean matrix of species by island.
mass_matrix <- mass_summary |>
pivot_wider(
names_from = island,
values_from = mean_mass
)
head(mass_matrix)# A tibble: 3 × 4
# Groups: species [3]
species Biscoe Dream Torgersen
<fct> <dbl> <dbl> <dbl>
1 Adelie 3710. 3688. 3706.
2 Chinstrap NA 3733. NA
3 Gentoo 5076. NA NA
# names_from dictates which column provides our new column headers (the islands), and values_from dictates which column populates the cells beneath them (the mean mass)Separating and uniting data tables
# Table where cases and population are combined into one column to show rate of infection per 10,000 people
table3# A tibble: 6 × 3
country year rate
<chr> <dbl> <chr>
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
# We need to split the rate column up into two variables: 1) cases and 2) population
table3 %>%
separate(rate, into = c("cases", "population"))# A tibble: 6 × 4
country year cases population
<chr> <dbl> <chr> <chr>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
# Can also specify what character to use when separating:
# separate(rate, into = c("cases", "population"), sep = "/")
# Need to use convert = TRUE to set new variables as numerical since separate() defaults to character
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)# A tibble: 6 × 4
country year cases population
<chr> <dbl> <int> <int>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
# Can also separate numerical varriables by position instead of character (less useful)
table3 %>%
separate(year, into = c("century", "year"), sep = 2)# A tibble: 6 × 4
country century year rate
<chr> <chr> <chr> <chr>
1 Afghanistan 19 99 745/19987071
2 Afghanistan 20 00 2666/20595360
3 Brazil 19 99 37737/172006362
4 Brazil 20 00 80488/174504898
5 China 19 99 212258/1272915272
6 China 20 00 213766/1280428583
# Uniting columns is also an option
table5# A tibble: 6 × 4
country century year rate
<chr> <chr> <chr> <chr>
1 Afghanistan 19 99 745/19987071
2 Afghanistan 20 00 2666/20595360
3 Brazil 19 99 37737/172006362
4 Brazil 20 00 80488/174504898
5 China 19 99 212258/1272915272
6 China 20 00 213766/1280428583
table5 %>%
unite(new, century, year, sep = "") #unites century and year into column called "new" without a separator# A tibble: 6 × 3
country new rate
<chr> <chr> <chr>
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
Wrangling Strings and Dates
Before we can merge datasets or plot timelines, we must standardize our text strings and date-times. The tidyverse includes two specialized packages for this: stringr for text and lubridate for dates.
Using Stringr to standardize text
library(tidyverse)
# A remarkably messy data frame of field sites
messy_sites <- tibble(
site_id = c(" Nelly Bay", "nelly_bay", "NELLY BAY", " Geoffrey_Bay ", "geoffrey bay")
)
# Using stringr within mutate to standardize the text
clean_sites <- messy_sites |>
mutate(
# 1. Convert everything to lowercase
site_clean = str_to_lower(site_id),
# 2. Trim any leading or trailing whitespace (invisible spaces at the ends)
site_clean = str_trim(site_clean),
# 3. Replace any spaces with underscores
site_clean = str_replace_all(site_clean, pattern = " ", replacement = "_")
)
print(clean_sites)# A tibble: 5 × 2
site_id site_clean
<chr> <chr>
1 " Nelly Bay" nelly_bay
2 "nelly_bay" nelly_bay
3 "NELLY BAY" nelly_bay
4 " Geoffrey_Bay " geoffrey_bay
5 "geoffrey bay" geoffrey_bay
Using Lubridate to standardize Dates
library(lubridate)
# Parsing different date formats
date_1 <- dmy("25/12/2026") # if day is listed first
date_2 <- ymd("2026-12-25") # if year is listed first
# R now recognizes these as identical Date objects
date_1 == date_2[1] TRUE
# A tibble of raw sensor data with a messy character timestamp
sensor_data <- tibble(
raw_time = c("14-05-2026 08:30:00", "14-05-2026 08:45:00", "14-05-2026 09:00:00"),
temperature = c(24.5, 24.6, 24.4)
)
# Converting character strings to true POSIXct datetime objects
sensor_clean <- sensor_data |>
mutate(
true_time = dmy_hms(raw_time)
)
print(sensor_clean)# A tibble: 3 × 3
raw_time temperature true_time
<chr> <dbl> <dttm>
1 14-05-2026 08:30:00 24.5 2026-05-14 08:30:00
2 14-05-2026 08:45:00 24.6 2026-05-14 08:45:00
3 14-05-2026 09:00:00 24.4 2026-05-14 09:00:00
Joining tables
Set up 2 tables
# Table 1: Biological observation data
observations <- tibble(
site_code = c("NB", "GB", "MI", "NB", "HB"),
species = c("Trout", "Snapper", "Trout", "Cod", "Trout"),
count = c(5, 2, 1, 3, 8)
)
# Table 2: Spatial metadata
site_metadata <- tibble(
site_code = c("NB", "GB", "MI", "RP", "WP"),
zone = c("Marine National Park", "Conservation Park", "Habitat Protection", "General Use", "Other Use"),
lat = c(-19.16, -19.15, -19.14, -19.12, -19.11)
)
print(observations)# A tibble: 5 × 3
site_code species count
<chr> <chr> <dbl>
1 NB Trout 5
2 GB Snapper 2
3 MI Trout 1
4 NB Cod 3
5 HB Trout 8
print(site_metadata)# A tibble: 5 × 3
site_code zone lat
<chr> <chr> <dbl>
1 NB Marine National Park -19.2
2 GB Conservation Park -19.2
3 MI Habitat Protection -19.1
4 RP General Use -19.1
5 WP Other Use -19.1
left_join(): keeps all the rows from your left table (observations) and matches data from your right table (site_metadata) wherever the key matches. If a match is missing, it fills the gaps with NA.
# Joining metadata to our observations
joined_data <- observations |>
left_join(site_metadata, by = join_by(site_code))
print(joined_data)# A tibble: 5 × 5
site_code species count zone lat
<chr> <chr> <dbl> <chr> <dbl>
1 NB Trout 5 Marine National Park -19.2
2 GB Snapper 2 Conservation Park -19.2
3 MI Trout 1 Habitat Protection -19.1
4 NB Cod 3 Marine National Park -19.2
5 HB Trout 8 <NA> NA
inner_join(): only keeps rows that have complete matches in both tables.
matched_data <- observations |>
inner_join(site_metadata, by = join_by(site_code))
print(matched_data)# A tibble: 4 × 5
site_code species count zone lat
<chr> <chr> <dbl> <chr> <dbl>
1 NB Trout 5 Marine National Park -19.2
2 GB Snapper 2 Conservation Park -19.2
3 MI Trout 1 Habitat Protection -19.1
4 NB Cod 3 Marine National Park -19.2
anti_join(): filters your left table to only show rows that do not have a match in the right table (run after left join if you end up with NA values)
# Which observations are missing from our metadata dictionary?
missing_context <- observations |>
anti_join(site_metadata, by = join_by(site_code))
print(missing_context)# A tibble: 1 × 3
site_code species count
<chr> <chr> <dbl>
1 HB Trout 8
Dealing with NA Values
Converting legacy errors with na_if()
#Raw data from an old temperature logger
logger_data <- tibble(
depth_m = c(10, 20, 30, 40),
temp_c = c(24.5, 24.1, -999, 23.5)) # -999 is a known sensor error code
# Convert the -999 error codes to true NA values
fixed_logger <- logger_data |>
mutate(temp_c = na_if(temp_c, -999))
print(fixed_logger)# A tibble: 4 × 2
depth_m temp_c
<dbl> <dbl>
1 10 24.5
2 20 24.1
3 30 NA
4 40 23.5
Replacing NA values with coalesce()
# Simulate some count data
shark_counts <- tibble(
site = c("Reef_A", "Reef_B", "Reef_C"),
shark_count = c(3, NA, 5)) # The NA here actually means 0 sharks were seen
# Replace NA with 0
shark_fixed <- shark_counts |>
mutate(shark_count = coalesce(shark_count, 0))
print(shark_fixed)# A tibble: 3 × 2
site shark_count
<chr> <dbl>
1 Reef_A 3
2 Reef_B 0
3 Reef_C 5
Important: coalesce() is incredibly powerful. If you have two columns, like a primary_sensor and a backup_sensor, coalesce(primary_sensor, backup_sensor) will automatically use the primary reading, but seamlessly fill in the gaps with the backup reading if the primary had failed!)
NaN (not a number), often happens when calculating metrics like Catch Per Unit Effort (CPUE) for a site where both the catch and the effort were zero and the function divided by zero.
cpue_data <- tibble(
site = c("Bay_1", "Bay_2"),
catch = c(10, 0),
effort_hours = c(2, 0))
# Calculating CPUE (catch / effort)
cpue_calc <- cpue_data |>
mutate(
cpue = catch / effort_hours)
print(cpue_calc)# A tibble: 2 × 4
site catch effort_hours cpue
<chr> <dbl> <dbl> <dbl>
1 Bay_1 10 2 5
2 Bay_2 0 0 NaN
# Added line to check for NaN values
cpue_calc <- cpue_data |>
mutate( is.nan(catch/ effort_hours), cpue = NA_real_,
cpue = catch / effort_hours)
print(cpue_calc)# A tibble: 2 × 5
site catch effort_hours `is.nan(catch/effort_hours)` cpue
<chr> <dbl> <dbl> <lgl> <dbl>
1 Bay_1 10 2 FALSE 5
2 Bay_2 0 0 TRUE NaN
Implicit missing values: complete()
# Important in ecology to know where species are not found, however tibbles don't typically include rows for zero catch data
raw_catch <- tibble(
site = c("Reef_1", "Reef_1", "Reef_2"),
species = c("Pmaculatus", "Pleopardus", "Pmaculatus"),
count = c(5, 2, 8))
print(raw_catch)# A tibble: 3 × 3
site species count
<chr> <chr> <dbl>
1 Reef_1 Pmaculatus 5
2 Reef_1 Pleopardus 2
3 Reef_2 Pmaculatus 8
# Force inclusion of hidden zero-catch data (imo)
full_catch_matrix <- raw_catch |>
complete(site, species, fill = list(count = 0))
print(full_catch_matrix)# A tibble: 4 × 3
site species count
<chr> <chr> <dbl>
1 Reef_1 Pleopardus 2
2 Reef_1 Pmaculatus 5
3 Reef_2 Pleopardus 0
4 Reef_2 Pmaculatus 8
The nuclear option: drop_na(). May have to remove entire row if NA value is core response variable
sensor_log <- tibble(
day = 1:4,
salinity = c(35.2, 35.1, NA, 35.3)
)
print(sensor_log)# A tibble: 4 × 2
day salinity
<int> <dbl>
1 1 35.2
2 2 35.1
3 3 NA
4 4 35.3
# Remove any row where salinity is missing
clean_log <- sensor_log |>
drop_na(salinity)
print(clean_log)# A tibble: 3 × 2
day salinity
<int> <dbl>
1 1 35.2
2 2 35.1
3 4 35.3
Example problem: Penguins, Pivots, and Relational Data
You have the clean penguins dataset, but a colleague has just emailed you a supplementary metadata file containing the GPS coordinates and the date the primary weather stations were installed on each island.
# Make sure packages are loaded
library(tidyverse)
library(palmerpenguins)
# The messy metadata provided by your colleague
island_metadata <- tibble(
island_name = c(" biscoe", "Dream ", "Torgersen"),
station_install = c("15/01/2003", "22-03-2004", "05/11/2001"),
latitude = c(-64.81, -64.73, -64.76)
)
print(island_metadata)# A tibble: 3 × 3
island_name station_install latitude
<chr> <chr> <dbl>
1 " biscoe" 15/01/2003 -64.8
2 "Dream " 22-03-2004 -64.7
3 "Torgersen" 05/11/2001 -64.8
# Using mutate(), stringr, and lubridate, fix the island_name column so that it perfectly matches the island column in the penguins dataset Then, convert the station_install column into a proper Date object.
clean_metadata <- island_metadata |>
mutate(
island_name = str_trim(island_name), # Remove leading/trailing whitespace
island_name = str_to_title(island_name), # Convert to title case
station_install = dmy(station_install) # Convert to Date object
)
print(clean_metadata)# A tibble: 3 × 3
island_name station_install latitude
<chr> <date> <dbl>
1 Biscoe 2003-01-15 -64.8
2 Dream 2004-03-22 -64.7
3 Torgersen 2001-11-05 -64.8
# Use left_join() to attach your clean_metadata to the main penguins dataset.
penguins_spatial <- penguins |>
left_join(clean_metadata, by = join_by(island == island_name))
head(penguins_spatial)# A tibble: 6 × 10
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <chr> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
# ℹ 4 more variables: sex <fct>, year <int>, station_install <date>,
# latitude <dbl>
A collaborator has asked for a clean, publication-ready table showing the maximum body mass recorded for each species, separated by island
penguin_summary = penguins_spatial |>
drop_na(body_mass_g) |> #Drop missing values in the body_mass_g column
group_by(species, island) |> #Group by species and island
summarise(max_mass = max(body_mass_g), .groups = "drop") |> #Summarise data to find maximum body mass
pivot_wider(names_from = island, values_from = max_mass) #Pivot dataset wider so species form the rows, and islands form the columns.
print(penguin_summary)# A tibble: 3 × 4
species Biscoe Dream Torgersen
<fct> <int> <int> <int>
1 Adelie 4775 4650 4700
2 Chinstrap NA 4800 NA
3 Gentoo 6300 NA NA