R for Marine Science Workshop 2

Author

Ryan Waln

Published

June 11, 2026

Introduction

This file covers more advanced data wrangling techniques, including:

Pivoting data to move around rows and columns in tables
Separating and uniting tables or specific columns
Standardizing text and dates in data sets
Dealing with NA values

# Packages
library(tidyverse)

Rules for tidy data:

Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.
ALWAYS: Put each dataset in a tibble and each variable in a column.

Example of working with tidy data

# Compute rate per 10,000 (existing data set)
table1 %>% 
  mutate(rate = cases / population * 10000)

# A tibble: 6 × 5
  country      year  cases population  rate
  <chr>       <dbl>  <dbl>      <dbl> <dbl>
1 Afghanistan  1999    745   19987071 0.373
2 Afghanistan  2000   2666   20595360 1.29 
3 Brazil       1999  37737  172006362 2.19 
4 Brazil       2000  80488  174504898 4.61 
5 China        1999 212258 1272915272 1.67 
6 China        2000 213766 1280428583 1.67

# Compute cases per year
table1 %>% 
  count(year, wt = cases)

# A tibble: 2 × 2
   year      n
  <dbl>  <dbl>
1  1999 250740
2  2000 296920

# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) + 
  geom_line(aes(group = country), colour = "grey50") + 
  geom_point(aes(colour = country))

Pivoting data: Lengthening data sets

# Usually used to fix when one variable is spread across multiple columns or one observation across multiple rows

# Increases rows & decreases columns:  pivot_longer() 

#Data on ranked songs of 2000 (each song's rank listed over 76 weeks)
billboard

# A tibble: 317 × 79
   artist     track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
   <chr>      <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 2 Pac      Baby… 2000-02-26      87    82    72    77    87    94    99    NA
 2 2Ge+her    The … 2000-09-02      91    87    92    NA    NA    NA    NA    NA
 3 3 Doors D… Kryp… 2000-04-08      81    70    68    67    66    57    54    53
 4 3 Doors D… Loser 2000-10-21      76    76    72    69    67    65    55    59
 5 504 Boyz   Wobb… 2000-04-15      57    34    25    17    17    31    36    49
 6 98^0       Give… 2000-08-19      51    39    34    26    26    19     2     2
 7 A*Teens    Danc… 2000-07-08      97    97    96    95   100    NA    NA    NA
 8 Aaliyah    I Do… 2000-01-29      84    62    51    41    38    35    35    38
 9 Aaliyah    Try … 2000-03-18      59    53    38    28    21    18    16    14
10 Adams, Yo… Open… 2000-08-26      76    76    74    69    68    67    61    58
# ℹ 307 more rows
# ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
#   wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
#   wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
#   wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
#   wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
#   wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, …

# Move weeks into one column to tidy
billboard |> 
  pivot_longer(
    cols = starts_with("wk"), # specifies which columns to pivot
    names_to = "week",        # names the variable stored in the column names
    values_to = "rank",       # names the variable stored in the cell values that we named rank (had to create those 2 variables)
    values_drop_na = TRUE
  )

# A tibble: 5,307 × 5
   artist  track                   date.entered week   rank
   <chr>   <chr>                   <date>       <chr> <dbl>
 1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
 2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
 3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
 4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
 5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
 6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
 7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
 8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
 9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
# ℹ 5,297 more rows

Pivoting: longer example

# Create dataset "df" w/3 varriables
df <- tribble(
  ~id,  ~bp1, ~bp2,
   "A",  100,  120,
   "B",  140,  115,
   "C",  120,  125
)

# Tidy data to have 3 variables: id (already exists), measurement (the column names), & value (the cell values)
df |> 
  pivot_longer(
    cols = bp1:bp2,
    names_to = "measurement",  
    values_to = "value"    
  )

# A tibble: 6 × 3
  id    measurement value
  <chr> <chr>       <dbl>
1 A     bp1           100
2 A     bp2           120
3 B     bp1           140
4 B     bp2           115
5 C     bp1           120
6 C     bp2           125

Pivoting data: widening data sets

cms_patient_experience  #load buit in data set expressing patient satisfaction w/ health organizations

# A tibble: 500 × 5
   org_pac_id org_nm                           measure_cd measure_title prf_rate
   <chr>      <chr>                            <chr>      <chr>            <dbl>
 1 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       63
 2 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       87
 3 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       86
 4 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       57
 5 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       85
 6 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       24
 7 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       59
 8 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       85
 9 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       83
10 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       63
# ℹ 490 more rows

cms_patient_experience |> 
  distinct(measure_cd, measure_title)   #See complete set of variables

# A tibble: 6 × 2
  measure_cd   measure_title                                                    
  <chr>        <chr>                                                            
1 CAHPS_GRP_1  CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
2 CAHPS_GRP_2  CAHPS for MIPS SSM: How Well Providers Communicate               
3 CAHPS_GRP_3  CAHPS for MIPS SSM: Patient's Rating of Provider                 
4 CAHPS_GRP_5  CAHPS for MIPS SSM: Health Promotion and Education               
5 CAHPS_GRP_8  CAHPS for MIPS SSM: Courteous and Helpful Office Staff           
6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources

# measure_cd doesn’t hint at the meaning of the variable and measure_title is a long sentence containing spaces

Code below still results in multiple rows for each organization since we still need to tell pivot_wider() which column or columns have values that uniquely identify each row

# add line to specify organizations
cms_patient_experience |> 
  pivot_wider(
    id_cols = starts_with("org"),
    names_from = measure_cd,
    values_from = prf_rate
  )

# A tibble: 95 × 8
   org_pac_id org_nm CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3 CAHPS_GRP_5 CAHPS_GRP_8
   <chr>      <chr>        <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
 1 0446157747 USC C…          63          87          86          57          85
 2 0446162697 ASSOC…          59          85          83          63          88
 3 0547164295 BEAVE…          49          NA          75          44          73
 4 0749333730 CAPE …          67          84          85          65          82
 5 0840104360 ALLIA…          66          87          87          64          87
 6 0840109864 REX H…          73          87          84          67          91
 7 0840513552 SCL H…          58          83          76          58          78
 8 0941545784 GRITM…          46          86          81          54          NA
 9 1052612785 COMMU…          65          84          80          58          87
10 1254237779 OUR L…          61          NA          NA          65          NA
# ℹ 85 more rows
# ℹ 1 more variable: CAHPS_GRP_12 <dbl>

Widening data sets example:

# Create data set with blood pressure measurments from 2 patients
df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "B",        "bp1",    140,
  "B",        "bp2",    115, 
  "A",        "bp2",    120,
  "A",        "bp3",    105
)

# Rearrange data with patients as rows
df |> 
  pivot_wider(
    names_from = measurement,
    values_from = value
  )

# A tibble: 2 × 4
  id      bp1   bp2   bp3
  <chr> <dbl> <dbl> <dbl>
1 A       100   120   105
2 B       140   115    NA

Pivoting Data: Palmer Penguins

Lengthening data

library(palmerpenguins)
head(penguins)

# A tibble: 6 × 8
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           NA            NA                  NA          NA
5 Adelie  Torgersen           36.7          19.3               193        3450
6 Adelie  Torgersen           39.3          20.6               190        3650
# ℹ 2 more variables: sex <fct>, year <int>

# Create a long version of the penguins data set

penguins_long <- penguins |>
  pivot_longer(
    cols = c(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g),
    names_to = "measurement_type",
    values_to = "value"
  )
# View the result
head(penguins_long)

# A tibble: 6 × 6
  species island    sex     year measurement_type   value
  <fct>   <fct>     <fct>  <int> <chr>              <dbl>
1 Adelie  Torgersen male    2007 bill_length_mm      39.1
2 Adelie  Torgersen male    2007 bill_depth_mm       18.7
3 Adelie  Torgersen male    2007 flipper_length_mm  181  
4 Adelie  Torgersen male    2007 body_mass_g       3750  
5 Adelie  Torgersen female  2007 bill_length_mm      39.5
6 Adelie  Torgersen female  2007 bill_depth_mm       17.4

#Create histogram of long data
penguins_long |>
  drop_na(value) |>
  ggplot(aes(x = value, fill = species)) +
  geom_histogram(bins = 30, alpha = 0.7, colour = "black") +
  facet_wrap(~ measurement_type, scales = "free_x") +
  theme_minimal() +
  labs(
    title = "Morphometric distributions across penguin species",
    x = "Measurement value",
    y = "Frequency"
  )

Widening data (can use to present summary statistics as a cross-tabulated matrix. It is much easier for a reader to parse a wide table than a long, repetitive list).

# Calculate mean body mass for each species on each island 
mass_summary <- penguins |>
  drop_na(body_mass_g) |>
  group_by(species, island) |>
  summarise(mean_mass = mean(body_mass_g))

head(mass_summary)

# A tibble: 5 × 3
# Groups:   species [3]
  species   island    mean_mass
  <fct>     <fct>         <dbl>
1 Adelie    Biscoe        3710.
2 Adelie    Dream         3688.
3 Adelie    Torgersen     3706.
4 Chinstrap Dream         3733.
5 Gentoo    Biscoe        5076.

# Output is technically "tidy" and computationally useful, but it is not ideal for publication presentation. We can use pivot_wider() to pull the island names up into columns, creating a clean matrix of species by island.
mass_matrix <- mass_summary |>
  pivot_wider(
    names_from = island,
    values_from = mean_mass
  )
head(mass_matrix)

# A tibble: 3 × 4
# Groups:   species [3]
  species   Biscoe Dream Torgersen
  <fct>      <dbl> <dbl>     <dbl>
1 Adelie     3710. 3688.     3706.
2 Chinstrap    NA  3733.       NA 
3 Gentoo     5076.   NA        NA

# names_from dictates which column provides our new column headers (the islands), and values_from dictates which column populates the cells beneath them (the mean mass)

Separating and uniting data tables

# Table where cases and population are combined into one column to show rate of infection per 10,000 people
table3

# A tibble: 6 × 3
  country      year rate             
  <chr>       <dbl> <chr>            
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583

# We need to split the rate column up into two variables: 1) cases and 2) population
table3 %>% 
  separate(rate, into = c("cases", "population"))

# A tibble: 6 × 4
  country      year cases  population
  <chr>       <dbl> <chr>  <chr>     
1 Afghanistan  1999 745    19987071  
2 Afghanistan  2000 2666   20595360  
3 Brazil       1999 37737  172006362 
4 Brazil       2000 80488  174504898 
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

# Can also specify what character to use when separating:
# separate(rate, into = c("cases", "population"), sep = "/")

# Need to use convert = TRUE to set new variables as numerical since separate() defaults to character
table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)

# A tibble: 6 × 4
  country      year  cases population
  <chr>       <dbl>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

# Can also separate numerical varriables by position instead of character (less useful)
table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)

# A tibble: 6 × 4
  country     century year  rate             
  <chr>       <chr>   <chr> <chr>            
1 Afghanistan 19      99    745/19987071     
2 Afghanistan 20      00    2666/20595360    
3 Brazil      19      99    37737/172006362  
4 Brazil      20      00    80488/174504898  
5 China       19      99    212258/1272915272
6 China       20      00    213766/1280428583

# Uniting columns is also an option
table5

# A tibble: 6 × 4
  country     century year  rate             
  <chr>       <chr>   <chr> <chr>            
1 Afghanistan 19      99    745/19987071     
2 Afghanistan 20      00    2666/20595360    
3 Brazil      19      99    37737/172006362  
4 Brazil      20      00    80488/174504898  
5 China       19      99    212258/1272915272
6 China       20      00    213766/1280428583

table5 %>% 
  unite(new, century, year, sep = "")  #unites century and year into column called "new" without a separator

# A tibble: 6 × 3
  country     new   rate             
  <chr>       <chr> <chr>            
1 Afghanistan 1999  745/19987071     
2 Afghanistan 2000  2666/20595360    
3 Brazil      1999  37737/172006362  
4 Brazil      2000  80488/174504898  
5 China       1999  212258/1272915272
6 China       2000  213766/1280428583

Wrangling Strings and Dates

Before we can merge datasets or plot timelines, we must standardize our text strings and date-times. The tidyverse includes two specialized packages for this: stringr for text and lubridate for dates.

Using Stringr to standardize text

library(tidyverse)

# A remarkably messy data frame of field sites
messy_sites <- tibble(
  site_id = c(" Nelly Bay", "nelly_bay", "NELLY BAY", " Geoffrey_Bay ", "geoffrey bay")
)

# Using stringr within mutate to standardize the text
clean_sites <- messy_sites |>
  mutate(
    # 1. Convert everything to lowercase
    site_clean = str_to_lower(site_id),
    # 2. Trim any leading or trailing whitespace (invisible spaces at the ends)
    site_clean = str_trim(site_clean),
    # 3. Replace any spaces with underscores
    site_clean = str_replace_all(site_clean, pattern = " ", replacement = "_")
  )

print(clean_sites)

# A tibble: 5 × 2
  site_id          site_clean  
  <chr>            <chr>       
1 " Nelly Bay"     nelly_bay   
2 "nelly_bay"      nelly_bay   
3 "NELLY BAY"      nelly_bay   
4 " Geoffrey_Bay " geoffrey_bay
5 "geoffrey bay"   geoffrey_bay

Using Lubridate to standardize Dates

library(lubridate)

# Parsing different date formats
date_1 <- dmy("25/12/2026") # if day is listed first
date_2 <- ymd("2026-12-25") # if year is listed first

# R now recognizes these as identical Date objects
date_1 == date_2

[1] TRUE

# A tibble of raw sensor data with a messy character timestamp
sensor_data <- tibble(
  raw_time = c("14-05-2026 08:30:00", "14-05-2026 08:45:00", "14-05-2026 09:00:00"),
  temperature = c(24.5, 24.6, 24.4)
)

# Converting character strings to true POSIXct datetime objects
sensor_clean <- sensor_data |>
  mutate(
    true_time = dmy_hms(raw_time)
  )

print(sensor_clean)

# A tibble: 3 × 3
  raw_time            temperature true_time          
  <chr>                     <dbl> <dttm>             
1 14-05-2026 08:30:00        24.5 2026-05-14 08:30:00
2 14-05-2026 08:45:00        24.6 2026-05-14 08:45:00
3 14-05-2026 09:00:00        24.4 2026-05-14 09:00:00

Joining tables

Set up 2 tables

# Table 1: Biological observation data
observations <- tibble(
  site_code = c("NB", "GB", "MI", "NB", "HB"),
  species = c("Trout", "Snapper", "Trout", "Cod", "Trout"),
  count = c(5, 2, 1, 3, 8)
)

# Table 2: Spatial metadata
site_metadata <- tibble(
  site_code = c("NB", "GB", "MI", "RP", "WP"),
  zone = c("Marine National Park", "Conservation Park", "Habitat Protection", "General Use", "Other Use"),
  lat = c(-19.16, -19.15, -19.14, -19.12, -19.11)
)

print(observations)

# A tibble: 5 × 3
  site_code species count
  <chr>     <chr>   <dbl>
1 NB        Trout       5
2 GB        Snapper     2
3 MI        Trout       1
4 NB        Cod         3
5 HB        Trout       8

print(site_metadata)

# A tibble: 5 × 3
  site_code zone                   lat
  <chr>     <chr>                <dbl>
1 NB        Marine National Park -19.2
2 GB        Conservation Park    -19.2
3 MI        Habitat Protection   -19.1
4 RP        General Use          -19.1
5 WP        Other Use            -19.1

left_join(): keeps all the rows from your left table (observations) and matches data from your right table (site_metadata) wherever the key matches. If a match is missing, it fills the gaps with NA.

# Joining metadata to our observations
joined_data <- observations |>
  left_join(site_metadata, by = join_by(site_code))

print(joined_data)

# A tibble: 5 × 5
  site_code species count zone                   lat
  <chr>     <chr>   <dbl> <chr>                <dbl>
1 NB        Trout       5 Marine National Park -19.2
2 GB        Snapper     2 Conservation Park    -19.2
3 MI        Trout       1 Habitat Protection   -19.1
4 NB        Cod         3 Marine National Park -19.2
5 HB        Trout       8 <NA>                  NA

inner_join(): only keeps rows that have complete matches in both tables.

matched_data <- observations |>
  inner_join(site_metadata, by = join_by(site_code))

print(matched_data)

# A tibble: 4 × 5
  site_code species count zone                   lat
  <chr>     <chr>   <dbl> <chr>                <dbl>
1 NB        Trout       5 Marine National Park -19.2
2 GB        Snapper     2 Conservation Park    -19.2
3 MI        Trout       1 Habitat Protection   -19.1
4 NB        Cod         3 Marine National Park -19.2

anti_join(): filters your left table to only show rows that do not have a match in the right table (run after left join if you end up with NA values)

# Which observations are missing from our metadata dictionary?
missing_context <- observations |>
  anti_join(site_metadata, by = join_by(site_code))

print(missing_context)

# A tibble: 1 × 3
  site_code species count
  <chr>     <chr>   <dbl>
1 HB        Trout       8

Dealing with NA Values

Converting legacy errors with na_if()

 #Raw data from an old temperature logger
logger_data <- tibble(
  depth_m = c(10, 20, 30, 40),
  temp_c = c(24.5, 24.1, -999, 23.5)) # -999 is a known sensor error code

# Convert the -999 error codes to true NA values
fixed_logger <- logger_data |>
  mutate(temp_c = na_if(temp_c, -999))

print(fixed_logger)

# A tibble: 4 × 2
  depth_m temp_c
    <dbl>  <dbl>
1      10   24.5
2      20   24.1
3      30   NA  
4      40   23.5

Replacing NA values with coalesce()

# Simulate some count data
shark_counts <- tibble(
  site = c("Reef_A", "Reef_B", "Reef_C"),
  shark_count = c(3, NA, 5)) # The NA here actually means 0 sharks were seen

# Replace NA with 0 
shark_fixed <- shark_counts |>
  mutate(shark_count = coalesce(shark_count, 0))

print(shark_fixed)

# A tibble: 3 × 2
  site   shark_count
  <chr>        <dbl>
1 Reef_A           3
2 Reef_B           0
3 Reef_C           5

Important: coalesce() is incredibly powerful. If you have two columns, like a primary_sensor and a backup_sensor, coalesce(primary_sensor, backup_sensor) will automatically use the primary reading, but seamlessly fill in the gaps with the backup reading if the primary had failed!)

NaN (not a number), often happens when calculating metrics like Catch Per Unit Effort (CPUE) for a site where both the catch and the effort were zero and the function divided by zero.

cpue_data <- tibble(
  site = c("Bay_1", "Bay_2"),
  catch = c(10, 0),
  effort_hours = c(2, 0))

# Calculating CPUE (catch / effort)
cpue_calc <- cpue_data |>
  mutate(
    cpue = catch / effort_hours)

print(cpue_calc)

# A tibble: 2 × 4
  site  catch effort_hours  cpue
  <chr> <dbl>        <dbl> <dbl>
1 Bay_1    10            2     5
2 Bay_2     0            0   NaN

# Added line to check for NaN values
cpue_calc <- cpue_data |>
  mutate( is.nan(catch/ effort_hours), cpue = NA_real_,
    cpue = catch / effort_hours)

print(cpue_calc)

# A tibble: 2 × 5
  site  catch effort_hours `is.nan(catch/effort_hours)`  cpue
  <chr> <dbl>        <dbl> <lgl>                        <dbl>
1 Bay_1    10            2 FALSE                            5
2 Bay_2     0            0 TRUE                           NaN

Implicit missing values: complete()

# Important in ecology to know where species are not found, however tibbles don't typically include rows for zero catch data
raw_catch <- tibble(
  site = c("Reef_1", "Reef_1", "Reef_2"),
  species = c("Pmaculatus", "Pleopardus", "Pmaculatus"),
  count = c(5, 2, 8))

print(raw_catch)

# A tibble: 3 × 3
  site   species    count
  <chr>  <chr>      <dbl>
1 Reef_1 Pmaculatus     5
2 Reef_1 Pleopardus     2
3 Reef_2 Pmaculatus     8

# Force inclusion of hidden zero-catch data (imo)
full_catch_matrix <- raw_catch |>
  complete(site, species, fill = list(count = 0))

print(full_catch_matrix)

# A tibble: 4 × 3
  site   species    count
  <chr>  <chr>      <dbl>
1 Reef_1 Pleopardus     2
2 Reef_1 Pmaculatus     5
3 Reef_2 Pleopardus     0
4 Reef_2 Pmaculatus     8

The nuclear option: drop_na(). May have to remove entire row if NA value is core response variable

sensor_log <- tibble(
  day = 1:4,
  salinity = c(35.2, 35.1, NA, 35.3)
)
print(sensor_log)

# A tibble: 4 × 2
    day salinity
  <int>    <dbl>
1     1     35.2
2     2     35.1
3     3     NA  
4     4     35.3

# Remove any row where salinity is missing
clean_log <- sensor_log |>
  drop_na(salinity)

print(clean_log)

# A tibble: 3 × 2
    day salinity
  <int>    <dbl>
1     1     35.2
2     2     35.1
3     4     35.3

Example problem: Penguins, Pivots, and Relational Data

You have the clean penguins dataset, but a colleague has just emailed you a supplementary metadata file containing the GPS coordinates and the date the primary weather stations were installed on each island.

# Make sure packages are loaded
library(tidyverse)
library(palmerpenguins)

# The messy metadata provided by your colleague
island_metadata <- tibble(
  island_name = c(" biscoe", "Dream ", "Torgersen"),
  station_install = c("15/01/2003", "22-03-2004", "05/11/2001"),
  latitude = c(-64.81, -64.73, -64.76)
)

print(island_metadata)

# A tibble: 3 × 3
  island_name station_install latitude
  <chr>       <chr>              <dbl>
1 " biscoe"   15/01/2003         -64.8
2 "Dream "    22-03-2004         -64.7
3 "Torgersen" 05/11/2001         -64.8

# Using mutate(), stringr, and lubridate, fix the island_name column so that it perfectly matches the island column in the penguins dataset Then, convert the station_install column into a proper Date object.

clean_metadata <- island_metadata |>
  mutate(
    island_name = str_trim(island_name), # Remove leading/trailing whitespace
    island_name = str_to_title(island_name), # Convert to title case
    station_install = dmy(station_install) # Convert to Date object
  )

print(clean_metadata)

# A tibble: 3 × 3
  island_name station_install latitude
  <chr>       <date>             <dbl>
1 Biscoe      2003-01-15         -64.8
2 Dream       2004-03-22         -64.7
3 Torgersen   2001-11-05         -64.8

# Use left_join() to attach your clean_metadata to the main penguins dataset. 
penguins_spatial <- penguins |>
  left_join(clean_metadata, by = join_by(island == island_name))

head(penguins_spatial)

# A tibble: 6 × 10
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <chr>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           NA            NA                  NA          NA
5 Adelie  Torgersen           36.7          19.3               193        3450
6 Adelie  Torgersen           39.3          20.6               190        3650
# ℹ 4 more variables: sex <fct>, year <int>, station_install <date>,
#   latitude <dbl>

A collaborator has asked for a clean, publication-ready table showing the maximum body mass recorded for each species, separated by island

penguin_summary = penguins_spatial |>
  drop_na(body_mass_g) |>    #Drop missing values in the body_mass_g column
  group_by(species, island) |>   #Group by species and island
  summarise(max_mass = max(body_mass_g), .groups = "drop") |>   #Summarise data to find maximum body mass
  pivot_wider(names_from = island, values_from = max_mass)   #Pivot dataset wider so species form the rows, and islands form the columns.


print(penguin_summary)

# A tibble: 3 × 4
  species   Biscoe Dream Torgersen
  <fct>      <int> <int>     <int>
1 Adelie      4775  4650      4700
2 Chinstrap     NA  4800        NA
3 Gentoo      6300    NA        NA