1. Introduction and setup

1.4.3. R packages

“The examples in this guide use a variety of R packages for accessing eBird data, working with spatial data, data processing and manipulation, and model training. To install all the packages necessary to work through this guide, run the following code:”

if (!requireNamespace("remotes", quietly = TRUE)) {
  install.packages("remotes")
}
remotes::install_github("ebird/ebird-best-practices")

1.4.5 GIS data

“The data package mentioned in Section 1.4.1 contains a GeoPackage with all the necessary GIS data. However, for reference, the following code was used to generate the GIS dataset. Running this code will create a GeoPackage containing the necessary spatial layers in data/gis-data.gpkg.”

library(dplyr)
library(rnaturalearth)
library(sf)

# file to save spatial data
gpkg_file <- "data/gis-data.gpkg"
dir.create(dirname(gpkg_file), showWarnings = FALSE, recursive = TRUE)

# political boundaries
# land border with lakes removed
ne_land <- ne_download(scale = 50, category = "cultural",
  type = "admin_0_countries_lakes",
  returnclass = "sf") |>
filter(CONTINENT %in% c("North America", "South America")) |>
st_set_precision(1e6) |>
st_union()

# country boundaries
ne_countries <- ne_download(scale = 50, category = "cultural",
  type = "admin_0_countries_lakes",
  returnclass = "sf") |>
select(country = ADMIN, country_code = ISO_A2)

# state boundaries for united states
ne_states <- ne_download(scale = 50, category = "cultural",
  type = "admin_1_states_provinces",
  returnclass = "sf") |>
filter(iso_a2 == "US") |>
select(state = name, state_code = iso_3166_2)

# country lines
# downloaded globally then filtered to north america with st_intersect()
ne_country_lines <- ne_download(scale = 50, category = "cultural",
  type = "admin_0_boundary_lines_land",
  returnclass = "sf") |>
st_geometry()
lines_on_land <- st_intersects(ne_country_lines, ne_land, sparse = FALSE) |>
as.logical()
ne_country_lines <- ne_country_lines[lines_on_land]

# states, north america
ne_state_lines <- ne_download(scale = 50, category = "cultural",
  type = "admin_1_states_provinces_lines",
  returnclass = "sf") |>
filter(ADM0_A3 %in% c("USA", "CAN")) |>
mutate(iso_a2 = recode(ADM0_A3, USA = "US", CAN = "CAN")) |>
select(country = ADM0_NAME, country_code = iso_a2)

# save all layers to a geopackage
unlink(gpkg_file)
write_sf(ne_land, gpkg_file,"ne_land")
write_sf(ne_countries, gpkg_file,"ne_countries")
write_sf(ne_states, gpkg_file,"ne_states")
write_sf(ne_country_lines, gpkg_file,"ne_country_lines")
write_sf(ne_state_lines, gpkg_file,"ne_state_lines")

2 eBird data

Importing eBird data into R

I downloaded two tab separated text files, one for the EBD (i.e. observation data) and one for the SED (i.e. checklist data).

The auk functions read_ebd() or read_sampling() are designed to import the EBD and SED, respectively, into R. First letʼs import the checklist data (SED).

library(auk)
## auk 0.8.0 is designed for EBD files downloaded after 2024-10-29. 
## No EBD data directory set, see ?auk_set_ebd_path to set EBD_PATH 
## eBird taxonomy version:  2024
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE

Read in sampling / checklist data (SED)

I had to update the file name and the pathway (i.e., remove “data-raw”)

#setwd("~/Documents/best practices using eBird")
#setwd("~/Library/CloudStorage/GoogleDrive-mcpheem@uwosh.edu/My Drive/Research _ Active/Solar and wildlife/eBird data/best practices using eBird/practice data sets")
f_sed <- "ebd_US-GA_woothr_smp_relAug-2024/ebd_US-GA_woothr_smp_relAug-2024_sampling.txt"
checklists <- read_sampling(f_sed)
#glimpse(checklists)
2 Exercise: histogram of the distribution of distance traveling for traveling protocol checklists

This is the code from the eBird document.
https://rpubs.com/techanswers88/after_stat_in_ggplot describes the use of “after_stat” In that document, they used after_stat to show percentages in the bar chart

So I’m guessing in the eBird case, the y axis will be the count of a given distance / the total count; i.e., the percentage. Thus, in their histogram, the y is percentage, not frequency.

checklists_traveling <- filter(checklists, protocol_type == "Traveling")

ggplot(checklists_traveling) +
  aes(x = effort_distance_km) +
  geom_histogram(binwidth = 1, 
                 aes(y = after_stat(count / sum(count)))) +
  
  scale_y_continuous(limits = c(0, NA), labels = scales::label_percent()) +
  labs(x = "Distance traveled [km]",
       y = "% of eBird checklists",
       title = "Distribution of distance traveled on eBird checklists")

Read in observation data (EBD)

f_ebd <- "ebd_US-GA_woothr_smp_relAug-2024 copy.txt"
observations <- read_ebd(f_ebd)
#glimpse(observations)

2.5 Filtering to study region and season

For the examples used throughout this guide weʼll only want observations from June for the last 10 years (2014-2023). In addition, weʼll only use complete checklists (i.e., those for which all birds seen or heard were reported), which will allow us to produce detection/non-detection data. We can apply these filters using the filter() function from dplyr.

#names(checklists)
checklists <- checklists |> 
  filter(all_species_reported,
    between(year(observation_date), 2014, 2023),
    month(observation_date) == 6)

# I think this is saying for the variable "all_species_reported", take the years 2014 - 2023 (based on variable "observation_date") and the 6th month (also based on variable "observation_date")
observations <- observations |> 
  filter(all_species_reported,
    between(year(observation_date), 2014, 2023),
    month(observation_date) == 6)

** Itʼs absolutely critical that we filter the observation and checklist data in exactly the same way to produce exactly the same population of checklists. Otherwise, the zero-filling we do in the next section will fail. **

Finally, there are rare situations in which some observers in a group of shared checklists quite drastically change there checklists, for example, changing the location or switching their checklist from complete to incomplete. In these cases, itʼs possible to end up with a mismatch between the checklists in the observation dataset and the checklist dataset. We can resolve this very rare issue by removing any checklists from the observation dataset not appearing in the checklist dataset.

observations <- semi_join(observations, checklists, by = "checklist_id")

2.6 Zero-filling

We can use auk_zerofill() to combine the checklist and observation data together to produce zero-filled, detection/non-detection data.

zf <- auk_zerofill(observations, checklists, collapse = TRUE)

Before continuing, weʼll transform some of the variables to a more useful form for modelling. We:

    1. convert time to a decimal value between 0 and 24,
    1. force the distance traveled to 0 for stationary checklists,
    1. create a new variable for speed,
    1. convert duration to hours,
    1. split date into year and day of year

Notably, eBirders have the option of entering an “X” rather than a count for a species, to indicate that the species was present, but they didnʼt keep track of how many individuals were observed. During the modeling stage, weʼll want the observation_count variable stored as an integer and we’ll

    1. convert “X” to NA to allow for this.
time_to_decimal <- function(x) {
  x <- hms(x, quiet = TRUE)
  hour(x) + minute(x) / 60 + second(x) / 3600
}
zf <- zf |>
  # (1) convert time to decimal hours since midnight
  mutate(
    hours_of_day = time_to_decimal(time_observations_started),

  # (2) effort_distance_km to 0 for stationary counts
    effort_distance_km = if_else(protocol_type == "Stationary", 0, effort_distance_km),
      
  # (4) convert duration to hours
    effort_hours = (duration_minutes / 60),

  # (3) speed km/h
    effort_speed_kmph = (effort_distance_km / effort_hours),

  # (5) split date into year and day of year
    year = year(observation_date), 
    day_of_year = yday(observation_date),
    
  # (6) convert count to integer and X to NA; ignore the warning "NAs introduced by coercion" 
    observation_count = as.integer(observation_count)
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `observation_count = as.integer(observation_count)`.
## Caused by warning:
## ! NAs introduced by coercion

2.7 Accounting for variation in effort

To deal with variation in detectability, impose some more consistent structure on the data by filtering observations on the effort variables. This reduces the variation in detectability between checklists. We suggest restricting checklists to traveling or stationary counts less than 6 hours in duration and 10 km in length, at speeds below 100km/h, and with 10 or fewer observers.

  • Note that these filtering parameters are based on making predictions at weekly spatial resolution and 3 km spatial resolution. For applications requiring higher spatial precision, stricter filtering on effort_distance_km should be used.
zf_filtered <- zf |>
  filter(protocol_type %in% c("Stationary","Traveling"),
    effort_hours <= 6,
    effort_distance_km <= 10,
    effort_speed_kmph <= 100,
    number_observers <= 10)
2.7 Exercise:

Pick one of the four effort variables we filtered on above and explore how much variation remains.

mean(zf_filtered$effort_hours)
## [1] 0.7544921
var(zf_filtered$effort_hours)
## [1] 0.7500433
ggplot(zf_filtered) +
  aes(x = effort_hours) +
  geom_histogram(binwidth = 0.5, 
                 aes(y = after_stat(count / sum(count)))) +
  scale_y_continuous(limits = c(0, NA), labels = scales::label_percent()) +
  labs(x = "Duration [hours]",
       y = "% of eBird checklists",
       title = "Distribution of eBird checklist duration")

2.8 Test-train split

For the modeling exercises used in this guide, we’ll hold aside a portion of the data from training to be used as an independent test set to assess the predictive performance of the model. Specifically, we’ll randomly split the data into 80% of checklists for training and 20% for testing. To facilitate this, we create a new variable type that will indicate whether the observation falls in the test set or training set.

zf_filtered$type <- if_else(runif(nrow(zf_filtered)) <= 0.8, "train", "test")
# confirm the proportion in each set is correct
table(zf_filtered$type) / nrow(zf_filtered)
## 
##      test     train 
## 0.1995728 0.8004272

Finally, there are a large number of variables in the EBD that are redundant (e.g. both state names and codes are present) or unnecessary for most modeling exercises (e.g. checklist comments and Important Bird Area codes). These can be removed at this point, keeping only the variables we want for modelling. Then we’ll save the resulting zero-filled observations for use in later chapters.

checklists <- zf_filtered |> 
  select(checklist_id, observer_id, type,
         observation_count, species_observed, 
         state_code, locality_id, latitude, longitude,
         protocol_type, all_species_reported,
         observation_date, year, day_of_year,
         hours_of_day, 
         effort_hours, effort_distance_km, effort_speed_kmph,
         number_observers)
write_csv(checklists, "checklists-zf_woothr_jun_us-ga copy.csv", na = "")

2.9 Exploratory analysis and visualization

Make a simple map of the observations. “This map uses GIS data available for download in the data package. Unzip this data package and place the contents in your RStudio project folder.”

** When I clicked on the link to download the package I got a 404 message - but I think this info got uploaded in chunk 2. **

ne_land <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg","ne_land") |>
  st_geometry()
ne_country_lines <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_country_lines") |>
  st_geometry()
ne_state_lines <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_state_lines") |>
  st_geometry()
study_region <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_states") |>
  filter(state_code == "US-GA") |>
  st_geometry()
checklists_sf <- checklists |> 
  # convert to spatial points
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |> 
  select(species_observed)
par(mar = c(0.25, 0.25, 4, 0.25))

# set up plot area
plot(st_geometry(checklists_sf), 
     main = "Wood Thrush eBird Observations\nJune 2014-2023",
     col = NA, border = NA)

# contextual gis data
plot(ne_land, col = "#cfcfcf", border = "#888888", lwd = 0.5, add = TRUE)
plot(study_region, col = "#e6e6e6", border = NA, add = TRUE)
plot(ne_state_lines, col = "#ffffff", lwd = 0.75, add = TRUE)
plot(ne_country_lines, col = "#ffffff", lwd = 1.5, add = TRUE)

# ebird observations
# not observed
plot(filter(checklists_sf, !species_observed),
     pch = 19, cex = 0.1, col = alpha("#555555", 0.25),
     add = TRUE)

# observed
plot(filter(checklists_sf, species_observed),
     pch = 19, cex = 0.3, col = alpha("#4daf4a", 1),
     add = TRUE)

# legend
legend("bottomright", bty = "n",
       col = c("#555555", "#4daf4a"),
       legend = c("eBird checklist", "Wood Thrush sighting"),
       pch = 19)
box()

Exploring the effort variables is also a valuable exercise. For each effort variable, we’ll produce both a histogram and a plot of frequency of detection as a function of that effort variable. The histogram will tell us something about birder behavior. For example, what time of day are most people going birding, and for how long? We may also want to note values of the effort variable that have very few observations; predictions made in these regions may be unreliable due to a lack of data. The detection frequency plots tell us how the probability of detecting a species changes with effort.

2.9.1 Time of day

The first predictor of detection that we’ll explore is the time of day at which a checklist was started. We’ll summarize the data in 1 hour intervals, then plot them. Since estimates of detection frequency are unreliable when only a small number of checklists are available, we’ll only plot hours for which at least 100 checklists are present.

breaks <- seq(0, 24)
labels <- breaks[-length(breaks)] + diff(breaks) / 2

# I don't understand breaks[-length(breaks)] + diff(breaks) / 2; I get that the difference between each element is 1 and 1/2 = 0.5 but length usually gives the number of elements, in this case 24.  But clearly it's starting with 0... and I don't understand why.  Also, if it starts with 0, the next should be 1 and 1+1 / 2 = 1, not 1.5...
checklists_time <- checklists |> 
  mutate(hour_bins = cut(hours_of_day, 
  # cut divides the range of x into intervals and codes the values in x according to which interval they fall. The leftmost interval corresponds to level one, the next leftmost to level two and so on.
  # hours_of_day was created in chunk 13 and put into checklists in chunk 18
                         breaks = breaks, 
                         labels = labels,
                         include.lowest = TRUE),
  # I don't know why breaks and labels are included
         hour_bins = as.numeric(as.character(hour_bins))) |> 
  group_by(hour_bins) |> 
  summarise(n_checklists = n(),
            # does n() return number of rows?  how different than length?
            n_detected = sum(species_observed),
            det_freq = mean(species_observed))
# nice description of n() versus count or just n:  https://stackoverflow.com/questions/67418568/what-is-the-difference-between-n-and-count-in-r-when-should-one-favour-the
g_tod_hist <- ggplot(checklists_time) +
  aes(x = hour_bins, y = n_checklists) +
  geom_segment(aes(xend = hour_bins, y = 0, yend = n_checklists),
               color = "grey50") +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 24, by = 3), limits = c(0, 24)) +
  scale_y_continuous(labels = scales::comma) +
  # scales::comma presents numbers using commas to separate the thousands
  labs(x = "Hours since midnight",
       y = "# checklists",
       title = "Distribution of observation start times")
g_tod_freq <- ggplot(checklists_time |> filter(n_checklists > 100)) +
  aes(x = hour_bins, y = det_freq) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(0, 24, by = 3), limits = c(0, 24)) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Hours since midnight",
       y = "% checklists with detections",
       title = "Detection frequency")
grid.arrange(g_tod_hist, g_tod_freq)

2.9.2 Checklist duration

When we filtered the eBird data in Section 2.7 [chunk 14: additional filtering], we restricted observations to those from checklists 6 hours in duration or shorter to reduce variability. Let’s see what sort of variation remains in checklist duration.

breaks <- seq(0, 6)
labels <- breaks[-length(breaks)] + diff(breaks) / 2
checklists_duration <- checklists |> 
  mutate(duration_bins = cut(effort_hours, 
                             breaks = breaks, 
                             labels = labels,
                             include.lowest = TRUE),
         duration_bins = as.numeric(as.character(duration_bins))) |> 
  group_by(duration_bins) |> 
  summarise(n_checklists = n(),
            n_detected = sum(species_observed),
            det_freq = mean(species_observed))
g_duration_hist <- ggplot(checklists_duration) +
  aes(x = duration_bins, y = n_checklists) +
  geom_segment(aes(xend = duration_bins, y = 0, yend = n_checklists),
               color = "grey50") +
  geom_point() +
  scale_x_continuous(breaks = breaks) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "Checklist duration [hours]",
       y = "# checklists",
       title = "Distribution of checklist durations")
g_duration_freq <- ggplot(checklists_duration |> filter(n_checklists > 100)) +
  aes(x = duration_bins, y = det_freq) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = breaks) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Checklist duration [hours]",
       y = "% checklists with detections",
       title = "Detection frequency")
grid.arrange(g_duration_hist, g_duration_freq)

2.9.3 Distance traveled

breaks <- seq(0, 10)
labels <- breaks[-length(breaks)] + diff(breaks) / 2
checklists_dist <- checklists |> 
  mutate(dist_bins = cut(effort_distance_km, 
                         breaks = breaks, 
                         labels = labels,
                         include.lowest = TRUE),
         dist_bins = as.numeric(as.character(dist_bins))) |> 
  group_by(dist_bins) |> 
  summarise(n_checklists = n(),
            n_detected = sum(species_observed),
            det_freq = mean(species_observed))
g_dist_hist <- ggplot(checklists_dist) +
  aes(x = dist_bins, y = n_checklists) +
  geom_segment(aes(xend = dist_bins, y = 0, yend = n_checklists),
               color = "grey50") +
  geom_point() +
  scale_x_continuous(breaks = breaks) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "Distance travelled [km]",
       y = "# checklists",
       title = "Distribution of distance travelled")
g_dist_freq <- ggplot(checklists_dist |> filter(n_checklists > 100)) +
  aes(x = dist_bins, y = det_freq) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = breaks) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Distance travelled [km]",
       y = "% checklists with detections",
       title = "Detection frequency")
# combine
grid.arrange(g_dist_hist, g_dist_freq)

2.9.4 Number of observers

breaks <- seq(0, 10)
labels <- seq(1, 10)
checklists_obs <- checklists |> 
  mutate(obs_bins = cut(number_observers, 
                        breaks = breaks, 
                        label = labels,
                        include.lowest = TRUE),
         obs_bins = as.numeric(as.character(obs_bins))) |> 
  group_by(obs_bins) |> 
  summarise(n_checklists = n(),
            n_detected = sum(species_observed),
            det_freq = mean(species_observed))
g_obs_hist <- ggplot(checklists_obs) +
  aes(x = obs_bins, y = n_checklists) +
  geom_segment(aes(xend = obs_bins, y = 0, yend = n_checklists),
               color = "grey50") +
  geom_point() +
  scale_x_continuous(breaks = breaks) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "# observers",
       y = "# checklists",
       title = "Distribution of the number of observers")
g_obs_freq <- ggplot(checklists_obs |> filter(n_checklists > 100)) +
  aes(x = obs_bins, y = det_freq) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = breaks) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "# observers",
       y = "% checklists with detections",
       title = "Detection frequency")
# combine
grid.arrange(g_obs_hist, g_obs_freq)

3 Environmental variables

3.2 Landcover

library(exactextractr)
library(landscapemetrics)
library(stringr)
library(terra)
## terra 1.8.42
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:terra':
## 
##     extract
library(units)
## udunits database from /Users/mcpheem/Library/R/arm64/4.5/library/units/share/udunits/udunits2.xml
library(viridis)
## Loading required package: viridisLite
landcover <- rast("ebird-best-practices-data _ complete/data-raw/landcover_mcd12q1_umd_us-ga_2014-2022.tif")
print(landcover)
## class       : SpatRaster 
## dimensions  : 1174, 1249, 9  (nrow, ncol, nlyr)
## resolution  : 463.3127, 463.3127  (x, y)
## extent      : -8132528, -7553851, 3362724, 3906653  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=sinu +lon_0=0 +x_0=0 +y_0=0 +R=6371007.181 +units=m +no_defs 
## source      : landcover_mcd12q1_umd_us-ga_2014-2022.tif 
## names       : 2014, 2015, 2016, 2017, 2018, 2019, ... 
## min values  :    0,    0,    0,    0,    0,    0, ... 
## max values  :   15,   15,   15,   15,   15,   15, ...
plot(as.factor(landcover[["2022"]]), 
     main = "MODIS Landcover 2022",
     axes = FALSE)

# My colors are very different than those in the BP doc

Weʼve also included a lookup table in the data package ( data-raw/mcd12q1_fao_classes.csv) that provides descriptions of each of these classes.

lc_classes <- read_csv("ebird-best-practices-data _ complete/data-raw/mcd12q1_umd_classes.csv")
## Rows: 15 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, label, description
## dbl (1): class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
knitr::kable(lc_classes)
class name label description
0 Water bodies water At least 60% of area is covered by permanent water bodies.
1 Evergreen Needleleaf Forests evergreen_needleleaf Dominated by evergreen conifer trees (canopy >2m). Tree cover >60%.
2 Evergreen Broadleaf Forests evergreen_broadleaf Dominated by evergreen broadleaf and palmate trees (canopy >2m). Tree cover >60%.
3 Deciduous Needleleaf Forests deciduous_needleleaf Dominated by deciduous needleleaf (e.g. larch) trees (canopy >2m). Tree cover >60%.
4 Deciduous Broadleaf Forests deciduous_broadleaf Dominated by deciduous broadleaf trees (canopy >2m). Tree cover >60%.
5 Mixed Forests mixed_forest Dominated by neither deciduous nor evergreen (40-60% of each) tree type (canopy >2m). Tree cover >60%.
6 Closed Shrublands closed_shrubland Dominated by woody perennials (1-2m height) >60% cover.
7 Open Shrublands open_shrubland Dominated by woody perennials (1-2m height) 10-60% cover.
8 Woody Savannas woody_savanna Tree cover 30-60% (canopy >2m).
9 Savannas savanna Tree cover 10-30% (canopy >2m).
10 Grasslands grassland Dominated by herbaceous annuals (<2m).
12 Croplands cropland At least 60% of area is cultivated cropland.
13 Urban and Built-up Lands urban At least 30% impervious surface area including building materials, asphalt, and vehicles.
15 Non-Vegetated Lands nonvegetated At least 60% of area is non-vegetated barren (sand, rock, soil) or permanent snow and ice with less than 10% vegetation.
255 Unclassified unclassified Has not received a map label because of missing inputs.

We’ll start by finding the full set of unique checklist locations for each year in the eBird data and buffer the points by 1.5km to generate 3 km diameter circular neighborhoods centered on each checklist location.

checklists <- read_csv("ebird-best-practices-data _ complete/data/checklists-zf_woothr_jun_us-ga.csv") |> 
  # landcover data not availble for the full period, so we use the closest year
  mutate(year_lc = as.character(pmin(year, 2022)))
## Rows: 47863 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (6): checklist_id, observer_id, type, state_code, locality_id, protoco...
## dbl  (10): observation_count, latitude, longitude, year, day_of_year, hours_...
## lgl   (2): species_observed, all_species_reported
## date  (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# what does pmin do?
checklists_sf <- checklists |> 
  # identify unique location/year combinations
  distinct(locality_id, year_lc, latitude, longitude) |> 
  # generate a 3 km neighborhoods
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326)
buffers <- st_buffer(checklists_sf, dist = set_units(1.5, "km"))
From https://www.jessesadler.com/post/simple-feature-objects/#:~:text=At%20its%20most%20basic%2C%20an,spatial%20aspects%20of%20the%20features
“At its most basic, an sf object is a collection of simple features that includes attributes and geometries in the form of a data frame. In other words, it is a data frame (or tibble) with rows of features, columns of attributes, and a special geometry column that contains the spatial aspects of the features.”

st_as_sf converts a foreign object to an sf object.

Next, for each location, we crop and mask the landcover layer corresponding to the checklist year to the circular neighborhood around that checklist. Then we use calculate_lsm() from landscapemetrics to calculate pland and ed metrics within each neighborhood.

This step may take 30 minute or longer to run.

lsm <- NULL
for (i in seq_len(nrow(buffers))) {
  buffer_i <- st_transform(buffers[i, ], crs = crs(landcover))
  year <- as.character(buffer_i$year_lc)
  
  # crop and mask landcover raster so all values outside buffer are missing
  lsm[[i]] <- crop(landcover[[year]], buffer_i) |> 
    mask(buffer_i) |> 
    # calcualte landscape metrics
    calculate_lsm(level = "class", metric = c("pland", "ed")) |> 
    # add variables to uniquely identify each point
    mutate(locality_id = buffer_i$locality_id, 
           year_lc = buffer_i$year_lc) |> 
    select(locality_id, year_lc, class, metric, value)
}
lsm <- bind_rows(lsm)

Finally, we’ll transform the data frame so there’s one row per location and all the environmental variables appear as columns. For each location, any landcover classes that don’t appear within the buffer will not have associated pland and ed metrics; at this stage, we replace these implicit missing values with zeros using the complete list of classes in lc_classes. We also replace the opaque class numbers with more meaningful names from the class description file data-raw/mcd12q1_umd_classes.csv.

lsm_wide <- lsm |> 
  # fill missing classes with zeros
  complete(nesting(locality_id, year_lc),
           class = lc_classes$class,
           metric = c("ed", "pland"),
           fill = list(value = 0)) |> 
  # bring in more descriptive names
  inner_join(select(lc_classes, class, label), by = "class") |> 
  # transform from long to wide format
  pivot_wider(values_from = value,
              names_from = c(class, label, metric),
              names_glue = "{metric}_c{str_pad(class, 2, pad = '0')}_{label}",
              names_sort = TRUE) |> 
  arrange(locality_id, year_lc)

3.3 Elevation

I originally skipped this because elevation is probably not an issue in Wisconsin but the chunk “Map percent cover of deciduous broadleaf forest (class 4)” wouldn’t run so I’m adding this section back in to see if that solves the problem.

# elevation raster
elevation <- rast("ebird-best-practices-data _ complete/data-raw/elevation_gmted_1km_us-ga.tif")

# mean and standard deviation within each circular neighborhood
elev_buffer <- exact_extract(elevation, buffers, fun = c("mean", "stdev"),
                             progress = FALSE) |> 
  # add variables to uniquely identify each point
  mutate(locality_id = buffers$locality_id, year_lc = buffers$year_lc) |> 
  select(locality_id, year_lc, 
         elevation_mean = mean,
         elevation_sd = stdev)
env_variables <- inner_join(elev_buffer, lsm_wide,
                            by = c("locality_id", "year_lc"))

# attach and expand to checklists
env_variables <- checklists |> 
  select(checklist_id, locality_id, year_lc) |> 
  inner_join(env_variables, by = c("locality_id", "year_lc")) |> 
  select(-locality_id, -year_lc)

# save to csv, dropping any rows with missing variables
write_csv(drop_na(env_variables), 
          "ebird-best-practices-data _ complete/data/environmental-variables_checklists_jun_us-ga.csv", 
          na = "")

3.4 Prediction grid

In this section, we’ll create such a prediction grid for our study region (Georgia) using the MODIS landcover data from the most recent year for which they’re available in addition to elevation data. [Note - I never calculated the elevation data].

To start, we’ll create a template raster with cells equal in dimension to the diameter of the circular neighborhoods we used above. It’s important to use an equal area coordinate reference system for the prediction grid; we’ll use a Lambert’s azimuthal equal area projection centered on our study region.

# lambert's azimuthal equal area projection for georgia
laea_crs <- st_crs("+proj=laea +lat_0=33.2 +lon_0=-83.7")

# study region: georgia
study_region <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", layer = "ne_states") |> 
  filter(state_code == "US-GA") |> 
  st_transform(crs = laea_crs)

# create a raster template covering the region with 3 km resolution
r <- rast(study_region, res = c(3000, 3000))

# fill the raster with 1s inside the study region
r <- rasterize(study_region, r, values = 1) |> 
  setNames("study_region")
## Warning: [rasterize] unexpected additional argument(s): values
# save for later use
r <- writeRaster(r, "ebird-best-practices-data _ complete/data/prediction-grid_us-ga.tif",
                 overwrite = TRUE,
                 gdal = "COMPRESS=DEFLATE")

Next, we extract the coordinates of the cell centers from the raster we just created, convert these to sf point features, and buffer these to generate 3 km circular neighborhoods.

buffers_pg <- as.data.frame(r, cells = TRUE, xy = TRUE) |> 
  select(cell_id = cell, x, y) |> 
  st_as_sf(coords = c("x", "y"), crs = laea_crs, remove = FALSE) |> 
  st_transform(crs = 4326) |> 
  st_buffer(set_units(3, "km"))

Now we can calculate the landcover and elevation variables exactly as we did for the eBird checklists in the previous two sections. First, the landscape metrics pland and ed from the landcover data. Note that we use the most recent year of landcover data (i.e. 2022) in this case.

# estimate landscape metrics for each cell in the prediction grid
lsm_pg <- NULL
for (i in seq_len(nrow(buffers_pg))) {
  buffer_i <- st_transform(buffers_pg[i, ], crs = crs(landcover))
  
  # crop and mask landcover raster so all values outside buffer are missing
  lsm_pg[[i]] <- crop(landcover[["2022"]], buffer_i) |> 
    mask(buffer_i) |> 
    # calcualte landscape metrics
    calculate_lsm(level = "class", metric = c("pland", "ed")) |> 
    # add variable to uniquely identify each point
    mutate(cell_id = buffer_i$cell_id) |> 
    select(cell_id, class, metric, value)
}
lsm_pg <- bind_rows(lsm_pg)

# transform to wide format
lsm_wide_pg <- lsm_pg |> 
  # fill missing classes with zeros
  complete(cell_id,
           class = lc_classes$class,
           metric = c("ed", "pland"),
           fill = list(value = 0)) |> 
  # bring in more descriptive names
  inner_join(select(lc_classes, class, label), by = "class") |> 
  # transform from long to wide format
  pivot_wider(values_from = value,
              names_from = c(class, label, metric),
              names_glue = "{metric}_c{str_pad(class, 2, pad = '0')}_{label}",
              names_sort = TRUE,
              values_fill = 0) |> 
  arrange(cell_id)
elev_buffer_pg <- exact_extract(elevation, buffers_pg, 
                                fun = c("mean", "stdev"),
                                progress = FALSE) |> 
  # add variables to uniquely identify each point
  mutate(cell_id = buffers_pg$cell_id) |> 
  select(cell_id, elevation_mean = mean, elevation_sd = stdev)
env_variables_pg <- inner_join(elev_buffer_pg, lsm_wide_pg, by = "cell_id")

# attach the xy coordinates of the cell centers
env_variables_pg <- buffers_pg |> 
  st_drop_geometry() |> 
  select(cell_id, x, y) |> 
  inner_join(env_variables_pg, by = "cell_id")

# save as csv, dropping any rows with missing variables
write_csv(drop_na(env_variables_pg),
          "ebird-best-practices-data _ complete/data/environmental-variables_prediction-grid_us-ga.csv", 
          na = "")

Map percent cover of deciduous broadleaf forest (class 4).

forest_cover <- env_variables_pg |> 
  # convert to spatial features
  st_as_sf(coords = c("x", "y"), crs = laea_crs) |> 
  # rasterize points
  rasterize(r, field = "pland_c04_deciduous_broadleaf")

# make a map
par(mar = c(0.25, 0.25, 2, 0.25))
plot(forest_cover, 
     axes = FALSE, box = FALSE, col = viridis(10), 
     main = "Deciduous Broadleaf Forest (% cover)")

4 Encounter rate

4.2 Data preparation

library(dplyr)
library(ebirdst)
## Please cite the eBird Status and Trends data using: 
##   Fink, D., T. Auer, A. Johnston, M. Strimas-Mackey, S. Ligocki, O. Robinson, 
##   W. Hochachka, L. Jaromczyk, C. Crowley, K. Dunham, A. Stillman, I. Davies, 
##   A. Rodewald, V. Ruiz-Gutierrez, C. Wood. 2023.
##   eBird Status and Trends, Data Version: 2022; Released: 2023. Cornell Lab of
##   Ornithology, Ithaca, New York. https://doi.org/10.2173/ebirdst.2022
## 
## This version of the package provides access to the 2022 version of the eBird
## Status and Trends Data Products. Access to the 2022 data will be provided 
## until November 2024 when it will be replaced by the 2023 data. At that 
## point, you will be required to update this R package and transition to using 
## the new data.
library(fields)
## Loading required package: spam
## Spam version 2.11-1 (2025-01-20) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## 
## Try help(fields) to get started.
## 
## Attaching package: 'fields'
## The following object is masked from 'package:terra':
## 
##     describe
library(ggplot2)
library(gridExtra)
library(lubridate)
library(mccf1)
library(ranger)
library(readr)
library(scam)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## This is scam 1.2-18.
library(sf)
library(terra)
library(tidyr)
# set random number seed for reproducibility
set.seed(1)

# environmental variables: landcover and elevation
env_vars <- read_csv("ebird-best-practices-data _ complete/data/environmental-variables_checklists_jun_us-ga.csv")
## Rows: 47863 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): checklist_id
## dbl (32): elevation_mean, elevation_sd, ed_c00_water, pland_c00_water, ed_c0...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# zero-filled ebird data combined with environmental data
checklists_env <- read_csv("ebird-best-practices-data _ complete/data/checklists-zf_woothr_jun_us-ga.csv") |> 
  inner_join(env_vars, by = "checklist_id")
## Rows: 47863 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (6): checklist_id, observer_id, type, state_code, locality_id, protoco...
## dbl  (10): observation_count, latitude, longitude, year, day_of_year, hours_...
## lgl   (2): species_observed, all_species_reported
## date  (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# prediction grid
pred_grid <- read_csv("ebird-best-practices-data _ complete/data/environmental-variables_prediction-grid_us-ga.csv")
## Rows: 16940 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (35): cell_id, x, y, elevation_mean, elevation_sd, ed_c00_water, pland_c...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# raster template for the grid
r <- rast("ebird-best-practices-data _ complete/data/prediction-grid_us-ga.tif")
# get the coordinate reference system of the prediction grid
crs <- st_crs(r)

# load gis data for making maps
study_region <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_states") |> 
  filter(state_code == "US-GA") |> 
  st_transform(crs = crs) |> 
  st_geometry()
ne_land <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_land") |> 
  st_transform(crs = crs) |> 
  st_geometry()
ne_country_lines <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_country_lines") |> 
  st_transform(crs = crs) |> 
  st_geometry()
ne_state_lines <- read_sf("ebird-best-practices-data _ complete/data/gis-data.gpkg", "ne_state_lines") |> 
  st_transform(crs = crs) |> 
  st_geometry()

4.3 Spatiotemporal subsampling

This is a example of subsampling. The full set of points is in black, the subsampled points are red.

# generate random points for a single week of the year
pts <- data.frame(longitude = runif(500, -0.1, 0.1),
                  latitude = runif(500, -0.1, 0.1),
                  day_of_year = sample(1:7, 500, replace = TRUE))

# sample one checklist per grid cell
# by default grid_sample() uses a 3km x 3km x 1 week grid
pts_ss <- grid_sample(pts)

# generate polygons for the grid cells
ggplot(pts) +
  aes(x = longitude, y = latitude) +
  geom_point(size = 0.5) +
  geom_point(data = pts_ss, col = "red") +
  theme_bw() # this gets rid of the gray

Now let’s apply exactly the same approach to subsampling the real eBird checklists; however, now we subsample temporally in addition to spatially, and sample detections and non-detections separately.

Also, recall from Section 2.8 that we split the data 80/20 into train/test sets. Using the sample_by argument to grid_sample_stratified(), we can independently sample from the train and test sets to remove bias from both.

# sample one checklist per 3km x 3km x 1 week grid for each year
# sample detection/non-detection independently 
checklists_ss <- grid_sample_stratified(checklists_env,
                                        obs_column = "species_observed",
                                        sample_by = "type")
4.3 Exercise:

Compare the full set of eBird checklists to the set of checklists remaining after subsampling. What was the change in sampled size and how did the subsampling impact the prevalence of detections compared to non-detections?

ggplot(checklists_env) +
  aes(x = longitude, y = latitude) +
  geom_point(size = 0.5) +
  geom_point(data = checklists_ss, col = "red") +
  theme_bw()

nrow(checklists_env)
## [1] 47863
count(checklists_env, species_observed) |> 
  mutate(percent = n / sum(n))
## # A tibble: 2 × 3
##   species_observed     n percent
##   <lgl>            <int>   <dbl>
## 1 FALSE            42724   0.893
## 2 TRUE              5139   0.107
nrow(checklists_ss)
## [1] 20642
count(checklists_ss, species_observed) |> 
  mutate(percent = n / sum(n))
## # A tibble: 2 × 3
##   species_observed     n percent
##   <lgl>            <int>   <dbl>
## 1 FALSE            17744   0.860
## 2 TRUE              2898   0.140

The subsampling decreased the overall number of checklists by a factor of about 2.3, but increased the prevalence of detections from 10.7% to 14.0%.

Now let’s look at how the subsampling affects the spatial distribution of the training observations.

# convert checklists to spatial features
all_pts <- checklists_env |>  
  filter(type == "train") |> 
  st_as_sf(coords = c("longitude","latitude"), crs = 4326) |>
  st_transform(crs = crs) |> 
  select(species_observed)
ss_pts <- checklists_ss |>  
  filter(type == "train") |> 
  st_as_sf(coords = c("longitude","latitude"), crs = 4326) |>
  st_transform(crs = crs) |> 
  select(species_observed)
both_pts <- list(before_ss = all_pts, after_ss = ss_pts)

# map
p <- par(mfrow = c(1, 2))
for (i in seq_along(both_pts)) {
  par(mar = c(0.25, 0.25, 0.25, 0.25))
  # set up plot area
  plot(st_geometry(both_pts[[i]]), col = NA)
  # contextual gis data
  plot(ne_land, col = "#dddddd", border = "#888888", lwd = 0.5, add = TRUE)
  plot(study_region, col = "#cccccc", border = NA, add = TRUE)
  plot(ne_state_lines, col = "#ffffff", lwd = 0.75, add = TRUE)
  plot(ne_country_lines, col = "#ffffff", lwd = 1.5, add = TRUE)
  # ebird observations
  # not observed
  plot(st_geometry(both_pts[[i]]),
       pch = 19, cex = 0.1, col = alpha("#555555", 0.25),
       add = TRUE)
  # observed
  plot(filter(both_pts[[i]], species_observed) |> st_geometry(),
       pch = 19, cex = 0.3, col = alpha("#4daf4a", 0.5),
       add = TRUE)
  # legend
  legend("bottomright", bty = "n",
         col = c("#555555", "#4daf4a"),
         legend = c("Non-detection", "Detection"),
         pch = 19)
  box()
  par(new = TRUE, mar = c(0, 0, 3, 0))
  if (names(both_pts)[i] == "before_ss") {
    title("Wood Thrush eBird Observations\nBefore subsampling")
  } else {
    title("After subsampling")
  }
}

par(p)

4.4 Random forests

checklists_train <- checklists_ss |> 
  filter(type == "train") |> 
  # select only the columns to be used in the model
  select(species_observed,
         year, day_of_year, hours_of_day,
         effort_hours, effort_distance_km, effort_speed_kmph,
         number_observers, 
         starts_with("pland_"),
         starts_with("ed_"),
         starts_with("elevation_"))
detection_freq <- mean(checklists_train$species_observed)
# ranger requires a factor response to do classification
er_model <- ranger(formula =  as.factor(species_observed) ~ ., 
                   data = checklists_train,
                   importance = "impurity",
                   probability = TRUE,
                   replace = TRUE, 
                   sample.fraction = c(detection_freq, detection_freq))

4.4.1 Calibration

To train a calibration model for our predictions, we predict encounter rate for each checklist in the training set, then fit a binomial Generalized Additive Model (GAM) with the real observed encounter rate as the response and the predicted encounter rate as the predictor variable.

# predicted encounter rate based on out of bag samples
er_pred <- er_model$predictions[, 2]
# observed detection, converted back from factor
det_obs <- as.integer(checklists_train$species_observed)
# construct a data frame to train the scam model
obs_pred <- data.frame(obs = det_obs, pred = er_pred)

# train calibration model
calibration_model <- scam(obs ~ s(pred, k = 6, bs = "mpi"), 
                          gamma = 2,
                          data = obs_pred)

To use the calibration model as a diagnostic tool, we’ll group the predicted encounter rates into bins, then calculate the mean predicted and observed encounter rates within each bin. This can be compared to predictions from the calibration model.

# group the predicted encounter rate into bins of width 0.02
# then calculate the mean observed encounter rates in each bin
er_breaks <- seq(0, 1, by = 0.02)
mean_er <- obs_pred |>
  mutate(er_bin = cut(pred, breaks = er_breaks, include.lowest = TRUE)) |>
  group_by(er_bin) |>
  summarise(n_checklists = n(),
            pred = mean(pred), 
            obs = mean(obs),
            .groups = "drop")

# make predictions from the calibration model
calibration_curve <- data.frame(pred = er_breaks)
cal_pred <- predict(calibration_model, calibration_curve, type = "response")
calibration_curve$calibrated <- cal_pred

# compared binned mean encounter rates to calibration model
ggplot(calibration_curve) +
  aes(x = pred, y = calibrated) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  geom_line(color = "blue") +
  geom_point(data = mean_er, 
             aes(x = pred, y = obs),
             size = 2, alpha = 0.6,
             show.legend = FALSE) +
  labs(x = "Estimated encounter rate",
       y = "Observed encounter rate",
       title = "Calibration model") +
  coord_equal(xlim = c(0, 1), ylim = c(0, 1))