Learn More about the Red-eyed Verio from All About Birds (https://www.allaboutbirds.org/guide/Red-eyed_Vireo/id)
library(auk)
library(lubridate)
library(sf)
library(gridExtra)
library(tidyverse)
# resolve namespace conflicts
select <- dplyr::select
# setup
dir.create("data", showWarnings = FALSE)
# Read eBird data using auk
ebd <- auk_ebd("C:/PENNSTATE/GEOG588_Analytical Approaches_/Term_Project/Feb_22_TermProject/Term_Project/Data/ebd_US-WI-007_200201_202501_smp_relJan-2025.txt",
file_sampling = "C:/PENNSTATE/GEOG588_Analytical Approaches_/Term_Project/Feb_22_TermProject/Term_Project/Data/ebd_US-WI-007_200201_202501_smp_relJan-2025_sampling.txt")
ebd_filters <- ebd %>%
auk_species("Red-eyed Vireo") %>%
auk_date(date = c("2002-01-01", "2025-01-01")) %>%
# restrict to the standard traveling and stationary count protocols
auk_protocol(protocol = c("Stationary", "Traveling")) %>%
auk_complete()
# Define and create data dictionary
data_dir <- "data"
if (!dir.exists(data_dir)) {
dir.create(data_dir)
}
f_ebd <- file.path(data_dir, "ebd_Red_Eyed_Vireo.txt")
f_sampling <- file.path(data_dir, "ebd_checklists_Red_eye_Vireo.txt")
# run if the files don't already exist
if (!file.exists(f_ebd)) {
auk_filter(ebd_filters, file = f_ebd, file_sampling = f_sampling)
}
#Importing and Zero-filling
ebd_zf <- auk_zerofill(f_ebd, f_sampling, collapse = TRUE)
# function to convert time observation to hours since midnight
time_to_decimal <- function(x) {
x <- hms(x, quiet = TRUE)
hour(x) + minute(x) / 60 + second(x) / 3600
}
# clean up variables
ebd_zf <- ebd_zf %>%
mutate(
# convert X to NA
observation_count = if_else(observation_count == "X",
NA_character_, observation_count),
observation_count = as.integer(observation_count),
# effort_distance_km to 0 for non-travelling counts
effort_distance_km = if_else(protocol_type != "Traveling",
0, effort_distance_km),
# convert time to decimal hours since midnight
time_observations_started = time_to_decimal(time_observations_started),
# split date into year and day of year
year = year(observation_date),
day_of_year = yday(observation_date)
)
# additional filtering
ebd_zf_filtered <- ebd_zf %>%
filter(
# effort filters
duration_minutes <= 5 * 60,
effort_distance_km <= 5,
# 10 or fewer observers
number_observers <= 10)
ebird <- ebd_zf_filtered %>%
select(checklist_id, observer_id, sampling_event_identifier,
scientific_name,
observation_count, species_observed,
state_code, locality_id, latitude, longitude,
protocol_type, all_species_reported,
observation_date, year, day_of_year,
time_observations_started,
duration_minutes, effort_distance_km,
number_observers)
write_csv(ebird, "data/ebd_Red_Eyed_Vireo_zf.csv", na = "")
#Plotting Observations types-Checklist vs sightings.
library(ggplot2)
library(sf)
library(scales) # for alpha color transparency
# Prepare eBird data for mapping (already cleaned and filtered)
ebird_sf <- ebd_zf_filtered %>%
# Convert to spatial points
st_as_sf(coords = c("longitude", "latitude"), crs = 4326)
ggplot() +
# Plot eBird observations
geom_sf(data = ebird_sf, aes(color = species_observed), size = 1, alpha = 0.7) +
scale_color_manual(
values = c("FALSE" = alpha("#FF0000", 0.25), # Red for checklists
"TRUE" = alpha("#4daf4a", 1)), # Green for sightings
labels = c("eBird checklists", "Red-eyed Vireo Observitions")
) +
theme_minimal() +
labs(
title = "Red-eyed Vireo eBird Observations",
subtitle = "January 2002 to January 2025",
color = "Observation Type",
x = "Longitude",
y = "Latitude"
) +
theme(legend.position = "bottom")
# Filter for traveling protocol type
checklists_traveling <- filter(ebird, protocol_type == "Traveling")
# Calculate the 95th percentile for effort_distance_km within the Traveling protocol
distance_95th <- quantile(checklists_traveling$effort_distance_km, 0.95, na.rm = TRUE)
ggplot(checklists_traveling) +
aes(x = effort_distance_km, fill = effort_distance_km <= distance_95th) +
geom_histogram(binwidth = 1, boundary = 0, aes(y = after_stat(count / sum(count)))) +
scale_fill_manual(values = c("TRUE" = "blue", "FALSE" = "red"), guide = FALSE) +
scale_x_continuous(limits = c(0, NA)) + # Ensure x-axis starts at 0
scale_y_continuous(limits = c(0, NA), labels = scales::label_percent()) +
labs(
x = "Distance Traveled [km]",
y = "% of Traveling Protocol Checklists",
title = "Distribution of Distance Traveled on Traveling Protocol Checklists",
fill = "Within 95% Range"
) +
theme_minimal()
`
# Ensure the `observation_date` column is in Date format
ebird <- ebird %>%
mutate(observation_date = as.Date(observation_date, format = "%Y-%m-%d"))
# Add a "Year" column based on `observation_date`
ebird <- ebird %>%
mutate(Year = as.numeric(format(observation_date, "%Y")))
# Group by year and summarize the data
yearly_counts <- ebird %>%
group_by(Year) %>%
summarize(Count = n(), Unique_Observers = n_distinct(observer_id))
# Plot observations and unique observers
ggplot(yearly_counts, aes(x = Year)) +
geom_bar(aes(y = Count), stat = "identity", fill = "skyblue") +
geom_line(aes(y = Unique_Observers), color = "darkred", linewidth = 1) +
geom_point(aes(y = Unique_Observers), color = "darkred", size = 2) +
labs(x = "Year", y = "Number of Observations",
title = "Observations and Unique Observers by Year") +
scale_y_continuous(name = "Number of Observations",
sec.axis = sec_axis(~., name = "Number of Unique Observers")) +
theme_minimal()
# Ensure the `observation_date` column is in Date format
ebird <- ebird %>%
mutate(observation_date = as.Date(observation_date, format = "%Y-%m-%d"))
# Create a complete list of months in calendar order
all_months <- tibble(Month = factor(month.name, levels = month.name)) # Ensure months are treated as ordered factors
# Summarize the data by month, filtering out rows where `observation_count` is 0
monthly_counts <- ebird %>%
filter(observation_count != 0) %>% # Keep only rows with non-zero observation counts
mutate(Month = factor(format(observation_date, "%B"), levels = month.name)) %>% # Extract and order month names
group_by(Month) %>%
summarize(Count = n(), .groups = 'drop') %>%
right_join(all_months, by = "Month") %>% # Join with all months
replace_na(list(Count = 0)) # Replace missing counts with 0
# Plot
ggplot(monthly_counts, aes(x = Month, y = Count)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Month", y = "Observation Count",
title = "Count of Observations by Month (Non-Zero Counts Only)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate month labels
library(auk)
library(lubridate)
library(sf)
library(gridExtra)
library(tidyverse)
glimpse(ebird)
## Rows: 22,904
## Columns: 20
## $ checklist_id <chr> "S71389058", "S71410330", "S71410088", "S699…
## $ observer_id <chr> "obsr398748", "obsr652990", "obsr652990", "o…
## $ sampling_event_identifier <chr> "S71389058", "S71410330", "S71410088", "S699…
## $ scientific_name <chr> "Vireo olivaceus", "Vireo olivaceus", "Vireo…
## $ observation_count <int> 1, 2, 6, 4, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ species_observed <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, …
## $ state_code <chr> "US-WI", "US-WI", "US-WI", "US-WI", "US-WI",…
## $ locality_id <chr> "L11881816", "L11885894", "L11885879", "L118…
## $ latitude <dbl> 46.67769, 46.22388, 46.22452, 46.57175, 46.2…
## $ longitude <dbl> -90.87994, -91.14767, -91.16111, -91.49045, …
## $ protocol_type <chr> "Traveling", "Traveling", "Traveling", "Trav…
## $ all_species_reported <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ observation_date <date> 2020-07-12, 2020-07-12, 2020-07-12, 2020-06…
## $ year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ day_of_year <dbl> 194, 194, 194, 154, 216, 216, 202, 203, 203,…
## $ time_observations_started <dbl> 10.016667, 16.166667, 15.416667, 9.750000, 9…
## $ duration_minutes <int> 38, 120, 40, 20, 20, 11, 31, 58, 13, 22, 6, …
## $ effort_distance_km <dbl> 1.094, 0.400, 0.500, 0.402, 1.282, 0.000, 2.…
## $ number_observers <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
## $ Year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
# `ebird` as data set
ebird_sf <- st_as_sf(ebird, coords = c("longitude", "latitude"), crs = 4326)
# Load NLCD data
nlcd_data <- st_read("C:/PENNSTATE/GEOG588_Analytical Approaches_/Term_Project/Feb_22_TermProject/Term_Project/Data/Term_Spatial_Data.gdb",
layer = "NLCD_Bayfield_Dissolve")
## Reading layer `NLCD_Bayfield_Dissolve' from data source
## `C:\PENNSTATE\GEOG588_Analytical Approaches_\Term_Project\Feb_22_TermProject\Term_Project\Data\Term_Spatial_Data.gdb'
## using driver `OpenFileGDB'
## Simple feature collection with 15 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 318006 ymin: 734596.1 xmax: 381486 ymax: 837736.1
## Projected CRS: North_America_Albers_Equal_Area_Conic
#Change coordinate system of ebird_sf to match the NLCD data
ebird_sf <- st_transform(ebird_sf, crs = st_crs(nlcd_data))
#Join NLCD with ebird Data
ebird_nlcd <- st_join(ebird_sf, nlcd_data, join = st_intersects)
# Filter for Traveling protocol type and calculate encounter rates
Traveling_encounter_rates <- ebird_nlcd %>%
filter(protocol_type == "Traveling") %>% # Include only Traveling protocol type
group_by(ClassName) %>%
summarize(TotalObservations = sum(observation_count, na.rm = TRUE),
TotalEffort = sum(effort_distance_km, na.rm = TRUE),
EncounterRate = TotalObservations / TotalEffort)
library(ggplot2)
ggplot(Traveling_encounter_rates, aes(x = reorder(ClassName, EncounterRate), y = EncounterRate)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(
title = "Encounter Rates by NLCD Class\n(Traveling Protocol)",
x = "NLCD Class",
y = "Encounter Rate (Observations per Km)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)
library(dplyr)
library(sf)
library(randomForest)
library(ggplot2)
# Load NLCD data
nlcd_data <- st_read("C:/PENNSTATE/GEOG588_Analytical Approaches_/Term_Project/Feb_22_TermProject/Term_Project/Data/Term_Spatial_Data.gdb",
layer = "NLCD_Bayfield_Dissolve")
## Reading layer `NLCD_Bayfield_Dissolve' from data source
## `C:\PENNSTATE\GEOG588_Analytical Approaches_\Term_Project\Feb_22_TermProject\Term_Project\Data\Term_Spatial_Data.gdb'
## using driver `OpenFileGDB'
## Simple feature collection with 15 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 318006 ymin: 734596.1 xmax: 381486 ymax: 837736.1
## Projected CRS: North_America_Albers_Equal_Area_Conic
# transform it to an sf object
ebird_sf <- st_as_sf(ebird, coords = c("longitude", "latitude"), crs = 4326)
# Change coordinate system of ebird_sf to match NLCD data
ebird_sf <- st_transform(ebird_sf, crs = st_crs(nlcd_data))
# Join NLCD with ebird data
ebird_nlcd <- st_join(ebird_sf, nlcd_data, join = st_intersects)
# Prepare data for Random Forest
rf_data_Traveling <- ebird_nlcd %>%
filter(protocol_type == "Traveling") %>%
mutate(EncounterRate = observation_count / effort_distance_km) %>%
filter(effort_distance_km > 0)
# Glimpse the prepared data
glimpse(rf_data_Traveling)
## Rows: 10,811
## Columns: 23
## $ checklist_id <chr> "S71389058", "S71410330", "S71410088", "S699…
## $ observer_id <chr> "obsr398748", "obsr652990", "obsr652990", "o…
## $ sampling_event_identifier <chr> "S71389058", "S71410330", "S71410088", "S699…
## $ scientific_name <chr> "Vireo olivaceus", "Vireo olivaceus", "Vireo…
## $ observation_count <int> 1, 2, 6, 4, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ species_observed <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, …
## $ state_code <chr> "US-WI", "US-WI", "US-WI", "US-WI", "US-WI",…
## $ locality_id <chr> "L11881816", "L11885894", "L11885879", "L118…
## $ protocol_type <chr> "Traveling", "Traveling", "Traveling", "Trav…
## $ all_species_reported <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ observation_date <date> 2020-07-12, 2020-07-12, 2020-07-12, 2020-06…
## $ year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ day_of_year <dbl> 194, 194, 194, 154, 216, 202, 203, 215, 215,…
## $ time_observations_started <dbl> 10.016667, 16.166667, 15.416667, 9.750000, 9…
## $ duration_minutes <int> 38, 120, 40, 20, 20, 31, 58, 1, 22, 48, 18, …
## $ effort_distance_km <dbl> 1.094, 0.400, 0.500, 0.402, 1.282, 2.832, 0.…
## $ number_observers <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,…
## $ Year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ geometry <POINT [m]> POINT (368762.3 798819.6), POINT (3522…
## $ ClassName <chr> "Developed Open Space", "Woody Wetlands", "M…
## $ Shape_Length <dbl> 10048109, 14844551, 41588907, 10048109, 1484…
## $ Shape_Area <dbl> 137810129, 622722888, 1489469752, 137810129,…
## $ EncounterRate <dbl> 0.9140768, 5.0000000, 12.0000000, 9.9502488,…
# Summarize data for training
training_data <- rf_data_Traveling %>%
group_by(ClassName) %>%
summarize(
TotalObservations = sum(observation_count, na.rm = TRUE),
TotalEffort = sum(effort_distance_km, na.rm = TRUE),
MeanDuration = mean(duration_minutes, na.rm = TRUE),
EncounterRate = TotalObservations / TotalEffort, # Response variable
.groups = "drop"
)
# Train Random Forest Model
set.seed(123)
rf_model <- randomForest(
EncounterRate ~ ClassName + MeanDuration,
data = training_data,
importance = TRUE,
ntree = 500
)
print(rf_model)
##
## Call:
## randomForest(formula = EncounterRate ~ ClassName + MeanDuration, data = training_data, importance = TRUE, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1042197
## % Var explained: -18.03
# Prepare prediction dataset
prediction_data <- nlcd_data %>%
group_by(ClassName) %>%
summarize(
MeanArea = mean(Shape_Area, na.rm = TRUE),
MeanLength = mean(Shape_Length, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(MeanDuration = mean(training_data$MeanDuration, na.rm = TRUE))
# Update Random Forest
set.seed(123)
rf_model_updated <- randomForest(
EncounterRate ~ ClassName,
data = training_data,
importance = TRUE,
ntree = 500
)
print(rf_model_updated)
##
## Call:
## randomForest(formula = EncounterRate ~ ClassName, data = training_data, importance = TRUE, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1634497
## % Var explained: -85.11
# Predict encounter rates using the model
prediction_data$PredictedEncounterRate <- predict(rf_model_updated, newdata = prediction_data)
# Plot
ggplot(prediction_data, aes(x = reorder(ClassName, PredictedEncounterRate), y = PredictedEncounterRate)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
coord_flip() +
labs(
title = "Predicted Encounter Rates by NLCD Class",
x = "NLCD Class",
y = "Predicted Encounter Rate"
) +
theme_minimal()
# Evaluate variable importance
varImpPlot(rf_model_updated)
# Summarize encounter rates for the Traveling protocol
Traveling_encounter_rates <- rf_data_Traveling %>%
group_by(ClassName) %>%
summarize(
EncounterRate = sum(observation_count, na.rm = TRUE) / sum(effort_distance_km, na.rm = TRUE), # Observations per Km
.groups = "drop"
)
# Bar plot of encounter rates for the Traveling protocol
library(ggplot2)
ggplot(Traveling_encounter_rates, aes(x = reorder(ClassName, EncounterRate), y = EncounterRate)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(
title = "Encounter Rates by NLCD Class (Traveling Protocol)",
x = "NLCD Class",
y = "Encounter Rate (Observations per Km)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)