Question 1: Most Observed Bird in California

Approach

  • Filtered the dataset to include only observations from California based on latitude and longitude.
  • Identified the most observed bird species by summing observation counts per species.
  • Created a map visualization using ggplot2 to display observation distribution.
recent_bird_subset <- read_csv("https://tinyurl.com/recent-bird-subset")
## Rows: 100000 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): loc_id, subnational1_code, entry_technique, sub_id, obs_id, PROJ_P...
## dbl (14): latitude, longitude, Month, Day, Year, how_many, valid, reviewed, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
site_info <- read_csv("https://tinyurl.com/observation-site-info")
## Rows: 254355 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): loc_id, proj_period_id
## dbl (60): yard_type_pavement, yard_type_garden, yard_type_landsca, yard_type...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
california_birds <- recent_bird_subset %>%
  inner_join(site_info, by = "loc_id") %>%
  filter(latitude > 32 & latitude < 42 & longitude > -125 & longitude < -114)  
## Warning in inner_join(., site_info, by = "loc_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 85771 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
most_observed_bird <- california_birds %>%
  group_by(species_code) %>%
  summarize(total_count = sum(how_many, na.rm = TRUE)) %>%
  arrange(desc(total_count)) %>%
  slice(1) %>%
  pull(species_code)

most_observed_count <- california_birds %>%
  filter(species_code == most_observed_bird) %>%
  summarize(total_count = sum(how_many, na.rm = TRUE)) %>%
  pull(total_count)

california_birds <- california_birds %>%
  mutate(is_most_observed = if_else(species_code == most_observed_bird, "Most Observed", "Other Birds"))

Figure 1. This map shows bird sightings across California, highlighting the most observed species. Data was filtered by latitude/longitude, and species counts were summed to identify the most frequently reported bird. Color coding differentiates the most observed species from others. The most observed bird species in California is houfin with 7347 sightings.

ggplot() +
  geom_polygon(data = map_data("state") %>% filter(region == "california"),
               aes(x = long, y = lat, group = group), fill = "white", color = "black") +
  geom_point(data = california_birds, aes(x = longitude, y = latitude, color = is_most_observed)) +
  scale_color_manual(values = c("Most Observed" = "red", "Other Birds" = "black")) +
  labs(x = "Longitude", y = "Latitude", color = "Bird Type") +
  theme_minimal()

Question 2: Bird Observations by Habitat Type

Approach

  • Categorized observation sites into three habitat types: Residential, Industrial, and Agricultural.
  • Filtered observations to exclude extreme outliers using the 99th percentile threshold.
  • Created a boxplot to compare bird observation counts across habitat types.
habitat_data <- site_info %>%
  select(loc_id, hab_residential, hab_industrial, hab_agricultural) %>%
  mutate(
    habitat_type = case_when(
      hab_residential > 0 ~ "Residential",
      hab_industrial > 0 ~ "Industrial",
      hab_agricultural > 0 ~ "Agricultural",
      TRUE ~ "Other"
    )
  ) %>%
  filter(habitat_type != "Other")  

bird_habitat_data <- recent_bird_subset %>%
  inner_join(habitat_data, by = "loc_id") %>%
  filter(how_many <= quantile(how_many, 0.99, na.rm = TRUE))
## Warning in inner_join(., habitat_data, by = "loc_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 72827 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bird_habitat_summary <- recent_bird_subset %>%
  inner_join(site_info, by = "loc_id") %>%
  inner_join(habitat_data, by = "loc_id") %>%
  group_by(habitat_type) %>%
  summarize(median_count = median(how_many, na.rm = TRUE))
## Warning in inner_join(., site_info, by = "loc_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 85771 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Warning in inner_join(., habitat_data, by = "loc_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 203916 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
min_median_count <- min(bird_habitat_summary$median_count)
max_median_count <- max(bird_habitat_summary$median_count)
top_habitat <- bird_habitat_summary$habitat_type[which.max(bird_habitat_summary$median_count)]

Figure 2. This boxplot compares bird counts across Residential, Industrial, and Agricultural habitats. Sites were categorized based on environmental data, and extreme outliers (above the 99th percentile) were removed. The plot highlights population differences across habitats. Bird observations vary by habitat type, with median counts ranging from 2 to 2.

if (nrow(bird_habitat_data) > 0) {
  ggplot(bird_habitat_data, aes(x = habitat_type, y = how_many, fill = habitat_type)) +
    geom_boxplot(outlier.shape = NA, alpha = 0.6) +
    ylim(0, 10) +
    labs(x = "Habitat Type", y = "Number of Birds Observed", fill = "Habitat Type") +
    theme_minimal() +
    theme(legend.position = "none")
} else {
  print("No data available for plotting.")
}
## Warning: Removed 26820 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Question 3: Bird Species Diversity by U.S. Region

Approach

  • Mapped U.S. states to four geographic regions: West, Midwest, South, and Northeast.
  • Calculated species diversity in each region by counting unique species codes.
  • Created a bar chart to compare bird species diversity across regions.
region_mapping <- tibble(
  subnational1_code = c("US-WA", "US-OR", "US-CA", "US-NV", "US-ID", "US-MT", "US-WY", "US-UT", "US-CO", "US-AK", "US-HI"),
  region = "West"
) %>%
  bind_rows(tibble(
    subnational1_code = c("US-ND", "US-SD", "US-NE", "US-KS", "US-MN", "US-IA", "US-MO", "US-WI", "US-IL", "US-IN", "US-MI", "US-OH"),
    region = "Midwest"
  )) %>%
  bind_rows(tibble(
    subnational1_code = c("US-TX", "US-OK", "US-AR", "US-LA", "US-MS", "US-AL", "US-TN", "US-KY", "US-GA", "US-FL", "US-SC", "US-NC", "US-VA", "US-WV", "US-MD", "US-DE", "US-DC"),
    region = "South"
  )) %>%
  bind_rows(tibble(
    subnational1_code = c("US-ME", "US-NH", "US-VT", "US-MA", "US-RI", "US-CT", "US-NY", "US-PA", "US-NJ"),
    region = "Northeast"
  ))

bird_region_data <- recent_bird_subset %>%
  left_join(region_mapping, by = "subnational1_code") %>%
  filter(!is.na(region))

region_species_diversity <- recent_bird_subset %>%
  left_join(site_info, by = "loc_id") %>%
  left_join(region_mapping, by = "subnational1_code") %>%
  group_by(region) %>%
  summarize(unique_species = n_distinct(species_code)) %>%
  arrange(desc(unique_species))
## Warning in left_join(., site_info, by = "loc_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 85771 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
region_max <- region_species_diversity$region[1]
max_species <- region_species_diversity$unique_species[1]
region_min <- region_species_diversity$region[nrow(region_species_diversity)]
min_species <- region_species_diversity$unique_species[nrow(region_species_diversity)]

Figure 3. This bar chart displays species diversity across four U.S. regions—West, Midwest, South, and Northeast. Observations were grouped by region, and diversity was measured by counting unique species codes. The chart shows geographic differences in bird richness.Bird species diversity varies significantly across U.S. regions. The South region recorded the highest diversity with 209 unique species. In contrast, the Midwest region had the lowest diversity with only 116 unique species.

if (nrow(bird_region_data) > 0) {
  ggplot(bird_region_data, aes(x = region, fill = region)) +
    geom_bar() +
    labs(x = "Region", y = "Number of Bird Observations", fill = "Region") +
    theme_minimal()
} else {
  print("No data available for region-based plot.")
}

Question 4: Rhode Island’s State Bird vs. Most Observed Bird

Approach

  • Filtered the dataset to only include Rhode Island observations using subnational1_code.
  • Identified the state bird and compared its observation count with the most observed species.
  • Created a map visualization of Rhode Island, highlighting locations of the state bird and most observed bird.
rhode_island_birds <- recent_bird_subset %>%
  filter(subnational1_code == "US-RI") %>%
  inner_join(site_info, by = "loc_id")
## Warning in inner_join(., site_info, by = "loc_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 113910 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
state_bird <- "amro"
state_bird_sightings <- rhode_island_birds %>%
  filter(species_code == state_bird) %>%
  summarize(total_sightings = sum(how_many, na.rm = TRUE)) %>%
  pull(total_sightings)

if (is.na(state_bird_sightings) || state_bird_sightings == 0) {
  state_bird <- rhode_island_birds %>%
    group_by(species_code) %>%
    summarize(total_count = sum(how_many, na.rm = TRUE)) %>%
    arrange(desc(total_count)) %>%
    slice(2) %>%
    pull(species_code)  
}

most_observed_bird <- rhode_island_birds %>%
  group_by(species_code) %>%
  summarize(total_count = sum(how_many, na.rm = TRUE)) %>%
  arrange(desc(total_count)) %>%
  slice(1) %>%
  pull(species_code)

rhode_island_birds <- rhode_island_birds %>%
  mutate(bird_category = case_when(
    species_code == state_bird ~ "State Bird",
    species_code == most_observed_bird ~ "Most Observed Bird",
    TRUE ~ "Other Birds"
  )) %>%
  filter(bird_category != "Other Birds")  
print(dim(rhode_island_birds))  # Debugging: Check dataset size
## [1] 270  84

Figure 4. This map compares sightings of Rhode Island’s state bird, the American Robin, to the most frequently observed species. Data was filtered for Rhode Island, and species counts were summed. The map shows locations where both species were sighted. Rhode Island’s state bird, the American Robin, was sighted 0 times, compared to the most observed species, blujay.

if (nrow(rhode_island_birds) > 0) {
  ggplot() +
    geom_polygon(data = map_data("state") %>% filter(region == "rhode island"),
                 aes(x = long, y = lat, group = group), fill = "gray90", color = "black") +
    geom_point(data = rhode_island_birds, aes(x = longitude, y = latitude, color = bird_category), alpha = 0.7) +
    scale_color_manual(values = c("State Bird" = "red", "Most Observed Bird" = "blue")) +
    labs(x = "Longitude", y = "Latitude", color = "Bird Type") +
    theme_minimal()
} else {
  print("No data available for Rhode Island plot.")
}