PNS eBird analysis

R Markdown

# Install packages if not already installed
required_packages <- c("tidyverse", "sf", "ggplot2", "rnaturalearth", "rnaturalearthdata", "knitr", "rlang", "gt", "webshot2", "rsconnect", "devtools", "raster", "geodata")
for (pkg in required_packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: sf
## 
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## 
## Loading required package: rnaturalearth
## 
## Loading required package: rnaturalearthdata
## 
## 
## Attaching package: 'rnaturalearthdata'
## 
## 
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
## 
## 
## Loading required package: knitr
## 
## Loading required package: rlang
## 
## 
## Attaching package: 'rlang'
## 
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
## 
## 
## Loading required package: gt
## 
## Loading required package: webshot2
## 
## Loading required package: rsconnect
## 
## Loading required package: devtools
## 
## Loading required package: usethis
## 
## 
## Attaching package: 'devtools'
## 
## 
## The following object is masked from 'package:rsconnect':
## 
##     lint
## 
## 
## Loading required package: raster
## 
## Loading required package: sp
## 
## 
## Attaching package: 'raster'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Loading required package: geodata
## 
## Loading required package: terra
## 
## terra 1.8.5
## 
## 
## Attaching package: 'terra'
## 
## 
## The following object is masked from 'package:knitr':
## 
##     spin
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

# Load required libraries
# This shoudl work but didnt so did local installation devtools::install_github("ropensci/rnaturalearthhires", force = TRUE) 
devtools::install_local("rnaturalearthhires-master/")

## Skipping install of 'rnaturalearthhires' from a local remote, the SHA1 (1.0.0.90) has not changed since last install.
##   Use `force = TRUE` to force installation

library(tidyverse)     # Data manipulation and visualization
library(sf)            # Handling spatial data
library(ggplot2)       # Visualization
library(rnaturalearth) # To get world map data
library(rnaturalearthdata)
library(knitr)         # For dynamic reporting
library(rlang)         # Utilities for programming
library(gt)            # For creating tables
library(webshot2)      # For saving tables as images
library(rsconnect)     # For RPubs md
library(devtools)
library(rnaturalearthhires)
library(raster)
library(geodata)

## 1. Load the dataset
data <- read.csv("MyEBirdData 2.csv", stringsAsFactors = FALSE)
glimpse(data)

## Rows: 48,395
## Columns: 23
## $ Submission.ID          <chr> "S33291577", "S33291584", "S33291523", "S944804…
## $ Common.Name            <chr> "Black-bellied Whistling-Duck", "Black-bellied …
## $ Scientific.Name        <chr> "Dendrocygna autumnalis", "Dendrocygna autumnal…
## $ Taxonomic.Order        <int> 233, 233, 239, 248, 248, 248, 248, 248, 248, 24…
## $ Count                  <chr> "20", "10", "6", "1", "1", "10", "2", "2", "2",…
## $ State.Province         <chr> "NI-RI", "NI-RI", "NI-RI", "IN-MP", "IN-MP", "I…
## $ County                 <chr> "", "", "", "Anuppur", "Anuppur", "Bengaluru Ru…
## $ Location.ID            <chr> "L1004596", "L1004596", "L3670273", "L16346038"…
## $ Location               <chr> "Isla Ometepe--Charco Verde ", "Isla Ometepe--C…
## $ Latitude               <dbl> 11.48486, 11.48486, 11.51136, 23.11376, 23.0426…
## $ Longitude              <dbl> -85.62478, -85.62478, -85.71801, 81.72472, 81.4…
## $ Date                   <chr> "2016-12-30", "2016-12-30", "2016-12-29", "2021…
## $ Time                   <chr> "08:58 AM", "09:13 AM", "04:37 PM", "07:32 AM",…
## $ Protocol               <chr> "eBird - Stationary Count", "eBird - Traveling …
## $ Duration..Min.         <int> 20, 97, 54, 22, 9, 150, 92, 14, 90, 46, 0, 21, …
## $ All.Obs.Reported       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,…
## $ Distance.Traveled..km. <dbl> NA, 2.000, 0.300, 2.128, 3.836, 0.500, 1.500, N…
## $ Area.Covered..ha.      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Number.of.Observers    <int> 2, 2, 2, 1, 1, 3, 2, 1, 5, 5, 12, 10, 4, 2, 5, …
## $ Breeding.Code          <chr> "", "", "", "", "", "", "", "", "", "", "", "",…
## $ Observation.Details    <chr> "", "", "", "", "", "", "", "", "", "", "", "",…
## $ Checklist.Comments     <chr> "Charco verde lagoon", "From charco verde lagoo…
## $ ML.Catalog.Numbers     <chr> "127451251", "", "", "", "", "", "", "", "", ""…

## 2. Clean the data
# Remove missing values for Latitude, Longitude, and Common Name
data_clean <- data %>%
  filter(!is.na(Latitude), !is.na(Longitude), !is.na(Common.Name))

missing_values <- data %>%
  summarise(across(everything(), ~ sum(is.na(.)), .names = "missing_{col}")) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Missing_Count")

print("Missing values by variable:")

## [1] "Missing values by variable:"

print(missing_values)

## # A tibble: 23 × 2
##    Variable                Missing_Count
##    <chr>                           <int>
##  1 missing_Submission.ID               0
##  2 missing_Common.Name                 0
##  3 missing_Scientific.Name             0
##  4 missing_Taxonomic.Order             0
##  5 missing_Count                       0
##  6 missing_State.Province              0
##  7 missing_County                      0
##  8 missing_Location.ID                 0
##  9 missing_Location                    0
## 10 missing_Latitude                    0
## # ℹ 13 more rows

# Summarize numeric variables
numeric_summary <- data %>%
  dplyr::select(where(is.numeric)) %>%
  summarise(across(everything(), list(mean = mean, median = median, sd = sd, min = min, max = max), na.rm = TRUE))

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

print("Summary of numeric variables:")

## [1] "Summary of numeric variables:"

print(numeric_summary)

##   Taxonomic.Order_mean Taxonomic.Order_median Taxonomic.Order_sd
## 1             17277.36                  21210           9951.142
##   Taxonomic.Order_min Taxonomic.Order_max Latitude_mean Latitude_median
## 1                 233               35564      13.59148        12.31475
##   Latitude_sd Latitude_min Latitude_max Longitude_mean Longitude_median
## 1    5.779635    -33.95959     56.34958       73.01723         77.14248
##   Longitude_sd Longitude_min Longitude_max Duration..Min._mean
## 1     26.27715     -123.9558      140.0584            27.85706
##   Duration..Min._median Duration..Min._sd Duration..Min._min Duration..Min._max
## 1                    17          42.00343                  0               1220
##   All.Obs.Reported_mean All.Obs.Reported_median All.Obs.Reported_sd
## 1             0.9147433                       1           0.2792663
##   All.Obs.Reported_min All.Obs.Reported_max Distance.Traveled..km._mean
## 1                    0                    1                    2.024088
##   Distance.Traveled..km._median Distance.Traveled..km._sd
## 1                           0.6                  4.165263
##   Distance.Traveled..km._min Distance.Traveled..km._max Area.Covered..ha._mean
## 1                       0.08                         70                 4.0469
##   Area.Covered..ha._median Area.Covered..ha._sd Area.Covered..ha._min
## 1                   4.0469                    0                4.0469
##   Area.Covered..ha._max Number.of.Observers_mean Number.of.Observers_median
## 1                4.0469                 2.524811                          2
##   Number.of.Observers_sd Number.of.Observers_min Number.of.Observers_max
## 1               2.178464                       1                      34

## 3. Group data by location (Latitude and Longitude)
# Calculate the number of unique species at each location
species_counts <- data_clean %>%
  group_by(Latitude, Longitude, Location) %>%
  summarize(Species_Count = n_distinct(Common.Name), .groups = "drop")

# Convert to spatial data
species_sf <- st_as_sf(species_counts, coords = c("Longitude", "Latitude"), crs = 4326)

## 4. Get world map data
world_map <- ne_countries(scale = "medium", returnclass = "sf")

## 5. Plot the number of unique species observed globally
ggplot() +
  geom_sf(data = world_map, fill = "lightgrey", color = "black") +
  geom_sf(data = species_sf, aes(size = Species_Count), color = "darkgreen", alpha = 0.6) +
  scale_size_continuous(name = "Unique Species Count", range = c(1, 8)) +
  labs(title = "PNS eBird",
       subtitle = "Based on Uploaded eBird Data",
       x = "Longitude", y = "Latitude") +
  theme_minimal()

## 6. Get the top 10 most observed species
top_species <- data %>%
  count(`Common.Name`, sort = TRUE) %>%
  slice_head(n = 10)

print("Top 10 most observed species:")

## [1] "Top 10 most observed species:"

print(top_species)

##                                  Common.Name    n
## 1                       Red-whiskered Bulbul 1558
## 2                 Rock Pigeon (Feral Pigeon) 1408
## 3  White-cheeked Barbet (Small Green Barbet) 1407
## 4                          Large-billed Crow 1276
## 5                                 Black Kite 1254
## 6                          Common Tailorbird 1226
## 7                                Common Myna 1203
## 8                                Ashy Prinia 1086
## 9                               Spotted Dove 1073
## 10                      Rose-ringed Parakeet 1069

# Display as a table with caption
print("Top 10 Most Observed Species:")

## [1] "Top 10 Most Observed Species:"

top_species %>%
  kable(caption = "Top 10 Most Observed Bird Species observed by Prashanth N S on eBird")

Top 10 Most Observed Bird Species observed by Prashanth N S on eBird
Common.Name	n
Red-whiskered Bulbul	1558
Rock Pigeon (Feral Pigeon)	1408
White-cheeked Barbet (Small Green Barbet)	1407
Large-billed Crow	1276
Black Kite	1254
Common Tailorbird	1226
Common Myna	1203
Ashy Prinia	1086
Spotted Dove	1073
Rose-ringed Parakeet	1069

# Create a table and save as an image
species_table <- top_species %>%
  gt() %>%
  tab_header(
    title = "Top 10 Most Observed Bird Species by Prashanth N S",
    subtitle = "Based on his eBird Dataset"
  ) %>%
  fmt_number(
    columns = "n",
    decimals = 0
  ) %>%
  cols_label(
    `Common.Name` = "Bird Species",
    n = "Observation Count"
  )

# Save the table as an image
gtsave(species_table, "top_species_table.png")

## file:////var/folders/0q/ys17wz2548z1x7tlclsfw7kr0000gn/T//RtmpZeUdVa/file18161603ab741.html screenshot completed

## 7. Get the most observed locations
top_locations <- data %>%
  count(Location, sort = TRUE) %>%
  slice_head(n = 10)

print("Top 10 most observed locations:")

## [1] "Top 10 most observed locations:"

print(top_locations)

##                                                                                  Location
## 1                                                             BR Hills--IPH field station
## 2                                                              Judicial layout--BBMP Park
## 3                                                                     Near Zuari/Hulikere
## 4  BR Hills ಬಿ ಆರ್‌ ಹಿಲ್ಸ್--Biligiri Rangaswamy Temple Tiger Reserve ಬಿಳಿಗಿರಿರಂಗನಾಥಸ್ವಾಮಿ ಹುಲಿ ಅಭಯಾರಣ್ಯ
## 5                                                          Judicial Layout--Adithya Homes
## 6                                                                        BR Hills—Shaheen
## 7                                                                             Mysore home
## 8                                                           Kumaraswamy Layout Water Tank
## 9                                                                              Muntipalya
## 10                                                              BR Hills--Someshwara Kere
##       n
## 1  6226
## 2  3546
## 3  3204
## 4  2646
## 5  1501
## 6  1465
## 7  1265
## 8  1137
## 9   805
## 10  745

# Create a table and save as an image
locations_table <- top_locations %>%
  gt() %>%
  tab_header(
    title = "Top 10 Most Observed Locations",
    subtitle = "Based on the eBird Dataset"
  ) %>%
  fmt_number(
    columns = "n",
    decimals = 0
  ) %>%
  cols_label(
    Location = "Location",
    n = "Observation Count"
  )

# Save the table as an image
gtsave(locations_table, "top_locations_table.png")

## file:////var/folders/0q/ys17wz2548z1x7tlclsfw7kr0000gn/T//RtmpZeUdVa/file18161618617cc.html screenshot completed

top_locations %>%
  kable(caption = "Top 10 locations from which Bird Species observed by Prashanth N S on eBird")

Top 10 locations from which Bird Species observed by Prashanth N S on eBird
Location	n
BR Hills–IPH field station	6226
Judicial layout–BBMP Park	3546
Near Zuari/Hulikere	3204
BR Hills ಬಿ ಆರ್‌ ಹಿಲ್ಸ್–Biligiri Rangaswamy Temple Tiger Reserve ಬಿಳಿಗಿರಿರಂಗನಾಥಸ್ವಾಮಿ ಹುಲಿ ಅಭಯಾರಣ್ಯ	2646
Judicial Layout–Adithya Homes	1501
BR Hills—Shaheen	1465
Mysore home	1265
Kumaraswamy Layout Water Tank	1137
Muntipalya	805
BR Hills–Someshwara Kere	745

## 8. Birding in Karnataka

# Filter data for Karnataka state
data_karnataka <- data %>%
  filter(State.Province == "IN-KA") %>%  
  filter(!is.na(Latitude), !is.na(Longitude)) 

# Convert to spatial data
data_karnataka_sf <- st_as_sf(data_karnataka, coords = c("Longitude", "Latitude"), crs = 4326)

# Download and load India shapefile from GADM
# india <- gadm("India", level = 1, path = tempdir()) # Level 1: State boundaries
india1 <- st_read("southern-zone-latest-free/gis_osm_landuse_a_free_1.shp") # base map from https://download.geofabrik.de/asia/india.html

## Reading layer `gis_osm_landuse_a_free_1' from data source 
##   `/Users/prashanthns/Documents/R/Learning/eBird/eBird/southern-zone-latest-free/gis_osm_landuse_a_free_1.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 167393 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 72.18424 ymin: 6.756267 xmax: 94.27662 ymax: 19.91997
## Geodetic CRS:  WGS 84

india <- st_read("maps-master/Country/india-osm.geojson") # base map from https://projects.datameet.org/maps/states/#repository

## Reading layer `india-osm' from data source 
##   `/Users/prashanthns/Documents/R/Learning/eBird/eBird/maps-master/Country/india-osm.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 1 feature and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 68.2056 ymin: 6.755997 xmax: 97.39556 ymax: 37.08411
## Geodetic CRS:  WGS 84

# Convert to sf object
india_sf <- st_as_sf(india)
india_sf1 <- st_as_sf(india1)


# Plot the map
ggplot() +
  geom_sf(data = india_sf, fill = "lightgrey", color = "black") +
  geom_sf(data = data_karnataka_sf, aes(color = Location), alpha = 0.7, size = 2) +
  labs(
    title = "Bird Lists from Karnataka by Prashanth N S",
    subtitle = "Locations of eBird Observations in Karnataka (2000-2024)",
    x = "Longitude", y = "Latitude"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggplot() +
  geom_sf(data = india_sf1, fill = "lightgrey", color = "black") +
  geom_sf(data = data_karnataka_sf, aes(color = Location), alpha = 0.7, size = 2) +
  labs(
    title = "Bird Lists from Karnataka by Prashanth N S",
    subtitle = "Locations of eBird Observations in Karnataka (2000-2024)",
    x = "Longitude", y = "Latitude"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# 9. type of lists

# Count the types of lists
list_counts <- data %>%
  count(Protocol, sort = TRUE)  # Group by 'Protocol' column and count occurrences

# Display the counts in a table
print(list_counts)

##                     Protocol     n
## 1   eBird - Stationary Count 27249
## 2    eBird - Traveling Count 18702
## 3 eBird - Casual Observation  2314
## 4                 Historical   130

# Plot the types of lists as a bar chart
ggplot(list_counts, aes(x = reorder(Protocol, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  coord_flip() +  # Flip the coordinates for better readability
  labs(
    title = "Type of Lists by Number",
    x = "List Type (Protocol)",
    y = "Number of Lists"
  ) +
  theme_minimal()

# 10.Over time
# Load required libraries
library(lubridate)

# Convert Count to numeric (replacing invalid entries with NA)
data <- data %>%
  mutate(
    Count = as.numeric(Count),     # Convert Count to numeric
    Date = as.Date(Date),          # Ensure Date is in the correct format
    Year = year(Date),             # Extract Year
    Month = month(Date, label = TRUE)  # Extract Month (labeled)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Count = as.numeric(Count)`.
## Caused by warning:
## ! NAs introduced by coercion

# Summarize bird observations over time
# Lists per month
monthly_observations <- data %>%
  group_by(Year, Month) %>%
  summarise(Observations = n(), .groups = "drop")

# Sum the bird counts (if Count is numeric and cleaned)
monthly_counts <- data %>%
  filter(!is.na(Count)) %>%  # Exclude rows with invalid Count values
  group_by(Year, Month) %>%
  summarise(Total_Birds = sum(Count, na.rm = TRUE), .groups = "drop")

#  trend of observations over time
ggplot(monthly_observations, aes(x = interaction(Year, Month, sep = "-"), y = Observations)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Number of Bird Observations Over Time",
    x = "Time (Year-Month)",
    y = "Number of Observations"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Plot the trend of bird counts over time
ggplot(monthly_counts, aes(x = interaction(Year, Month, sep = "-"), y = Total_Birds)) +
  geom_line(color = "green", size = 1) +
  geom_point(color = "orange", size = 2) +
  labs(
    title = "Total Birds Observed Over Time",
    x = "Time (Year-Month)",
    y = "Total Bird Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Summarize by Quarter
quarterly_observations <- data %>%
  mutate(Quarter = paste0("Q", quarter(Date), "-", Year)) %>%
  group_by(Quarter) %>%
  summarise(Observations = n(), .groups = "drop")

# Plot the quarterly trend
ggplot(quarterly_observations, aes(x = Quarter, y = Observations)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Number of Bird Observations Over Time (Quarterly)",
    x = "Quarter",
    y = "Number of Observations"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Summarize by Year instead of Year-Month
data <- data %>%
  mutate(
    Date = as.Date(Date),          # Ensure Date is in the correct format
    Year = year(Date)              # Extract the Year
  ) %>%
  filter(Year >= 2000 & Year <= 2024)  # Filter for years 2000 to 2024

# Summarize by Year
yearly_observations <- data %>%
  group_by(Year) %>%
  summarise(Observations = n(), .groups = "drop")  # Count the number of lists per year

# Plot the yearly trend from 2000-2024
ggplot(yearly_observations, aes(x = Year, y = Observations)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Number of Bird Observations Over Time (2000-2024)",
    x = "Year",
    y = "Number of Observations"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

PNS eBird analysis

Prashanth N S

2024-12-27

R Markdown