Translating HawkEars Confidence to Probability

Author

Barbara Frei

Published

May 26, 2026

Overview

This translates HawkEars confidence scores into probabilities using species-specific logistic regression models.

The script loops through all species listed in species_code and creates: - a logistic regression model for each species - a probability plot - optional saved output figures

Code

# load libraries

library(tidyverse)
library(dplyr)
library(ggplot2)
library(knitr)

# load-data

# Read in data created using Ana's code:
# Adding_confidence_levels.R

Spp_threshold.df <- read.csv("Data/Spp_threshold.df")

# Create observed column for logistic regression
# tag_rating == 1 = false positive
# tag_rating == 5 = true positive

Spp_threshold.df <- Spp_threshold.df %>%
  mutate(
    observed = case_when(
      tag_rating == 1 ~ 0,
      tag_rating == 5 ~ 1,
      TRUE ~ NA_real_
    )
  ) %>%
  drop_na(observed)

# Check structure
glimpse(Spp_threshold.df)

Rows: 12,243
Columns: 34
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…
$ organization            <chr> "ECCC- NbS", "ECCC- NbS", "ECCC- NbS", "ECCC- …
$ project                 <chr> "Montreal Species-specific confidence threshol…
$ project_id              <int> 4170, 4170, 4170, 4170, 4170, 4170, 4170, 4170…
$ location                <chr> "ANGELL-01", "ANGELL-01", "ANGELL-01", "ANGELL…
$ location_id             <int> 310868, 310868, 310868, 310868, 310868, 310868…
$ recording_date_time     <chr> "2023-05-10 05:02:00", "2023-05-10 07:02:00", …
$ recording_id            <int> 901475, 901471, 901470, 901468, 901465, 901462…
$ task_method             <chr> "1SPT", "1SPT", "1SPT", "1SPT", "1SPT", "1SPT"…
$ task_id                 <int> 3408334, 3408306, 3408305, 3408304, 3408303, 3…
$ task_is_complete        <chr> "t", "t", "t", "t", "t", "t", "t", "t", "t", "…
$ species_code            <chr> "WOTH", "BEKI", "RUBL", "YRWA", "BAOR", "BWWA"…
$ species_common_name     <chr> "Wood Thrush", "Belted Kingfisher", "Rusty Bla…
$ species_scientific_name <chr> "HYLOCICHLA MUSTELINA", "MEGACERYLE ALCYON", "…
$ species_class           <chr> "Aves", "Aves", "Aves", "Aves", "Aves", "Aves"…
$ detection_time          <int> 568, 382, 352, 136, 348, 538, 136, 370, 310, 2…
$ task_duration           <dbl> 599.5, 599.5, 599.5, 599.5, 599.5, 599.5, 599.…
$ tag_duration            <int> 3, 17, 3, 5, 3, 5, 3, 9, 5, 3, 3, 5, 3, 5, 3, …
$ min_tag_freq            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ max_tag_freq            <int> 12000, 12000, 12000, 12000, 12000, 12000, 1200…
$ tag_id                  <int> 6109879, 6109933, 6109952, 6109990, 6110130, 6…
$ individual_order        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ vocalization            <chr> "Song", "Song", "Song", "Song", "Song", "Song"…
$ abundance               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ tag_rating              <int> 1, 1, 5, 1, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 5, 1…
$ tag_is_verified         <chr> "t", "t", "t", "t", "t", "t", "t", "t", "t", "…
$ clip_channel_used       <int> 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2…
$ observer                <chr> "Ana Morales", "Ana Morales", "Ana Morales", "…
$ observer_id             <int> 1467, 1467, 1467, 1467, 1467, 1467, 1467, 1467…
$ verifier_id             <int> 1467, 4892, 4892, 4892, 4892, 4892, 4892, 4892…
$ needs_review            <chr> "f", "f", "f", "f", "f", "f", "f", "f", "f", "…
$ tag_ID                  <chr> "WOTH 2023-05-10 05:02:00 ANGELL-01 1", "BEKI …
$ confidence              <dbl> 0.286, 0.637, 0.579, 0.424, 0.712, 0.351, 0.21…
$ observed                <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0…

Species-Specific Logistic Regression Plots

Code

library(knitr)

# Get unique species codes
species_list <- sort(unique(Spp_threshold.df$species_code))

# Create empty list to store models
model_list <- list()

# Loop through species
for(sp in species_list){

  # Create species heading
  cat("\n\n# ", sp, "\n\n")

  # Subset data
  spp.df <- subset(Spp_threshold.df, species_code == sp)

  # Skip species with insufficient data
  if(nrow(spp.df) < 10){

    cat("Insufficient data for this species.\n\n")

    next
  }

  # Fit logistic regression
  spp.mod <- glm(
    observed ~ confidence,
    data = spp.df,
    family = binomial
  )

  # Store model
  model_list[[sp]] <- spp.mod

  # Create plot
  p <- ggplot(spp.df, aes(x = confidence, y = observed)) +

    geom_point(
      size = 4,
      alpha = 0.1
    ) +

    geom_smooth(
      method = "glm",
      method.args = list(family = binomial),
      se = FALSE,
      linewidth = 1.5
    ) +

    scale_x_continuous(
      limits = c(0.1, 1),
      expand = c(0, 0),
      breaks = seq(0.1, 1, by = 0.3)
    ) +

    scale_y_continuous(
      limits = c(0, 1)
    ) +

    theme_bw() +

    labs(
      title = sp,
      x = "HawkEars confidence",
      y = "True positive rate"
    )

  # Force plot rendering in Quarto
  print(p)

  cat("\n\n")
}

Overview

Species-Specific Logistic Regression Plots

ALFL

AMCR

AMKE

AMRE

AMRO

AMWO

ATSP

BAOR

BARS

BAWW

BBCU

BBWA

BCCH

BEKI

BHCO

BHVI

BLBW

BLJA

BLPW

BOBO

BOWA

BRCR

BRTH

BTBW

BTNW

BWHA

BWWA

CANG

CAWA

CEDW

CHSP

CHSW

CMWA

COGR

COHA

CONI

CORA

COTE

COYE

CSWA

DEJU

DICK

DOWO

EABL

EAKI

EAME

EAPH

EASO

EAWP

EUST

EVGR

EWPW

FOSP

GCFL

GCKI

GCTH

GHOW

GRCA

GWWA

HAWO

HETH

HOFI

HOSP

INBU

KILL

LEFL

LISP

MALL

MAWA

MERL

MODO

MOWA

NAWA

NOCA

NOFL

NOHA

NOPA

NOWA