Translating HawkEars Confidence to Probability

Author

Barbara Frei

Published

May 26, 2026

Overview

This translates HawkEars confidence scores into probabilities using species-specific logistic regression models.

The script loops through all species listed in species_code and creates: - a logistic regression model for each species - a probability plot - optional saved output figures

Code
# load libraries

library(tidyverse)
library(dplyr)
library(ggplot2)
library(knitr)

# load-data

# Read in data created using Ana's code:
# Adding_confidence_levels.R

Spp_threshold.df <- read.csv("Data/Spp_threshold.df")

# Create observed column for logistic regression
# tag_rating == 1 = false positive
# tag_rating == 5 = true positive

Spp_threshold.df <- Spp_threshold.df %>%
  mutate(
    observed = case_when(
      tag_rating == 1 ~ 0,
      tag_rating == 5 ~ 1,
      TRUE ~ NA_real_
    )
  ) %>%
  drop_na(observed)

# Check structure
glimpse(Spp_threshold.df)
Rows: 12,243
Columns: 34
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…
$ organization            <chr> "ECCC- NbS", "ECCC- NbS", "ECCC- NbS", "ECCC- …
$ project                 <chr> "Montreal Species-specific confidence threshol…
$ project_id              <int> 4170, 4170, 4170, 4170, 4170, 4170, 4170, 4170…
$ location                <chr> "ANGELL-01", "ANGELL-01", "ANGELL-01", "ANGELL…
$ location_id             <int> 310868, 310868, 310868, 310868, 310868, 310868…
$ recording_date_time     <chr> "2023-05-10 05:02:00", "2023-05-10 07:02:00", …
$ recording_id            <int> 901475, 901471, 901470, 901468, 901465, 901462…
$ task_method             <chr> "1SPT", "1SPT", "1SPT", "1SPT", "1SPT", "1SPT"…
$ task_id                 <int> 3408334, 3408306, 3408305, 3408304, 3408303, 3…
$ task_is_complete        <chr> "t", "t", "t", "t", "t", "t", "t", "t", "t", "…
$ species_code            <chr> "WOTH", "BEKI", "RUBL", "YRWA", "BAOR", "BWWA"…
$ species_common_name     <chr> "Wood Thrush", "Belted Kingfisher", "Rusty Bla…
$ species_scientific_name <chr> "HYLOCICHLA MUSTELINA", "MEGACERYLE ALCYON", "…
$ species_class           <chr> "Aves", "Aves", "Aves", "Aves", "Aves", "Aves"…
$ detection_time          <int> 568, 382, 352, 136, 348, 538, 136, 370, 310, 2…
$ task_duration           <dbl> 599.5, 599.5, 599.5, 599.5, 599.5, 599.5, 599.…
$ tag_duration            <int> 3, 17, 3, 5, 3, 5, 3, 9, 5, 3, 3, 5, 3, 5, 3, …
$ min_tag_freq            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ max_tag_freq            <int> 12000, 12000, 12000, 12000, 12000, 12000, 1200…
$ tag_id                  <int> 6109879, 6109933, 6109952, 6109990, 6110130, 6…
$ individual_order        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ vocalization            <chr> "Song", "Song", "Song", "Song", "Song", "Song"…
$ abundance               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ tag_rating              <int> 1, 1, 5, 1, 1, 1, 1, 5, 1, 1, 1, 5, 1, 1, 5, 1…
$ tag_is_verified         <chr> "t", "t", "t", "t", "t", "t", "t", "t", "t", "…
$ clip_channel_used       <int> 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2…
$ observer                <chr> "Ana Morales", "Ana Morales", "Ana Morales", "…
$ observer_id             <int> 1467, 1467, 1467, 1467, 1467, 1467, 1467, 1467…
$ verifier_id             <int> 1467, 4892, 4892, 4892, 4892, 4892, 4892, 4892…
$ needs_review            <chr> "f", "f", "f", "f", "f", "f", "f", "f", "f", "…
$ tag_ID                  <chr> "WOTH 2023-05-10 05:02:00 ANGELL-01 1", "BEKI …
$ confidence              <dbl> 0.286, 0.637, 0.579, 0.424, 0.712, 0.351, 0.21…
$ observed                <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0…

Species-Specific Logistic Regression Plots

Code
library(knitr)

# Get unique species codes
species_list <- sort(unique(Spp_threshold.df$species_code))

# Create empty list to store models
model_list <- list()

# Loop through species
for(sp in species_list){

  # Create species heading
  cat("\n\n# ", sp, "\n\n")

  # Subset data
  spp.df <- subset(Spp_threshold.df, species_code == sp)

  # Skip species with insufficient data
  if(nrow(spp.df) < 10){

    cat("Insufficient data for this species.\n\n")

    next
  }

  # Fit logistic regression
  spp.mod <- glm(
    observed ~ confidence,
    data = spp.df,
    family = binomial
  )

  # Store model
  model_list[[sp]] <- spp.mod

  # Create plot
  p <- ggplot(spp.df, aes(x = confidence, y = observed)) +

    geom_point(
      size = 4,
      alpha = 0.1
    ) +

    geom_smooth(
      method = "glm",
      method.args = list(family = binomial),
      se = FALSE,
      linewidth = 1.5
    ) +

    scale_x_continuous(
      limits = c(0.1, 1),
      expand = c(0, 0),
      breaks = seq(0.1, 1, by = 0.3)
    ) +

    scale_y_continuous(
      limits = c(0, 1)
    ) +

    theme_bw() +

    labs(
      title = sp,
      x = "HawkEars confidence",
      y = "True positive rate"
    )

  # Force plot rendering in Quarto
  print(p)

  cat("\n\n")
}

ALFL

AMCR

AMKE

AMRE

AMRO

AMWO

ATSP

BAOR

BARS

BAWW

BBCU

BBWA

BCCH

BEKI

BHCO

BHVI

BLBW

BLJA

BLPW

BOBO

BOWA

BRCR

BRTH

BTBW

BTNW

BWHA

BWWA

CANG

CAWA

CEDW

CHSP

CHSW

CMWA

COGR

COHA

CONI

CORA

COTE

COYE

CSWA

DEJU

DICK

DOWO

EABL

EAKI

EAME

EAPH

EASO

EAWP

EUST

EVGR

EWPW

FOSP

GCFL

GCKI

GCTH

GHOW

GRCA

GWWA

HAWO

HETH

HOFI

HOSP

INBU

KILL

LEFL

LISP

MALL

MAWA

MERL

MODO

MOWA

NAWA

NOCA

NOFL

NOHA

NOPA

NOWA

NRWS

NSHR

NSWO

OSFL

OVEN

PAWA

PHVI

PIGR

PISI

PIWA

PIWO

PUFI

PUMA

RBGR

RBGU

RBNU

RBWO

RCKI

RECR

REVI

ROPI

RSHA

RTHA

RTHU

RUBL

RUGR

RWBL

SAVS

SCTA

SOSP

SSHA

SWSP

SWTH

TEWA

TRES

VEER

VESP

WAVI

WBNU

WCSP

WIFL

WISN

WITU

WIWA

WIWR

WODU

WOTH

WTSP

WWCR

YBFL

YBSA

YEWA

YRWA