1 Overview

This notebook explores your blue whale (Bm) integrated dataset (visual + acoustics) and OMP water mass covariates prior to building an integrated SDM (sdmTMB).

We will: - sanity check columns, missingness, duplicates, ranges
- characterize zeros (effort but no detections) across year/season/method
- explore distributions (raw + log1p), and uncertainty fields (if present)
- look at annual/seasonal summaries
- map spatial coverage and density
- inspect OMP covariates (composition constraints + collinearity)


```{r setup, include=FALSE} knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 5 ) pkgs <- c( “tidyverse”, “lubridate”, “janitor”, “skimr”, “GGally”, “corrplot”, “patchwork”, “sf”, “rnaturalearth”, “rnaturalearthdata”, “viridis” )

to_install <- pkgs[!pkgs %in% rownames(installed.packages())] if (length(to_install) > 0) install.packages(to_install)

library(tidyverse) library(lubridate) library(janitor) library(skimr) library(GGally) library(corrplot) library(patchwork) library(sf) library(rnaturalearth) library(rnaturalearthdata) library(viridis) 1) Read data dat_raw <- readr::read_csv(params$data_path, show_col_types = FALSE) %>% janitor::clean_names()

glimpse(dat_raw) 2) Parse / standardize columns dat <- dat_raw

2 —- time column —-

3 If your time column is named differently, change time_col here.

time_col <- “time_utc” if (!(time_col %in% names(dat))) { stop(paste0( “Expected a time column named ‘“, time_col,”’ but didn’t find it.”, “Look at names(dat_raw) and set time_col to the correct name.” )) }

4 —- required columns —-

needed <- c(“year”, “lat”, “lon”, “density”, “method”, time_col) missing_needed <- setdiff(needed, names(dat)) if (length(missing_needed) > 0) { stop(paste(“Missing required columns:”, paste(missing_needed, collapse = “,”))) }

5 —- optional columns —-

if (!(“season” %in% names(dat))) dat\(season <- NA_character_ if (!("cv" %in% names(dat))) dat\)cv <- NA_real_ if (!(“se” %in% names(dat))) dat$se <- NA_real_

dat <- dat %>% mutate( time_utc = ymd_hms(.data[[time_col]], tz = “UTC”), season = factor(season, levels = c(“Winter”, “Spring”, “Summer”, “Fall”)), method = factor(method), is_zero = density == 0, is_acoustic = str_detect(as.character(method), “^A”), is_visual = as.character(method) == “V”, log1p_density = log1p(density) )

summary(dat\(time_utc) levels(dat\)method) 3) Sanity checks 3.1 Missingness (proportion NA per column) dat %>% summarise(across(everything(), ~ mean(is.na(.)))) %>% pivot_longer(everything(), names_to = “column”, values_to = “prop_na”) %>% arrange(desc(prop_na)) %>% print(n = Inf) 3.2 Duplicate records (same time/lat/lon/method) dups <- dat %>% count(time_utc, lat, lon, method) %>% filter(n > 1) %>% arrange(desc(n))

cat(“Number of duplicate (time, lat, lon, method) combinations:”, nrow(dups), “”) if (nrow(dups) > 0) print(head(dups, 30)) 3.3 Ranges dat %>% summarise( year_min = min(year, na.rm = TRUE), year_max = max(year, na.rm = TRUE), lat_min = min(lat, na.rm = TRUE), lat_max = max(lat, na.rm = TRUE), lon_min = min(lon, na.rm = TRUE), lon_max = max(lon, na.rm = TRUE), density_min = min(density, na.rm = TRUE), density_max = max(density, na.rm = TRUE) ) 4) Summary stats skimr::skim(dat %>% select( year, season, time_utc, lat, lon, density, log1p_density, method, cv, se, starts_with(“pew_”), starts_with(“enpcw_”), starts_with(“psuw_”) ) ) 5) Zeros (effort but no detections) dat %>% group_by(method) %>% summarise( n = n(), prop_zero = mean(is_zero, na.rm = TRUE), median_nonzero = median(density[density > 0], na.rm = TRUE), .groups = “drop” ) %>% arrange(desc(prop_zero)) dat %>% group_by(year, method) %>% summarise( n = n(), prop_zero = mean(is_zero, na.rm = TRUE), .groups = “drop” ) %>% ggplot(aes(year, prop_zero, color = method)) + geom_line() + geom_point(size = 1.5) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + labs(title = “Proportion of zeros by year”, x = “Year”, y = “Zero proportion”) + theme_minimal() dat %>% filter(!is.na(season)) %>% group_by(season, method) %>% summarise( n = n(), prop_zero = mean(is_zero, na.rm = TRUE), .groups = “drop” ) %>% ggplot(aes(season, prop_zero, fill = method)) + geom_col(position = position_dodge(width = 0.8), width = 0.7) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + labs(title = “Zero proportion by season”, x = NULL, y = “Zero proportion”) + theme_minimal() 6) Distributions p_raw <- dat %>% ggplot(aes(density)) + geom_histogram(bins = 60) + facet_wrap(~ method, scales = “free_y”) + labs(title = “Density distribution (raw)”, x = “Density”, y = “Count”) + theme_minimal()

p_log <- dat %>% ggplot(aes(log1p_density)) + geom_histogram(bins = 60) + facet_wrap(~ method, scales = “free_y”) + labs(title = “Density distribution (log1p)”, x = “log(1 + density)”, y = “Count”) + theme_minimal()

p_raw / p_log dat %>% ggplot(aes(method, density)) + geom_boxplot(outlier.alpha = 0.15) + scale_y_continuous(trans = “log1p”) + labs(title = “Density by method (log1p scale)”, x = NULL, y = “Density”) + theme_minimal() 7) Acoustic uncertainty (if present) dat %>% group_by(is_acoustic) %>% summarise( n = n(), prop_cv_missing = mean(is.na(cv)), prop_se_missing = mean(is.na(se)), .groups = “drop” ) dat %>% filter(is_acoustic, !is.na(cv)) %>% ggplot(aes(density, cv)) + geom_point(alpha = 0.25) + scale_x_continuous(trans = “log1p”) + labs(title = “Acoustic density vs CV”, x = “Density (log1p scale)”, y = “CV”) + theme_minimal() dat %>% filter(is_acoustic, !is.na(se)) %>% ggplot(aes(density, se)) + geom_point(alpha = 0.25) + scale_x_continuous(trans = “log1p”) + labs(title = “Acoustic density vs SE”, x = “Density (log1p scale)”, y = “SE”) + theme_minimal() 8) Time trends dat_year <- dat %>% group_by(year, method) %>% summarise( n = n(), prop_zero = mean(is_zero), med = median(density, na.rm = TRUE), q25 = quantile(density, 0.25, na.rm = TRUE), q75 = quantile(density, 0.75, na.rm = TRUE), .groups = “drop” )

ggplot(dat_year, aes(year, med, color = method)) + geom_line() + geom_ribbon(aes(ymin = q25, ymax = q75, fill = method), alpha = 0.15, color = NA) + scale_y_continuous(trans = “log1p”) + labs(title = “Annual density (median with IQR)”, x = “Year”, y = “Density (log1p scale)”) + theme_minimal() dat %>% filter(!is.na(season)) %>% group_by(season, method) %>% summarise( med = median(density, na.rm = TRUE), q25 = quantile(density, 0.25, na.rm = TRUE), q75 = quantile(density, 0.75, na.rm = TRUE), .groups = “drop” ) %>% ggplot(aes(season, med, color = method, group = method)) + geom_line() + geom_point(size = 2) + geom_errorbar(aes(ymin = q25, ymax = q75), width = 0.15, alpha = 0.4) + scale_y_continuous(trans = “log1p”) + labs(title = “Seasonal density (median with IQR)”, x = NULL, y = “Density (log1p scale)”) + theme_minimal() 9) Spatial maps coast <- ne_countries(scale = “medium”, returnclass = “sf”) %>% filter(admin %in% c(“United States”, “Mexico”))

coast_crop <- st_crop(coast, xmin = -126, xmax = -112, ymin = 28, ymax = 38) ggplot() + geom_sf(data = coast_crop, fill = “gray95”, color = “gray70”, linewidth = 0.2) + geom_point(data = dat, aes(lon, lat, color = method), alpha = 0.35, size = 0.8) + coord_sf(xlim = c(-125, -114), ylim = c(30, 36), expand = FALSE) + labs(title = “Sampling locations by method”, x = NULL, y = NULL) + theme_minimal() dat %>% ggplot() + geom_sf(data = coast_crop, fill = “gray95”, color = “gray70”, linewidth = 0.2) + geom_point(aes(lon, lat, color = log1p_density), alpha = 0.6, size = 0.9) + facet_wrap(~ method) + coord_sf(xlim = c(-125, -114), ylim = c(30, 36), expand = FALSE) + scale_color_viridis_c(option = “C”) + labs(title = “Spatial distribution of density (log1p)”, x = NULL, y = NULL, color = “log(1 + density)”) + theme_minimal() 10) OMP covariates: composition checks + collinearity omp_cols <- names(dat) %>% keep(~ str_detect(.x, “^(pew|enpcw|psuw)”)) omp_cols length(omp_cols) has_triplet <- function(depth) { all(c(paste0(”pew”, depth), paste0(“enpcw_”, depth), paste0(“psuw_”, depth)) %in% names(dat)) }

depths <- c(“55”, “105”, “280”) triplet_ok <- tibble(depth = depths, ok = map_lgl(depths, has_triplet)) triplet_ok

dat_sums <- dat %>% mutate( sum_55 = if (has_triplet(“55”)) rowSums(across(all_of(c(“pew_55”,“enpcw_55”,“psuw_55”))), na.rm = TRUE) else NA_real_, sum_105 = if (has_triplet(“105”)) rowSums(across(all_of(c(“pew_105”,“enpcw_105”,“psuw_105”))), na.rm = TRUE) else NA_real_, sum_280 = if (has_triplet(“280”)) rowSums(across(all_of(c(“pew_280”,“enpcw_280”,“psuw_280”))), na.rm = TRUE) else NA_real_ )

dat_sums %>% select(sum_55, sum_105, sum_280) %>% pivot_longer(everything(), names_to = “depth”, values_to = “sum”) %>% filter(!is.na(sum)) %>% ggplot(aes(sum)) + geom_histogram(bins = 60) + facet_wrap(~ depth, scales = “free_y”) + labs(title = “OMP composition sums (by depth)”, x = “Sum of contributions”, y = “Count”) + theme_minimal() if (length(omp_cols) > 0) { dat %>% select(all_of(omp_cols)) %>% pivot_longer(everything(), names_to = “covariate”, values_to = “value”) %>% ggplot(aes(value)) + geom_histogram(bins = 60) + facet_wrap(~ covariate, scales = “free_y”) + labs(title = “OMP covariate distributions”, x = “Contribution”, y = “Count”) + theme_minimal() } if (length(omp_cols) > 1) { omp_mat <- dat %>% select(all_of(omp_cols)) %>% drop_na() if (nrow(omp_mat) > 5) { cor_omp <- cor(omp_mat, use = “pairwise.complete.obs”) corrplot(cor_omp, method = “color”, type = “upper”, tl.cex = 0.8) } } if (length(omp_cols) > 1) { omp_mat <- dat %>% select(all_of(omp_cols)) %>% drop_na() if (nrow(omp_mat) > 50) { set.seed(1) GGally::ggpairs(omp_mat %>% sample_n(min(nrow(omp_mat), 1500))) } } 11) Quick-look: density vs a few OMP covariates plot_covs <- omp_cols[1:min(length(omp_cols), 6)] plot_covs

if (length(plot_covs) > 0) { for (cc in plot_covs) { p <- dat %>% filter(!is.na(.data[[cc]])) %>% ggplot(aes(.data[[cc]], log1p_density, color = method)) + geom_point(alpha = 0.20) + geom_smooth(se = FALSE, method = “gam”, formula = y ~ s(x, k = 8)) + labs(title = paste(“log(1 + density) vs”, cc), x = cc, y = “log(1 + density)”) + theme_minimal() print(p) } } 12) Session info sessionInfo()