This document contains diagnostic plots and summary statistics for the fin whale joint visual-acoustic density model with group-size likelihood structure.
Model features: - Hilbert-space Gaussian Process (HSGP) for spatial smoothing - Separate GP per season (Winter, Spring, Summer, Fall) - Group-based visual likelihood (models number of groups, not individuals) - Poisson model for group size distribution - Joint acoustic likelihood (20Hz and 40Hz calls)
library(rstan)
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(bayesplot)
library(posterior)
library(gridExtra)
# Set paths
code_dir <- "/Users/michaelaalksne/Desktop/PhD/Chapter2/CalCOFI/CalCOFI_integrated_SDM/code/stan/"
data_dir <- "/Users/michaelaalksne/Desktop/PhD/Chapter2/CalCOFI/CalCOFI_integrated_SDM/data/"
fig_dir <- "/Users/michaelaalksne/Desktop/PhD/Chapter2/CalCOFI/CalCOFI_integrated_SDM/figures/modeled_outputs/fin_gp_group_size/"
season_labels <- c("1"="Winter", "2"="Spring", "3"="Summer", "4"="Fall")
# Load fitted model
fit <- readRDS(paste0(code_dir, "fitted_finwhale_gp_groupsize.rds"))
posterior_samples <- readRDS(paste0(code_dir, "posterior_samples_finwhale_gp_groupsize.rds"))
# Load original data for comparison
sighting_raw <- readr::read_csv(paste0(data_dir, "Bp_sighting_data.csv"),
show_col_types = FALSE)
subsegment_raw <- readr::read_csv(paste0(data_dir, "Bp_groupsize_data.csv"),
show_col_types = FALSE)
cat("Posterior samples:", nrow(posterior_samples), "\n")
## Posterior samples: 2000
cat("Total sightings:", nrow(sighting_raw), "\n")
## Total sightings: 288
cat("Total subsegments:", nrow(subsegment_raw), "\n")
## Total subsegments: 10694
cat("FIN WHALE GROUP-SIZE GP MODEL - SUMMARY\n")
## FIN WHALE GROUP-SIZE GP MODEL - SUMMARY
cat("DATA:\n")
## DATA:
cat(" Visual subsegments:", nrow(subsegment_raw), "\n")
## Visual subsegments: 10694
cat(" Subsegments with groups:", sum(subsegment_raw$num_groups_observed > 0), "\n")
## Subsegments with groups: 213
cat(" Total sightings:", nrow(sighting_raw), "\n")
## Total sightings: 288
cat(" Mean observed group size:", round(mean(sighting_raw$group_size), 2), "\n\n")
## Mean observed group size: 1.92
# Get convergence info early
summary_df <- as.data.frame(summary(fit)$summary)
rhat_issues <- rownames(summary_df)[summary_df$Rhat > 1.01 & !is.nan(summary_df$Rhat)]
cat("CONVERGENCE:\n")
## CONVERGENCE:
cat(" Rhat > 1.01:", length(rhat_issues), "parameters\n")
## Rhat > 1.01: 0 parameters
cat(" Status:", ifelse(length(rhat_issues) == 0, "✓ CONVERGED", "⚠ CHECK"), "\n\n")
## Status: ✓ CONVERGED
# Get key posteriors
exp_group_size_posterior <- posterior_samples$exp_group_size
seasonal_dens <- data.frame(
Season = factor(season_labels, levels = c("Winter", "Spring", "Summer", "Fall")),
mean_density = colMeans(as.matrix(posterior_samples[, paste0("mean_density_season.", 1:4)])),
lower_95 = apply(as.matrix(posterior_samples[, paste0("mean_density_season.", 1:4)]),
2, quantile, 0.025),
upper_95 = apply(as.matrix(posterior_samples[, paste0("mean_density_season.", 1:4)]),
2, quantile, 0.975)
)
rho_long <- bind_rows(lapply(1:4, function(m) {
data.frame(
rho = posterior_samples[[paste0("rho.", m)]],
Season = season_labels[as.character(m)]
)
})) %>%
mutate(Season = factor(Season, levels = c("Winter", "Spring", "Summer", "Fall")))
cat("KEY PARAMETERS:\n")
## KEY PARAMETERS:
cat(" Expected group size:", round(mean(exp_group_size_posterior), 2),
" (95% CI: ", round(quantile(exp_group_size_posterior, 0.025), 2), "-",
round(quantile(exp_group_size_posterior, 0.975), 2), ")\n", sep = "")
## Expected group size:1.93 (95% CI: 1.82-2.05)
cat("\n Seasonal density (whales/km²):\n")
##
## Seasonal density (whales/km²):
for (i in 1:4) {
cat(" ", season_labels[as.character(i)], ": ",
sprintf("%.5f", seasonal_dens$mean_density[i]), "\n", sep = "")
}
## Winter: 0.00138
## Spring: 0.00470
## Summer: 0.00863
## Fall: 0.00412
cat("\n GP length scales (km):\n")
##
## GP length scales (km):
rho_summary <- rho_long %>%
group_by(Season) %>%
summarise(mean_rho = mean(rho), .groups = "drop")
for (i in 1:4) {
cat("rho summary", rho_summary$Season[i], ": ",
round(rho_summary$mean_rho[i], 1), " km\n", sep = "")
}
## rho summary1: 151.4 km
## rho summary2: 124.3 km
## rho summary3: 131.6 km
## rho summary4: 91.6 km
measure of distance of spatial correlation rho = “How far apart can two locations be before their densities become uncorrelated?”
rho = 90 km means: - Two locations 20 km apart: HIGHLY correlated (similar density) - Two locations 90 km apart: MODERATELY correlated (the threshold) - Two locations 200 km apart: UNCORRELATED (independent)
Bigger rho = smoother surface (correlation extends further) Smaller rho = bumpier surface (density varies on shorter scales)
interpretation:
Winter (151 km) - Most Dispersed: Whales are spread out in small groups across the SCB Density varies on large spatial scales Fewer distinct hotspots, more gradual gradients
Spring (124 km) - Intermediate: Some aggregation but not as tight as Fall. Similar to winter/summer.
Summer (131 km) - Intermediate: Some aggregation but not as tight as Fall. Similar to winter/spring. However, we know that there are WAY more whales during this time of year. Interesting that spatial correlation remains similar. High densities are kind of all over the place cause there’s so many whales?
Fall (92 km) - Most Localized: Fin whales appear to be in more concentrated hotspots Density varies on smaller spatial scales local aggregations separated by empty areas Matches the acoustics (lots of 20Hz in specific areas!)
# Print summary for key parameters
print(fit, pars = c("mu_intercept", "gp_alpha", "rho",
"beta_20", "beta_40", "mu_group_size", "exp_group_size",
"phi_v", "phi_20", "phi_40",
"mean_density_season", "spatial_sd"))
## Inference for Stan model: anon_model.
## 4 chains, each with iter=1000; warmup=500; thin=1;
## post-warmup draws per chain=500, total post-warmup draws=2000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5%
## mu_intercept[1] -7.41 0.01 0.23 -7.86 -7.56 -7.40 -7.25 -6.96
## mu_intercept[2] -7.40 0.01 0.24 -7.87 -7.55 -7.39 -7.24 -6.95
## mu_intercept[3] -6.64 0.00 0.17 -6.99 -6.75 -6.64 -6.53 -6.32
## mu_intercept[4] -6.98 0.01 0.23 -7.46 -7.14 -6.98 -6.83 -6.54
## gp_alpha[1] 1.31 0.02 0.43 0.75 1.02 1.23 1.50 2.35
## gp_alpha[2] 1.58 0.01 0.48 0.92 1.25 1.49 1.80 2.63
## gp_alpha[3] 1.89 0.02 0.61 1.14 1.49 1.77 2.14 3.41
## gp_alpha[4] 1.83 0.02 0.52 1.05 1.46 1.76 2.11 3.10
## rho[1] 151.43 2.31 59.88 68.45 112.92 139.16 176.56 304.06
## rho[2] 124.34 1.89 45.19 55.74 95.61 117.66 144.51 247.36
## rho[3] 131.56 1.19 30.47 79.30 111.19 128.66 149.22 198.02
## rho[4] 91.63 1.05 26.65 43.73 73.60 90.14 107.55 145.28
## beta_20[1] 0.30 0.00 0.11 0.15 0.23 0.28 0.36 0.56
## beta_20[2] 0.15 0.00 0.05 0.08 0.11 0.14 0.17 0.27
## beta_20[3] 0.01 0.00 0.00 0.01 0.01 0.01 0.02 0.02
## beta_20[4] 0.31 0.00 0.10 0.17 0.24 0.30 0.37 0.54
## beta_40[1] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.01
## beta_40[2] 0.01 0.00 0.00 0.00 0.00 0.01 0.01 0.01
## beta_40[3] 0.01 0.00 0.00 0.00 0.01 0.01 0.01 0.02
## beta_40[4] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## mu_group_size 0.93 0.00 0.06 0.82 0.89 0.93 0.97 1.05
## exp_group_size 1.93 0.00 0.06 1.82 1.89 1.93 1.97 2.05
## phi_v 0.11 0.00 0.02 0.08 0.09 0.11 0.12 0.15
## phi_20 0.03 0.00 0.00 0.03 0.03 0.03 0.03 0.04
## phi_40 0.02 0.00 0.00 0.02 0.02 0.02 0.02 0.02
## mean_density_season[1] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## mean_density_season[2] 0.00 0.00 0.13 0.00 0.00 0.00 0.00 0.00
## mean_density_season[3] 0.01 0.00 0.03 0.00 0.00 0.00 0.01 0.03
## mean_density_season[4] 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.01
## spatial_sd[1] 1.21 0.01 0.27 0.78 1.03 1.18 1.34 1.84
## spatial_sd[2] 1.47 0.01 0.34 0.98 1.24 1.42 1.64 2.24
## spatial_sd[3] 1.74 0.02 0.41 1.16 1.46 1.67 1.95 2.78
## spatial_sd[4] 1.64 0.01 0.38 1.01 1.36 1.60 1.86 2.50
## n_eff Rhat
## mu_intercept[1] 1715 1.00
## mu_intercept[2] 1249 1.00
## mu_intercept[3] 1831 1.00
## mu_intercept[4] 1284 1.00
## gp_alpha[1] 703 1.01
## gp_alpha[2] 1049 1.00
## gp_alpha[3] 591 1.01
## gp_alpha[4] 611 1.01
## rho[1] 669 1.00
## rho[2] 571 1.01
## rho[3] 652 1.01
## rho[4] 647 1.00
## beta_20[1] 1579 1.00
## beta_20[2] 1686 1.00
## beta_20[3] 2308 1.00
## beta_20[4] 1962 1.00
## beta_40[1] 2028 1.00
## beta_40[2] 1995 1.00
## beta_40[3] 2014 1.00
## beta_40[4] 1969 1.00
## mu_group_size 3142 1.00
## exp_group_size 3142 1.00
## phi_v 2552 1.00
## phi_20 3670 1.00
## phi_40 3426 1.00
## mean_density_season[1] 1073 1.00
## mean_density_season[2] 2008 1.00
## mean_density_season[3] 1312 1.00
## mean_density_season[4] 1643 1.00
## spatial_sd[1] 1228 1.00
## spatial_sd[2] 1090 1.00
## spatial_sd[3] 710 1.01
## spatial_sd[4] 842 1.00
##
## Samples were drawn using NUTS(diag_e) at Wed Mar 11 01:41:04 2026.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
cat("\nParameters with Rhat > 1.005:", length(rhat_issues), "\n")
##
## Parameters with Rhat > 1.005: 0
if (length(rhat_issues) > 0) {
cat("\nProblematic parameters:\n")
print(summary_df[rhat_issues, c("mean", "sd", "Rhat", "n_eff")])
} else {
cat("✓ All Rhat < 1.005 - convergence!\n")
}
## ✓ All Rhat < 1.005 - convergence!
# Rhat histogram
ggplot(data.frame(Rhat = summary_df$Rhat), aes(x = Rhat)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
geom_vline(xintercept = 1.005, color = "red", linetype = "dashed") +
labs(title = "Rhat Distribution Across All Parameters",
subtitle = "Red line at 1.005 threshold",
x = "Rhat", y = "Count") +
theme_minimal()
R_hat - a measure of chain equilibrium, must be within 0.05 of 1.0 looks good
ggplot(data.frame(n_eff = summary_df$n_eff), aes(x = n_eff)) +
geom_histogram(bins = 50, fill = "darkgreen", alpha = 0.7) +
geom_vline(xintercept = 400, color = "red", linetype = "dashed") +
labs(title = "Effective Sample Size Distribution",
subtitle = "Red line at 400",
x = "Effective Sample Size", y = "Count") +
theme_minimal()
I ran: 4 chains × 500 post-warmup iterations = 2000 total samples
N_eff - effective sample size - the number of independent draws in the sample
n_eff ≈ 2000 means: - almost NO autocorrelation in the chains - Each iteration is providing independent information - good mixing and exploration - efficient sampling
mcmc_trace(fit, pars = c("mu_intercept[1]", "mu_intercept[2]",
"mu_intercept[3]", "mu_intercept[4]",
"gp_alpha[1]", "gp_alpha[2]",
"gp_alpha[3]", "gp_alpha[4]")) +
labs(title = "Trace Plots - Density Intercepts and GP Amplitude") +
theme_minimal()
All chains mixed together, can’t tell them apart Looks like thick fuzzy horizontal bands No trends, no patterns, just random wiggling around a stable value. Looks good.
Example: ───────────────── ← all 4 chains overlapping ───────────────── ─────────────────
mcmc_trace(fit, pars = c("rho[1]", "rho[2]", "rho[3]", "rho[4]",
"beta_20[1]", "beta_20[2]", "beta_20[3]", "beta_20[4]",
"beta_40[1]", "beta_40[2]", "beta_40[3]", "beta_40[4]")) +
labs(title = "Trace Plots - GP Length Scale and Call Rates") +
theme_minimal()