Step 1: Generate Lipid-normalized Dataset (Mortality only)

Concentration and effect data was pulled from Berninger and Tillitt (2019). Wet weight concentrations were converted to lipid weight based on the following:

1. For eggs lipid concentration was assumed to be 100%

2. For the one sample where lipid-normalized concentration reported, that concentration was used.

3. For species typically used for OECD test guideline 305, the 5% lipid content was used

4. For all other species a lipid range of 0.5-20% was used to span the maximum potential range

3 and 4 above were based on the following:

library(readxl)
library(DT)

my_data <- read_excel("C:/Users/faberma/Desktop/Salmon Pop Modeling/Berninger and Tillit SI Data_for upload.xlsx")
DT::datatable(my_data)
# Re-create the dataset (outlier removed as in Berninger and Tillitt (2019))
df <- tribble(
  ~lipid_conc_high, ~lipid_conc_low, ~lipid_conc, ~effect,
  150, 3.75, NA, 57,
  306, 7.65, NA, 32,
  420, 10.5, NA, 44,
  NA, NA, 54, 18,
  NA, NA, 5.1, 22,
  1020, 25.5, NA, 30,
  1200, 30, NA, 5.8,
  1320, 33, NA, 13,
  2800, 70, NA, 66,
  2850, 71.25, NA, 54,
  3000, 75, NA, 28,
  NA, NA, 346, 50,
  9200, 230, NA, 51,
  15580, 389.5, NA, 92,
  21200, 530, NA, 47,
  NA, NA, 2400, 44,
  NA, NA, 2400, 31,
  25000, 625, NA, 18,
  NA, NA, 2760, 25,
  34000, 850, NA, 83,
  40000, 1000, NA, 82,
  50000, 1250, NA, 50,
  NA, NA, 5200, 69,
  60000, 1500, NA, 95,
  73000, 1825, NA, 91,
  129000, 3225, NA, 100,
  NA, NA, 15900, 100,
  220000, 5500, NA, 88,
  NA, NA, 59000, 100
)

Step 2: For each concentration with a range of lipid contents, perform bootstrapping by randomly sampling values within the range (1000 iterations) and fit a regression model for each resulting dataset.

get_sampled_conc <- function(low, high, fixed) {
  if (!is.na(fixed)) {
    return(fixed)
  } else if (!is.na(low) & !is.na(high)) {
    return(runif(1, low, high))
  } else {
    return(NA)
  }
}

n_iter <- 1000
results <- vector("list", n_iter)
sampled_points_list <- vector("list", n_iter)

set.seed(915)

for (i in seq_len(n_iter)) {
  sampled_df <- df %>%
    rowwise() %>%
    mutate(concentration = get_sampled_conc(lipid_conc_low, lipid_conc_high, lipid_conc)) %>%
    ungroup() %>%
    filter(!is.na(concentration), !is.na(effect))
  
  model <- lm(effect ~ log(concentration), data = sampled_df)
  summary_model <- summary(model)
  
  results[[i]] <- list(
    intercept = coef(model)[1],
    slope = coef(model)[2],
    r_squared = summary_model$adj.r.squared,
    p_value = coef(summary_model)[2, 4]
  )
  
  sampled_points_list[[i]] <- sampled_df %>%
    mutate(log_conc = log10(concentration),
           iteration = i) %>%
    select(iteration, log_conc, effect, concentration)
}

results_df <- bind_rows(results)
sampled_points_df <- bind_rows(sampled_points_list)

summary_stats <- results_df %>%
  summarise(
    mean_intercept = mean(intercept),
    mean_slope = mean(slope),
    mean_r2 = mean(r_squared),
    mean_p = mean(p_value),
    sd_r2 = sd(r_squared),
    sd_p = sd(p_value)
  )

kable(summary_stats, digits = 4, caption = "Summary Statistics of Bootstrapped Regressions")
Summary Statistics of Bootstrapped Regressions
mean_intercept mean_slope mean_r2 mean_p sd_r2 sd_p
-7.5971 8.0315 0.4209 2e-04 0.0529 3e-04

Step 3: Plot distribution of key summary statistics

ggplot(results_df, aes(x = r_squared)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  labs(title = "Distribution of R² Values", x = "R²", y = "Count")

ggplot(results_df, aes(x = p_value)) +
  geom_histogram(bins = 30, fill = "purple", color = "white") +
  labs(title = "Distribution of p-values", x = "p-value", y = "Count")

Step 4: Plot each regression

ggplot() +
  geom_point(data = sampled_points_df, aes(x = concentration, y = effect), alpha = 0.05, color = "gray40") +
  geom_abline(data = results_df, aes(intercept = intercept, slope = slope),
              color = "#1f77b4", alpha = 0.1) +
  geom_abline(slope = 22.1, intercept = -12.53, color = "red", linetype = "dashed",
              cex = 1) +
  geom_vline(xintercept = 2.4, color = "grey", linetype= "dashed", cex = 1)+
  labs(
    title = "Bootstrapped Regressions and Sampled Points",
    x = "Concentration ug/g lipid weight",
    y = "Effect"
  ) +
     scale_x_continuous(
    trans = "log10",
    limits = c(1, 500000),
    breaks = c(1, 10, 100, 1000, 10000, 100000),
    labels = c("1", "10", "100", "1000", "10000", "100000")
  ) +
  theme_minimal()

The red line represents the current lipid-normalized regression model. The blue lines show the bootstrapped regressions generated by randomly sampling from the available lipid concentration ranges. Verticle grey line is the Meador et al. (2002) lw threshold for sublethal effects. Points with horizontal scatter indicate values sampled from a range of possible lipid concentrations, while points without scatter had a single estimated value (see Table 1). Seems like the current model we’re using (estimated using a 4.6% lipid content across the data) fits the data better.

A few thoughts:

It looks like the horizontal noise is flattening the slope compared to the slope we’re currently using. Or maybe the data points don’t meet the assumptions of a linear regression (heteroscedastic) and the model is compensating? If this could be the case, not sure how we would check the residuals vs. fitted values per each iteration. Or maybe I’m doing something wrong?

Compared to the bootstrapped regressions, the current regression model we’re employing will consistently estimate higher effects. The mean r-squared value for the bootleg regressions (0.42) were similar to the original regression using all raw datapoints (0.46).

As a reminder: Original Wet-Weight Regression

df_original <- read.csv("C:/Users/faberma/Desktop/Salmon Pop Modeling/Original_data.csv")

ggplot() +
  geom_point(data = df_original, aes(x = concentration, y = effect), color = "black") +
  geom_abline(slope = 21.15, intercept = 18.94, color = "red", linetype = "dashed",
             cex = 1) +
  labs(
    title = "Original Berninger and Tillitt Regression",
    x = "Concentration ug/g wet weight",
    y = "Effect"
  ) +
   scale_x_continuous(
    trans = "log10",
    limits = c(0.01, 1000),
    breaks = c(0.01, 0.1, 1, 10, 100, 1000),
    labels = c("0.01", "0.1", "1", "10", "100", "1000")
  ) +
  scale_y_continuous(limits = c(0,100))+
  theme_bw()

The red line is the raw data regression line (not quintile regression line). R2 value was 0.46 for this line.