Step 1: Generate Dataframe
Concentration and effect data was pulled from Berninger and Tillitt (2019). Wet weight concentrations were converted to lipid weight based on the following:
1. For eggs lipid concentration was assumed to be 100%
2. For the one sample where lipid-normalized concentration reported, that concentration was used.
3. For species typically used for OECD test guideline 305, the 5% lipid content was used
4. For all other species a lipid range of 0.5-20% was used to span the maximum potential range
3 and 4 above were based on the following:
library(readxl)
library(DT)
my_data <- read_excel("C:/Users/faberma/Desktop/Salmon Pop Modeling/Berninger and Tillit SI Data_for upload.xlsx")
DT::datatable(my_data)# Re-create the dataset
df <- tribble(
~lipid_conc_high, ~lipid_conc_low, ~lipid_conc, ~effect,
52, 1.3, NA, 92,
150, 3.75, NA, 57,
306, 7.65, NA, 32,
420, 10.5, NA, 44,
NA, NA, 54, 18,
NA, NA, 5.1, 22,
1020, 25.5, NA, 30,
1200, 30, NA, 5.8,
1320, 33, NA, 13,
2800, 70, NA, 66,
2850, 71.25, NA, 54,
3000, 75, NA, 28,
NA, NA, 346, 50,
9200, 230, NA, 51,
15580, 389.5, NA, 92,
21200, 530, NA, 47,
NA, NA, 2400, 44,
NA, NA, 2400, 31,
25000, 625, NA, 18,
NA, NA, 2760, 25,
34000, 850, NA, 83,
40000, 1000, NA, 82,
50000, 1250, NA, 50,
NA, NA, 5200, 69,
60000, 1500, NA, 95,
73000, 1825, NA, 91,
129000, 3225, NA, 100,
NA, NA, 15900, 100,
220000, 5500, NA, 88,
NA, NA, 59000, 100,
NA, NA, 2.8, 26,
420, 10.5, NA, 12,
570, 14.25, NA, 12.5,
600, 15, NA, 16.9,
NA, NA, 3.5, 4.1,
1330, 33.25, NA, 26,
2866, 71.65, NA, 39,
NA, NA, 720, 20,
8600, 215, NA, 25,
10000, 250, NA, 33,
10000, 250, NA, 18,
NA, NA, 1200, 50,
14200, 355, NA, 23,
NA, NA, 2400, 66,
NA, NA, 2400, 11,
NA, NA, 3000, 39,
NA, NA, 5200, 30
)Step 2: For each concentration with a range of lipid contents, perform bootstrapping by randomly sampling values within the range (1000 iterations) and fit a regression model for each resulting dataset.
get_sampled_conc <- function(low, high, fixed) {
if (!is.na(fixed)) {
return(fixed)
} else if (!is.na(low) & !is.na(high)) {
return(runif(1, low, high))
} else {
return(NA)
}
}
n_iter <- 1000
results <- vector("list", n_iter)
sampled_points_list <- vector("list", n_iter)
set.seed(915)
for (i in seq_len(n_iter)) {
sampled_df <- df %>%
rowwise() %>%
mutate(concentration = get_sampled_conc(lipid_conc_low, lipid_conc_high, lipid_conc)) %>%
ungroup() %>%
filter(!is.na(concentration), !is.na(effect))
model <- lm(effect ~ log(concentration), data = sampled_df)
summary_model <- summary(model)
results[[i]] <- list(
intercept = coef(model)[1],
slope = coef(model)[2],
r_squared = summary_model$adj.r.squared,
p_value = coef(summary_model)[2, 4]
)
sampled_points_list[[i]] <- sampled_df %>%
mutate(log_conc = log10(concentration),
iteration = i) %>%
select(iteration, log_conc, effect)
}
results_df <- bind_rows(results)
sampled_points_df <- bind_rows(sampled_points_list)
summary_stats <- results_df %>%
summarise(
mean_intercept = mean(intercept),
mean_slope = mean(slope),
mean_r2 = mean(r_squared),
mean_p = mean(p_value),
sd_r2 = sd(r_squared),
sd_p = sd(p_value)
)
kable(summary_stats, digits = 4, caption = "Summary Statistics of Bootstrapped Regressions")| mean_intercept | mean_slope | mean_r2 | mean_p | sd_r2 | sd_p |
|---|---|---|---|---|---|
| 2.2117 | 5.9756 | 0.2527 | 4e-04 | 0.0363 | 6e-04 |
Step 3: Plot distribution of key summary statistics
ggplot(results_df, aes(x = r_squared)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
labs(title = "Distribution of R² Values", x = "R²", y = "Count")ggplot(results_df, aes(x = p_value)) +
geom_histogram(bins = 30, fill = "purple", color = "white") +
labs(title = "Distribution of p-values", x = "p-value", y = "Count")Step 4: Plot each regression
ggplot() +
geom_point(data = sampled_points_df, aes(x = log_conc, y = effect), alpha = 0.05, color = "gray40") +
geom_abline(data = results_df, aes(intercept = intercept, slope = slope),
color = "#1f77b4", alpha = 0.1) +
geom_abline(slope = 22.1, intercept = -12.53, color = "red", linetype = "dashed",
cex = 1) +
labs(
title = "Bootstrapped Regressions and Sampled Points",
x = "log10(Lipid Concentration)",
y = "Effect"
) +
theme_minimal()The red line represents the current lipid-normalized regression model. The blue lines show the bootstrapped regressions generated by randomly sampling from the available lipid concentration ranges. Points with horizontal scatter indicate values sampled from a range of possible lipid concentrations, while points without scatter had a single reported value (see Table 1).
A few thoughts:
It looks like the horizontal noise is flattening the slope compared to the slope we’re currently using. Or maybe the data points don’t meet the assumptions of a linear regression (heteroscedastic) and the model is compensating? If this could be the case, not sure how we would check the residuals vs. fitted values per each iteration. Or maybe I’m doing something wrong?