Code for this is here.
library(tidyverse)
Data obtained by Trevor by email 7.3.2023. This excludes the results from the VEGAS model in order to reproduce results in the submitted version of the manuscript (for Nature CC).
## NEW (NATURE CC) VERSION
df <- read_csv("../data/SlandAndBetas.csv") |>
rename(model = modelNames, sland = Sland_Cumulative, beta = betaGPP) |>
# # remove to reproduce exact numbers as in submitted ms
# filter(!(model %in% c("VEGAS"))) |>
# add RS models' beta
bind_rows(
tibble(model = "MODIS", sland = NA, beta = 0.1672)
) |>
bind_rows(
tibble(model = "MPI", sland = NA, beta = 0.1625)
)
Define prediction error function based on methods in Cox et al., 2018 (and 2013).
Add line for prediction error \(\sigma_f(x)\) manually.
# sum of square errors
linmod <- lm(beta ~ sland, data = df |> drop_na())
# Cox et al., 2013 use N, and not N-1 in denominator of variance. Why?
coxvar <- function(vec){
vec <- vec[!is.na(vec)]
sum((vec - mean(vec))^2)/length(vec)
}
# prediction error (sigma_f)
get_prediction_error <- function(x, linmod, x_mean = NA, x_var = NA, use_method = "default"){
if (use_method == "default"){
# 95% CI (shown by geom_smooth())
out <- predict(linmod,
newdata = data.frame(sland = x),
se.fit = TRUE,
interval = "none",
type = "response")$se.fit
} else if (use_method == "cox18"){
# Calculate prediction error following Cox et al., 2018, but omitting the '1 + ...' in the square root
sse <- sum(linmod$residuals^2)
s_squared <- (1 / (length(linmod$residuals) - 2)) * sse # Eq. 9
out <- sqrt(s_squared) * sqrt( 1/length(linmod$residuals) + (x - x_mean)^2 / (length(linmod$residuals) * x_var) )
}
return(out)
}
# prediction +/- prediction error
predict_range_upper <- function(x, linmod){
predict(linmod, data.frame(sland = x)) +
get_prediction_error(x,
linmod)
}
predict_range_lower <- function(x, linmod){
predict(linmod, data.frame(sland = x)) -
get_prediction_error(x,
linmod)
}
# prediction +/- prediction error
predict_range_upper_cox18 <- function(x, linmod){
predict(linmod, data.frame(sland = x)) +
get_prediction_error(x,
linmod,
x_mean = mean(x),
x_var = coxvar(x),
use_method = "cox18")
}
predict_range_lower_cox18 <- function(x, linmod){
predict(linmod, data.frame(sland = x)) -
get_prediction_error(x,
linmod,
x_mean = mean(x),
x_var = coxvar(x),
use_method = "cox18")
}
# black line: prediction error based on built-in function
# blue line: prediction error based on Cox et al., 2018 (without the 1 + ...)
df |>
ggplot(aes(sland, beta)) +
geom_point() +
geom_smooth(method = "lm", color = "red", se = FALSE) +
geom_function(fun = function(x) predict_range_upper(x, linmod = linmod)) +
geom_function(fun = function(x) predict_range_lower(x, linmod = linmod)) +
geom_function(fun = function(x) predict_range_upper_cox18(x, linmod = linmod), color = "royalblue", linetype = "dashed") +
geom_function(fun = function(x) predict_range_lower_cox18(x, linmod = linmod), color = "royalblue", linetype = "dashed") +
theme_classic()
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
# get metrics
out <- rbeni::analyse_modobs2(df, "sland", "beta")
out$df_metrics |> filter(.metric == "cor")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 cor standard 0.702
Issues:
The land sink is assumed to be normally distributed with \(\mu\) = 92.13 and \(\sigma\) = 17.04.
prob_x <- function(x){
dnorm(x, mean = 92.13, sd = 17.04)
}
\[ P(y) = \int_{-\infty }^{+\infty} P(y|x) P(x) \; dx \]
# P(y|x) is a Gaussian function with mean = prediction and variance = prediction error
prob_y_given_x <- function(x, y, x_mean, x_var, linmod){
# out <- dnorm(y,
# mean = predict(linmod, data.frame(sland = x)),
# sd = get_prediction_error(x, x_mean, x_var, linmod))
# or "manually":
f_x <- predict(linmod, data.frame(sland = x))
sigma_f <- get_prediction_error(x, linmod, x_mean, x_var, use_method = "cox18")
out <- (1 / sqrt(2 * pi * sigma_f^2)) * exp(-(y - f_x)^2 / (2 * sigma_f^2))
return(out)
}
d_constrained_prob <- function(x, y, x_mean, x_var, linmod){
prob_y_given_x(x, y, x_mean, x_var, linmod) * prob_x(x)
}
constrained_prob <- function(y, x_mean, x_var, linmod){
integrate(d_constrained_prob,
lower = 0,
upper = 200,
y = y,
x_mean = x_mean,
x_var = x_var,
linmod = linmod
)$value
}
Issues:
Plot unconstrained (black) and constrained (brownish) probability distributions.
# create values for constrained probability
df_prob <- tibble(y = seq(from = 0, to = 1, by = 0.001)) |>
rowwise() |>
mutate(prob_y = constrained_prob(y,
x_mean = mean(df$sland, na.rm = TRUE),
x_var = var(df$sland, na.rm = TRUE),
linmod = linmod))
df |>
ggplot() +
geom_histogram(aes(beta, ..density..),
bins = 12,
color = "black",
fill = "grey70") +
stat_function(fun = dnorm,
args = list(mean = mean(df$beta, na.rm = TRUE),
sd = sd(df$beta, na.rm = TRUE)),
color = "grey40",
linetype = "dashed") +
stat_function(fun = dnorm,
args = list(mean = mean(df |>
drop_na() |>
pull(beta)
, na.rm = TRUE),
sd = sd(df |>
drop_na() |>
pull(beta)
, na.rm = TRUE)),
color = "black") +
geom_line(data = df_prob, aes(y, prob_y), color = "darkgoldenrod") +
theme_classic() +
xlim(0,1)
## Warning: Removed 2 rows containing missing values (geom_bar).
nboot <- 5000
# sample Sland
vec_sland <- rnorm(nboot, mean = 92.13, sd = 17.04)
# function to generate a sample of predictions, given Sland
get_vec_beta_given_sland <- function(x, n_beta){
# predict beta, given sland
pred_beta <- predict(linmod,
newdata = data.frame(sland = x),
se.fit = TRUE,
level = 0.95,
interval = "none",
type = "response"
)
# generate vector of predictions
rnorm(n_beta, mean = pred_beta$fit, sd = pred_beta$se.fit)
}
# generate beta-prediction samples of Sland samples and convert to vector
vec_beta <- purrr::map(as.list(vec_sland),
~get_vec_beta_given_sland(., n_beta = nboot)) |>
unlist()
Plot bootstrapped: Brownish is constrained distribution of predictions. Solid brownish line is the based on the bootstrapping, dotted is the quasi-analytical curve (same as in plot above).
df |>
ggplot() +
geom_histogram(aes(beta, ..density..),
bins = 12,
color = "black",
fill = "grey70") +
stat_function(fun = dnorm,
args = list(mean = mean(df$beta, na.rm = TRUE),
sd = sd(df$beta, na.rm = TRUE)),
color = "grey40",
linetype = "dashed") +
stat_function(fun = dnorm,
args = list(mean = mean(df |>
drop_na() |>
pull(beta)
, na.rm = TRUE),
sd = sd(df |>
drop_na() |>
pull(beta)
, na.rm = TRUE)),
color = "black") +
geom_density(data = tibble(beta = vec_beta), aes(beta, ..density..), color = "darkgoldenrod") +
geom_line(data = df_prob, aes(y, prob_y), color = "darkgoldenrod4", linetype = "dotted") +
theme_classic() +
xlim(0,1)
## Warning: Removed 3129 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
The bootstrapping-based approach is perfectly consistent with the
quasi-analytical approach using integrate()
, and using the
Cox et al. (2018) formulae (but omitting the ‘1 + …’!) for getting the
prediction error.