Code for this is here.

library(tidyverse)

Read data

Data obtained by Trevor by email 7.3.2023. This excludes the results from the VEGAS model in order to reproduce results in the submitted version of the manuscript (for Nature CC).

## NEW (NATURE CC) VERSION
df <- read_csv("../data/SlandAndBetas.csv") |> 
  rename(model = modelNames, sland = Sland_Cumulative, beta = betaGPP) |> 
  
  # # remove to reproduce exact numbers as in submitted ms
  # filter(!(model %in% c("VEGAS"))) |>

  # add RS models' beta
  bind_rows(
    tibble(model = "MODIS", sland = NA, beta = 0.1672)
  ) |> 
  bind_rows(
    tibble(model = "MPI", sland = NA, beta = 0.1625)
  )

Regression

Define prediction error function based on methods in Cox et al., 2018 (and 2013).

Add line for prediction error \(\sigma_f(x)\) manually.

# sum of square errors
linmod <- lm(beta ~ sland, data = df |> drop_na())

# Cox et al., 2013 use N, and not N-1 in denominator of variance. Why?
coxvar <- function(vec){
  vec <- vec[!is.na(vec)]
  sum((vec - mean(vec))^2)/length(vec)
}

# prediction error (sigma_f)
get_prediction_error <- function(x, linmod, x_mean = NA, x_var = NA, use_method = "default"){
  
  if (use_method == "default"){
    # 95% CI (shown by geom_smooth())
    out <- predict(linmod,
                   newdata = data.frame(sland = x),
                   se.fit = TRUE,
                   interval = "none",
                   type = "response")$se.fit
      
  } else if (use_method == "cox18"){
    # Calculate prediction error following Cox et al., 2018, but omitting the '1 + ...' in the square root
    sse <- sum(linmod$residuals^2)
    s_squared <- (1 / (length(linmod$residuals) - 2)) * sse  # Eq. 9
    out <- sqrt(s_squared) * sqrt( 1/length(linmod$residuals) + (x - x_mean)^2 / (length(linmod$residuals) * x_var) )
      
  }

  return(out)
}

# prediction +/- prediction error
predict_range_upper <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) + 
    get_prediction_error(x, 
                         linmod)
}
predict_range_lower <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) -
    get_prediction_error(x, 
                         linmod)
}

# prediction +/- prediction error
predict_range_upper_cox18 <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) + 
    get_prediction_error(x, 
                         linmod,
                         x_mean = mean(x),
                         x_var = coxvar(x),
                         use_method = "cox18")
}
predict_range_lower_cox18 <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) -
    get_prediction_error(x, 
                         linmod,
                         x_mean = mean(x),
                         x_var = coxvar(x),
                         use_method = "cox18")
}

# black line: prediction error based on built-in function
# blue line: prediction error based on Cox et al., 2018 (without the 1 + ...)
df |> 
  ggplot(aes(sland, beta)) + 
  geom_point() +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  geom_function(fun = function(x) predict_range_upper(x, linmod = linmod)) +
  geom_function(fun = function(x) predict_range_lower(x, linmod = linmod)) +
  geom_function(fun = function(x) predict_range_upper_cox18(x, linmod = linmod), color = "royalblue", linetype = "dashed") +
  geom_function(fun = function(x) predict_range_lower_cox18(x, linmod = linmod), color = "royalblue", linetype = "dashed") +
  theme_classic()
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

# get metrics
out <- rbeni::analyse_modobs2(df, "sland", "beta")
out$df_metrics |> filter(.metric == "cor")
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 cor     standard       0.702

Issues:

Distribution of S_land

The land sink is assumed to be normally distributed with \(\mu\) = 92.13 and \(\sigma\) = 17.04.

prob_x <- function(x){
  dnorm(x, mean = 92.13, sd = 17.04)
}

Constrained probability distribution

\[ P(y) = \int_{-\infty }^{+\infty} P(y|x) P(x) \; dx \]

# P(y|x) is a Gaussian function with mean = prediction and variance = prediction error
prob_y_given_x <- function(x, y, x_mean, x_var, linmod){
  
  # out <- dnorm(y, 
  #              mean = predict(linmod, data.frame(sland = x)),
  #              sd = get_prediction_error(x, x_mean, x_var, linmod))
  
  # or "manually":
  f_x <- predict(linmod, data.frame(sland = x))
  sigma_f <- get_prediction_error(x, linmod, x_mean, x_var, use_method = "cox18")
  out <- (1 / sqrt(2 * pi * sigma_f^2)) * exp(-(y - f_x)^2 / (2 * sigma_f^2))
  
  return(out)
}

d_constrained_prob <- function(x, y, x_mean, x_var, linmod){
  prob_y_given_x(x, y, x_mean, x_var, linmod) * prob_x(x)
}

constrained_prob <- function(y, x_mean, x_var, linmod){
  integrate(d_constrained_prob, 
            lower = 0, 
            upper = 200, 
            y = y, 
            x_mean = x_mean, 
            x_var = x_var, 
            linmod = linmod
            )$value
}

Issues:

Plot unconstrained (black) and constrained (brownish) probability distributions.

# create values for constrained probability
df_prob <- tibble(y = seq(from = 0, to = 1, by = 0.001)) |> 
  rowwise() |> 
  mutate(prob_y = constrained_prob(y, 
                                   x_mean = mean(df$sland, na.rm = TRUE), 
                                   x_var = var(df$sland, na.rm = TRUE), 
                                   linmod = linmod))

df |> 
  ggplot() +
  geom_histogram(aes(beta, ..density..), 
                 bins = 12, 
                 color = "black", 
                 fill = "grey70") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df$beta, na.rm = TRUE), 
                            sd = sd(df$beta, na.rm = TRUE)),
                color = "grey40", 
                linetype = "dashed") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE), 
                            sd = sd(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE)),
                color = "black") +
  geom_line(data = df_prob, aes(y, prob_y), color = "darkgoldenrod") +
  theme_classic() +
  xlim(0,1)
## Warning: Removed 2 rows containing missing values (geom_bar).

Bootstrapping

nboot <- 5000

# sample Sland
vec_sland <- rnorm(nboot, mean = 92.13, sd = 17.04)

# function to generate a sample of predictions, given Sland
get_vec_beta_given_sland <- function(x, n_beta){
  
  # predict beta, given sland
  pred_beta <- predict(linmod,
    newdata = data.frame(sland = x),
    se.fit = TRUE, 
    level = 0.95,
    interval = "none",
    type = "response"
    )
  
  # generate vector of predictions
  rnorm(n_beta, mean = pred_beta$fit, sd = pred_beta$se.fit)
}

# generate beta-prediction samples of Sland samples and convert to vector
vec_beta <- purrr::map(as.list(vec_sland),
                       ~get_vec_beta_given_sland(., n_beta = nboot)) |> 
  unlist()

Plot bootstrapped: Brownish is constrained distribution of predictions. Solid brownish line is the based on the bootstrapping, dotted is the quasi-analytical curve (same as in plot above).

df |> 
  ggplot() +
  geom_histogram(aes(beta, ..density..), 
                 bins = 12, 
                 color = "black", 
                 fill = "grey70") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df$beta, na.rm = TRUE), 
                            sd = sd(df$beta, na.rm = TRUE)),
                color = "grey40", 
                linetype = "dashed") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE), 
                            sd = sd(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE)),
                color = "black") +
  geom_density(data = tibble(beta = vec_beta), aes(beta, ..density..), color = "darkgoldenrod") +
  geom_line(data = df_prob, aes(y, prob_y), color = "darkgoldenrod4", linetype = "dotted") +
  theme_classic() +
  xlim(0,1)
## Warning: Removed 3129 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).

The bootstrapping-based approach is perfectly consistent with the quasi-analytical approach using integrate(), and using the Cox et al. (2018) formulae (but omitting the ‘1 + …’!) for getting the prediction error.