Emergent constraint on beta-GPP

Code for this is here.

library(tidyverse)

Read data

Data obtained by Trevor by email 7.3.2023. This excludes the results from the VEGAS model in order to reproduce results in the submitted version of the manuscript (for Nature CC).

## NEW (NATURE CC) VERSION
df <- read_csv("../data/SlandAndBetas.csv") |> 
  rename(model = modelNames, sland = Sland_Cumulative, beta = betaGPP) |> 
  
  # # remove to reproduce exact numbers as in submitted ms
  # filter(!(model %in% c("VEGAS"))) |>

  # add RS models' beta
  bind_rows(
    tibble(model = "MODIS", sland = NA, beta = 0.1672)
  ) |> 
  bind_rows(
    tibble(model = "MPI", sland = NA, beta = 0.1625)
  )

Regression

Define prediction error function based on methods in Cox et al., 2018 (and 2013).

Add line for prediction error \(\sigma_f(x)\) manually.

# sum of square errors
linmod <- lm(beta ~ sland, data = df |> drop_na())

# Cox et al., 2013 use N, and not N-1 in denominator of variance. Why?
coxvar <- function(vec){
  vec <- vec[!is.na(vec)]
  sum((vec - mean(vec))^2)/length(vec)
}

# prediction error (sigma_f)
get_prediction_error <- function(x, linmod, x_mean = NA, x_var = NA, use_method = "default"){
  
  if (use_method == "default"){
    # 95% CI (shown by geom_smooth())
    out <- predict(linmod,
                   newdata = data.frame(sland = x),
                   se.fit = TRUE,
                   interval = "none",
                   type = "response")$se.fit
      
  } else if (use_method == "cox18"){
    # Calculate prediction error following Cox et al., 2018, but omitting the '1 + ...' in the square root
    sse <- sum(linmod$residuals^2)
    s_squared <- (1 / (length(linmod$residuals) - 2)) * sse  # Eq. 9
    out <- sqrt(s_squared) * sqrt( 1/length(linmod$residuals) + (x - x_mean)^2 / (length(linmod$residuals) * x_var) )
      
  }

  return(out)
}

# prediction +/- prediction error
predict_range_upper <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) + 
    get_prediction_error(x, 
                         linmod)
}
predict_range_lower <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) -
    get_prediction_error(x, 
                         linmod)
}

# prediction +/- prediction error
predict_range_upper_cox18 <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) + 
    get_prediction_error(x, 
                         linmod,
                         x_mean = mean(x),
                         x_var = coxvar(x),
                         use_method = "cox18")
}
predict_range_lower_cox18 <- function(x, linmod){
  predict(linmod, data.frame(sland = x)) -
    get_prediction_error(x, 
                         linmod,
                         x_mean = mean(x),
                         x_var = coxvar(x),
                         use_method = "cox18")
}

# black line: prediction error based on built-in function
# blue line: prediction error based on Cox et al., 2018 (without the 1 + ...)
df |> 
  ggplot(aes(sland, beta)) + 
  geom_point() +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  geom_function(fun = function(x) predict_range_upper(x, linmod = linmod)) +
  geom_function(fun = function(x) predict_range_lower(x, linmod = linmod)) +
  geom_function(fun = function(x) predict_range_upper_cox18(x, linmod = linmod), color = "royalblue", linetype = "dashed") +
  geom_function(fun = function(x) predict_range_lower_cox18(x, linmod = linmod), color = "royalblue", linetype = "dashed") +
  theme_classic()

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

# get metrics
out <- rbeni::analyse_modobs2(df, "sland", "beta")
out$df_metrics |> filter(.metric == "cor")

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 cor     standard       0.702

Issues:

Cox et al. (2013 and 2018) use \(N\), and not \(N-1\) in denominator of variance. This is because you can use N when you are calculating exactly the variance of a known population, and use N-1 when estimating the variance of a population based on a sample (thanks, Trevor).
Cox et al. (2013 and 2018) use \(1 + ...\) in the formula for \(\sigma_f(x)\). Unclear why. This is omitted here.

Distribution of S_land

The land sink is assumed to be normally distributed with \(\mu\) = 92.13 and \(\sigma\) = 17.04.

prob_x <- function(x){
  dnorm(x, mean = 92.13, sd = 17.04)
}

Constrained probability distribution

\[ P(y) = \int_{-\infty }^{+\infty} P(y|x) P(x) \; dx \]

# P(y|x) is a Gaussian function with mean = prediction and variance = prediction error
prob_y_given_x <- function(x, y, x_mean, x_var, linmod){
  
  # out <- dnorm(y, 
  #              mean = predict(linmod, data.frame(sland = x)),
  #              sd = get_prediction_error(x, x_mean, x_var, linmod))
  
  # or "manually":
  f_x <- predict(linmod, data.frame(sland = x))
  sigma_f <- get_prediction_error(x, linmod, x_mean, x_var, use_method = "cox18")
  out <- (1 / sqrt(2 * pi * sigma_f^2)) * exp(-(y - f_x)^2 / (2 * sigma_f^2))
  
  return(out)
}

d_constrained_prob <- function(x, y, x_mean, x_var, linmod){
  prob_y_given_x(x, y, x_mean, x_var, linmod) * prob_x(x)
}

constrained_prob <- function(y, x_mean, x_var, linmod){
  integrate(d_constrained_prob, 
            lower = 0, 
            upper = 200, 
            y = y, 
            x_mean = x_mean, 
            x_var = x_var, 
            linmod = linmod
            )$value
}

Issues:

Cox et al. 2018 use the following for the constrained probability. Note: The variance is used in the denominator of the exponent, not the standard deviation (as probably erroneously described in Cox et al., 2013): \[ P(y|x) = \frac{1}{\sqrt{2\pi \sigma_f^2}} \; \exp{\left(-\frac{(y-f(x))^2}{2\sigma_f^2}\right)} \]

Plot unconstrained (black) and constrained (brownish) probability distributions.

# create values for constrained probability
df_prob <- tibble(y = seq(from = 0, to = 1, by = 0.001)) |> 
  rowwise() |> 
  mutate(prob_y = constrained_prob(y, 
                                   x_mean = mean(df$sland, na.rm = TRUE), 
                                   x_var = var(df$sland, na.rm = TRUE), 
                                   linmod = linmod))

df |> 
  ggplot() +
  geom_histogram(aes(beta, ..density..), 
                 bins = 12, 
                 color = "black", 
                 fill = "grey70") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df$beta, na.rm = TRUE), 
                            sd = sd(df$beta, na.rm = TRUE)),
                color = "grey40", 
                linetype = "dashed") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE), 
                            sd = sd(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE)),
                color = "black") +
  geom_line(data = df_prob, aes(y, prob_y), color = "darkgoldenrod") +
  theme_classic() +
  xlim(0,1)

## Warning: Removed 2 rows containing missing values (geom_bar).

Bootstrapping

nboot <- 5000

# sample Sland
vec_sland <- rnorm(nboot, mean = 92.13, sd = 17.04)

# function to generate a sample of predictions, given Sland
get_vec_beta_given_sland <- function(x, n_beta){
  
  # predict beta, given sland
  pred_beta <- predict(linmod,
    newdata = data.frame(sland = x),
    se.fit = TRUE, 
    level = 0.95,
    interval = "none",
    type = "response"
    )
  
  # generate vector of predictions
  rnorm(n_beta, mean = pred_beta$fit, sd = pred_beta$se.fit)
}

# generate beta-prediction samples of Sland samples and convert to vector
vec_beta <- purrr::map(as.list(vec_sland),
                       ~get_vec_beta_given_sland(., n_beta = nboot)) |> 
  unlist()

Plot bootstrapped: Brownish is constrained distribution of predictions. Solid brownish line is the based on the bootstrapping, dotted is the quasi-analytical curve (same as in plot above).

df |> 
  ggplot() +
  geom_histogram(aes(beta, ..density..), 
                 bins = 12, 
                 color = "black", 
                 fill = "grey70") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df$beta, na.rm = TRUE), 
                            sd = sd(df$beta, na.rm = TRUE)),
                color = "grey40", 
                linetype = "dashed") +
  stat_function(fun = dnorm, 
                args = list(mean = mean(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE), 
                            sd = sd(df |> 
                                          drop_na() |> 
                                          pull(beta)
                                        , na.rm = TRUE)),
                color = "black") +
  geom_density(data = tibble(beta = vec_beta), aes(beta, ..density..), color = "darkgoldenrod") +
  geom_line(data = df_prob, aes(y, prob_y), color = "darkgoldenrod4", linetype = "dotted") +
  theme_classic() +
  xlim(0,1)

## Warning: Removed 3129 rows containing non-finite values (stat_density).

## Warning: Removed 2 rows containing missing values (geom_bar).

The bootstrapping-based approach is perfectly consistent with the quasi-analytical approach using integrate(), and using the Cox et al. (2018) formulae (but omitting the ‘1 + …’!) for getting the prediction error.