When reading the files you sent me, some data is missing.

source sland beta_gpp beta_reco nrf
CABLE 3.875213 0.8191375 0.8025959 8.7791036
CLASS_CTEM 3.204068 0.4930046 0.4030876 0.9971417
CLM 2.240896 0.3903936 0.3515676 2.7268429
DLEM 2.288555 0.6974570 0.6394701 2.3818175
ISAM 2.291047 0.6654760 0.6007053 0.5101950
JSBACH 3.038597 0.6165090 0.5579764 6.3021586
JULES 2.731302 0.6043650 0.5205092 -0.6808847
LPJ_GUESS 2.063041 0.5558467 NA NA
LPJ 1.883699 0.4430937 0.4283084 NA
LPX 1.226619 0.4430766 0.3732446 17.1920030
ORCHIDEE_MICT 2.938975 0.5963677 0.5206025 4.6774682
ORCHIDEE 2.498581 0.4696402 0.4305576 8.3658094
VEGAS NA 0.2253959 0.1916185 NA
VISIT 3.047290 0.5818867 0.4834557 4.5331730
GCPresidualSink 2.549365 NA NA NA

Olot original Beta-GPP vs the sink. Note that this displays the 95% confidence interval to be somewhat larger than in ED Fig. 1.

df %>% 
    ggplot(aes(sland, beta_gpp)) +
    geom_point() +
    geom_smooth(method = "lm", color = "red", level = 0.95) +
    theme_classic()

Fit a linear regression model. Note that the R-squared is 0.95, not 0.99 as in the paper.

linmod <- lm(sland ~ beta_gpp + beta_reco * nrf,
             data = df %>% 
                 filter(source != "GCPresidualSink")
             )
summary(linmod)
## 
## Call:
## lm(formula = sland ~ beta_gpp + beta_reco * nrf, data = df %>% 
##     filter(source != "GCPresidualSink"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23264 -0.09567  0.01719  0.04165  0.32497 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.65371    0.50724   7.203 0.000362 ***
## beta_gpp       12.23562    3.31821   3.687 0.010240 *  
## beta_reco     -15.96950    3.56665  -4.477 0.004204 ** 
## nrf            -0.49814    0.05727  -8.698 0.000128 ***
## beta_reco:nrf   1.05961    0.12854   8.243 0.000172 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1961 on 6 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9178 
## F-statistic: 28.91 on 4 and 6 DF,  p-value: 0.0004625

Apply variance normalization on the sink

cf <- coef(linmod)

df <- df %>% 
    mutate(betareco_nrf = beta_reco * nrf) %>% 
    mutate(b_beta_reco = cf["beta_reco"] * beta_reco,
           c_betareco_nrf = cf["beta_reco:nrf"] * betareco_nrf )

b_beta_reco_mean <- mean(df$b_beta_reco, na.rm = TRUE)
c_betareco_nrf_mean <- mean(df$c_betareco_nrf, na.rm = TRUE)

df <- df %>% 
    mutate(sland_norm = sland - ((b_beta_reco + c_betareco_nrf) - (b_beta_reco_mean + c_betareco_nrf_mean)))

Plot normalised sink vs beta-gpp. Doesn’t look the way it should…

df %>% 
    ggplot(aes(sland_norm, beta_gpp)) +
    geom_point() +
    geom_smooth(method = "lm", color = "red", level = 0.95) +
    theme_classic()

Should it also be normalised for the non-respired flux (nrf)?

cf <- coef(linmod)

df <- df %>% 
    mutate(d_nrf = cf["nrf"] * nrf )

d_nrf_mean <- mean(df$d_nrf, na.rm = TRUE)

df <- df %>% 
    mutate(sland_norm2 = sland - ((b_beta_reco + c_betareco_nrf + d_nrf) - (b_beta_reco_mean + c_betareco_nrf_mean + d_nrf_mean)))

Yes!

df %>% 
    ggplot(aes(sland_norm2, beta_gpp)) +
    geom_point() +
    geom_smooth(method = "lm", color = "red", level = 0.95) +
    theme_classic()

That’s the one that gets the R-squared of 0.99:

yardstick::rsq(df, sland_norm2, beta_gpp) %>% pull(.estimate)
## [1] 0.9901483

Are the means of the original and normalised (red) sinks different? No!!!

df %>% 
  ggplot() +
  geom_density(aes(sland, ..density..)) +
  geom_density(aes(sland_norm2, ..density..), color = "red") +
  geom_vline(xintercept = mean(df$sland_norm, na.rm = TRUE), size = 2) +
  geom_vline(xintercept = mean(df$sland_norm2, na.rm = TRUE), color = "red") +
  theme_classic()
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Removed 4 rows containing non-finite values (stat_density).

Now, perturb the distribution by removing a model (the first one: CABLE). Let’s make this a function.

normalize_sink <- function(df){
  linmod <- lm(sland ~ beta_gpp + beta_reco * nrf,
             data = df %>% 
                 filter(source != "GCPresidualSink")
             )
  cf <- coef(linmod)
  out <- df %>% 
      mutate(betareco_nrf = beta_reco * nrf) %>% 
      mutate(b_beta_reco = cf["beta_reco"] * beta_reco,
             c_betareco_nrf = cf["beta_reco:nrf"] * betareco_nrf,
             d_nrf = cf["nrf"] * nrf)
  
  b_beta_reco_mean <- mean(out$b_beta_reco, na.rm = TRUE)
  c_betareco_nrf_mean <- mean(out$c_betareco_nrf, na.rm = TRUE)
  d_nrf_mean <- mean(out$d_nrf, na.rm = TRUE)

  out <- out %>% 
    mutate(sland_norm2 = sland - ((b_beta_reco + c_betareco_nrf + d_nrf) - (b_beta_reco_mean + c_betareco_nrf_mean + d_nrf_mean)))
  
  return(out)
}

df2 <- df %>% 
  slice(-1) %>%   # perturb sample by removing one model
  normalize_sink()

Plot both distributions and means: it does affect the mean (vertical lines).

ggplot() +
  geom_density(aes(sland_norm2, ..density..), data = df) +
  geom_density(aes(sland_norm2, ..density..), data = df2, color = "red") +
  geom_vline(xintercept = mean(df$sland_norm2, na.rm = TRUE), lintetype = "dashed") +
  geom_vline(xintercept = mean(df2$sland_norm2, na.rm = TRUE), color = "red", lintetype = "dashed") +
  theme_classic()
## Warning: Ignoring unknown parameters: lintetype

## Warning: Ignoring unknown parameters: lintetype
## Warning: Removed 4 rows containing non-finite values (stat_density).

## Warning: Removed 4 rows containing non-finite values (stat_density).