Sink normalization

When reading the files you sent me, some data is missing.

source	sland	beta_gpp	beta_reco	nrf
CABLE	3.875213	0.8191375	0.8025959	8.7791036
CLASS_CTEM	3.204068	0.4930046	0.4030876	0.9971417
CLM	2.240896	0.3903936	0.3515676	2.7268429
DLEM	2.288555	0.6974570	0.6394701	2.3818175
ISAM	2.291047	0.6654760	0.6007053	0.5101950
JSBACH	3.038597	0.6165090	0.5579764	6.3021586
JULES	2.731302	0.6043650	0.5205092	-0.6808847
LPJ_GUESS	2.063041	0.5558467	NA	NA
LPJ	1.883699	0.4430937	0.4283084	NA
LPX	1.226619	0.4430766	0.3732446	17.1920030
ORCHIDEE_MICT	2.938975	0.5963677	0.5206025	4.6774682
ORCHIDEE	2.498581	0.4696402	0.4305576	8.3658094
VEGAS	NA	0.2253959	0.1916185	NA
VISIT	3.047290	0.5818867	0.4834557	4.5331730
GCPresidualSink	2.549365	NA	NA	NA

Olot original Beta-GPP vs the sink. Note that this displays the 95% confidence interval to be somewhat larger than in ED Fig. 1.

df %>% 
    ggplot(aes(sland, beta_gpp)) +
    geom_point() +
    geom_smooth(method = "lm", color = "red", level = 0.95) +
    theme_classic()

Fit a linear regression model. Note that the R-squared is 0.95, not 0.99 as in the paper.

linmod <- lm(sland ~ beta_gpp + beta_reco * nrf,
             data = df %>% 
                 filter(source != "GCPresidualSink")
             )
summary(linmod)

## 
## Call:
## lm(formula = sland ~ beta_gpp + beta_reco * nrf, data = df %>% 
##     filter(source != "GCPresidualSink"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23264 -0.09567  0.01719  0.04165  0.32497 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.65371    0.50724   7.203 0.000362 ***
## beta_gpp       12.23562    3.31821   3.687 0.010240 *  
## beta_reco     -15.96950    3.56665  -4.477 0.004204 ** 
## nrf            -0.49814    0.05727  -8.698 0.000128 ***
## beta_reco:nrf   1.05961    0.12854   8.243 0.000172 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1961 on 6 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9507, Adjusted R-squared:  0.9178 
## F-statistic: 28.91 on 4 and 6 DF,  p-value: 0.0004625

Apply variance normalization on the sink

cf <- coef(linmod)

df <- df %>% 
    mutate(betareco_nrf = beta_reco * nrf) %>% 
    mutate(b_beta_reco = cf["beta_reco"] * beta_reco,
           c_betareco_nrf = cf["beta_reco:nrf"] * betareco_nrf )

b_beta_reco_mean <- mean(df$b_beta_reco, na.rm = TRUE)
c_betareco_nrf_mean <- mean(df$c_betareco_nrf, na.rm = TRUE)

df <- df %>% 
    mutate(sland_norm = sland - ((b_beta_reco + c_betareco_nrf) - (b_beta_reco_mean + c_betareco_nrf_mean)))

Plot normalised sink vs beta-gpp. Doesn’t look the way it should…

df %>% 
    ggplot(aes(sland_norm, beta_gpp)) +
    geom_point() +
    geom_smooth(method = "lm", color = "red", level = 0.95) +
    theme_classic()

Should it also be normalised for the non-respired flux (nrf)?

cf <- coef(linmod)

df <- df %>% 
    mutate(d_nrf = cf["nrf"] * nrf )

d_nrf_mean <- mean(df$d_nrf, na.rm = TRUE)

df <- df %>% 
    mutate(sland_norm2 = sland - ((b_beta_reco + c_betareco_nrf + d_nrf) - (b_beta_reco_mean + c_betareco_nrf_mean + d_nrf_mean)))

Yes!

df %>% 
    ggplot(aes(sland_norm2, beta_gpp)) +
    geom_point() +
    geom_smooth(method = "lm", color = "red", level = 0.95) +
    theme_classic()

That’s the one that gets the R-squared of 0.99:

yardstick::rsq(df, sland_norm2, beta_gpp) %>% pull(.estimate)

## [1] 0.9901483

Are the means of the original and normalised (red) sinks different? No!!!

df %>% 
  ggplot() +
  geom_density(aes(sland, ..density..)) +
  geom_density(aes(sland_norm2, ..density..), color = "red") +
  geom_vline(xintercept = mean(df$sland_norm, na.rm = TRUE), size = 2) +
  geom_vline(xintercept = mean(df$sland_norm2, na.rm = TRUE), color = "red") +
  theme_classic()

## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning: Removed 4 rows containing non-finite values (stat_density).

Now, perturb the distribution by removing a model (the first one: CABLE). Let’s make this a function.

normalize_sink <- function(df){
  linmod <- lm(sland ~ beta_gpp + beta_reco * nrf,
             data = df %>% 
                 filter(source != "GCPresidualSink")
             )
  cf <- coef(linmod)
  out <- df %>% 
      mutate(betareco_nrf = beta_reco * nrf) %>% 
      mutate(b_beta_reco = cf["beta_reco"] * beta_reco,
             c_betareco_nrf = cf["beta_reco:nrf"] * betareco_nrf,
             d_nrf = cf["nrf"] * nrf)
  
  b_beta_reco_mean <- mean(out$b_beta_reco, na.rm = TRUE)
  c_betareco_nrf_mean <- mean(out$c_betareco_nrf, na.rm = TRUE)
  d_nrf_mean <- mean(out$d_nrf, na.rm = TRUE)

  out <- out %>% 
    mutate(sland_norm2 = sland - ((b_beta_reco + c_betareco_nrf + d_nrf) - (b_beta_reco_mean + c_betareco_nrf_mean + d_nrf_mean)))
  
  return(out)
}

df2 <- df %>% 
  slice(-1) %>%   # perturb sample by removing one model
  normalize_sink()

Plot both distributions and means: it does affect the mean (vertical lines).

ggplot() +
  geom_density(aes(sland_norm2, ..density..), data = df) +
  geom_density(aes(sland_norm2, ..density..), data = df2, color = "red") +
  geom_vline(xintercept = mean(df$sland_norm2, na.rm = TRUE), lintetype = "dashed") +
  geom_vline(xintercept = mean(df2$sland_norm2, na.rm = TRUE), color = "red", lintetype = "dashed") +
  theme_classic()

## Warning: Ignoring unknown parameters: lintetype

## Warning: Ignoring unknown parameters: lintetype

## Warning: Removed 4 rows containing non-finite values (stat_density).

## Warning: Removed 4 rows containing non-finite values (stat_density).

Sink normalization

Beni Stocker

1/28/2022