gsbe

Author

bbl

Code
SSD <- 123
message("set.seed = ", SSD)
set.seed = 123
Code
set.seed(SSD)

dat <- read_csv("srdb_joined_data.csv", show_col_types = FALSE)

message(nrow(dat), " rows of data")
4903 rows of data
Code
dat %>% 
    filter(is.na(Rs_growingseason) | (Rs_growingseason > 0 & Rs_growingseason < 20)) %>% 
    filter(is.na(Rs_annual) | (Rs_annual > 0 & Rs_annual < 4000)) %>% 
    filter(is.na(Rh_annual) | (Rh_annual > 0 & Rh_annual < 2000)) %>% 
    replace_na(list(Ecosystem_type = "Unknown", Biome = "Unknown")) ->
    dat_filtered

message("After filtering, ", nrow(dat_filtered), " rows of data")
After filtering, 4867 rows of data

Independent variable distribution

We have three potential independent variables:

  • Rs_annual (annual soil respiration)

  • Rh_annual (annual heterotrophic respiration)

  • Rs_growingseason (mean growing season soil respiration)

Their distributions are expected to be non-normal, which is a problem for linear models and not ideal for anything. Evaluate possible transformations.

Code
dat_filtered %>% 
    select(Rs_annual, Rh_annual, Rs_growingseason) %>% 
    pivot_longer(everything()) %>% 
    filter(!is.na(value), value > 0) %>% 
    rename(none = value) %>% 
    mutate(log = log(none), sqrt = sqrt(none)) %>% 
    
    pivot_longer(-name, names_to = "Transformation") %>% 
    ggplot(aes(value, color = Transformation)) + 
    geom_density() +
    facet_wrap(~ name + Transformation, scales = "free")

Code
dat_filtered %>% 
    mutate(sqrt_Rs_annual = sqrt(Rs_annual),
           sqrt_Rh_annual = sqrt(Rh_annual),
           sqrt_Rs_growingseason = sqrt(Rs_growingseason)) ->
    dat_filtered

We have a transformation winner: sqrt()!

Dataset overview

Code
dat_filtered %>% 
    select(Biome, Ecosystem_type, sqrt_Rs_annual, sqrt_Rh_annual, sqrt_Rs_growingseason,
           Tair, Tsoil_lev1, VWC_lev1, Precip, LAI_high, 
           MODIS_NPP, MODIS_GPP) %>% 
    pivot_longer(Tair:MODIS_GPP, names_to = "covariate_name", values_to = "covariate_value") %>%
    pivot_longer(sqrt_Rs_annual:sqrt_Rs_growingseason) %>% 
    ggplot(aes(value, covariate_value, color = Biome)) + 
        facet_grid(covariate_name ~ name, scales = "free") +
        geom_point(na.rm = TRUE)

Code
dat_filtered %>% 
    filter(!Biome == "Semi-arid", !Biome == "Unknown") %>% 
    select(Biome, Ecosystem_type, sqrt_Rs_annual, 
           Tair, Tsoil_lev1, VWC_lev1, Precip, LAI_high) %>% 
    pivot_longer(Tair:LAI_high, names_to = "covariate_name", values_to = "covariate_value") %>% 
    ggplot(aes(sqrt_Rs_annual, covariate_value, color = Biome)) + 
        facet_grid(covariate_name ~ Biome, scales = "free") +
        geom_point(na.rm = TRUE)

Code
stn <- as.factor(dat_filtered$Soil_type_number)
nst <- length(unique(stn))
dat_filtered %>% 
    select(sqrt_Rs_growingseason, sqrt_Rs_annual, sqrt_Rh_annual) %>% 
    pairs(col = hcl.colors(nst, "Temps")[stn])

Test Random Forest model

We are using the ranger package.

Wright, M. N. and Ziegler, A.: Ranger: A fast implementation of random forests for high dimensional data in C++ and R, J. Stat. Softw., 77, https://doi.org/10.18637/jss.v077.i01, 2017.

Code
# Fit random forest model; i.v. in first column
library(ranger)
fit_and_test_rf <- function(x_train, x_val) {
    f <- as.formula(paste(colnames(x_train)[1], "~ ."))
    rf <- ranger::ranger(formula = f, data = x_train, importance = "impurity")
    # use importance = "permutation" for importance_pvalues()
    
    preds <- predict(rf, data = x_val)$predictions
    obs <- pull(x_val[1])
    rss <- sum((preds - obs) ^ 2)
    tss <- sum((obs - mean(obs)) ^ 2)

    return(list(model = rf,
                training_r2 = rf$r.squared, 
                importance = importance(rf), 
                validation_r2 = 1 - rss / tss))
} # fit random forest model

dat_filtered %>% 
    # vivid::pdpVars() below needs things as factors not char
    mutate(Ecosystem_type = as.factor(Ecosystem_type),
           Soil_type_name = as.factor(Soil_type_name),
           PBE = SPEI12_y1 <= -1 & SPEI12 > 0) %>% 
    select(sqrt_Rs_annual, #Longitude, Latitude,
           Ecosystem_type, SPEI12, PBE, 
           Tair:LAI_high, Soil_type_name) %>% 
    filter(complete.cases(.)) -> 
    test

message("The test complete-cases dataset has ", dim(test)[1],
        " rows and ", dim(test)[2], " columns")
The test complete-cases dataset has 3335 rows and 12 columns
Code
message("Fitting all-data model")
Fitting all-data model
Code
rf_all_data <- ranger(sqrt_Rs_annual ~ ., data = test, importance = "impurity")
print(rf_all_data)
Ranger result

Call:
 ranger(sqrt_Rs_annual ~ ., data = test, importance = "impurity") 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      3335 
Number of independent variables:  11 
Mtry:                             3 
Target node size:                 5 
Variable importance mode:         impurity 
Splitrule:                        variance 
OOB prediction error (MSE):       27.15355 
R squared (OOB):                  0.6434321 
Code
imps <- sort(rf_all_data$variable.importance, decreasing = TRUE)

library(vivid)
Registered S3 method overwritten by 'seriation':
  method         from 
  reorder.hclust gclus
Code
# Top three variables...
pdpVars(data = test, 
        fit = rf_all_data, 
        response = "sqrt_Rs_annual",
        vars = names(imps)[1:3])

Code
# ...and the next three
pdpVars(data = test, 
        fit = rf_all_data, 
        response = "sqrt_Rs_annual",
        vars = names(imps)[4:6])

Code
# Fit a model without PBE and compare
rf_all_data_no_PBE <- ranger(sqrt_Rs_annual ~ ., 
                             data = select(test, -PBE),
                             importance = "impurity")

tibble(model_with_PBE =rf_all_data$predictions,
       model_without_PBE = rf_all_data_no_PBE$predictions,
       PBE = test$PBE) %>% 
    ggplot(aes(model_without_PBE, model_with_PBE)) + 
    geom_point(aes(color = PBE)) + 
    geom_abline() + 
    ggtitle("Random Forest: effect of including PBE")

How important are the MODIS data?

The MODIS data only start in 2001 (see gsbe-data-prep output), so requiring them in a model cuts off a lot of early data – on the order of 20%. Are they needed?

Code
dat_filtered %>% 
    select(sqrt_Rs_annual,
           Ecosystem_type, SPEI12, MODIS_NPP, MODIS_GPP,
           Tair:LAI_high, Soil_type_name) %>% 
    filter(complete.cases(.)) -> 
    test1
rf_modis <- ranger(sqrt_Rs_annual ~ ., data = test1)

dat_filtered %>% 
    select(sqrt_Rs_annual, Longitude, Latitude,
           Ecosystem_type, SPEI12, #MODIS_NPP, MODIS_GPP,
           Tair:LAI_high, Soil_type_name) %>% 
    filter(complete.cases(.)) -> 
    test2
rf_no_modis <- ranger(sqrt_Rs_annual ~ ., data = test1)

message("Model *with* MODIS data OOB R2 = ", round(rf_modis$r.squared, 4))
Model *with* MODIS data OOB R2 = 0.6297
Code
message("Model *without* MODIS data OOB R2 = ", round(rf_no_modis$r.squared, 4))
Model *without* MODIS data OOB R2 = 0.6308

It doesn’t seem like they’re needed.

Test spatial Random Forest

Spatial autocorrelation wouldn’t change the fit of the model, but it could bias the variable importance metrics. Evaluate.

Code
# This needs the newest version 1.1.5, not yet on CRAN
# remotes::install_github(repo = "blasbenito/spatialRF",
#   ref = "main", force = TRUE, quiet = TRUE)
stopifnot(packageVersion("spatialRF") >= "1.1.5")

dat_filtered %>% 
    select(sqrt_Rs_annual, Longitude, Latitude,
           Ecosystem_type, SPEI12, #MODIS_NPP, MODIS_GPP,
           Tair:LAI_high, Soil_type_name) %>% 
    filter(complete.cases(.)) -> 
    test

# Compute distance between all pairs of points
library(geodist)
dm <- geodist(select(test, Longitude, Latitude), measure = "geodesic")
test$Longitude <- test$Latitude <- NULL

library(spatialRF)

Attaching package: 'spatialRF'
The following object is masked from 'package:stats':

    rf
Code
rf_sp <- spatialRF::rf(data = test, 
                       dependent.variable.name = colnames(test)[1],
                       predictor.variable.names = setdiff(colnames(test), colnames(test)[1]),
                       distance.matrix = dm)

Model type
  - Fitted with:                     ranger()
  - Response variable:               sqrt_Rs_annual

Random forest parameters
  - Type:                            Regression
  - Number of trees:                 500
  - Sample size:                     3339
  - Number of predictors:            10
  - Mtry:                            3
  - Minimum node size:               5


Model performance 
  - R squared (oob):                  0.6423978
  - R squared (cor(obs, pred)^2):     0.8886928
  - Pseudo R squared (cor(obs, pred)):0.942705
  - RMSE (oob):                       5.218019
  - RMSE:                             3.0377
  - Normalized RMSE:                  0.2850532

Model residuals 
  - Stats: 
             ┌────────┬────────┬────────┬───────┬────────┬───────┐
             │ Min.   │ 1st Q. │ Median │ Mean  │ 3rd Q. │ Max.  │
             ├────────┼────────┼────────┼───────┼────────┼───────┤
             │ -18.24 │  -1.67 │  -0.04 │ -0.02 │   1.55 │ 16.06 │
             └────────┴────────┴────────┴───────┴────────┴───────┘
  - Normality: 
      - Shapiro-Wilks W: 0.972 
      - p-value        : 0 
      - Interpretation : Residuals are not normal 

  - Spatial autocorrelation: 
             ┌───────────┬───────────┬─────────┬──────────────────┐
             │  Distance │ Moran's I │ P value │ Interpretation   │
             ├───────────┼───────────┼─────────┼──────────────────┤
             │       0.0 │    -0.017 │   0.205 │ No spatial       │
             │           │           │         │ correlation      │
             ├───────────┼───────────┼─────────┼──────────────────┤
             │ 3331584.0 │     0.000 │   0.696 │ No spatial       │
             │           │           │         │ correlation      │
             ├───────────┼───────────┼─────────┼──────────────────┤
             │ 6663168.0 │     0.000 │   0.357 │ No spatial       │
             │           │           │         │ correlation      │
             ├───────────┼───────────┼─────────┼──────────────────┤
             │ 9994752.0 │     0.000 │   0.089 │ No spatial       │
             │           │           │         │ correlation      │
             └───────────┴───────────┴─────────┴──────────────────┘

Variable importance: 
                        ┌────────────────┬────────────┐
                        │ Variable       │ Importance │
                        ├────────────────┼────────────┤
                        │ Tair           │      8.934 │
                        ├────────────────┼────────────┤
                        │ Tsoil_lev1     │      7.748 │
                        ├────────────────┼────────────┤
                        │ Tsoil_lev2     │      7.557 │
                        ├────────────────┼────────────┤
                        │ Precip         │      6.796 │
                        ├────────────────┼────────────┤
                        │ LAI_high       │      5.525 │
                        ├────────────────┼────────────┤
                        │ VWC_lev2       │      5.389 │
                        ├────────────────┼────────────┤
                        │ VWC_lev1       │      5.372 │
                        ├────────────────┼────────────┤
                        │ Ecosystem_type │      3.994 │
                        ├────────────────┼────────────┤
                        │ SPEI12         │      3.949 │
                        ├────────────────┼────────────┤
                        │ Soil_type_name │      3.413 │
                        └────────────────┴────────────┘

Analysis1

Code
fit_lm <- function(x) {} # fit linear model; i.v. in first column

do_k_fold <- function(x, k = 10) {
    if(!all(complete.cases(x))) {
        stop("x should not have any NAs at this stage!")
    }
    
    set.seed(123)
    groups <- sample.int(n = k, size = nrow(x), replace = TRUE)
    training_r2 <- rep.int(NA_real_, k)
    validation_r2 <- rep.int(NA_real_, k)
    for(i in seq_len(k)) {
        x_val <- x[groups == i,]
        x_train <- x[groups != i,]
        # message("\tk-fold ", i, ":")
        # message("\t\tTraining is ", nrow(x_train), " x ", ncol(x_train))
        # message("\t\tValidation is ", nrow(x_val), " x ", ncol(x_val))

        rf_out <- fit_and_test_rf(x_train = x_train, x_val = x_val)
        training_r2[i] <- rf_out$training_r2
        validation_r2[i] <- rf_out$validation_r2
        
        # message("\t\tTraining R2 = ", round(training_r2[i], 3))
        # message("\t\tValidation R2 = ", round(validation_r2[i], 3))
    }
    tibble(k = seq_len(k), 
           training_r2 = training_r2,
           validation_r2 = validation_r2)
} # compute fit statistics using k-fold



# These are the three most common fluxes recorded in SRDB
#iv_set <- c("Rs_annual")#, "Rh_annual", "Rs_growingseason")
iv_set <- c("sqrt_Rs_annual", "sqrt_Rh_annual", "sqrt_Rs_growingseason")

# The following SPEI values are from Table 1 of Keune et al. (2025)
# https://www.nature.com/articles/s41597-025-04896-y
# and correspond to "severely dry", "moderately dry", and "mildly dry"
# (there are no "extremely dry"=-2 in the dataset)
spei_cutoffs <- c(-1.5, -1.0, 0.0)

# What SPEI window should be used for the cutoffs above?
spei_windows <- c(12, 24) # months

# Model independent variables
predictors <- c("Ecosystem_type",
                "SPEI12",
                "Tair", "Precip",
                "Tsoil_lev1", "Tsoil_lev2", "VWC_lev1", "VWC_lev2", 
                "LAI_high", "Soil_type_name")

do_full_analysis <- function(x, iv_set, predictors, spei_windows, spei_cutoffs, k = 10) {
    results <- list()
    SPEI_cols <- colnames(x)[grep("^SPEI", colnames(x))]
    for(iv in iv_set) {
        
        for(w in spei_windows) {
            spei_past_col <- paste0("SPEI", w, "_y1")
            
            for(spei_cut in spei_cutoffs) {
                # Remove other potential independent variables
                this_x <- x[c(iv, union(predictors, SPEI_cols))]
                # Complete cases only
                this_x <- this_x[complete.cases(this_x),]
                
                message("---------------------------------------------")
                message("iv = ", iv)
                message("spei_cut = ", spei_cut, " window = ", w)
                message("\tn = ", nrow(this_x))
                
                # identify observations not currently in a drought but that
                # WERE in a drought the previous year
                # PBE = potential Birch effect
                this_x$PBE = this_x$SPEI12 > 0 & this_x[spei_past_col] <= spei_cut
                message("\tPBE = ", sum(this_x$PBE))
                
                # Drop non-predictors entirely now
                this_x <- this_x[c(iv, predictors, "PBE")]
                
                # Run the k-fold and save results
                k_fold_out <- do_k_fold(this_x, k = k)
                k_fold_out$spei_window = w
                k_fold_out$spei_cut <- spei_cut
                k_fold_out$iv = iv
                results[[paste(iv, w, spei_cut)]] <- k_fold_out
            }
        }
    }
    bind_rows(results)
}

out <- do_full_analysis(dat_filtered,
                        iv_set, predictors,
                        spei_windows = c(12,24),
                        spei_cutoffs = spei_cutoffs,
                        k = 10)
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1.5 window = 12
    n = 3334
    PBE = 23
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1 window = 12
    n = 3334
    PBE = 93
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = 0 window = 12
    n = 3334
    PBE = 550
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1.5 window = 24
    n = 3334
    PBE = 95
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1 window = 24
    n = 3334
    PBE = 181
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = 0 window = 24
    n = 3334
    PBE = 674
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1.5 window = 12
    n = 728
    PBE = 4
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1 window = 12
    n = 728
    PBE = 17
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = 0 window = 12
    n = 728
    PBE = 125
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1.5 window = 24
    n = 728
    PBE = 22
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1 window = 24
    n = 728
    PBE = 35
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = 0 window = 24
    n = 728
    PBE = 135
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1.5 window = 12
    n = 1637
    PBE = 7
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1 window = 12
    n = 1637
    PBE = 25
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = 0 window = 12
    n = 1637
    PBE = 309
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1.5 window = 24
    n = 1637
    PBE = 43
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1 window = 24
    n = 1637
    PBE = 112
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = 0 window = 24
    n = 1637
    PBE = 332
Code
# Compute means across the k-folds
out %>% 
    group_by(spei_window, spei_cut, iv) %>% 
    summarise(training_r2 = mean(training_r2), 
              validation_r2 = mean(validation_r2)) %>% 
    mutate(spei_cut = as.factor(spei_cut),
           spei_window = paste(spei_window, "months")) ->
    out_smry

ggplot(out_smry, aes(spei_cut, iv, fill = validation_r2)) + 
    geom_tile() + 
    facet_wrap(~spei_window) +
    xlab("Previous-year SPEI cutoff") + ylab("")

Code
out_smry %>% 
    filter(spei_window == "12 months") %>% 
    pivot_longer(cols = c(training_r2, validation_r2)) %>% 
    ggplot(aes(spei_cut, iv, fill = value)) +
    geom_tile() +
    facet_wrap(~name) +
    xlab("Previous-year SPEI cutoff") + ylab("") +
    ggtitle("12-month SPEI window")

Test linear model

Code
dat_filtered %>% 
    mutate(Ecosystem_type = as.factor(Ecosystem_type),
           Soil_type_name = as.factor(Soil_type_name),
           PBE = SPEI12_y1 <= -1 & SPEI12 > 0) %>% 
    select(sqrt_Rs_annual, #Longitude, Latitude,
           Ecosystem_type, SPEI12, PBE, 
           Tair:LAI_high, Soil_type_name) %>% 
    filter(complete.cases(.)) -> 
    dat_linear

message("Fitting linear model with PBE (SPEI12, -1.0) effect...")
Fitting linear model with PBE (SPEI12, -1.0) effect...
Code
m <- lm(sqrt_Rs_annual ~ (Tair + Tsoil_lev1 + Tsoil_lev2 +
            VWC_lev1 + I(VWC_lev1 ^ 2) + VWC_lev2 ^ 2 + I(VWC_lev2 ^ 2)) * PBE +
            Ecosystem_type,
        data = dat_linear)
m_reduced <- MASS::stepAIC(m, trace = FALSE)
summary(m_reduced)

Call:
lm(formula = sqrt_Rs_annual ~ Tair + Tsoil_lev1 + Tsoil_lev2 + 
    VWC_lev1 + I(VWC_lev1^2) + VWC_lev2 + I(VWC_lev2^2) + PBE + 
    Ecosystem_type + Tair:PBE + Tsoil_lev2:PBE + VWC_lev1:PBE + 
    I(VWC_lev1^2):PBE + VWC_lev2:PBE + I(VWC_lev2^2):PBE, data = dat_linear)

Residuals:
     Min       1Q   Median       3Q      Max 
-25.7298  -4.8857  -0.2356   4.2593  31.0512 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)              1.511e+01  1.593e+00   9.486  < 2e-16 ***
Tair                     2.135e-01  1.084e-01   1.970 0.048903 *  
Tsoil_lev1              -9.179e+00  1.948e+00  -4.713 2.54e-06 ***
Tsoil_lev2               9.342e+00  1.955e+00   4.779 1.84e-06 ***
VWC_lev1                 6.472e+01  2.655e+01   2.437 0.014851 *  
I(VWC_lev1^2)            1.333e+00  5.663e+01   0.024 0.981225    
VWC_lev2                -2.346e+01  2.878e+01  -0.815 0.414916    
I(VWC_lev2^2)           -5.497e+01  5.872e+01  -0.936 0.349271    
PBETRUE                  2.392e+00  8.435e+00   0.284 0.776753    
Ecosystem_typeBare      -2.238e+01  7.611e+00  -2.941 0.003292 ** 
Ecosystem_typeCropland   3.110e+00  4.430e+00   0.702 0.482751    
Ecosystem_typeDesert    -5.296e+00  1.452e+00  -3.649 0.000268 ***
Ecosystem_typeForest     2.005e+00  5.239e-01   3.827 0.000132 ***
Ecosystem_typeGrassland  2.750e+00  6.187e-01   4.445 9.07e-06 ***
Ecosystem_typeMixed      2.851e+00  3.140e+00   0.908 0.364027    
Ecosystem_typeOrchard   -4.702e+00  4.443e+00  -1.058 0.290054    
Ecosystem_typeSavanna    3.242e-02  1.832e+00   0.018 0.985883    
Ecosystem_typeShrubland -2.116e+00  8.123e-01  -2.604 0.009242 ** 
Ecosystem_typeTundra    -2.171e+00  2.748e+00  -0.790 0.429459    
Ecosystem_typeUnknown    5.504e+00  3.155e+00   1.745 0.081153 .  
Ecosystem_typeUrban      5.720e+00  2.165e+00   2.642 0.008285 ** 
Ecosystem_typeWetland   -3.224e+00  8.128e-01  -3.966 7.46e-05 ***
Tair:PBETRUE            -2.713e+00  6.947e-01  -3.905 9.62e-05 ***
Tsoil_lev2:PBETRUE       2.972e+00  7.724e-01   3.848 0.000122 ***
VWC_lev1:PBETRUE        -1.709e+03  6.367e+02  -2.684 0.007319 ** 
I(VWC_lev1^2):PBETRUE    4.171e+03  1.158e+03   3.602 0.000320 ***
VWC_lev2:PBETRUE         1.664e+03  6.645e+02   2.504 0.012339 *  
I(VWC_lev2^2):PBETRUE   -4.112e+03  1.185e+03  -3.471 0.000526 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.591 on 3307 degrees of freedom
Multiple R-squared:  0.2494,    Adjusted R-squared:  0.2433 
F-statistic:  40.7 on 27 and 3307 DF,  p-value: < 2.2e-16
Code
par(mfrow = c(2, 2))
plot(m_reduced)
Warning: not plotting observations with leverage one:
  3044

Code
library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
Code
car::Anova(m_reduced, type = "III")
Anova Table (Type III tests)

Response: sqrt_Rs_annual
                  Sum Sq   Df F value    Pr(>F)    
(Intercept)         5185    1 89.9797 < 2.2e-16 ***
Tair                 224    1  3.8815 0.0489032 *  
Tsoil_lev1          1280    1 22.2132 2.541e-06 ***
Tsoil_lev2          1316    1 22.8402 1.837e-06 ***
VWC_lev1             342    1  5.9402 0.0148514 *  
I(VWC_lev1^2)          0    1  0.0006 0.9812251    
VWC_lev2              38    1  0.6648 0.4149159    
I(VWC_lev2^2)         51    1  0.8764 0.3492711    
PBE                    5    1  0.0804 0.7767535    
Ecosystem_type      8916   13 11.9014 < 2.2e-16 ***
Tair:PBE             879    1 15.2477 9.617e-05 ***
Tsoil_lev2:PBE       853    1 14.8036 0.0001216 ***
VWC_lev1:PBE         415    1  7.2019 0.0073190 ** 
I(VWC_lev1^2):PBE    748    1 12.9771 0.0003200 ***
VWC_lev2:PBE         361    1  6.2684 0.0123390 *  
I(VWC_lev2^2):PBE    694    1 12.0462 0.0005256 ***
Residuals         190570 3307                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
message("Fitting linear model without PBE effect...")
Fitting linear model without PBE effect...
Code
m_no_PBE <- lm(sqrt_Rs_annual ~ (Tair + Tsoil_lev1 + Tsoil_lev2 +
            VWC_lev1 + I(VWC_lev1 ^ 2) + VWC_lev2 ^ 2 + I(VWC_lev2 ^ 2)) +
            Ecosystem_type,
        data = dat_linear)
m_no_PBE_reduced <- MASS::stepAIC(m, trace = FALSE)

tibble(model_with_PBE = predict(m),
       model_without_PBE = predict(m_no_PBE),
       PBE = dat_linear$PBE) %>% 
    ggplot(aes(model_without_PBE, model_with_PBE)) + 
    geom_point(aes(color = PBE)) + 
    geom_abline() + 
    ggtitle("Linear model: effect of including PBE")

Reproducibility

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.7.2

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] car_3.1-3       carData_3.0-5   spatialRF_1.1.5 geodist_0.1.1  
 [5] vivid_0.3.0     ranger_0.17.0   lubridate_1.9.4 readr_2.1.5    
 [9] ggplot2_4.0.1   tidyr_1.3.1     dplyr_1.1.4    

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1   viridisLite_0.4.2  farver_2.1.2       viridis_0.6.5     
 [5] S7_0.2.1           fastmap_1.2.0      TSP_1.2.6          GGally_2.4.0      
 [9] promises_1.3.3     digest_0.6.39      timechange_0.3.0   mime_0.13         
[13] lifecycle_1.0.4    cluster_2.1.8.1    magrittr_2.0.4     compiler_4.5.2    
[17] rlang_1.1.6        tools_4.5.2        igraph_2.1.4       yaml_2.3.10       
[21] huxtable_5.8.0     knitr_1.50         labeling_0.4.3     htmlwidgets_1.6.4 
[25] bit_4.6.0          RColorBrewer_1.1-3 abind_1.4-8        registry_0.5-1    
[29] ca_0.71.1          withr_3.0.2        purrr_1.2.0        grid_4.5.2        
[33] flashlight_1.0.0   fansi_1.0.7        xtable_1.8-4       colorspace_2.1-2  
[37] scales_1.4.0       iterators_1.0.14   MASS_7.3-65        cli_3.6.5         
[41] rmarkdown_2.29     crayon_1.5.3       generics_0.1.4     rstudioapi_0.17.1 
[45] tzdb_0.5.0         stringr_1.6.0      assertthat_0.2.1   parallel_4.5.2    
[49] vctrs_0.6.5        Matrix_1.7-4       jsonlite_2.0.0     seriation_1.5.8   
[53] hms_1.1.3          patchwork_1.3.2    gclus_1.3.3        bit64_4.6.0-1     
[57] Formula_1.2-5      foreach_1.5.2      ggnewscale_0.5.2   glue_1.8.0        
[61] ggstats_0.11.0     codetools_0.2-20   stringi_1.8.7      gtable_0.3.6      
[65] DendSer_1.0.2      later_1.4.2        tibble_3.3.0       pillar_1.11.1     
[69] htmltools_0.5.9    R6_2.6.1           doParallel_1.0.17  vroom_1.6.5       
[73] evaluate_1.0.3     shiny_1.11.1       lattice_0.22-7     httpuv_1.6.16     
[77] condvis2_0.1.2     Rcpp_1.1.0         gridExtra_2.3      xfun_0.52         
[81] pkgconfig_2.0.3