Wright, M. N. and Ziegler, A.: Ranger: A fast implementation of random forests for high dimensional data in C++ and R, J. Stat. Softw., 77, https://doi.org/10.18637/jss.v077.i01, 2017.
Code
# Fit random forest model; i.v. in first columnlibrary(ranger)fit_and_test_rf <-function(x_train, x_val) { f <-as.formula(paste(colnames(x_train)[1], "~ .")) rf <- ranger::ranger(formula = f, data = x_train, importance ="impurity")# use importance = "permutation" for importance_pvalues() preds <-predict(rf, data = x_val)$predictions obs <-pull(x_val[1]) rss <-sum((preds - obs) ^2) tss <-sum((obs -mean(obs)) ^2)return(list(model = rf,training_r2 = rf$r.squared, importance =importance(rf), validation_r2 =1- rss / tss))} # fit random forest modeldat_filtered %>%# vivid::pdpVars() below needs things as factors not charmutate(Ecosystem_type =as.factor(Ecosystem_type),Soil_type_name =as.factor(Soil_type_name),PBE = SPEI12_y1 <=-1& SPEI12 >0) %>%select(sqrt_Rs_annual, #Longitude, Latitude, Ecosystem_type, SPEI12, PBE, Tair:LAI_high, Soil_type_name) %>%filter(complete.cases(.)) -> testmessage("The test complete-cases dataset has ", dim(test)[1]," rows and ", dim(test)[2], " columns")
The test complete-cases dataset has 3335 rows and 12 columns
Code
message("Fitting all-data model")
Fitting all-data model
Code
rf_all_data <-ranger(sqrt_Rs_annual ~ ., data = test, importance ="impurity")print(rf_all_data)
Ranger result
Call:
ranger(sqrt_Rs_annual ~ ., data = test, importance = "impurity")
Type: Regression
Number of trees: 500
Sample size: 3335
Number of independent variables: 11
Mtry: 3
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 27.15355
R squared (OOB): 0.6434321
Registered S3 method overwritten by 'seriation':
method from
reorder.hclust gclus
Code
# Top three variables...pdpVars(data = test, fit = rf_all_data, response ="sqrt_Rs_annual",vars =names(imps)[1:3])
Code
# ...and the next threepdpVars(data = test, fit = rf_all_data, response ="sqrt_Rs_annual",vars =names(imps)[4:6])
Code
# Fit a model without PBE and comparerf_all_data_no_PBE <-ranger(sqrt_Rs_annual ~ ., data =select(test, -PBE),importance ="impurity")tibble(model_with_PBE =rf_all_data$predictions,model_without_PBE = rf_all_data_no_PBE$predictions,PBE = test$PBE) %>%ggplot(aes(model_without_PBE, model_with_PBE)) +geom_point(aes(color = PBE)) +geom_abline() +ggtitle("Random Forest: effect of including PBE")
How important are the MODIS data?
The MODIS data only start in 2001 (see gsbe-data-prep output), so requiring them in a model cuts off a lot of early data – on the order of 20%. Are they needed?
fit_lm <-function(x) {} # fit linear model; i.v. in first columndo_k_fold <-function(x, k =10) {if(!all(complete.cases(x))) {stop("x should not have any NAs at this stage!") }set.seed(123) groups <-sample.int(n = k, size =nrow(x), replace =TRUE) training_r2 <-rep.int(NA_real_, k) validation_r2 <-rep.int(NA_real_, k)for(i inseq_len(k)) { x_val <- x[groups == i,] x_train <- x[groups != i,]# message("\tk-fold ", i, ":")# message("\t\tTraining is ", nrow(x_train), " x ", ncol(x_train))# message("\t\tValidation is ", nrow(x_val), " x ", ncol(x_val)) rf_out <-fit_and_test_rf(x_train = x_train, x_val = x_val) training_r2[i] <- rf_out$training_r2 validation_r2[i] <- rf_out$validation_r2# message("\t\tTraining R2 = ", round(training_r2[i], 3))# message("\t\tValidation R2 = ", round(validation_r2[i], 3)) }tibble(k =seq_len(k), training_r2 = training_r2,validation_r2 = validation_r2)} # compute fit statistics using k-fold# These are the three most common fluxes recorded in SRDB#iv_set <- c("Rs_annual")#, "Rh_annual", "Rs_growingseason")iv_set <-c("sqrt_Rs_annual", "sqrt_Rh_annual", "sqrt_Rs_growingseason")# The following SPEI values are from Table 1 of Keune et al. (2025)# https://www.nature.com/articles/s41597-025-04896-y# and correspond to "severely dry", "moderately dry", and "mildly dry"# (there are no "extremely dry"=-2 in the dataset)spei_cutoffs <-c(-1.5, -1.0, 0.0)# What SPEI window should be used for the cutoffs above?spei_windows <-c(12, 24) # months# Model independent variablespredictors <-c("Ecosystem_type","SPEI12","Tair", "Precip","Tsoil_lev1", "Tsoil_lev2", "VWC_lev1", "VWC_lev2", "LAI_high", "Soil_type_name")do_full_analysis <-function(x, iv_set, predictors, spei_windows, spei_cutoffs, k =10) { results <-list() SPEI_cols <-colnames(x)[grep("^SPEI", colnames(x))]for(iv in iv_set) {for(w in spei_windows) { spei_past_col <-paste0("SPEI", w, "_y1")for(spei_cut in spei_cutoffs) {# Remove other potential independent variables this_x <- x[c(iv, union(predictors, SPEI_cols))]# Complete cases only this_x <- this_x[complete.cases(this_x),]message("---------------------------------------------")message("iv = ", iv)message("spei_cut = ", spei_cut, " window = ", w)message("\tn = ", nrow(this_x))# identify observations not currently in a drought but that# WERE in a drought the previous year# PBE = potential Birch effect this_x$PBE = this_x$SPEI12 >0& this_x[spei_past_col] <= spei_cutmessage("\tPBE = ", sum(this_x$PBE))# Drop non-predictors entirely now this_x <- this_x[c(iv, predictors, "PBE")]# Run the k-fold and save results k_fold_out <-do_k_fold(this_x, k = k) k_fold_out$spei_window = w k_fold_out$spei_cut <- spei_cut k_fold_out$iv = iv results[[paste(iv, w, spei_cut)]] <- k_fold_out } } }bind_rows(results)}out <-do_full_analysis(dat_filtered, iv_set, predictors,spei_windows =c(12,24),spei_cutoffs = spei_cutoffs,k =10)
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1.5 window = 12
n = 3334
PBE = 23
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1 window = 12
n = 3334
PBE = 93
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = 0 window = 12
n = 3334
PBE = 550
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1.5 window = 24
n = 3334
PBE = 95
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = -1 window = 24
n = 3334
PBE = 181
---------------------------------------------
iv = sqrt_Rs_annual
spei_cut = 0 window = 24
n = 3334
PBE = 674
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1.5 window = 12
n = 728
PBE = 4
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1 window = 12
n = 728
PBE = 17
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = 0 window = 12
n = 728
PBE = 125
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1.5 window = 24
n = 728
PBE = 22
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = -1 window = 24
n = 728
PBE = 35
---------------------------------------------
iv = sqrt_Rh_annual
spei_cut = 0 window = 24
n = 728
PBE = 135
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1.5 window = 12
n = 1637
PBE = 7
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1 window = 12
n = 1637
PBE = 25
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = 0 window = 12
n = 1637
PBE = 309
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1.5 window = 24
n = 1637
PBE = 43
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = -1 window = 24
n = 1637
PBE = 112
---------------------------------------------
iv = sqrt_Rs_growingseason
spei_cut = 0 window = 24
n = 1637
PBE = 332
Code
# Compute means across the k-foldsout %>%group_by(spei_window, spei_cut, iv) %>%summarise(training_r2 =mean(training_r2), validation_r2 =mean(validation_r2)) %>%mutate(spei_cut =as.factor(spei_cut),spei_window =paste(spei_window, "months")) -> out_smryggplot(out_smry, aes(spei_cut, iv, fill = validation_r2)) +geom_tile() +facet_wrap(~spei_window) +xlab("Previous-year SPEI cutoff") +ylab("")