#Load RF database

library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.3.3
library(lme4)         
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(emmeans)      
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(tidyr)
library(broom)
library(readxl)

RF_data <- read_csv(
  "D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/drivers_data.csv")
## Rows: 473 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_t...
## dbl (19): ICR_N_perc_23A, ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, IC...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert binary variables (0/1) to factors
RF_data$Couch_Consit <- factor(RF_data$Couch_Consit,
                               levels = c(0,1),
                               labels = c("Absent","Present"))

RF_data$Whitch_weed <- factor(RF_data$Whitch_weed,
                              levels = c(0,1),
                              labels = c("Absent","Present"))

# Convert categorical variables to factors
RF_data$CA_typology_09 <- as.factor(RF_data$CA_typology_09)
RF_data$IQR_soil_texture <- as.factor(RF_data$IQR_soil_texture)
RF_data$IQR_soil_color <- as.factor(RF_data$IQR_soil_color)
RF_data$IQF_agzone <- as.factor(RF_data$IQF_agzone)
RF_data$IQF_environment <- as.factor(RF_data$IQF_environment)

# Optional (often categorical as well)
RF_data$IQR_plot_fert_quality <- as.factor(RF_data$IQR_plot_fert_quality)

Select variables and prepare dataset

library(tidyverse)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(usdm)
## Warning: package 'usdm' was built under R version 4.3.3
## Loading required package: terra
## Warning: package 'terra' was built under R version 4.3.3
## terra 1.8.29
## 
## Attaching package: 'terra'
## The following object is masked from 'package:tidyr':
## 
##     extract
# Select response + explanatory variables
data_model <- RF_data %>%
  dplyr::select(
    avg_Y_ratio,
    ICR_N_perc_23A,
    ICR_K_perc_23A,
    K_factor,
    Couch_Consit,
    Whitch_weed,
    ICF_Alt_avg,
    ICR_Avail_P_avg,
    P_factor,
    ICR_arable_land_avg,
    ICR_rainfall_avg,
    acc_rain_p60_maize,
    acc_rain_p60_beans,
    Comp_amount_corr_quali,
    ICF_planting_date,
    Slope,
    IQR_TF_tubura_client,
    IQR_plot_fert_quality,
    IQR_soil_texture,
    IQR_soil_color,
    IQF_position_slope,
    Weed_species_A_combined,
    Weed_Overall_avg,
    avg_ICR_mulch_perc_planti,
    IQF_agzone,
    IQF_environment
  ) %>%
  na.omit()

Train/ Test split

set.seed(42)

parts <- createDataPartition(data_model$avg_Y_ratio, p = 0.7, list = FALSE)

data_train <- data_model[parts, ]
data_test  <- data_model[-parts, ]

x_train <- data_train[, -1]
y_train <- data_train$avg_Y_ratio

Variable selection (RFE)

set.seed(627)

control_RFE <- rfeControl(
  functions = rfFuncs,
  method = "repeatedcv",
  repeats = 1,
  number = 5
)

start <- Sys.time()

result_RFE <- rfe(
  x = x_train,
  y = y_train,
  sizes = 1:ncol(x_train),
  rfeControl = control_RFE
)

end <- Sys.time()
print(end - start)
## Time difference of 32.51653 secs

Selected:

predictors(result_RFE)
## [1] "ICR_rainfall_avg"          "IQF_agzone"               
## [3] "ICF_Alt_avg"               "avg_ICR_mulch_perc_planti"
## [5] "Couch_Consit"              "Weed_Overall_avg"         
## [7] "ICF_planting_date"         "acc_rain_p60_beans"       
## [9] "ICR_K_perc_23A"

non selected

names(x_train)[!names(x_train) %in% result_RFE$optVariables]
##  [1] "ICR_N_perc_23A"          "K_factor"               
##  [3] "Whitch_weed"             "ICR_Avail_P_avg"        
##  [5] "P_factor"                "ICR_arable_land_avg"    
##  [7] "acc_rain_p60_maize"      "Comp_amount_corr_quali" 
##  [9] "Slope"                   "IQR_TF_tubura_client"   
## [11] "IQR_plot_fert_quality"   "IQR_soil_texture"       
## [13] "IQR_soil_color"          "IQF_position_slope"     
## [15] "Weed_species_A_combined" "IQF_environment"

Keep only selected variables

dataRFE <- subset(data_train, select = result_RFE$optVariables)

I will force it to keep “Slope” and “IQR_soil_texture” and “acc_rain_p60_maize”

forced_vars <- c("Slope", "IQR_soil_texture", "acc_rain_p60_maize", "ICR_Avail_P_avg", "K_factor" , "Comp_amount_corr_quali", "IQF_position_slope", "IQR_soil_texture", "IQR_soil_color")

final_vars <- unique(c(result_RFE$optVariables, forced_vars))

dataRFE <- data_train[, final_vars]

Test colinearity

First we separate numeric from factoria as VIF only handles numeric

library(dplyr)

data_numeric <- dataRFE %>% select(where(is.numeric))
data_factor  <- dataRFE %>% select(where(is.factor))

names(data_numeric)
##  [1] "ICR_rainfall_avg"          "ICF_Alt_avg"              
##  [3] "avg_ICR_mulch_perc_planti" "Weed_Overall_avg"         
##  [5] "ICF_planting_date"         "acc_rain_p60_beans"       
##  [7] "ICR_K_perc_23A"            "Slope"                    
##  [9] "acc_rain_p60_maize"        "ICR_Avail_P_avg"          
## [11] "Comp_amount_corr_quali"
names(data_factor)
## [1] "IQF_agzone"       "Couch_Consit"     "IQR_soil_texture" "IQR_soil_color"

Colineary test: VIF

Common thresholds:

VIF < 5 → safe VIF < 10 → acceptable VIF > 10 → problematic collinearity

So we keep all the numeric

library(usdm)
data_numeric_df <- as.data.frame(data_numeric)

vif_step <- vifstep(data_numeric_df, th = 10)

vif_step
## No variable from the 11 input variables has collinearity problem. 
## 
## The linear correlation coefficients ranges between: 
## min correlation ( ICR_Avail_P_avg ~ acc_rain_p60_maize ):  -0.001750004 
## max correlation ( acc_rain_p60_beans ~ ICR_rainfall_avg ):  0.7007355 
## 
## ---------- VIFs of the remained variables -------- 
##                    Variables      VIF
## 1           ICR_rainfall_avg 2.733473
## 2                ICF_Alt_avg 1.666398
## 3  avg_ICR_mulch_perc_planti 1.708313
## 4           Weed_Overall_avg 1.316005
## 5          ICF_planting_date 1.520338
## 6         acc_rain_p60_beans 2.160719
## 7             ICR_K_perc_23A 1.197928
## 8                      Slope 1.429011
## 9         acc_rain_p60_maize 1.548176
## 10           ICR_Avail_P_avg 1.066016
## 11    Comp_amount_corr_quali 1.166478

Final RF database

data_train_final <- cbind(
  avg_Y_ratio = data_train$avg_Y_ratio,
  dataRFE
)

RF model calibration

# model calibration -------------------------------------------------------

fitControl <- trainControl(
  method = "repeatedcv",
  repeats = 1,
  number = 5,
  search = "grid"
)

# grid for ranger RF
rfGrid <- expand.grid(
  mtry = c(1:length(dataRFE)),
  min.node.size = c(5, 10, 50, 100),
  splitrule = "variance"
)

set.seed(627)

start <- Sys.time()

RF_model_train <- train(
  formula(paste0("avg_Y_ratio ~ ",
                 paste0(names(dataRFE), collapse = " + "))),
  data = data_train,
  method = "ranger",
  trControl = fitControl,
  tuneGrid = rfGrid,
  metric = "RMSE",
  verbose = FALSE
)

end <- Sys.time()
print(end - start)
## Time difference of 16.3722 secs

Final model using best hyperparameters

fitControl <- trainControl(method = "none", search = "grid")

RFFGrid <- expand.grid(
  mtry = RF_model_train$bestTune$mtry,
  splitrule = RF_model_train$bestTune$splitrule,
  min.node.size = RF_model_train$bestTune$min.node.size
)

RFFGrid
##   mtry splitrule min.node.size
## 1    3  variance             5

Train final RF

fit_RF <- train(
  formula(paste0("avg_Y_ratio ~ ",
                 paste0(names(dataRFE), collapse = " + "))),
  data = data_train,
  method = "ranger",
  trControl = fitControl,
  metric = "RMSE",
  verbose = FALSE,
  tuneGrid = RFFGrid
)

Train set

library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
results_RF_train <- data.frame(
  avg_Y_ratio = data_train$avg_Y_ratio,
  y_pred = predict(fit_RF)
)

results_RF_train$residual <- results_RF_train$avg_Y_ratio - results_RF_train$y_pred

RMSE_RF_train <- rmse(results_RF_train$avg_Y_ratio, results_RF_train$y_pred)
RMSE_RF_train
## [1] 0.09362188
R2_RF_train <- cor(results_RF_train$y_pred, results_RF_train$avg_Y_ratio)^2
R2_RF_train
## [1] 0.8980759

For the test set:

results_RF_test <- data.frame(
  avg_Y_ratio = data_test$avg_Y_ratio,
  y_pred = predict(fit_RF, newdata = data_test)
)

results_RF_test$residual <- results_RF_test$avg_Y_ratio - results_RF_test$y_pred

RMSE_RF_test <- rmse(results_RF_test$avg_Y_ratio, results_RF_test$y_pred)
RMSE_RF_test
## [1] 0.1663057
R2_RF_test <- cor(results_RF_test$y_pred, results_RF_test$avg_Y_ratio)^2
R2_RF_test
## [1] 0.3364586

Observed vs Predicted plots

Trained data

plot(results_RF_train$y_pred, results_RF_train$avg_Y_ratio,
     xlab = "Predicted Y ratio",
     ylab = "Observed Y ratio",
     pch = 20,
     main = "RF Train")

abline(lm(avg_Y_ratio ~ y_pred, data = results_RF_train), col = "red")
abline(0,1,col="red", lty=2)

legend("topleft",
       legend = c(paste("R2 =", round(R2_RF_train,3)),
                  paste("RMSE =", round(RMSE_RF_train,3))),
       bty="n")

## tested

plot(results_RF_test$y_pred, results_RF_test$avg_Y_ratio,
     xlab = "Predicted Y ratio",
     ylab = "Observed Y ratio",
     pch = 20,
     main = "RF Test")

abline(lm(avg_Y_ratio ~ y_pred, data = results_RF_test), col = "red")
abline(0,1,col="red", lty=2)

legend("topleft",
       legend = c(paste("R2 =", round(R2_RF_test,3)),
                  paste("RMSE =", round(RMSE_RF_test,3))),
       bty="n")

## Stave model objects

save(
  data_model,
  parts, data_train, data_test,
  result_RFE,
  RF_model_train, fit_RF,
  results_RF_train, RMSE_RF_train, R2_RF_train,
  results_RF_test, RMSE_RF_test, R2_RF_test,
  file = "RF_avg_Y_ratio_model.Rdata"
)

names(dataRFE) %in% names(data_train)

names(dataRFE) %in% names(data_train)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE

Visualizetion of variables importance

fit_RF <- train(
  formula(paste0("avg_Y_ratio ~ ",
                 paste0(names(dataRFE), collapse = " + "))),
  data = data_train,
  method = "ranger",
  trControl = fitControl,
  metric = "RMSE",
  verbose = FALSE,
  tuneGrid = RFFGrid,
  importance = "permutation"
)

importance_RF <- varImp(fit_RF)

importance_RF
## ranger variable importance
## 
##   only 20 most important variables shown (out of 34)
## 
##                           Overall
## ICR_rainfall_avg           100.00
## ICF_Alt_avg                 70.96
## Couch_ConsitPresent         65.45
## avg_ICR_mulch_perc_planti   57.30
## Weed_Overall_avg            55.44
## ICR_K_perc_23A              44.93
## ICF_planting_date           44.67
## Slope                       41.42
## acc_rain_p60_beans          39.15
## acc_rain_p60_maize          36.80
## Comp_amount_corr_quali      35.17
## IQF_agzoneCentral Plateau   24.92
## ICR_Avail_P_avg             23.51
## IQF_agzoneMayaga-Bugesera   22.78
## IQR_soil_colorred           21.55
## IQF_agzoneEastern Savanah   20.42
## K_factorVLow                17.95
## IQR_soil_textureFine        17.27
## IQR_soil_textureMix         16.79
## IQF_agzoneCongo Nile        16.07
plot(importance_RF, top = 20)