#Load RF database
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.3.3
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(tidyr)
library(broom)
library(readxl)
RF_data <- read_csv(
"D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/drivers_data.csv")
## Rows: 473 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): IQR_block, IQR_TF_tubura_client, IQR_plot_fert_quality, IQR_soil_t...
## dbl (19): ICR_N_perc_23A, ICR_Org_C_avg, ICR_K_perc_23A, ICR_Avail_P_avg, IC...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert binary variables (0/1) to factors
RF_data$Couch_Consit <- factor(RF_data$Couch_Consit,
levels = c(0,1),
labels = c("Absent","Present"))
RF_data$Whitch_weed <- factor(RF_data$Whitch_weed,
levels = c(0,1),
labels = c("Absent","Present"))
# Convert categorical variables to factors
RF_data$CA_typology_09 <- as.factor(RF_data$CA_typology_09)
RF_data$IQR_soil_texture <- as.factor(RF_data$IQR_soil_texture)
RF_data$IQR_soil_color <- as.factor(RF_data$IQR_soil_color)
RF_data$IQF_agzone <- as.factor(RF_data$IQF_agzone)
RF_data$IQF_environment <- as.factor(RF_data$IQF_environment)
# Optional (often categorical as well)
RF_data$IQR_plot_fert_quality <- as.factor(RF_data$IQR_plot_fert_quality)
library(tidyverse)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(usdm)
## Warning: package 'usdm' was built under R version 4.3.3
## Loading required package: terra
## Warning: package 'terra' was built under R version 4.3.3
## terra 1.8.29
##
## Attaching package: 'terra'
## The following object is masked from 'package:tidyr':
##
## extract
# Select response + explanatory variables
data_model <- RF_data %>%
dplyr::select(
avg_Y_ratio,
ICR_N_perc_23A,
ICR_K_perc_23A,
K_factor,
Couch_Consit,
Whitch_weed,
ICF_Alt_avg,
ICR_Avail_P_avg,
P_factor,
ICR_arable_land_avg,
ICR_rainfall_avg,
acc_rain_p60_maize,
acc_rain_p60_beans,
Comp_amount_corr_quali,
ICF_planting_date,
Slope,
IQR_TF_tubura_client,
IQR_plot_fert_quality,
IQR_soil_texture,
IQR_soil_color,
IQF_position_slope,
Weed_species_A_combined,
Weed_Overall_avg,
avg_ICR_mulch_perc_planti,
IQF_agzone,
IQF_environment
) %>%
na.omit()
set.seed(42)
parts <- createDataPartition(data_model$avg_Y_ratio, p = 0.7, list = FALSE)
data_train <- data_model[parts, ]
data_test <- data_model[-parts, ]
x_train <- data_train[, -1]
y_train <- data_train$avg_Y_ratio
set.seed(627)
control_RFE <- rfeControl(
functions = rfFuncs,
method = "repeatedcv",
repeats = 1,
number = 5
)
start <- Sys.time()
result_RFE <- rfe(
x = x_train,
y = y_train,
sizes = 1:ncol(x_train),
rfeControl = control_RFE
)
end <- Sys.time()
print(end - start)
## Time difference of 32.51653 secs
predictors(result_RFE)
## [1] "ICR_rainfall_avg" "IQF_agzone"
## [3] "ICF_Alt_avg" "avg_ICR_mulch_perc_planti"
## [5] "Couch_Consit" "Weed_Overall_avg"
## [7] "ICF_planting_date" "acc_rain_p60_beans"
## [9] "ICR_K_perc_23A"
names(x_train)[!names(x_train) %in% result_RFE$optVariables]
## [1] "ICR_N_perc_23A" "K_factor"
## [3] "Whitch_weed" "ICR_Avail_P_avg"
## [5] "P_factor" "ICR_arable_land_avg"
## [7] "acc_rain_p60_maize" "Comp_amount_corr_quali"
## [9] "Slope" "IQR_TF_tubura_client"
## [11] "IQR_plot_fert_quality" "IQR_soil_texture"
## [13] "IQR_soil_color" "IQF_position_slope"
## [15] "Weed_species_A_combined" "IQF_environment"
dataRFE <- subset(data_train, select = result_RFE$optVariables)
forced_vars <- c("Slope", "IQR_soil_texture", "acc_rain_p60_maize", "ICR_Avail_P_avg", "K_factor" , "Comp_amount_corr_quali", "IQF_position_slope", "IQR_soil_texture", "IQR_soil_color")
final_vars <- unique(c(result_RFE$optVariables, forced_vars))
dataRFE <- data_train[, final_vars]
library(dplyr)
data_numeric <- dataRFE %>% select(where(is.numeric))
data_factor <- dataRFE %>% select(where(is.factor))
names(data_numeric)
## [1] "ICR_rainfall_avg" "ICF_Alt_avg"
## [3] "avg_ICR_mulch_perc_planti" "Weed_Overall_avg"
## [5] "ICF_planting_date" "acc_rain_p60_beans"
## [7] "ICR_K_perc_23A" "Slope"
## [9] "acc_rain_p60_maize" "ICR_Avail_P_avg"
## [11] "Comp_amount_corr_quali"
names(data_factor)
## [1] "IQF_agzone" "Couch_Consit" "IQR_soil_texture" "IQR_soil_color"
Common thresholds:
VIF < 5 → safe VIF < 10 → acceptable VIF > 10 → problematic collinearity
So we keep all the numeric
library(usdm)
data_numeric_df <- as.data.frame(data_numeric)
vif_step <- vifstep(data_numeric_df, th = 10)
vif_step
## No variable from the 11 input variables has collinearity problem.
##
## The linear correlation coefficients ranges between:
## min correlation ( ICR_Avail_P_avg ~ acc_rain_p60_maize ): -0.001750004
## max correlation ( acc_rain_p60_beans ~ ICR_rainfall_avg ): 0.7007355
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 ICR_rainfall_avg 2.733473
## 2 ICF_Alt_avg 1.666398
## 3 avg_ICR_mulch_perc_planti 1.708313
## 4 Weed_Overall_avg 1.316005
## 5 ICF_planting_date 1.520338
## 6 acc_rain_p60_beans 2.160719
## 7 ICR_K_perc_23A 1.197928
## 8 Slope 1.429011
## 9 acc_rain_p60_maize 1.548176
## 10 ICR_Avail_P_avg 1.066016
## 11 Comp_amount_corr_quali 1.166478
data_train_final <- cbind(
avg_Y_ratio = data_train$avg_Y_ratio,
dataRFE
)
# model calibration -------------------------------------------------------
fitControl <- trainControl(
method = "repeatedcv",
repeats = 1,
number = 5,
search = "grid"
)
# grid for ranger RF
rfGrid <- expand.grid(
mtry = c(1:length(dataRFE)),
min.node.size = c(5, 10, 50, 100),
splitrule = "variance"
)
set.seed(627)
start <- Sys.time()
RF_model_train <- train(
formula(paste0("avg_Y_ratio ~ ",
paste0(names(dataRFE), collapse = " + "))),
data = data_train,
method = "ranger",
trControl = fitControl,
tuneGrid = rfGrid,
metric = "RMSE",
verbose = FALSE
)
end <- Sys.time()
print(end - start)
## Time difference of 16.3722 secs
fitControl <- trainControl(method = "none", search = "grid")
RFFGrid <- expand.grid(
mtry = RF_model_train$bestTune$mtry,
splitrule = RF_model_train$bestTune$splitrule,
min.node.size = RF_model_train$bestTune$min.node.size
)
RFFGrid
## mtry splitrule min.node.size
## 1 3 variance 5
fit_RF <- train(
formula(paste0("avg_Y_ratio ~ ",
paste0(names(dataRFE), collapse = " + "))),
data = data_train,
method = "ranger",
trControl = fitControl,
metric = "RMSE",
verbose = FALSE,
tuneGrid = RFFGrid
)
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
results_RF_train <- data.frame(
avg_Y_ratio = data_train$avg_Y_ratio,
y_pred = predict(fit_RF)
)
results_RF_train$residual <- results_RF_train$avg_Y_ratio - results_RF_train$y_pred
RMSE_RF_train <- rmse(results_RF_train$avg_Y_ratio, results_RF_train$y_pred)
RMSE_RF_train
## [1] 0.09362188
R2_RF_train <- cor(results_RF_train$y_pred, results_RF_train$avg_Y_ratio)^2
R2_RF_train
## [1] 0.8980759
results_RF_test <- data.frame(
avg_Y_ratio = data_test$avg_Y_ratio,
y_pred = predict(fit_RF, newdata = data_test)
)
results_RF_test$residual <- results_RF_test$avg_Y_ratio - results_RF_test$y_pred
RMSE_RF_test <- rmse(results_RF_test$avg_Y_ratio, results_RF_test$y_pred)
RMSE_RF_test
## [1] 0.1663057
R2_RF_test <- cor(results_RF_test$y_pred, results_RF_test$avg_Y_ratio)^2
R2_RF_test
## [1] 0.3364586
plot(results_RF_train$y_pred, results_RF_train$avg_Y_ratio,
xlab = "Predicted Y ratio",
ylab = "Observed Y ratio",
pch = 20,
main = "RF Train")
abline(lm(avg_Y_ratio ~ y_pred, data = results_RF_train), col = "red")
abline(0,1,col="red", lty=2)
legend("topleft",
legend = c(paste("R2 =", round(R2_RF_train,3)),
paste("RMSE =", round(RMSE_RF_train,3))),
bty="n")
## tested
plot(results_RF_test$y_pred, results_RF_test$avg_Y_ratio,
xlab = "Predicted Y ratio",
ylab = "Observed Y ratio",
pch = 20,
main = "RF Test")
abline(lm(avg_Y_ratio ~ y_pred, data = results_RF_test), col = "red")
abline(0,1,col="red", lty=2)
legend("topleft",
legend = c(paste("R2 =", round(R2_RF_test,3)),
paste("RMSE =", round(RMSE_RF_test,3))),
bty="n")
## Stave model objects
save(
data_model,
parts, data_train, data_test,
result_RFE,
RF_model_train, fit_RF,
results_RF_train, RMSE_RF_train, R2_RF_train,
results_RF_test, RMSE_RF_test, R2_RF_test,
file = "RF_avg_Y_ratio_model.Rdata"
)
names(dataRFE) %in% names(data_train)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE
fit_RF <- train(
formula(paste0("avg_Y_ratio ~ ",
paste0(names(dataRFE), collapse = " + "))),
data = data_train,
method = "ranger",
trControl = fitControl,
metric = "RMSE",
verbose = FALSE,
tuneGrid = RFFGrid,
importance = "permutation"
)
importance_RF <- varImp(fit_RF)
importance_RF
## ranger variable importance
##
## only 20 most important variables shown (out of 34)
##
## Overall
## ICR_rainfall_avg 100.00
## ICF_Alt_avg 70.96
## Couch_ConsitPresent 65.45
## avg_ICR_mulch_perc_planti 57.30
## Weed_Overall_avg 55.44
## ICR_K_perc_23A 44.93
## ICF_planting_date 44.67
## Slope 41.42
## acc_rain_p60_beans 39.15
## acc_rain_p60_maize 36.80
## Comp_amount_corr_quali 35.17
## IQF_agzoneCentral Plateau 24.92
## ICR_Avail_P_avg 23.51
## IQF_agzoneMayaga-Bugesera 22.78
## IQR_soil_colorred 21.55
## IQF_agzoneEastern Savanah 20.42
## K_factorVLow 17.95
## IQR_soil_textureFine 17.27
## IQR_soil_textureMix 16.79
## IQF_agzoneCongo Nile 16.07
plot(importance_RF, top = 20)