Modeling and predicting earthworm diversity and distribution in France
2024-07-22
Packages
library(tidyverse)
# library(glme)
# library(lsmeans)
# library(agricolae)
# library(RVAideMemoire)
library(corrplot)
# library(emmeans)
library(lme4)
library(multcomp)
library(MASS)
# library(R2WinBUGS)
library(arm)
# library(performance)
# library(AER)
# library(AICcmodavg)
# library(MuMIn)
library(ade4)
library(Hmisc)
library(labdsv)
library(vegan)
library(cowplot)
library(ggpubr)
library(rstatix)
library(patchwork)
library(multcompView)
library(ggsignif)
library(grid)
library(FactoMineR)
library(factoextra)
library(explore)
library(ggrepel)
library(naniar)
library(outliers)
library(leaps)
library(fastDummies)
library(caret) # pour l'entrainement des models
library(mgcv)
library(ggeffects)
library(gratia)
library(GGally) # pour ggpair
# library(caTools)
# library(rpart)
# library(rpart.plot)
library(openxlsx)
library(readxl)
library(leaflet) # pour la carto
library(quarto)
library(raster)
library(knitr)
library(kableExtra)
library(stringr)
library(plotly)
# library(PerformanceAnalytics)
# library(usdm)
library(vcd) # pour la distribution des var reponse
library(prospectr)# pour split data avec kenSton()
# library(glmnet)
library(randomForest)
# library(doParallel)
library(gbm)
library(kernlab)
# library(e1071)
library(ggforce)
library(keras)
library(tensorflow)
library(neuralnet)
# library(parallel)
library(iml) # pour l'interpretabilité des models https://cran.r-project.org/web/packages/iml/vignettes/intro.html
library(stats)
# library(Boruta) # importance des predicteurs
library(bestNormalize)
library(rmarkdown)
library(DT)
library(gtExtras) # pour la
library(reshape2)
# library(mapview)
library(sf)
library(ggplot2)
library(maptools)
library(ggsn)
library(spThin)
library(sp)
library(gstat)
library(rms)
Functions
## Identification des NA dans un df -----------------------------------------------
taux_completion<-
function(df, afficher_zero_percent = FALSE, seuil, trie=FALSE) {
# Calcule du pourcentage de NA dans le dataframe
pourcentage_total <-
round(sum(is.na(df)) / (nrow(df) * ncol(df)) * 100, 1)
# Calcule du pourcentage de NA par colonne
pourcentage_colonnes <- round(colMeans(is.na(df)) * 100, 1)
# Creation d'un dataframe résultat avec deux colonnes
result <-
data.frame(
Variables = names(df),
CR = pourcentage_colonnes,
row.names = NULL
)
if (afficher_zero_percent) {
result <- result[result$CR == 0, ]
result$CR = 100 -result$CR
} else {
result <- result[result$CR > 0, ]
result$CR = 100 -result$CR
}
result <- rbind(result, c("Total", pourcentage_total))
#result <- rbind(result, c("Total", paste0(pourcentage_total, "")))
result <- result[, c("Variables", "CR")]
result$CR = as.numeric(result$CR)
result$CR = round(result$CR,1)
if (trie){
result = result %>% arrange(desc(CR))
}
result$CR = paste0(result$CR,"%")
return(result)
}
# Converssion des colonne en num ou factor-----------------------------------------------
conv_col <- function (data, columns_to_convert, to_types) {
if (to_types == "numeric") {
# Conversion des colonnes en numeric
for (col in columns_to_convert) {
data[, col] <- as.numeric(data[, col])
}
} else {
# Conversion des colonnes en facteurs
for (col in columns_to_convert) {
data[, col] <- as.factor(data[, col])
}
}
return(data)
}
#data_converted <- conv_col(data, names(data [, c(1, 3)]), "factor")
# exploration graphiques des variables numeriques -----------------------------------------------
explo_num <- function(nom_col, titre, df = landworm, ligne_col = c(2, 2),mini = min(df[[nom_col]]), maxi=max(df[[nom_col]]) ) {
par(mfrow = ligne_col)
df[complete.cases(df[[nom_col]]), ]
df <- df %>%dplyr::filter(!is.na(df[[nom_col]]))
df[[nom_col]] = as.numeric(df[[nom_col]])
# Boxplot
boxplot(df[[nom_col]], col = 'blue', ylab = titre, ylim = c(mini, maxi))
# Cleveland plot
# dotchart(df[[nom_col]], pch = 16, col = 'blue', xlab = titre)
plot(df[[nom_col]], pch = 16, col = 'blue', ylab = titre)
# Histogram
hist(df[[nom_col]], col = 'blue', xlab = titre, main = "")
# Quantile-Quantile plot
qqnorm(df[[nom_col]], pch = 16, col = 'blue', xlab = '')
qqline(df[[nom_col]], col = 'red')
}
# Extraction des predictors + moyennes -----------------------------------------------
extraction <- function(nom_col, tif_file_path, df = landworm, conv = 1) {
#df <- df %>%filter(!is.na(gps_x) & !is.na(gps_y))
raster_data <- raster(tif_file_path)
# Création d'un dataframe pour stocker les valeurs extraites
df_interne <- data.frame(gps_x = df$gps_x, gps_y = df$gps_y)
proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
# Transformer les coordonnées GPS en système de coordonnées du raster
gps_coords_sp <- SpatialPoints(df_interne, proj4string = CRS(proj4Str))
gps_coords_proj <- spTransform(gps_coords_sp, crs(raster_data))
# Extraction des valeurs du raster
values <- raster::extract(raster_data, gps_coords_proj)
# Ajout des valeurs extraites comme nouvelles colonnes a df
#df_save = data.frame()
#df_save[[nom_col]] <- values / conv
df[[nom_col]] <- values / conv
return(df)
}
# la moyenne des predictores -----------------------------------------------
moyenne_val_extrct <- function(nom_col, vec_col, df=landworm) {
df[[nom_col]] <- rowMeans(as.matrix(df[, vec_col, drop = FALSE]), na.rm = TRUE)
df[[nom_col]] = round(df[[nom_col]],1)
return(as.data.frame(df))
}
# tests de corrélation avec un seuil -----------------------------------------------
cor_function_seuil <- function(data, seuil,affiche=FALSE) {
# Création d'un vecteur pour stocker les paires de variables corrélées
variables_corr <- c()
# Boucle pour tester la corrélation entre chaque paire de variables
for (i in 1:(ncol(data) - 1)) {
for (j in (i + 1):ncol(data)) {
# Calcul de la corrélation entre les variables i et j
cor_value <- stats::cor(data[, i], data[, j], use = "na.or.complete")
# Stockage du résultat dans le vecteur si supérieur au seuil
if (cor_value >= seuil | cor_value <= -seuil) {
if(affiche){
cat(
"***",
colnames(data)[i],
" __est correlee a__ ",
colnames(data)[j],
"avec un R =",
cor_value,
"\n \n \n"
)
}
variables_corr <-
c(variables_corr, colnames(data)[i], colnames(data)[j])
}
}
}
return(variables_corr)
}
# tests de valeurs aberant -----------------------------------------------
test_grub <- function(data, variable, direction = "maxi") {
if (direction == "maxi") {
repeat {
#le test de Grubbs
test_aberrant <- grubbs.test(data[[variable]], opposite = FALSE)
# Obtenir la p-valeur du test
p.value <- test_aberrant$p.value
# Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
if (p.value < 0.05) {
max_value <- max(data[[variable]],na.rm=TRUE)
data <- subset(data, data[[variable]] != max_value | is.na(data[[variable]]))
} else {
# S'il n'y a plus de valeurs aberrantes, sortir de la boucle
break
}
}
}
if (direction == "mini") {
repeat {
test_aberrant <- grubbs.test(data[[variable]], opposite = TRUE)
# Obtenir la p-valeur du test
p.value <- test_aberrant$p.value
# Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
if (p.value < 0.05) {
min_value <- min(data[[variable]],na.rm=TRUE)
data <- subset(data, data[[variable]] != min_value | is.na(data[[variable]]))
} else {
# S'il n'y a plus de valeurs aberrantes, sortir de la boucle
break
}
}
}
return(data)
}
# boxplote -----------------------------------------------
plot_boxplot <-function(donnee,
x_col,y_col,x_label,y_label,title,legend_title,
couleurs,
affiche_point = TRUE,
ymin = min(donnee[[y_col]]),
ymax = 1.2 * max(donnee[[y_col]])) {
graphe <-ggplot(donnee,
aes_string(
x = x_col,
y = y_col,
colour = x_col
)) +
geom_boxplot(
outlier.shape = NA,
outlier.colour = "black",
alpha = 0.20,
size = 1.5
) +
labs(title = title,x = x_label,y = y_label) +
scale_color_manual(values = couleurs, name = legend_title) +
theme_classic(base_size = 12, base_family = "Arial") +
theme(axis.text = element_text(size = 10),
axis.title.y = element_text(
vjust = 5, size = 12, face = "bold"),
axis.title.x = element_text(face = "bold"),
axis.ticks.length = unit(0.2, "cm"),
legend.position = "none", # Cette ligne supprime la legende
#legend.position = "right",
legend.text = element_text(size = 10),
legend.title = element_text(size = 12, face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
)
if (affiche_point) {
graphe <-
graphe + geom_jitter(position = position_jitter(seed = 0.5), size = 0.8)
}
if (y_col %in% names(donnee)) {
graphe <- graphe +
coord_cartesian(ylim = c(ymin, ymax))
}
graphe = graphe + stat_summary(
fun.y = mean,
geom = "point",
shape = 15,
size = 1.5,
col = "black",
fill = "black"
)
return(graphe)
}
#pour le pairwise.t.test() -----------------------------------------------------
tri.to.squ <- function(x) {
rn <- row.names(x)
cn <- colnames(x)
an <- unique(c(cn, rn))
myval <- x[!is.na(x)]
mymat <-
matrix(
1,
nrow = length(an),
ncol = length(an),
dimnames = list(an, an)
)
for (ext in 1:length(cn))
{
for (int in 1:length(rn))
{
if (is.na(x[row.names(x) == rn[int], colnames(x) == cn[ext]]))
next
mymat[row.names(mymat) == rn[int], colnames(mymat) == cn[ext]] <-
x[row.names(x) == rn[int], colnames(x) == cn[ext]]
mymat[row.names(mymat) == cn[ext], colnames(mymat) == rn[int]] <-
x[row.names(x) == rn[int], colnames(x) == cn[ext]]
}
}
return(mymat)
}
# Selection interaction -------------------------------
select_inter <- function(response_var, df, explanatory_vars) {
results <- data.frame()
combinations <- combn(explanatory_vars, 2, simplify = FALSE)
for(i in seq_along(combinations)) {
formula <- as.formula(paste(response_var, "~", paste(combinations[[i]], collapse = "*")))
model <- gam(formula, data = df)
r_squared <- summary(model)$r.sq
aic <- AIC(model)
results <- rbind(results, data.frame("variables" = paste0(combinations[[i]], collapse = ".inter."),
"r_squared" = r_squared,
"aic" = aic))
}
return(results)
}
# Comparaion betwen predtited and observed -----------------------------------
plot_comp = function (df,ylabel, title_class, legende = TRUE,plotly = FALSE,xlabel = "observations",title=""){
p = ggplot(df, aes(x = observation)) +
#graph representant observed
geom_point(aes(y = Observed, color = "Observed values")) +
geom_line(aes(y = Observed, color = "Observed values")) +
#graph representant preticted
geom_point(aes(y = Predicted, color="Predicted values")) +
geom_line(aes(y = Predicted, color="Predicted values")) +
# ggtitle(title)
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = title,x=xlabel, y=ylabel, color = "Legend :") +
ylim(min(c(min(df$Predicted), min(df$Observed))),
max(c(max(df$Predicted), max(df$Observed)))+1 ) +
scale_color_manual(values = c("Observed values"='red', "Predicted values"='green')) +
annotate("text", x = 8, y = max(c(max(df$Predicted), max(df$Observed)))+1,
label = title_class, col = "black", size = 3)
if (!legende) {
p <- p + theme(legend.position = "none")
}
if(plotly){
p = ggplotly(p)
}
return (p)
}
# Calcul R² -----------------------------------
# calcule_R2 = function(x, y) {cor(x, y)^2}
calcule_R2 <- function(y_true, y_pred) {
sse <- sum((y_true - y_pred)^2)
sst <- sum((y_true - mean(y_true))^2)
r_squared <- 1 - (sse / sst)
return(r_squared)
}
# Fonction d'exploration graphique des relations entre les vd et les predictors -----------------------------------
nuage_point <- function(df, var_rep, predicteur, titre_x, titre_y) {
# Convertir les variables en numeriques
df[[predicteur]] <- as.numeric(df[[predicteur]])
df[[var_rep]] <- as.numeric(df[[var_rep]])
df= df[,c(predicteur,var_rep)]
df = drop_na(df)
# correlation entre les deux variables
correlation <- round(cor(df[[predicteur]], df[[var_rep]]), 3)
# le graphique
g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
geom_point() + # Ajout des points
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
# title = paste("Relationship between",var_rep, "and", predicteur),
subtitle = paste0(" r = ", correlation),
x = titre_x,
y = titre_y) +
theme_classic()
return(g)
}
box_plot <- function(df, var_rep, predicteur, titre_x, titre_y) {
g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
geom_boxplot() + # Ajout des boxplots
labs(
# title = paste("Boxplot of", var_rep, "by", predicteur),
x = titre_x,
y = titre_y) +
theme_classic()
print(g)
}
Columns with only NAs
To make the dataset easier to read, I identify and remove all columns with only NA:
[1] "thermal_weed_control" "amdmt_orga_freq" "amdmt_orga_names"
[4] "amdmt_orga_qtty" "amdmt_calcic" "amdmt_calcic_names"
[7] "amdmt_calcic_qtty"
Numbers | |
---|---|
cp | 468 |
dc | 4324 |
gp | 299 |
mh | 855 |
sg | 357 |
df_suivi = landworm
n_line = nrow(df_suivi)
# landworm$gps_x <- as.numeric(gsub("[^0-9.-]", "", landworm$gps_x))
# landworm$gps_y <- as.numeric(gsub("[^0-9.-]", "", landworm$gps_y))
landworm <- landworm[complete.cases(landworm$gps_x, landworm$gps_y), ]
landworm <- landworm %>%dplyr::filter(!is.na(gps_x) & !is.na(gps_y))
#sum(is.na(landworm$gps_x))
#sum(is.na(landworm$gps_y))
df_suivi = landworm
n_line = nrow(df_suivi)
df_coord <- landworm[, c("gps_x", "gps_y")] %>% mutate(gps_x = as.numeric(gps_x),gps_y = as.numeric(gps_y))
df_coord$num_ligne <- seq(nrow(df_coord))
carte <- leaflet(df_coord) %>%
addTiles() %>%
addCircleMarkers(lng = ~gps_x, lat = ~gps_y, radius = 0.8, fillOpacity = 0.8, fillColor = "blue")
carte
# sum(is.na(landworm$Annee))
landworm <- landworm %>%dplyr::filter(!is.na(Annee))# on enleve les NA
annes_omit= c("1821", "1960", "1978", "1982", "1983", "1984", "1986", "1988", "1989", "1905") # annee sup
landworm <- landworm[!landworm$Annee %in% annes_omit, ]
landworm=droplevels(landworm)
summary_df <- data.frame(Numbers = summary(landworm$Annee))
summary_df$Annee = rownames(summary_df)
summary_df = summary_df[,c("Annee", "Numbers")]
rownames(summary_df) = NULL
summary_df = summary_df %>% arrange(Annee)
summary_df %>% datatable(options = list(pageLength = 10))
Numbers | |
---|---|
1_Naturel | 1 |
2_Agricole | 3 |
Agricultural areas | 3706 |
Artificial surfaces | 834 |
Forest and semi natural areas | 541 |
Territoires agricoles | 1 |
Wetlands | 30 |
NA's | 894 |
levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "1_Naturel"] <- "Forest and semi natural areas"
levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "2_Agricole"] <- "Agricultural areas"
levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "Territoires agricoles"] <- "Agricultural areas"
landworm$clcm_lvl1= as.factor(landworm$clcm_lvl1)
summary_df <- as.data.frame(summary(landworm$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers | |
---|---|
Forest and semi natural areas | 542 |
Agricultural areas | 3710 |
Artificial surfaces | 834 |
Wetlands | 30 |
NA's | 894 |
levels(landworm$clcm_lvl2)[levels(landworm$clcm_lvl2) == "21_Agricole ouvert"] <- "Arable land"
landworm$clcm_lvl2= as.factor(landworm$clcm_lvl2)
summary_df <- as.data.frame(summary(landworm$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
# kable(summary_df,padding = 5)
summary_df %>% datatable(options = list(pageLength = 10), rownames = TRUE)
Broad-leaved forest
Indetermined forest
Mixed forest
Urban forest and woodland
Pastures, meadows and other permanent grasslands under agricultural use
Non-irrigated arable land
Vineyards
Green urban areas
Natural grasslands
# select_os= c("Broad-leaved forest", "Coniferous forest", "Mixed forest",
# "Pastures, meadows and other permanent grasslands under agricultural use", "Non-irrigated arable land", "Vineyards","Green urban areas","Natural grasslands")
select_os= c("Broad-leaved forest", "Mixed forest", "Indetermined forest",
"Urban forest and woodland", "Pastures, meadows and other permanent grasslands under agricultural use", "Non-irrigated arable land", "Vineyards","Green urban areas","Natural grasslands")
landworm <- landworm[landworm$clcm_lvl3 %in% select_os, ]
landworm=droplevels(landworm)
landworm$clcm_lvl3 = as.factor(landworm$clcm_lvl3)
summary_df <- as.data.frame(summary(landworm$clcm_lvl3))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))
We merge the 4 types of forest:
Merging levels :
levels(landworm$Protocole)[levels(landworm$Protocole) == "FHS"] <- "F_HS"
levels(landworm$Protocole)[levels(landworm$Protocole) == "hand sorting"] <- "HS"
landworm$Protocole = as.factor(landworm$Protocole)
summary_df <- as.data.frame(summary(landworm$Protocole))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))
# kable (table(landworm$clcm_lvl1, landworm$Protocole,exclude = NULL), align = "c", format = "pipe", padding = 10)
# kable (table(landworm$clcm_lvl2, landworm$Protocole,exclude = NULL), align = "c", format = "pipe", padding = 10)
df = table(landworm$clcm_lvl3, landworm$Protocole,exclude = NULL)
# df = as.data.frame(df)
kable (df, align = "c", format = "pipe", padding = 10)
AITC_HS | AITCTM | F_HS | HS | HS_F_16 | HS_M_16 | M_HS | |
---|---|---|---|---|---|---|---|
Forest | 4 | 2 | 22 | 147 | 0 | 0 | 3 |
Green urban areas | 2 | 0 | 0 | 595 | 0 | 0 | 10 |
Natural grasslands | 0 | 66 | 3 | 62 | 0 | 0 | 9 |
Non-irrigated arable land | 80 | 81 | 358 | 1267 | 24 | 90 | 36 |
Pastures, meadows and other permanent grasslands under agricultural use | 60 | 0 | 227 | 264 | 0 | 42 | 0 |
Vineyards | 0 | 1 | 374 | 375 | 0 | 0 | 0 |
Sppression des valeurs aberrantes
Sppression des valeurs aberrantes
Total richness calculation method
Voici la liste des sp a fusionnée:
# function pour fusion les sous especes a leurs especes
sp_identique = function(df, nom_sous_espece, nom_espece ) {
# Si les deux colonnes existent dans df
if (nom_sous_espece %in% names(df) && nom_espece %in% names(df)) {
# Addition des valeurs des deux colonnes et stockage du résultat dans nom_espece
# df[[nom_espece]] = df[[nom_sous_espece]] + df[[nom_espece]]
df[[nom_espece]] = rowSums(df[,c(nom_sous_espece,nom_espece)], na.rm = TRUE)
df[[nom_sous_espece]] = NULL
# Si nom_espece n'existe pas dans df mais nom_sous_espece existe
} else if (nom_sous_espece %in% names(df) && !(nom_espece %in% names(df))) {
# on renome nom_sous_espece par nom_espece
names(df)[names(df) == nom_sous_espece] <- nom_espece
df[[nom_sous_espece]] = NULL
}
return(df)
}
for (i in 1:nrow(df_sp_sous_sp)){
landworm_sp = sp_identique(df = landworm_sp,nom_sous_espece = df_sp_sous_sp[i,"col_sp_origines"],
nom_espece = df_sp_sous_sp[i,"col_sp_concatener"])
}
# On supprimme les colonnes AB_ qui ne sont pas des espèces dans le calcule
ab_supprimee = c("AB_AD",
"AB_JV",
"AB_SA",
"AB_STAD_X",
"AB_indéterminable",
"AB_Indéterminable",
"AB_indéterminable_endogeic",
"AB_tot",
"AB_Indéterminable_epigeic",
"AB_indéterminable_endogeic",
"AB_Ep.X",
"AB_vide",
"AB_Ep.X1",
"AB_Ep.X2",
"AB_A.X",
"AB_Adult",
"AB_cocon",
"AB_indéterminé",
"AB_Juvenile",
"AB_Sub.adult",
"AB_Indéterminé")
colonnes_AB <- colonnes_AB[!colonnes_AB %in% ab_supprimee]
cat("A ce stade, la liste des especes est: \n")
A ce stade, la liste des especes est:
A ce stade, la richesse varie de:
# On calcule la richesse en attribiant 1 à chaque colonne si la valeur est différent de 0 et de NA
landworm_sp$Richesse_tot <- 0
landworm_sp$Richesse_tot <- rowSums(!is.na(landworm_sp[colonnes_AB]) & landworm_sp[colonnes_AB] != 0)
#sum (is.na(landworm_sp$Richesse_tot) )
summary(landworm_sp$Richesse_tot)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 3.000 5.000 4.923 7.000 14.000
Voir code:
# Apres calcule richesse ------------------------------------------------------
sp_prorata <- function(df, sp_x, sp) {
name_col <- names(df)
# si la colonne sp_x est présente
if (sp_x %in% name_col) {
# pour chaque ligne
for (i in 1:nrow(df)) {
# si les valeurs de la colonne sp_x ne sont ni 0 ni NA
if (!is.na(df[i, sp_x]) && df[i, sp_x] != 0) {
# Si la somme des ab des sp est différente de 0
if (rowSums(df[i, sp], na.rm = TRUE) != 0) {
# La richesse diminue de 1
df[i, "Richesse_tot"] <- df[i, "Richesse_tot"] - 1
}
}
}
}
return(df)
}
# Pour AB_Allolobophora_sp ------------------------------------------
sp_x = "AB_Allolobophora_sp"
sp=c("AB_Allolobophora_burgondiae",
"AB_Allolobophora_chlorotica",
"AB_Aporrectodea_cupulifera",
"AB_Aporrectodea_icterica",
"AB_Aporrectodea_limicola",
"AB_Aporrectodea_rosea")
# df= landworm_sp[, c(sp_x,sp, "Richesse_tot")]
# df=df[!df$AB_Allolobophora_sp == 0,]
# df=df[! is.na(df$AB_Allolobophora_sp),]
# df$som = rowSums(df[,sp], na.rm=TRUE)
# View(df)
#
# dff <- sp_prorata(landworm_sp, sp_x, sp)
# dff= dff[, c(sp_x,sp, "Richesse_tot")]
# dff=dff[!dff$AB_Allolobophora_sp == 0,]
# dff=dff[! is.na(dff$AB_Allolobophora_sp),]
# dff$som = rowSums(dff[,sp], na.rm=TRUE)
# View(dff)
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Aporrectodea_indéterminable ------------------------------------------
sp_x = "AB_Aporrectodea_indéterminable"
sp=c("AB_Allolobophora_burgondiae",
"AB_Allolobophora_chlorotica",
"AB_Aporrectodea_cupulifera",
"AB_Aporrectodea_icterica",
"AB_Aporrectodea_limicola",
"AB_Aporrectodea_rosea")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Aporrectodea_sp ------------------------------------------
sp_x = "AB_Aporrectodea_sp"
sp=c("AB_Aporrectodea_giardi",
"AB_Aporrectodea_longa",
"AB_Aporrectodea_nocturna",
"AB_Aporrectodea_ripicola",
"AB_Aporrectodea_longa/giardi")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Dendrobaena_sp ------------------------------------------
# Tous les "Dendrobaena" et "Dendrodrilus_rubidus" + Satchellius mammalis
sp_x = "AB_Dendrobaena_sp"
sp=c("AB_Dendrobaena_alpina",
"AB_Dendrobaena_attemsi",
"AB_Dendrobaena_cognettii",
"AB_Dendrobaena_hortensis",
"AB_Dendrobaena_octaedra",
"AB_Dendrodrilus_rubidus",
"AB_Satchellius_mammalis")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Eisenia_sp ------------------------------------------
# Tous les Eisenia possible
sp_x = "AB_Eisenia_sp"
sp=c("AB_Eisenia_andrei",
"AB_Eisenia_fetida",
"AB_Eisenia_veneta",
"AB_Eiseniella_tetraedra")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Lumbricus_sp ------------------------------------------
# Tous les Lumbricus sauf Lumbricus castaneus ?
sp_x = "AB_Lumbricus_sp"
sp=c("AB_Lumbricus_centralis",
"AB_Lumbricus_festivus",
"AB_Lumbricus_friendi",
"AB_Lumbricus_herculeus",
"AB_Lumbricus_meliboeus",
"AB_Lumbricus_rubellus" ,
"AB_Lumbricus_terrestris",
"AB_Lumbricus_friendi/centralis")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Microscolex_sp ------------------------------------------
# Tous les AB_Microscolex
sp_x = "AB_Microscolex_sp"
sp=c("AB_Microscolex_dubius",
"AB_Microscolex_phosphoreus")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Octolasion_sp ------------------------------------------
# Tous les AB_Octolasion
sp_x = "AB_Octolasion_sp"
sp=c("AB_Octodrilus_complanatus",
"AB_Octolasion_cyaneum",
"AB_Octolasion_lacteum")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Pheretima_indéterminable ------------------------------------------
# AB_Pheritima_Diffringens
sp_x = "AB_Pheretima_indéterminable"
sp=c("AB_Pheritima_Diffringens", "AB_Pheritima_Diffringens")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Prosellodrilus_sp ------------------------------------------
# Tous les Prosellodrilus
sp_x = "AB_Prosellodrilus_sp"
sp=c("AB_Prosellodrilus_amplisetosus",
"AB_Prosellodrilus_fragilis",
"AB_Prosellodrilus_occidentalis",
"AB_Prosellodrilus_praticola",
"AB_Prosellodrilus_pyrenaicus")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# Pour AB_Scherotheca_sp ------------------------------------------
# Tous les AB_Scherotheca_sp
sp_x = "AB_Scherotheca_sp"
sp=c("AB_Satchellius_mammalis",
"AB_Scherotheca_aquitana",
"AB_Scherotheca_dinoscolex",
"AB_Scherotheca_nivicola",
"AB_Scherotheca_porotheca",
"AB_Scherotheca_rhodana",
"AB_Scherotheca_savignyi")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
# dff <- data.frame(
# owner = c("DC", "GP", "XX", "DC"),
# AB_Aporrectodea_trapezoides = c(1, NA, 0, 2),
# AB_Aporrectodea_tuberculata = c(NA, 0, 1, 3),
# AB_Aporrectodea_caliginosa = c(2, 1, NA, 4),
# Richesse_tot = c(10, 20, 30, 40)
# )
# df <- dff
df=landworm_sp
for (i in 1:nrow(df)) {
# Si owner est "DC" ou "GP"
if (df[i, "owner"] %in% c("DC", "GP")) {
# si AB_Aporrectodea_trapezoides ou AB_Aporrectodea_tuberculata sont présents et non nuls
if ((!is.na(df[i, "AB_Aporrectodea_trapezoides"]) &&
df[i, "AB_Aporrectodea_trapezoides"] != 0)
||
(!is.na(df[i, "AB_Aporrectodea_tuberculata"]) &&
df[i, "AB_Aporrectodea_tuberculata"] != 0)) {
# si AB_Aporrectodea_caliginosa est présente et non nulle
if (!is.na(df[i, "AB_Aporrectodea_caliginosa"]) &&
df[i, "AB_Aporrectodea_caliginosa"] != 0) {
# La richesse diminue de 1 pour cette ligne
df[i, "Richesse_tot"] <- df[i, "Richesse_tot"] - 1
}
}
}
}
landworm_sp = df
Sppression des valeurs aberrantes
Il n y a pas de valeurs aberant. Cependant, il y a 11 parcelles qui ont une richesse superieur a 12, a garder ou a supprimer ?
Details des parcelles a forte richesse (sup à 12):
I remove the 11 plots with richness greater than 12.
Saving the database after applying the previous filters.
At this stage, the database has 4132 rows and 260 columns without the climatic and pedological variables.
# Ajout variables du soil (voir script Add_soil_dats.R)
soil_datas <- read.xlsx("datas/soil_datas.xlsx" , sheet = "Sheet 1")
soil_datas <- subset(soil_datas, select = -c(Programme,Annee,Date_Prelevement,ID_Site,gps_x, gps_y))
rows_not_in_soil_datas <- anti_join(landworm, soil_datas, by = "ID_2")
rows_not_in_landworm <- anti_join(soil_datas,landworm, by = "ID_2")
# plot(soil_datas$ID_2, landworm$ID_2)
merged_df <- merge(landworm, soil_datas, by = "ID_2")
ids_not_matching <- anti_join( merged_df,landworm, by = "ID_2")
# colSums(is.na(soil_datas))
nn = nrow(merged_df)
merged_df = drop_na(merged_df,names(soil_datas))
p_sans_pedo = nn - nrow(merged_df)
landworm = merged_df
196 plots have at least one NA in the pedological variables, so I am removing them, and the database goes from 4132 to 3936.
Extracted values (g/kg, 0 - 30 cm)
Extracted values (g/kg, 0 - 30 cm)
Sppression des valeurs aberrantes
Extracted values (g/kg, 0 - 30 cm)
Sppression des valeurs aberrantes
Sppression des valeurs aberrantes
Sppression des valeurs aberrantes
Sppression des valeurs aberrantes
# Lire le fichier Excel
chemin_fichier_excel <- "C:/Users/diall/Downloads/datas/ODMAP.xlsx"
climat <- read.xlsx(chemin_fichier_excel, sheet = "climat")
# Fusions des cellules des colonnes avec des éléments dupliqués
for (col in names(climat)) {
climat[[col]] <- ifelse(duplicated(climat[[col]]), "", climat[[col]])
}
# Affichage du tableau avec kableExtra et centrage du contenu des cellules
# kableExtra::kable(climat)
For each of the following variables, I calculate the mean and the standard deviation at 3 months, 6 months, and 120 months (30 variables in total).
Data available only until 2018/2019.
See file Script 1 DownloadChelsaData
See file Script 2 AddchelsaData
Steps:
Obtain a vector of “month_Year” periods:
Initialize the necessary columns:
Main loop through periods and variables:
Save the updated data
Merging database and climat database
# Ajout variables du climate (voir script Add_climate_dats.R)
climate_datas <- read.xlsx("datas/climate_datas.xlsx" , sheet = "Sheet 1")
climate_datas <- subset(climate_datas, select = -c(Programme,Annee,Date_Prelevement,ID_Site,gps_x, gps_y))
rows_not_in_climate_datas <- anti_join(landworm, climate_datas, by = "ID_2")
rows_not_in_landworm <- anti_join(climate_datas,landworm, by = "ID_2")
# plot(climate_datas$ID_2, landworm$ID_2)
merged_df <- merge(landworm, climate_datas, by = "ID_2")
ids_not_matching <- anti_join( merged_df,landworm, by = "ID_2")
# colSums(is.na(climate_datas))
nn = nrow(merged_df)
merged_df = drop_na(merged_df,names(climate_datas))
p_sans_climat = nn - nrow(merged_df)
landworm = merged_df
0 plots have at least one NA in the climate variables, so I am removing them, and the database goes from 3882 to 3882.
::: {.panel-tabset}
The database therefore changes from 3882 to 3876 observations.
The database therefore changes from 3876 to 3876 observations.
Standard deviation
The database therefore changes from 3876 to 3870 observations.
Standard deviation
:::
The database therefore changes from 3870 to 3849 observations.
The database therefore changes from 3849 to 3849 observations.
Standard deviation
The database therefore changes from 3849 to 3849 observations.
Standard deviation
The database therefore changes from 3849 to 3849 observations.
The database therefore changes from 3849 to 3849 observations.
Standard deviation
The database therefore changes from 3849 to 3847 observations.
Standard deviation
The database therefore changes from 3847 to 3826 observations.
The database therefore changes from 3822 to 3803 observations.
Standard deviation
The database therefore changes from 3803 to 3793 observations.
Standard deviation
The database therefore changes from 3784 to 3784 observations.
The database therefore changes from 3784 to 3550 observations.
Standard deviation
The database therefore changes from 3545 to 3523 observations.
Standard deviation
Data set reduction:
3 Response variables: Abundance, biomass, and richness
41 Explanatory variables (including 30 climatic, 10 pedological, and land use)
id_col=c("ID_2","Programme","Annee","ID_Site","Protocole")
vdt_col=c("AB_tot", "BM_tot", "Richesse_tot")
os_and_gps = c('clcm_lvl3','gps_x', 'gps_y')
soils_variables = c("sable.0_30","limon.0_30","argile.0_30",'jrc_pH_H2O','N','P','K','CN', 'CaCO3',"c_orga_0_a_30")
climates_variables = c("tas_mean_3","tas_mean_6","tas_mean_120",
"tas_sd_3","tas_sd_6","tas_sd_120",
"tasmax_mean_3","tasmax_mean_6","tasmax_mean_120",
"tasmax_sd_3","tasmax_sd_6","tasmax_sd_120",
"tasmin_mean_3","tasmin_mean_6","tasmin_mean_120",
"tasmin_sd_3","tasmin_sd_6","tasmin_sd_120",
"pr_mean_3","pr_mean_6","pr_mean_120",
"pr_sd_3","pr_sd_6","pr_sd_120",
"pet_mean_3","pet_mean_6","pet_mean_120",
"pet_sd_3","pet_sd_6","pet_sd_120")
landworm_explo = landworm[,c(id_col,vdt_col,os_and_gps,soils_variables,climates_variables)]
landworm_explo <- landworm_explo %>%
dplyr::rename(
Sand = sable.0_30,
Silt = limon.0_30,
Clay = argile.0_30,
pH = jrc_pH_H2O,
C_org = c_orga_0_a_30,
LC_ = clcm_lvl3,
Long = gps_x ,
Lat = gps_y
)
cl_original <- levels(landworm_explo$LC_)
new_cl <- c("For","Gua", "Nag", "Nial", "Pmo", "Viny")
landworm_explo$LC_ <- factor(landworm_explo$LC_, levels = cl_original, labels = new_cl)
soils_variables = c("Sand","Silt","Clay",'pH','N','P','K','CN', 'CaCO3',"C_org")
variables_factor = c("LC_")
all_predictors = c(variables_factor,soils_variables,climates_variables)
all_predictors_num = all_predictors[!all_predictors %in% c(variables_factor)]
Il reste donc 3521 lignes et 51 colonnes.
Séparation entre les variables pédologiques et climatiques
a). Variables pédologiques
Conclusion: je supprime Silt et je garde Sand. De même je suprime CaCO3 et je garde pH.
b). Variables climatiques:
On enlève d’abord les variables sd:
pet_sd_120, pet_sd_3, pr_sd_120, pr_sd_6, tasmin_sd_120, tasmin_sd_6, tasmin_sd_3, tasmax_sd_120, tasmax_sd_6, tasmax_sd_3
Maintenat on enleve les variables qui ont le plus de correlation
On enleve donc:
pet_mean_3, pet_mean_120, tas_mean_3, tasmin_mean_3, tasmin_mean_6, tasmin_mean_120, tasmax_mean_6, tasmax_mean_3, tasmax_mean_120, pr_mean_6
Synthese
Il reste donc 18 variables explicatives non correlee
=> 22 variables supprimees
No variable from the 18 input variables has collinearity problem.
The linear correlation coefficients ranges between:
min correlation ( pr_sd_3 ~ P ): -0.0006721748
max correlation ( tas_sd_6 ~ tas_mean_6 ): 0.6810085
---------- VIFs of the remained variables --------
Variables VIF
1 Sand 2.352594
2 Clay 3.059920
3 pH 2.802590
4 N 2.796920
5 P 2.056419
6 K 1.389062
7 CN 2.948456
8 C_org 1.847050
9 tas_mean_6 7.077602
10 tas_mean_120 3.282206
11 tas_sd_3 3.032422
12 tas_sd_6 8.558721
13 tas_sd_120 3.927902
14 pr_mean_3 3.725966
15 pr_mean_120 2.572277
16 pr_sd_3 2.058873
17 pet_mean_6 5.740856
18 pet_sd_6 5.842996
- AB_tot & LC_
- AB_tot & Sand
- AB_tot & Silt
- AB_tot & Clay
- AB_tot & pH
- AB_tot & N
- AB_tot & P
- AB_tot & K
- AB_tot & CN
- AB_tot & CaCO3
- AB_tot & C_org
- AB_tot & tas_mean_3
- AB_tot & tas_mean_6
- AB_tot & tas_mean_120
- AB_tot & tas_sd_3
- AB_tot & tas_sd_6
- AB_tot & tas_sd_120
- AB_tot & tasmax_mean_3
- AB_tot & tasmax_mean_6
- AB_tot & tasmax_mean_120
- AB_tot & tasmax_sd_3
- AB_tot & tasmax_sd_6
- AB_tot & tasmax_sd_120
- AB_tot & tasmin_mean_3
- AB_tot & tasmin_mean_6
- AB_tot & tasmin_mean_120
- AB_tot & tasmin_sd_3
- AB_tot & tasmin_sd_6
- AB_tot & tasmin_sd_120
- AB_tot & pr_mean_3
- AB_tot & pr_mean_6
- AB_tot & pr_mean_120
- AB_tot & pr_sd_3
- AB_tot & pr_sd_6
- AB_tot & pr_sd_120
- AB_tot & pet_mean_3
- AB_tot & pet_mean_6
- AB_tot & pet_mean_120
- AB_tot & pet_sd_3
- AB_tot & pet_sd_6
- AB_tot & pet_sd_120
- BM_tot & LC_
- BM_tot & Sand
- BM_tot & Silt
- BM_tot & Clay
- BM_tot & pH
- BM_tot & N
- BM_tot & P
- BM_tot & K
- BM_tot & CN
- BM_tot & CaCO3
- BM_tot & C_org
- BM_tot & tas_mean_3
- BM_tot & tas_mean_6
- BM_tot & tas_mean_120
- BM_tot & tas_sd_3
- BM_tot & tas_sd_6
- BM_tot & tas_sd_120
- BM_tot & tasmax_mean_3
- BM_tot & tasmax_mean_6
- BM_tot & tasmax_mean_120
- BM_tot & tasmax_sd_3
- BM_tot & tasmax_sd_6
- BM_tot & tasmax_sd_120
- BM_tot & tasmin_mean_3
- BM_tot & tasmin_mean_6
- BM_tot & tasmin_mean_120
- BM_tot & tasmin_sd_3
- BM_tot & tasmin_sd_6
- BM_tot & tasmin_sd_120
- BM_tot & pr_mean_3
- BM_tot & pr_mean_6
- BM_tot & pr_mean_120
- BM_tot & pr_sd_3
- BM_tot & pr_sd_6
- BM_tot & pr_sd_120
- BM_tot & pet_mean_3
- BM_tot & pet_mean_6
- BM_tot & pet_mean_120
- BM_tot & pet_sd_3
- BM_tot & pet_sd_6
- BM_tot & pet_sd_120
- Richesse_tot & LC_
- Richesse_tot & Sand
- Richesse_tot & Silt
- Richesse_tot & Clay
- Richesse_tot & pH
- Richesse_tot & N
- Richesse_tot & P
- Richesse_tot & K
- Richesse_tot & CN
- Richesse_tot & CaCO3
- Richesse_tot & C_org
- Richesse_tot & tas_mean_3
- Richesse_tot & tas_mean_6
- Richesse_tot & tas_mean_120
- Richesse_tot & tas_sd_3
- Richesse_tot & tas_sd_6
- Richesse_tot & tas_sd_120
- Richesse_tot & tasmax_mean_3
- Richesse_tot & tasmax_mean_6
- Richesse_tot & tasmax_mean_120
- Richesse_tot & tasmax_sd_3
- Richesse_tot & tasmax_sd_6
- Richesse_tot & tasmax_sd_120
- Richesse_tot & tasmin_mean_3
- Richesse_tot & tasmin_mean_6
- Richesse_tot & tasmin_mean_120
- Richesse_tot & tasmin_sd_3
- Richesse_tot & tasmin_sd_6
- Richesse_tot & tasmin_sd_120
- Richesse_tot & pr_mean_3
- Richesse_tot & pr_mean_6
- Richesse_tot & pr_mean_120
- Richesse_tot & pr_sd_3
- Richesse_tot & pr_sd_6
- Richesse_tot & pr_sd_120
- Richesse_tot & pet_mean_3
- Richesse_tot & pet_mean_6
- Richesse_tot & pet_mean_120
- Richesse_tot & pet_sd_3
- Richesse_tot & pet_sd_6
- Richesse_tot & pet_sd_120
lamda = 0.3
lamda = 0.4
lamda = 0.6
Transformation sqrt de l’abondance et de la biomasse
Transformation centrée reduite des prédicteurs
All predictors: => RF, GBM et ANN
[1] "LC_" "Sand" "Silt" "Clay"
[5] "pH" "N" "P" "K"
[9] "CN" "CaCO3" "C_org" "tas_mean_3"
[13] "tas_mean_6" "tas_mean_120" "tas_sd_3" "tas_sd_6"
[17] "tas_sd_120" "tasmax_mean_3" "tasmax_mean_6" "tasmax_mean_120"
[21] "tasmax_sd_3" "tasmax_sd_6" "tasmax_sd_120" "tasmin_mean_3"
[25] "tasmin_mean_6" "tasmin_mean_120" "tasmin_sd_3" "tasmin_sd_6"
[29] "tasmin_sd_120" "pr_mean_3" "pr_mean_6" "pr_mean_120"
[33] "pr_sd_3" "pr_sd_6" "pr_sd_120" "pet_mean_3"
[37] "pet_mean_6" "pet_mean_120" "pet_sd_3" "pet_sd_6"
[41] "pet_sd_120"
No correlation: => GLM et GAM |0.7|
[1] "LC_" "Sand" "Clay" "pH" "N"
[6] "P" "K" "CN" "C_org" "tas_mean_6"
[11] "tas_mean_120" "tas_sd_3" "tas_sd_6" "tas_sd_120" "pr_mean_3"
[16] "pr_mean_120" "pr_sd_3" "pet_mean_6" "pet_sd_6"
Initialization
Model creation and evaluation Loop
For each iteration i
up to length(variables) - 1
:
Model creation
Calculation of variable importance
Determination of the least important variable
Predictions on Test Data
Calculation of performance metrics
RMSE
, adjusted R²
for training, R²
for testing, and MAE
.Update of Train and Test datasets
train
& test
.Recording of Results
# AB_tot --------------------------------------------------------------------------
df_mod_AB_tot = data_deep[,c("AB_tot",predictor_deep, LC_levels)]
df_mod_AB_tot = drop_na(df_mod_AB_tot)
df_mod_AB_tot = droplevels(df_mod_AB_tot)
# Partition
# set.seed(123)
# ind <- sample(2, nrow(df_mod_AB_tot), replace = T, prob = c(.8, .2))
# AB_tot_train <- df_mod_AB_tot[ind==1,]
# AB_tot_test <- df_mod_AB_tot[ind==2,]
set.seed(42)
split <- rsample::initial_split(df_mod_AB_tot,prop = 0.8, strata = "AB_tot")
AB_tot_train <- rsample::training(split)
AB_tot_test <- rsample::testing(split)
write.csv2(x =AB_tot_train,file = "datas/landW/AB_tot_train.csv", row.names = FALSE)
write.csv2(x =AB_tot_test,file = "datas/landW/AB_tot_test.csv", row.names = FALSE)
#
# AB_tot_train = read.csv2("datas/landW/AB_tot_train.csv")
# AB_tot_test = read.csv2("datas/landW/AB_tot_test.csv")
# df_mod_AB_tot = rbind(AB_tot_train,AB_tot_test)
AB_tot_train = as.data.frame(AB_tot_train)
AB_tot_test = as.data.frame(AB_tot_test)
df <- data.frame(y =AB_tot_train[,"AB_tot"])
abundance_dist_train = ggplot(df, aes(x=y)) +
geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
geom_density(fill="black", alpha=0.2) +
theme_gray() +
labs(title="Abundance: Train", x="Value", y="Density") +
theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/landW/abundance_dist_train.png", plot = abundance_dist_train, dpi = 300,width = 3,height = 2)
df <- data.frame(y =AB_tot_test[,"AB_tot"])
abundance_dist_test = ggplot(df, aes(x=y)) +
geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
geom_density(fill="black", alpha=0.2) +
theme_gray() +
labs(title="Abundance: Test", x="Value", y="Density") +
theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/landW/abundance_dist_test.png", plot = abundance_dist_test, dpi = 300,width = 3,height = 2)
# Distrvitbution de var rep dans train et de test: est ce homogene ?
abundance_dist_train_and_test = ggarrange(abundance_dist_train, abundance_dist_test,
labels = c('(a)', '(b)'),
common.legend = TRUE,
legend = 'right'
)
ggsave("Results/landW/abundance_dist_train_and_test.png", plot = abundance_dist_train_and_test, dpi = 300,height = 2,width = 4)
Abundance
Data partition (3521, 48):
train data (80 %) = 2816, 48
test data (20 %) = 705, 48
Biomasse
Data partition (2304, 48):
train data (80 %) = 1840, 48
test data (20 %) = 464, 48
Richness
Data partition (3521, 48):
train data (80 %) = 2816, 48
test data (20 %) = 705, 48
GLM <- function(var_rep, df_app, df_valid,family = 'gaussian'){
var_predicteurs = names(df_app[,-1])
df_app = df_app[,c(var_rep,var_predicteurs)]
df_valid = df_valid[,c(var_rep,var_predicteurs)]
formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))
# entrainement du modele sur le jeu d'entrainement
modelglm<-glm(formula,family = family ,data = df_app)
# Prediction sur le jeu de validation
pred.GLM<-predict(modelglm,newdata=as.data.frame(df_valid[,var_predicteurs]))
# Calcul du RMSE pour évaluer la qualite du modele
rmse <- round (sqrt(mean((df_valid[,var_rep] - pred.GLM)^2,na.rm=TRUE)),2)
# Calcul du R² ajusté pour train
R_adj_train <- calcule_R2(df_app[,var_rep], predict(modelglm, newdata=df_app))
n_train <- nrow(df_app)
p_train <- ncol(df_app) - 1
r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
# Calcul du R²
# R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GLM)
# n_test <- nrow(df_valid)
# p_test <- ncol(df_valid) - 1
# r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
res <- rms::lrm(df_valid[,var_rep] ~ pred.GLM, x= TRUE, y = TRUE)
res = res$stats
r_adj_test = round (res[["R2"]],2)
MAE <- mean(abs(pred.GLM - df_valid[,var_rep]),na.rm=TRUE)
# Round results
rmse <- round(rmse, 2)
r_adj_train <- round(r_adj_train, 2)
r_adj_test <- round(r_adj_test, 2)
MAE <- round(MAE, 2)
# output
results_df <- data.frame(Algorithms = "GLM",
Response_variables = var_rep,
R2_adjusted_train = r_adj_train,
R2_adjusted_test = r_adj_test,
RMSE = rmse,
MAE = MAE)
results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelglm,predit = pred.GLM, df = results_df)
return(results)
}
GAM <- function(var_rep, df_app, df_valid, family = 'gaussian',method = "REML", interaction = FALSE){
var_predicteurs = names(df_app[,-1])
if (var_rep == "AB_tot"){
modelgam<-gam(AB_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v,
family=family,method = method,data = df_app)
}
if (var_rep == "BM_tot"){
modelgam<-gam(BM_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v,
family=family,method = method,data = df_app)
}
if(var_rep == "Richesse_tot"){
modelgam<-gam(Richesse_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v ,
family=family,method = method,data = df_app)
}
# Prediction sur le jeu de validation
pred.GAM <- predict(modelgam,newdata=as.data.frame(df_valid[,var_predicteurs]))
# Calcul du RMSE pour évaluer la qualite du modele
rmse <- sqrt(mean((df_valid[,var_rep] - pred.GAM)^2,na.rm=TRUE))
# Calcul du R² ajusté pour train
R_adj_train <- calcule_R2(df_app[,var_rep], predict(modelgam, newdata=df_app))
n_train <- nrow(df_app)
p_train <- ncol(df_app) - 1
r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
# Calcul du R² ajusté pour test
# R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GAM)
# n_test <- nrow(df_valid)
# p_test <- ncol(df_valid) - 1
# r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
res <- rms::lrm(df_valid[,var_rep] ~ pred.GAM, x= TRUE, y = TRUE)
res = res$stats
r_adj_test = round (res[["R2"]],2)
# Calcule le MAE
MAE <- mean(abs(pred.GAM - df_valid[,var_rep]))
# Round results
rmse <- round(rmse, 2)
r_adj_train <- round(r_adj_train, 2)
r_adj_test <- round(r_adj_test, 2)
MAE <- round(MAE, 2)
# output
results_df <- data.frame(Algorithms = "GAM",
Response_variables = var_rep,
R2_adjusted_train = r_adj_train,
R2_adjusted_test = r_adj_test,
RMSE = rmse,
MAE = MAE)
results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelgam, predit = pred.GAM, df = results_df)
return(results)
}
Family = gaussian
Link function = identity
Method = REML
Tuning
RF model tuning by grid
ntree = \(100,300,500,700,900,1000,1300,1500,1700,2000\)
mtry = \(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24\)
maxnodes = \(10 , 20, 30, 40, 50, 60, 70, 80, 90, 100\)
Total number of models = \(ntree * mtry * maxnode = 960\)
ForetAlea <- function(var_rep, df_app, df_valid, mtry, ntree, maxnodes) {
set.seed(1863)
col_posi <- which(names(df_app) == var_rep)
ForeVDT <- randomForest::randomForest(df_app[-col_posi], df_app[[col_posi]], mtry = mtry, ntree = ntree, maxnodes = maxnodes)
# Prediction on the validation dataset
col_posi <- which(names(df_valid) == var_rep)
pred.RF <- predict(ForeVDT, newdata = df_valid[, -col_posi])
# Calculate RMSE to evaluate model quality
rmse <- sqrt(mean((df_valid[, col_posi] - pred.RF)^2))
# Calcul du R² ajusté pour train
R_adj_train <- calcule_R2(df_app[,var_rep], predict(ForeVDT, newdata=df_app))
n_train <- nrow(df_app)
p_train <- ncol(df_app) - 1
r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
# Calcul du R² ajusté pour test
# R_adj_test <-calcule_R2(df_valid[,col_posi],pred.RF)
# n_test <- nrow(df_valid)
# p_test <- ncol(df_valid) - 1
# r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
res <- rms::lrm(df_valid[,var_rep] ~ pred.RF, x= TRUE, y = TRUE)
res = res$stats
r_adj_test = round (res[["R2"]],2)
# Calculate MAE
MAE <- mean(abs(pred.RF - df_valid[, col_posi]))
# Round results
rmse <- round(rmse, 2)
r_adj_train <- round(r_adj_train, 2)
r_adj_test <- round(r_adj_test, 2)
MAE <- round(MAE, 2)
# output
results_df <- data.frame(Algorithms = "RF",
Response_variables = var_rep,
R2_adjusted_train = r_adj_train,
R2_adjusted_test = r_adj_test,
RMSE = rmse,
MAE = MAE)
results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = ForeVDT, predit = pred.RF, df = results_df)
return(results)
}
Default model
GBM model tuning by grid
n.trees = \(1000, 1500, 1700, 2000, 3000\)
shrinkage = \(0.01, 0.02, 0.05, 0.001, 0.002, 0.005\)
interaction.depth = \(3, 5, 6, 8, 10\)
n.minobsinnode = \(2, 5, 10, 30, 50, 70\)
Total number of models = \(n.trees * shrinkage * interaction.depth * n.minobsinnode = 900\)
GBM <- function(var_rep, df_app, df_valid,distribution = 'gaussian',n.trees ,shrinkage,interaction.depth,n.minobsinnode){
formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))
Gradboost<-gbm(formula, data = df_app,
distribution = distribution,
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
n.minobsinnode = n.minobsinnode)
# Prediction sur le jeu de validation :
col_posi <- which(names(df_valid) == var_rep)
prev.GBM<-predict(Gradboost,newdata=as.data.frame(df_valid[,-col_posi]))
# Calcul du RMSE pour évaluer la qualité du modele
rmse <- sqrt(mean((df_valid[,var_rep] - prev.GBM)^2))
# Calcul du R² ajusté pour train
R_adj_train <- calcule_R2(df_app[,var_rep], predict(Gradboost, newdata=df_app))
n_train <- nrow(df_app)
p_train <- ncol(df_app) - 1
r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
# Calcul du R² ajusté pour test
# R_adj_test <-calcule_R2(df_valid[,col_posi],prev.GBM)
# n_test <- nrow(df_valid)
# p_test <- ncol(df_valid) - 1
# r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
res <- rms::lrm(df_valid[,var_rep] ~ prev.GBM, x= TRUE, y = TRUE)
res = res$stats
r_adj_test = round (res[["R2"]],2)
# calcule MAE
MAE <- mean(abs(prev.GBM - df_valid[,col_posi]))
# Round results
rmse <- round(rmse, 2)
r_adj_train <- round(r_adj_train, 2)
r_adj_test <- round(r_adj_test, 2)
MAE <- round(MAE, 2)
# output
results_df <- data.frame(Algorithms = "GBM",
Response_variables = var_rep,
R2_adjusted_train = r_adj_train,
R2_adjusted_test = r_adj_test,
RMSE = rmse,
MAE = MAE)
results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = Gradboost, predit = prev.GBM, df = results_df)
return(results)
}
Summary of Compilations
For GLM and GAM, there were 19 explanatory variables => \(18*5*3*2 = 540\)
For RF, GBM, and ANN, there were 41 variables => \(40*5*3*3 = 1800\)
Excluding optimization models
“Le R² ajusté est utile pour les modèles de régression linéaire, il est moins pertinent pour les modèles de machine learning en raison de la complexité des modèle.”
La formule du coefficient de détermination \(R^2\) est :
\[\begin{equation} R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} \end{equation}\]
où :
La formule du \(R²\) ajusté est :
\[\begin{equation} R^2_{\text{ajusté}} = 1 - \left( \frac{(1 - R^2) \cdot (n - 1)}{n - k - 1} \right) \end{equation}\]
où :
Justificatifs
“La procédure d’ajustement du modèle des models non linéaires ne repose pas sur la minimisation progressive de la somme des carrés des erreurs résiduelles (RSS) et, par conséquent, le modèle ajusté de manière optimale pourrait avoir une somme des carrés résiduels supérieure à la somme totale des carrés. Cela signifie que R² pour de tels modèles peut être une quantité négative. En tant que tel, R² n’est pas une mesure utile de la qualité de l’ajustement pour la plupart des modèles non linéaires.”
“Les modèles non linéaires utilisent souvent des techniques d’ajustement de modèle telles que l’estimation de vraisemblance maximale (MLE) qui ne minimisent pas nécessairement la somme des carrés résiduels (RSS). Ainsi, étant donné deux modèles non linéaires ajustés à l’aide de MLE, celui qui présente la meilleure qualité d’ajustement peut se révéler avoir un R² ou un R² ajusté plus faible. Une autre conséquence de ce fait est que l’ajout de variables de régression aux modèles non linéaires peut réduire le R². Globalement, le R² ou le R² ajusté ne doivent pas être utilisés pour juger de la qualité d’ajustement d’un modèle de régression non linéaire.”
“De nombreux modèles de régression non linéaires n’utilisent pas la technique d’estimation des moindres carrés ordinaires pour ajuster le modèle.”
The best model of GLM is :
\(mod_11 = glm(AB_tot ~ pet_sd_6 + tas_sd_6 + pr_mean_120 + P + Clay + \\ Sand + CN + pH + LC_, df = data)\)
The best model of gam is:
\(mod_7 = gam(AB_tot ~ tas_sd_6 + tas_sd_3 + Sand + tas_sd_120 +\\ K + pet_sd_6 + pH + CN + P + tas_mean_6 + pr_mean_3 + tas_mean_120 + LC_, df = data)\)
The best model of RF is:
\(mod_28 = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)\)
The best model of GBM is:
\(mod_17 = gbm(AB_tot ~ tasmin_sd_6 + tasmax_mean_6 + Sand + pr_mean_120 + tas_mean_6 +\\ pet_sd_3 + CaCO3 + tasmin_mean_120 + CN + K + tasmax_mean_120 + pet_sd_6 + pr_sd_6 +\\ P + Silt + tas_sd_6 + pr_mean_6 + pet_mean_3 + tasmin_mean_6 + pr_sd_120 + Clay + pH + \\ tas_mean_120 + pr_mean_3 + LC_, df = data)\)
The best model of ANN is:
\(mod_27 = ann(AB_tot ~ pr_sd_6 + pH + P + pr_sd_120 + K + tasmin_sd_3 + \\ pet_sd_3 + pr_sd_3 + pet_mean_120 + tas_sd_120 + CaCO3 + Silt + tasmax_mean_120 + \\ tasmin_sd_120 + LC_ , df = data)\)
Models | Rep.var | R2_test | RMSE | MAE |
---|---|---|---|---|
GLM | Abundance | 0.31 | 5.65 | 4.52 |
GAM | Abundance | 0.36 | 5.43 | 4.28 |
RF | Abundance | 0.46 | 5.00 | 3.87 |
GBM | Abundance | 0.36 | 5.55 | 4.39 |
ANN | Abundance | 0.33 | 6.20 | 4.72 |
Meta models
Models | Rep.var | R2_test | RMSE | MAE |
---|---|---|---|---|
averaging | Abundance | 0.44 | 5.29 | 4.15 |
Stacking | Abundance | 0.47 | 4.94 | 3.83 |
GLM
\(mod = glm(AB_tot ~ pet_sd_6 + tas_sd_6 + pr_mean_120 + P + Clay + \\ Sand + CN + pH + LC_, df = data)\)
GAM
\(mod = gam(AB_tot ~ tas_sd_6 + tas_sd_3 + Sand + tas_sd_120 +\\ K + pet_sd_6 + pH + CN + P + tas_mean_6 + pr_mean_3 + tas_mean_120 + LC_, df = data)\)
RF
\(mod = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)\)
GBM
\(mod= gbm(AB_tot ~ tasmin_sd_6 + tasmax_mean_6 + Sand + pr_mean_120 + tas_mean_6 +\\ pet_sd_3 + CaCO3 + tasmin_mean_120 + CN + K + tasmax_mean_120 + pet_sd_6 + pr_sd_6 +\\ P + Silt + tas_sd_6 + pr_mean_6 + pet_mean_3 + tasmin_mean_6 + pr_sd_120 + Clay + pH + \\ tas_mean_120 + pr_mean_3 + LC_, df = data)\)
ANN
\(mod = ann(AB_tot ~ pr_sd_6 + pH + P + pr_sd_120 + K + tasmin_sd_3 + \\ pet_sd_3 + pr_sd_3 + pet_mean_120 + tas_sd_120 + CaCO3 + Silt + tasmax_mean_120 + \\ tasmin_sd_120 + LC_ , df = data)\)
Conclusion: le meillleur model pour l’abondance est le RF
The best model of GLM is :
\(mod_6 = glm(BM_tot ~ N + pet_mean_6 + pet_sd_6 + tas_sd_120 + P + Sand + \\ Clay + C_org + tas_sd_6 + tas_sd_3 + pr_sd_3 + CN + tas_mean_6 + pH + \\ pr_mean_120 + LC_, df = data)\)
The best model of gam is:
\(mod_5 = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)\)
The best model of RF is:
\(mod_34 = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + LC_ + tasmin_sd_120 + pet_mean_3 , df = data)\)
The best model of GBM is:
\(mod_25 = gbm(BM_tot ~ pr_mean_6 + pet_sd_6 + tasmax_sd_3 + tasmin_mean_120 + \\P + pr_mean_120 + pr_sd_3 + pH + pr_sd_120 + tasmax_sd_6 + pet_mean_3 + Silt + Sand +\\ tasmax_mean_6 + pr_mean_3 + LC_ + tasmin_sd_3 , df = data)\)
The best model of ANN is:
\(mod_15 = ann(BM_tot ~ pr_mean_3 + tasmax_mean_120 + pet_mean_120 + pr_sd_6 + \\Sand + tasmin_sd_6 + pet_sd_120 + K + Silt + tasmax_mean_6 + tasmin_sd_3 +\\ pet_sd_6 + pet_mean_3 + tas_mean_6 + tasmax_mean_3 + P + C_org + CN + tas_sd_3 + pr_sd_3 + \\tasmin_sd_120 + pet_sd_3 + tasmin_mean_3 + pr_mean_6 + pr_mean_120 + pH + LC_, df = data)\)
Models | Rep.var | R2_test | RMSE | MAE |
---|---|---|---|---|
GLM | Biomass | 0.39 | 3.10 | 2.46 |
GAM | Biomass | 0.49 | 2.86 | 2.24 |
RF | Biomass | 0.56 | 2.62 | 2.08 |
GBM | Biomass | 0.47 | 2.91 | 2.31 |
ANN | Biomass | 0.50 | 3.05 | 2.35 |
Meta models
Models | Rep.var | R2_test | RMSE | MAE |
---|---|---|---|---|
Averaging | Biomass | 0.54 | 2.74 | 2.15 |
Stacking | Biomass | 0.57 | 2.58 | 2.05 |
GLM
\(mod = glm(BM_tot ~ N + pet_mean_6 + pet_sd_6 + tas_sd_120 + P + Sand + \\ Clay + C_org + tas_sd_6 + tas_sd_3 + pr_sd_3 + CN + tas_mean_6 + pH + \\ pr_mean_120 + LC_, df = data)\)
GAM
\(mod = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)\)
RF
\(mod = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + LC_ + tasmin_sd_120 + pet_mean_3 , df = data)\)
GBM
\(mod= gbm(BM_tot ~ pr_mean_6 + pet_sd_6 + tasmax_sd_3 + tasmin_mean_120 + \\P + pr_mean_120 + pr_sd_3 + pH + pr_sd_120 + tasmax_sd_6 + pet_mean_3 + Silt + Sand +\\ tasmax_mean_6 + pr_mean_3 + LC_ + tasmin_sd_3 , df = data)\)
ANN
\(mod = ann(BM_tot ~ pr_mean_3 + tasmax_mean_120 + pet_mean_120 + pr_sd_6 + \\Sand + tasmin_sd_6 + pet_sd_120 + K + Silt + tasmax_mean_6 + tasmin_sd_3 +\\ pet_sd_6 + pet_mean_3 + tas_mean_6 + tasmax_mean_3 + P + C_org + CN + tas_sd_3 + pr_sd_3 + \\tasmin_sd_120 + pet_sd_3 + tasmin_mean_3 + pr_mean_6 + pr_mean_120 + pH + LC_, df = data)\)
Conclusion: le meillleur model pour la biomasse est le RF
The best model of GLM is :
\(mod_7 = glm(Richesse_tot ~ N + pr_sd_3 + tas_mean_120 + CN + P + tas_sd_6 + pet_sd_6 + pet_mean_6 + tas_mean_6 + tas_sd_120 + pr_mean_120 + pH + LC_, df = data)\)
The best model of gam is:
\(mod_4 = gam(Richesse_tot ~ Sand + K + pr_sd_3 + tas_sd_3 + CN + tas_sd_120 +\\ P + tas_sd_6 + tas_mean_120 + pet_sd_6 + pet_mean_6 + N + tas_mean_6 + pr_mean_120 +\\ pH + LC_ , df = data)\)
The best model of RF is:
\(mod_24 = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 + K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 + tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)\)
The best model of GBM is:
\(mod_9 = gbm(Richesse_tot ~ pr_mean_3 + Clay + tasmax_mean_6 + pr_mean_120 + tasmax_sd_120 + tasmin_sd_6 + CaCO3 + pr_sd_6 + N + K + pr_mean_6 + pr_sd_120 + Sand + tasmin_sd_3 + P + pH + pet_sd_120 + LC_, df = data)\)
The best model of ANN is:
\(mod_33 = ann(Richesse_tot ~ P + tasmin_sd_120 + pr_sd_3 + tasmin_sd_3 + pr_sd_120 + pr_mean_6 + Silt + pH + LC_, df = data)\)
Models | Rep.var | R2_test | RMSE | MAE |
---|---|---|---|---|
GLM | Richness | 0.37 | 1.92 | 1.54 |
GAM | Richness | 0.48 | 1.75 | 1.38 |
RF | Richness | 0.57 | 1.61 | 1.24 |
GBM | Richness | 0.43 | 1.85 | 1.48 |
ANN | Richness | 0.44 | 2.13 | 1.65 |
Meta models
Models | Rep.var | R2_test | RMSE | MAE |
---|---|---|---|---|
Averaging | Richness | 0.52 | 1.73 | 1.36 |
Stacking | Richness | 0.58 | 1.58 | 1.23 |
GLM
\(mod = glm(Richesse_tot ~ N + pr_sd_3 + tas_mean_120 + CN + P + tas_sd_6 + pet_sd_6 + pet_mean_6 + tas_mean_6 + tas_sd_120 + pr_mean_120 + pH + LC_, df = data)\)
GAM
\(mod = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)\)
RF
\(mod = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 + K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 + tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)\)
GBM
\(mod= gbm(Richesse_tot ~ pr_mean_3 + Clay + tasmax_mean_6 + pr_mean_120 + tasmax_sd_120 + tasmin_sd_6 + CaCO3 + pr_sd_6 + N + K + pr_mean_6 + pr_sd_120 + Sand + tasmin_sd_3 + P + pH + pet_sd_120 + LC_, df = data)\)
ANN
\(mod = ann(Richesse_tot ~ P + tasmin_sd_120 + pr_sd_3 + tasmin_sd_3 + pr_sd_120 + pr_mean_6 + Silt + pH + LC_, df = data)\)
Conclusion: le meillleur model pour la Richesse_tot est le RF
Models | Rep.var | R2_adj_train | R2_test | RMSE | MAE |
---|---|---|---|---|---|
RF | Abundance | 0.83 | 0.46 | 5.00 | 3.87 |
RF | Biomass | 0.84 | 0.56 | 2.62 | 2.08 |
RF | Richness | 0.87 | 0.57 | 1.61 | 1.24 |
Abundance
\(mod = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)\)
Biomass
\(mod = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + \\ LC_ + tasmin_sd_120 + pet_mean_3 , df = data)\)
Richness
\(mod = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 +\\ K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 +\\ tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)\)
____________Befor_____________
Models | Rep.var | R2_adj_train | R2_test | RMSE | MAE |
---|---|---|---|---|---|
RF | Abundance | 0.83 | 0.46 | 5.00 | 3.87 |
RF | Biomass | 0.84 | 0.56 | 2.62 | 2.08 |
RF | Richness | 0.87 | 0.57 | 1.61 | 1.24 |
____________After_____________
Models | Rep.var | R2_adj_train | R2_test | RMSE | MAE |
---|---|---|---|---|---|
RF | Abundance | 0.82 | 0.46 | 4.99 | 3.87 |
RF | Biomass | 0.85 | 0.56 | 2.62 | 2.08 |
RF | Richness | 0.87 | 0.57 | 1.59 | 1.23 |
Reading layer `france_shapefile_sans_corse' from data source
`C:\Users\diall\OneDrive\Bureau\M2_MODE\stage_abdou_m2\R_Stage_M2\cartographie\france_shapefile_sans_corse.shp'
using driver `ESRI Shapefile'
Simple feature collection with 12 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -5.141277 ymin: 42.33292 xmax: 8.232497 ymax: 51.08899
Geodetic CRS: +proj=longlat +datum=WGS84 +no_defs
Datas preparations
Land use
Soils predictors
Climate predictors
Abundance
# mod_AB_tot = readRDS("cartographie/best_mod/RF_mod_AB_tot.RDS")
#
# AB_tot_pred<- predict(mod_AB_tot, newdata = predictors_trans)
# AB_tot_pred = round(as.numeric(AB_tot_pred))
# # summary(AB_tot_pred)
# AB_tot_pred = data.frame(Abundance = AB_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
#
# write.csv(x =AB_tot_pred,file = "cartographie/AB_tot_pred.csv",row.names = FALSE)
#
# saveRDS(AB_tot_pred, "cartographie/AB_tot_pred.rds")
#
# AB_tot_pred = readRDS("cartographie/AB_tot_pred.rds")
Biomass
# mod_BM_tot = readRDS("cartographie/best_mod/RF_mod_BM_tot.RDS")
#
# BM_tot_pred<- predict(mod_BM_tot, newdata = predictors_trans)
# BM_tot_pred = round(as.numeric(BM_tot_pred))
# # summary(BM_tot_pred)
# BM_tot_pred = data.frame(Biomass = BM_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
#
# write.csv(x =BM_tot_pred,file = "cartographie/BM_tot_pred.csv",row.names = FALSE)
#
# saveRDS(BM_tot_pred, "cartographie/BM_tot_pred.rds")
# BM_tot_pred = readRDS("cartographie/BM_tot_pred.rds")
Richness
# mod_Richesse_tot = readRDS("cartographie/best_mod/RF_mod_Richesse_tot.RDS")
#
# Richesse_tot_pred<- predict(mod_Richesse_tot, newdata = predictors_trans)
# Richesse_tot_pred = round(as.numeric(Richesse_tot_pred))
# # summary(Richesse_tot_pred)
# Richesse_tot_pred = data.frame(Richness = Richesse_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
#
# write.csv(x =Richesse_tot_pred,file = "cartographie/Richesse_tot_pred.csv",row.names = FALSE)
#
# saveRDS(Richesse_tot_pred, "cartographie/Richesse_tot_pred.rds")
#
# Richesse_tot_pred = readRDS("cartographie/Richesse_tot_pred.rds")
Sampling
800 m regular grid
Stores gps coordinates
Fig. Interpolation
Quelles dates de prevelement pour la prediction
All the material from my internship, including scripts and datasets, is available on my GitHub and my Rpubs.