ProDij

Abdourahmane Diallo

2024-07-03

Setting

Packages

Code
  library(tidyverse)
  library(corrplot)
  library(lme4)
  library(multcomp)
  library(MASS)
  library(arm)
  library(ade4)
  library(Hmisc)
  library(labdsv)
  library(vegan)
  library(cowplot)
  library(ggpubr)
  library(rstatix)
  library(patchwork)
  library(multcompView)
  library(ggsignif)
  library(grid)
  library(FactoMineR)
  library(factoextra)
  library(explore)
  library(ggrepel)
  library(naniar)
  library(outliers)
  library(leaps)
  library(fastDummies)
  library(caret) # pour l'entrainement des models
  library(mgcv)
  library(ggeffects)
  library(gratia)
  library(GGally) # pour ggpair
  library(openxlsx)
  library(readxl)
  library(leaflet) # pour la carto
  library(quarto)
  library(raster)
  library(knitr)
  library(kableExtra)
  library(stringr)
  library(plotly)
  library(vcd) # pour la distribution des var reponse
  library(prospectr)# pour split data avec kenSton()
  library(randomForest)
  library(gbm)
  library(kernlab)
  library(ggforce)
  library(keras)
  library(tensorflow)
  library(neuralnet)
  library(iml) # pour l'interpretabilité des models https://cran.r-project.org/web/packages/iml/vignettes/intro.html
  library(stats)
  library(bestNormalize)
  library(rmarkdown)
  library(DT)
  library(gtExtras) # pour la
  library(reshape2)
  library(sf)
  library(ggplot2)
  library(maptools)
  library(ggsn)
  library(spThin)
  library(sp)
  library(gstat)

Functions

Code
## Identification des NA dans un df -----------------------------------------------
taux_completion<-
  function(df, afficher_zero_percent = FALSE, seuil, trie=FALSE) {
    # Calcule du pourcentage de NA dans le dataframe
    pourcentage_total <-
      round(sum(is.na(df)) / (nrow(df) * ncol(df)) * 100, 1)
    
    # Calcule du pourcentage de NA par colonne
    pourcentage_colonnes <- round(colMeans(is.na(df)) * 100, 1)
    
    # Creation d'un dataframe résultat avec deux colonnes
    result <-
      data.frame(
        Variables = names(df),
        CR = pourcentage_colonnes,
        row.names = NULL
      )
    
    if (afficher_zero_percent) {
      result <- result[result$CR == 0, ]
      result$CR = 100 -result$CR
    } else {
      result <- result[result$CR > 0, ]
      result$CR = 100 -result$CR
      
    }
    
    result <- rbind(result, c("Total", pourcentage_total))
    #result <- rbind(result, c("Total", paste0(pourcentage_total, "")))
    
    result <- result[, c("Variables", "CR")]
    result$CR = as.numeric(result$CR)
    result$CR = round(result$CR,1)
    if (trie){
      result = result %>% arrange(desc(CR))
    }
    result$CR = paste0(result$CR,"%")
    
    return(result)
  }
# Converssion des colonne en num ou factor-----------------------------------------------
conv_col <- function (data, columns_to_convert, to_types) {
  if (to_types == "numeric") {
    # Conversion des colonnes en numeric
    for (col in columns_to_convert) {
      data[, col] <- as.numeric(data[, col])
    }
  } else {
    # Conversion des colonnes en facteurs
    for (col in columns_to_convert) {
      data[, col] <- as.factor(data[, col])
    }
  }
  return(data)
}
#data_converted <- conv_col(data, names(data [, c(1, 3)]), "factor")

# exploration graphiques des variables numeriques -----------------------------------------------
explo_num <- function(nom_col, titre, df = prodij, ligne_col = c(2, 2),mini = min(df[[nom_col]]), maxi=max(df[[nom_col]]) ) {
  par(mfrow = ligne_col)
  
  df[complete.cases(df[[nom_col]]), ]
  df <- df %>%filter(!is.na(df[[nom_col]]))
  df[[nom_col]] = as.numeric(df[[nom_col]])
  # Boxplot
  boxplot(df[[nom_col]], col = 'blue', ylab = titre, ylim = c(mini, maxi))
  # Cleveland plot
  dotchart(df[[nom_col]], pch = 16, col = 'blue', xlab = titre)
  # Histogram
  hist(df[[nom_col]], col = 'blue', xlab = titre, main = "")
  # Quantile-Quantile plot
  qqnorm(df[[nom_col]], pch = 16, col = 'blue', xlab = '')
  qqline(df[[nom_col]], col = 'red') 
}

# Extraction des predictors + moyennes -----------------------------------------------

extraction <- function(nom_col, tif_file_path, df = prodij, conv = 1) {
  #df <- df %>%filter(!is.na(GPS_X) & !is.na(GPS_Y))
  raster_data <- raster(tif_file_path)
  
  # Création d'un dataframe pour stocker les valeurs extraites
  df_interne <- data.frame(GPS_X = df$GPS_X, GPS_Y = df$GPS_Y)
  proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
  # Transformer les coordonnées GPS en système de coordonnées du raster
  gps_coords_sp <- SpatialPoints(df_interne, proj4string = CRS(proj4Str))
  gps_coords_proj <- spTransform(gps_coords_sp, crs(raster_data))
  
  # Extraction des valeurs du raster 
  values <- raster::extract(raster_data, gps_coords_proj)
  
  # Ajout des valeurs extraites comme nouvelles colonnes a df
  #df_save = data.frame()
  #df_save[[nom_col]] <- values / conv
  
  df[[nom_col]] <- values / conv
  
  return(df)
}

# la moyenne des predictores -----------------------------------------------
moyenne_val_extrct <- function(nom_col, vec_col, df=prodij) {
  df[[nom_col]] <- rowMeans(as.matrix(df[, vec_col, drop = FALSE]), na.rm = TRUE)
  df[[nom_col]] = round(df[[nom_col]],1)
  return(as.data.frame(df))
}


# tests de corrélation avec un seuil -----------------------------------------------
cor_function_seuil <- function(data, seuil,affiche=FALSE) {
  # Création d'un vecteur pour stocker les paires de variables corrélées
  variables_corr <- c()
  
  # Boucle pour tester la corrélation entre chaque paire de variables
  for (i in 1:(ncol(data) - 1)) {
    for (j in (i + 1):ncol(data)) {
      # Calcul de la corrélation entre les variables i et j
      cor_value <- stats::cor(data[, i], data[, j], use = "na.or.complete")
      
      # Stockage du résultat dans le vecteur si supérieur au seuil
      if (cor_value >= seuil | cor_value <= -seuil) {
        if(affiche){
        cat(
          "***",
          colnames(data)[i],
          "  __est correlee a__  ",
          colnames(data)[j],
          "avec un R =",
          cor_value,
          "\n \n \n"
        )
      }
        
        variables_corr <-
          c(variables_corr, colnames(data)[i], colnames(data)[j])
      }
    }
  }
  
  return(variables_corr)
}


# tests de valeurs aberant -----------------------------------------------
test_grub <- function(data, variable, direction = "maxi") {
  
  if (direction == "maxi") { 
    repeat {
      #le test de Grubbs
      test_aberrant <- grubbs.test(data[[variable]], opposite = FALSE)
      
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        max_value <- max(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != max_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  if (direction == "mini") { 
    repeat {
      test_aberrant <- grubbs.test(data[[variable]], opposite = TRUE)
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        min_value <- min(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != min_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  return(data)
}




# boxplote -----------------------------------------------
plot_boxplot <-function(donnee,
           x_col,y_col,x_label,y_label,title,legend_title,
           couleurs,
           affiche_point = TRUE,
           ymin = min(donnee[[y_col]]),
           ymax = 1.2 * max(donnee[[y_col]])) {
    
  graphe <-ggplot(donnee,
             aes_string(
               x = x_col,
               y = y_col,
               colour = x_col
             )) +
  geom_boxplot(
        outlier.shape = NA,
        outlier.colour = "black",
        alpha = 0.20,
        size = 1.5 
      ) +
  labs(title = title,x = x_label,y = y_label) +
  scale_color_manual(values = couleurs, name = legend_title) +
  theme_classic(base_size = 12, base_family = "Arial") +
  theme(axis.text = element_text(size = 10),
        axis.title.y = element_text(
          vjust = 5, size = 12, face = "bold"),
        axis.title.x = element_text(face = "bold"),
        axis.ticks.length = unit(0.2, "cm"),
        legend.position = "none",  # Cette ligne supprime la légende
        #legend.position = "right",
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 12, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = 14, face = "bold"),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
      )
    if (affiche_point) {
      graphe <-
        graphe + geom_jitter(position = position_jitter(seed = 0.5), size = 0.8)
    }
    
    if (y_col %in% names(donnee)) {
      graphe <- graphe +
        coord_cartesian(ylim = c(ymin, ymax))
    }
  
    graphe = graphe + stat_summary(
      fun.y = mean,
      geom = "point",
      shape = 15,
      size = 1.5,
      col = "black",
      fill = "black"
    )
    
    return(graphe)
}



#pour le  pairwise.t.test() -----------------------------------------------------
tri.to.squ <- function(x) {
  rn <- row.names(x)
  cn <- colnames(x)
  an <- unique(c(cn, rn))
  myval <- x[!is.na(x)]
  mymat <-
    matrix(
      1,
      nrow = length(an),
      ncol = length(an),
      dimnames = list(an, an)
    )
  for (ext in 1:length(cn))
  {
    for (int in 1:length(rn))
    {
      if (is.na(x[row.names(x) == rn[int], colnames(x) == cn[ext]]))
        next
      mymat[row.names(mymat) == rn[int], colnames(mymat) == cn[ext]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
      mymat[row.names(mymat) == cn[ext], colnames(mymat) == rn[int]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
    }
  }
  return(mymat)
}



# Selection interaction -------------------------------
select_inter <- function(response_var, df, explanatory_vars) {
  results <- data.frame()
  combinations <- combn(explanatory_vars, 2, simplify = FALSE)

  for(i in seq_along(combinations)) {

    formula <- as.formula(paste(response_var, "~", paste(combinations[[i]], collapse = "*")))
    model <- gam(formula, data = df)
    r_squared <- summary(model)$r.sq
    aic <- AIC(model)
    results <- rbind(results, data.frame("variables" = paste0(combinations[[i]], collapse = ".inter."), 
                                         "r_squared" = r_squared, 
                                 "aic" = aic))
  }
  return(results)
}

# Comparaion betwen predtited and observed -----------------------------------
plot_comp = function (df,ylabel, title_class, legende = TRUE,plotly = FALSE,xlabel = "observations",title=""){ 

  
  p = ggplot(df, aes(x = observation)) + 
  #graph representant observed
  geom_point(aes(y = Observed, color = "Observed values")) +
  geom_line(aes(y = Observed, color = "Observed values")) + 
  
  #graph representant  preticted
  geom_point(aes(y = Predicted, color="Predicted values")) +
  geom_line(aes(y = Predicted, color="Predicted values")) + 
  # ggtitle(title)
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = title,x=xlabel, y=ylabel, color = "Legend :") + 
  ylim(min(c(min(df$Predicted), min(df$Observed))),
            max(c(max(df$Predicted), max(df$Observed)))+1  ) +
    
  scale_color_manual(values = c("Observed values"='red', "Predicted values"='green')) +
  annotate("text", x = 8, y =  max(c(max(df$Predicted), max(df$Observed)))+1, 
           label = title_class, col = "black", size = 3)

  
  if (!legende) {
    p <- p + theme(legend.position = "none")
  }
  
  if(plotly){
    p = ggplotly(p)
  }

return (p)

}


# Calcul R²
# calcule_R2 = function(x, y) {cor(x, y)^2}
calcule_R2 <- function(y_true, y_pred) {
  sse <- sum((y_true - y_pred)^2)
  sst <- sum((y_true - mean(y_true))^2)
  r_squared <- 1 - (sse / sst)
  return(r_squared)
}

1 Database import

  • Import of database T_synth_sit_TIDM_21_22_23.xlsx (june 16, 2024)
  • The database contains 530 rows and 153 columns

1.1 Colonnes ayant que des NA

Pour faciliter la lecture du jeu de donnée, j’identifie et j’enleve tout les colonne ayant que des NA:

 [1] "Specificites_Parcelle_Niv4"     "H_debut"                       
 [3] "H_fin"                          "Duree_Prelevement"             
 [5] "Ensoleillement"                 "Pluie"                         
 [7] "Vent"                           "Humidite_Sol"                  
 [9] "Date_Derniere_Pluie"            "Date_Derniere_Gelee"           
[11] "Temperature_sol"                "Temperature_air"               
[13] "Profondeur_Sol_cm"              "Observateur"                   
[15] "Referent"                       "Mode_Production"               
[17] "Texture_Dominante"              "Texture_classement_indetermine"
[19] "N_org"                          "pH_KCl"                        
[21] "Code_Piluliers"                 "MO_pourc"                      
[23] "Gestion_Herbe"                 
  • The database contains 530 rows and 130 columns

2 Database exploration

  • CR = Completion rate

2.1 Complete columns

Les colonnes suivants sont remplis a 100% (0 NA)

Code
df_col=taux_completion(prodij,TRUE,trie=FALSE)
df_col = df_col[df_col$Variables != "Total",]
#print("table")
# kable(df_col, caption = "", col.width = c("75%", "25%"))
DT::datatable(df_col, options = list(pageLength = 5))
Code
rm("df_col")

2.2 Non-complete columns

Les colonnes suivants ont des NA

2.3 Focus ont protocols

  • List of protocols available on the database ( 1 levels)
Code
prodij$Protocole = as.factor(prodij$Protocole)
df <- as.data.frame(summary(prodij$Protocole))
colnames(df) <- c("Numbers")
kable(df)
Numbers
TB 530
Code
rm("df")

Toutes les parcelles ont été échantillonnée avec le protocol TB

2.4 Focus on GPS coordinates

  • There is 0 NA in GPS_X
  • There is 0 NA in GPS_Y
Code
df_suivi = prodij
n_line = nrow(df_suivi)

prodij$GPS_X <- as.numeric(prodij$GPS_X)
prodij$GPS_Y <- as.numeric(prodij$GPS_Y)
prodij <- prodij[complete.cases(prodij$GPS_X, prodij$GPS_Y), ]
prodij <- prodij %>%filter(!is.na(GPS_X) & !is.na(GPS_Y))
#sum(is.na(prodij$GPS_X))
#sum(is.na(prodij$GPS_Y))
  • We delete the NA lines in the GPS coordinates
  • The database therefore changes from 530 to 530 observations.

2.5 Cartography

Code
df_coord <- prodij[, c("GPS_X", "GPS_Y")] %>% mutate(GPS_X = as.numeric(GPS_X),GPS_Y = as.numeric(GPS_Y))

df_coord$num_ligne <- seq(nrow(df_coord))
carte <- leaflet(df_coord) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~GPS_X, lat = ~GPS_Y, radius = 0.8, fillOpacity = 0.8, fillColor = "blue")
carte

2.6 Focus on years

  • Cleaning the Annee column
Numbers
2021 400
2022 80
2023 50
  • The database therefore changes from 530 to 530 observations.

2.7 Focus on Categorie_Milieu_Niv1

Code
prodij$Categorie_Milieu_Niv1= as.factor(prodij$Categorie_Milieu_Niv1)
summary_df <- as.data.frame(summary(prodij$Categorie_Milieu_Niv1))
colnames(summary_df) <- c("Numbers")
kable(summary_df)
Numbers
1_Naturel 75
2_Agricole 361
3_Artificialisé 94
Code
rm("summary_df")

2.8 Focus on SousCategorie_Milieu_Niv2

Code
prodij$SousCategorie_Milieu_Niv2= as.factor(prodij$SousCategorie_Milieu_Niv2)
summary_df <- as.data.frame(summary(prodij$SousCategorie_Milieu_Niv2))
colnames(summary_df) <- c("Numbers")
kable(summary_df)
Numbers
11_Naturel fermé 75
21_Agricole ouvert 361
31_Espace vert boisé 5
32_Espace vert ouvert 43
34_Espace cultivé urbain 15
35_Bordure de voirie 26
38_Friche industrielle 5
Code
rm("summary_df")

2.9 Focus on Details_Milieu_Niv3

Code
prodij$Details_Milieu_Niv3 = as.factor(prodij$Details_Milieu_Niv3)
summary_df <- as.data.frame(summary(prodij$Details_Milieu_Niv3))
colnames(summary_df) <- c("Numbers")
datatable(summary_df, options = list(pageLength = 5))
Code
rm("summary_df")

2.10 Selection de l’occupation du sol

J’ai choisi d’utilisé l’occupation du sol du niveau 3 : Details_Milieu_Niv3

Uniquement le Rural:

Code
df_suivi = prodij
n_line = nrow(df_suivi)

# prodij = s_prodij
# prodij$Details_Milieu_Niv3 = as.factor(prodij$Details_Milieu_Niv3)
# summary_df <- as.data.frame(summary(prodij$Details_Milieu_Niv3))
# colnames(summary_df) <- c("Numbers")
# datatable(summary_df, options = list(pageLength = 5))
# rm("summary_df")


select_os= c("111_Forêt de feuillus", "210_Prairie agricole permanente", 
"214_Culture annuelle", "218_Vignes et autres Cultures pérennes")

s_prodij = prodij
prodij <- prodij[prodij$Details_Milieu_Niv3 %in% select_os, ]
prodij=droplevels(prodij)
prodij$Details_Milieu_Niv3 = as.factor(prodij$Details_Milieu_Niv3)
summary_df <- as.data.frame(summary(prodij$Details_Milieu_Niv3))
colnames(summary_df) <- c("Numbers")
datatable(summary_df, options = list(pageLength = 5))
Code
rm("summary_df")


levels(prodij$Details_Milieu_Niv3)[levels(prodij$Details_Milieu_Niv3)=="111_Forêt de feuillus"]="OS_111"

levels(prodij$Details_Milieu_Niv3)[levels(prodij$Details_Milieu_Niv3)=="210_Prairie agricole permanente"]="OS_210"

levels(prodij$Details_Milieu_Niv3)[levels(prodij$Details_Milieu_Niv3)=="214_Culture annuelle"]="OS_214"

levels(prodij$Details_Milieu_Niv3)[levels(prodij$Details_Milieu_Niv3)=="218_Vignes et autres Cultures pérennes"]="OS_218"




df_l <- data.frame(
  Niveau_Original = c("111_Forêt de feuillus", 
                      "210_Prairie agricole permanente", 
                      "214_Culture annuelle", 
                      "218_Vignes et autres Cultures pérennes"),
  Abréviations = c("OS_111", "OS_210", "OS_214", "OS_218"))

kable(df_l)
Niveau_Original Abréviations
111_Forêt de feuillus OS_111
210_Prairie agricole permanente OS_210
214_Culture annuelle OS_214
218_Vignes et autres Cultures pérennes OS_218
  • The database therefore changes from 530 to 416 observations.

3 Earthworms data

3.1 Total abundance

Sppression des valeurs aberrantes

  • The database therefore changes from 416 to 414 observations.
Code
# summary(AB_tot_aberant)
 AB_tot_aberant_2 = AB_tot_aberant
AB_tot_aberant_2 = AB_tot_aberant_2[AB_tot_aberant_2$AB_tot > max(prodij$AB_tot),]



kable(unique(AB_tot_aberant_2[,c("Programme","Annee","Details_Milieu_Niv3")]))

df = AB_tot_aberant_2
df$observation = 1:nrow(df)
df$Richesse_100 = df$Richesse*100
g_AB_tot_aberant = ggplot(df, aes(x = observation)) + 
  geom_point(aes(y = AB_tot, color = "Abundance")) +
  geom_line(aes(y = AB_tot, color = "Abundance")) + 
  geom_point(aes(y = Richesse_100, color="Richness*100")) +
  geom_line(aes(y = Richesse_100, color="Richness*100")) + 
  # ggtitle(title)
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = "  ",x="Observation", y="Values", color = "Legend:") +
  scale_color_manual(values = c("Abundance"='red', "Richness*100"='green'))
# ggsave("g_AB_tot_aberant.png", plot = g_AB_tot_aberant, dpi = 300)
ggplotly(g_AB_tot_aberant)

3.2 Total biomass

Sppression des valeurs aberrantes

  • The database therefore changes from 414 to 411 observations.

3.3 Total taxonomic richness

Sppression des valeurs aberrantes

  • The database therefore changes from 411 to 411 observations.

4 Exploration des variables explicatives potentielles

4.1 Liste des variables explicatives

  • J’enleve Texture_GEPPA des variables explicatives car beaucoup de NA et j’ai deja la texture avec sables, limons et argiles.

Travail_du_sol:

 non  oui NA's 
 169  152   90 

Fertilisation:

 non  oui NA's 
 100  256   55 
  • Je garde le Travail_du_sol et la Fertilisation mais en supprimant leurs NA.
  • The database therefore changes from 411 to 319 observations.

J’enleve le CaCO3_tot car j’ai deja le pH bien renseigné.

Il reste donc :

 [1] "GPS_X"               "GPS_Y"               "Details_Milieu_Niv3"
 [4] "Fertilisation"       "Travail_du_sol"      "SableF"             
 [7] "SableG"              "LimonF"              "LimonG"             
[10] "Argile"              "Sables"              "Limons"             
[13] "MO"                  "C_tot"               "C_org"              
[16] "C.N"                 "N_tot"               "pH_eau"             

Conclusion: 15 variables numeric et 3 variables catégorielles (OS, fertilisation et travail du sol)

4.2 Matrice de correlation

Test de corrélation

Les valeurs inférieures à 0,6 ont été mises à 0 pour une meilleure visibilité.

J’enleve Sables et Limons car très corrélée aux autre variables et:

  • \(Sables = SablesF + SablesG\)
  • \(Limons = LimonsF + LimonsG\)

J’enleve MO car trés corrélé à C_tot, C_org et N_tot.

J’enleve C_tot et N_tot car très corrélé et :

  • \(C_tot = C_Orga + X\) (avec x ~ C_min)

retour sur Predictors

Conclusion: Il reste donc 10 variables explicatives numeric.

4.3 VIF

2 variables from the 10 input variables have collinearity problem: 
 
SableF SableG 

After excluding the collinear variables, the linear correlation coefficients ranges between: 
min correlation ( pH_eau ~ C_org ):  0.009204932 
max correlation ( pH_eau ~ C.N ):  -0.5074557 

---------- VIFs of the remained variables -------- 
  Variables      VIF
1     GPS_X 1.723465
2     GPS_Y 1.718448
3    LimonF 1.654946
4    LimonG 1.994132
5    Argile 1.312747
6     C_org 1.896763
7       C.N 2.243654
8    pH_eau 1.957370

SableG et SableF sont multicolinéaire: on enleve donc un des deux:

  • SableG
No variable from the 9 input variables has collinearity problem. 

The linear correlation coefficients ranges between: 
min correlation ( pH_eau ~ C_org ):  0.009204932 
max correlation ( LimonF ~ SableF ):  -0.6233721 

---------- VIFs of the remained variables -------- 
  Variables      VIF
1     GPS_X 1.745123
2     GPS_Y 1.773541
3    SableF 4.029032
4    LimonF 3.301790
5    LimonG 2.008510
6    Argile 2.422163
7     C_org 2.003050
8       C.N 2.260907
9    pH_eau 1.959802

Conclusion: Il reste donc 9 variables explicatives numerique + 3 variables factorielles

4.4 Verifications des valeurs abérants

Code
# Parcourir chaque variable dans la liste
# for (var in variables_num) {
#   # Générer le code pour chaque variable
#   cat(paste0(
#     "## ", var, "\n\n",
#     "```{r fig_", var, ",fig.align='center',fig.height=10}\n",
#     "df_suivi = prodij\n",
#     "n_line = nrow(df_suivi)\n\n",
#     "# summary(prodij$", var, ")\n",
#     "df_cleaned = prodij\n\n",
#     "df_cleaned$", var, " = as.numeric(df_cleaned$", var, ")\n",
#     "explo_num(nom_col = '", var, "', titre = '", var, " (before cleaning)', df = df_cleaned)\n",
#     "df_cleaned <- test_grub(df_cleaned, '", var, "', direction = 'maxi')\n",
#     "df_cleaned <- test_grub(df_cleaned, '", var, "', direction = 'mini')\n",
#     "cat('Suppression des valeurs aberrantes')\n",
#     "explo_num(nom_col = '", var, "', titre = '", var, " (after cleaning)', df = df_cleaned)\n",
#     "# summary(df_cleaned$", var, ")\n",
#     "# prodij = df_cleaned\n",
#     "```\n\n",
#     "The database therefore changes from **`r n_line`** to **`r nrow(df_cleaned)`** observations.\n\n\n"
#   ))
# }

4.4.1 GPS_X

Suppression des valeurs aberrantes

The database therefore changes from 319 to 319 observations.

4.4.2 GPS_Y

Suppression des valeurs aberrantes

The database therefore changes from 319 to 319 observations.

4.4.3 SableF

Suppression des valeurs aberrantes

The database therefore changes from 319 to 313 observations.

4.4.4 LimonF

Suppression des valeurs aberrantes

The database therefore changes from 313 to 313 observations.

4.4.5 LimonG

Suppression des valeurs aberrantes

The database therefore changes from 313 to 313 observations.

4.4.6 Argile

Suppression des valeurs aberrantes

The database therefore changes from 313 to 313 observations.

4.4.7 C_org

Suppression des valeurs aberrantes

The database therefore changes from 313 to 302 observations.

4.4.8 C.N

Suppression des valeurs aberrantes

The database therefore changes from 302 to 296 observations.

4.4.9 pH_eau

Suppression des valeurs aberrantes

The database therefore changes from 296 to 296 observations.

5 Relations entre les vers de terre et les variables explicatives

5.1 Abundance

- AB_tot & Details_Milieu_Niv3

- AB_tot & Fertilisation

- AB_tot & Travail_du_sol

-  AB_tot & GPS_X

-  AB_tot & GPS_Y

-  AB_tot & SableF

-  AB_tot & LimonF

-  AB_tot & LimonG

-  AB_tot & Argile

-  AB_tot & C_org

-  AB_tot & C.N

-  AB_tot & pH_eau

5.2 Biomass

- BM_tot & Details_Milieu_Niv3

- AB_tot & Fertilisation

- AB_tot & Travail_du_sol

-  BM_tot & GPS_X

-  BM_tot & GPS_Y

-  BM_tot & SableF

-  BM_tot & LimonF

-  BM_tot & LimonG

-  BM_tot & Argile

-  BM_tot & C_org

-  BM_tot & C.N

-  BM_tot & pH_eau

5.3 Richness

- Richesse & Details_Milieu_Niv3

- AB_tot & Fertilisation

- AB_tot & Travail_du_sol

-  Richesse & GPS_X

-  Richesse & GPS_Y

-  Richesse & SableF

-  Richesse & LimonF

-  Richesse & LimonG

-  Richesse & Argile

-  Richesse & C_org

-  Richesse & C.N

-  Richesse & pH_eau

6 Exploratory analysis

Data set reduction

Code
id_col=c("Programme","Annee","ID_Site","Protocole")

predictor_non_r = c("Long", "Lat" ,"SableF" , "LimonF" ,"LimonG" ,"Argile" ,"C_org" ,"C.N", "pH_eau","OS","Ferti_","w_sol_")

predictor_deep = c(predictor_non_r,c("Sables","SableG","Limons","MO","C_tot","N_tot"))
vdt_col=c("AB_tot", "BM_tot", "Richesse")

prodij_explo = prodij[, c(id_col, vdt_col,predictor_deep)]



cl_original <- levels(prodij_explo$OS)

6.1 Total abundance distributions


  • Transformation sqrt

lamda = 0.4




6.2 Total biomass distributions


  • Transformation sqrt

lamda = 0.4




6.3 Total taxonomic richness distributions


  • Transformation sqrt

lamda = 0.6




6.4 Standarization

  • Transformation sqrt de l’abondance et de la biomasse

  • Transformation centrée reduite des prédicteurs

7 Modeling

7.1 Predictors



Toutes les variables explicatives: => RG, GBM et ANN


 [1] "Long"   "Lat"    "SableF" "LimonF" "LimonG" "Argile" "C_org"  "C.N"   
 [9] "pH_eau" "OS"     "Ferti_" "w_sol_" "Sables" "SableG" "Limons" "MO"    
[17] "C_tot"  "N_tot" 


Aucune corrélation: => GLM et GAM (<0.7)

 [1] "Long"   "Lat"    "SableF" "LimonF" "LimonG" "Argile" "C_org"  "C.N"   
 [9] "pH_eau" "OS"     "Ferti_" "w_sol_"

Voir Matrice de correlation

7.2 Processus de sélection des variables


Initialisation


Boucle de création et d’évaluation des modèles

Pour chaque itération i allant jusqu’à length(variables) - 1 :

  1. Création du modèle

    • Nous ajustons un modèle en utilisant toutes les variables prédicteurs disponibles.
  2. Calcul de l’importance des variables

    • Nous utilisons l’algorithme d’importance des caractéristiques de permutation basé sur Fisher, Rudin et Dominici (2018) :
      • Entrée: modèle \(\hat{F}\), variables \(X\), cible \(y\), mesure d’erreur \(L(y, \hat{F}(X))\)
      • Estimation de l’erreur du modèle d’origine \(L_{original}\).
      • Pour chaque variables \(j\) :
        1. Permutation \(j\) dans \(X\) pour obtenir \(X_{perm}\), rompant ainsi l’association entre \(j\) et \(y\).
        2. Estimation de l’erreur \(L_{perm}\) avec \(X_{perm}\).
        3. Calcule de l’importance \(FI_j = L_{perm} / L_{original}\).
      • Trie des variables par FI décroissant.
  3. Détermination de la variable la moins importante

  4. Prédictions sur les données de test

  5. Calcul des métriques de performance

    • RMSE, R² ajusté pour l’entraînement, pour le test, et MAE.
  6. Mise à jour des jeux de données train et test

    • Suppression de la variable la moins importante dans train & test.

Enregistrement des résultats

7.3 Data preparation

Code
# AB_tot --------------------------------------------------------------------------
df_mod_AB_tot = data_deep[,c("AB_tot",predictor_deep)]
# dummy_vars <- model.matrix(~ OS - 1, data = df_mod_AB_tot)
# df_mod_AB_tot <- cbind(df_mod_AB_tot, dummy_vars)
# df_mod_AB_tot <- df_mod_AB_tot[, -which(names(df_mod_AB_tot) == "OS")]

df_mod_AB_tot = drop_na(df_mod_AB_tot)
df_mod_AB_tot = droplevels(df_mod_AB_tot)


# Partition
# set.seed(123)
# ind <- sample(2, nrow(df_mod_AB_tot), replace = T, prob = c(.8, .2))
# AB_tot_train <- df_mod_AB_tot[ind==1,]
# AB_tot_test <- df_mod_AB_tot[ind==2,]

set.seed(42)
split <- rsample::initial_split(df_mod_AB_tot, prop = 0.8, strata = "AB_tot")
AB_tot_train <- rsample::training(split)
AB_tot_test <- rsample::testing(split)

write.csv2(x =AB_tot_train,file = "datas/proDij/AB_tot_train.csv", row.names = FALSE)
write.csv2(x =AB_tot_test,file = "datas/proDij/AB_tot_test.csv", row.names = FALSE)
# 
# AB_tot_train = read.csv2("datas/proDij/AB_tot_train.csv")
# AB_tot_test = read.csv2("datas/proDij/AB_tot_test.csv")
# df_mod_AB_tot = rbind(AB_tot_train,AB_tot_test)

AB_tot_train = as.data.frame(AB_tot_train)
AB_tot_test = as.data.frame(AB_tot_test)

df <- data.frame(y =AB_tot_train[,"AB_tot"])
abundance_dist_train = ggplot(df, aes(x=y)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
  geom_density(fill="black", alpha=0.2) +
  theme_gray() +
  labs(title="Abundance: Train",  x="Values", y = "Counts") +
  theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/proDij/abundance_dist_train.png", plot = abundance_dist_train, dpi = 300,width = 3,height = 2)



df <- data.frame(y =AB_tot_test[,"AB_tot"])
abundance_dist_test = ggplot(df, aes(x=y)) +
  geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
  geom_density(fill="black", alpha=0.2) +
  theme_gray() +
  labs(title="Abundance: Test",  x="Values", y = "Counts") +
  theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/proDij/abundance_dist_test.png", plot = abundance_dist_test, dpi = 300,width = 3,height = 2)

# Distrvitbution de var rep dans train et de test: est ce homogene ?
abundance_dist_train_and_test = ggarrange(abundance_dist_train, abundance_dist_test,
                          labels = c('(a)', '(b)'),
                          common.legend = TRUE,
                          legend = 'right'
)


ggsave("Results/proDij/abundance_dist_train_and_test.png", plot = abundance_dist_train_and_test, dpi = 300,height = 2,width = 4)

# Test de Kolmogorov-Smirnov: # Do x and y come from the same distribution?
# ks.test(AB_tot_train$AB_tot, AB_tot_test$AB_tot)

Abundance

  • Data partition (296, 19):

    • train data (80 %) = 235, 19

    • test data (20 %) = 61, 19

Biomasse

  • Data partition (296, 19):

    • train data (80 %) = 236, 19

    • test data (20 %) = 60, 19

Richness

  • Data partition (296, 19):

    • train data (80 %) = 234, 19

    • test data (20 %) = 62, 19

7.4 GLM

Code
GLM <- function(var_rep, df_app, df_valid,family = 'gaussian'){
  
  
  var_predicteurs = names(df_app[,-1])
 
  df_app = df_app[,c(var_rep,var_predicteurs)]
  df_valid = df_valid[,c(var_rep,var_predicteurs)]
  
  formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))
  
  
  # entrainement du modele sur le jeu d'entrainement
  modelglm<-glm(formula,family = family ,data = df_app)
  
  # Prediction sur le jeu de validation
  pred.GLM<-predict(modelglm,newdata=as.data.frame(df_valid[,var_predicteurs]))
  
  # Calcul du RMSE pour évaluer la qualite du modele
  rmse <- round (sqrt(mean((df_valid[,var_rep] - pred.GLM)^2,na.rm=TRUE)),2)
  
  
 # Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(modelglm, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² 
  # R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GLM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
  # r_adj_test = R_adj_test
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.GLM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  
  MAE <- mean(abs(pred.GLM - df_valid[,var_rep]),na.rm=TRUE)
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  # output
  results_df <- data.frame(Algorithms = "GLM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
    
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelglm,predit = pred.GLM, df = results_df)
  return(results)
}
  • Gaussian distribution

7.5 GAM

Code
GAM <- function(var_rep, df_app, df_valid, family = 'gaussian',method = "REML", interaction = FALSE){
  
  var_predicteurs = names(df_app[,-1])
  
  
  if (var_rep == "AB_tot"){ 

  modelgam<-gam(AB_tot ~  s(Long) + s(Lat) + s(SableF) + s(LimonF) + s(LimonG) + s(Argile) + s(C_org) + s(C.N) + s(pH_eau) + OS1_Naturel + OS2_Agricole + OS3_Artificialisé,
        family=family,method = method,data = df_app)
  }
  
  
  
  
  if (var_rep == "BM_tot"){ 

  modelgam<-gam(BM_tot ~ s(Long) + s(Lat) + s(SableF) + s(LimonF) + s(LimonG) + s(Argile) + s(C_org) + s(C.N) + s(pH_eau) + OS1_Naturel + OS2_Agricole + OS3_Artificialisé,
        family=family,method = method,data = df_app)
  
    
  }
  
  
  
  if(var_rep == "Richesse"){ 
    
  modelgam<-gam(Richesse ~ s(Long) + s(Lat) + s(SableF) + s(LimonF) + s(LimonG) + s(Argile) + s(C_org) + s(C.N) + s(pH_eau) + OS1_Naturel + OS2_Agricole + OS3_Artificialisé,
        family=family,method = method,data = df_app)
   
  }
  
  
  # Prediction sur le jeu de validation
  pred.GAM <- predict(modelgam,newdata=as.data.frame(df_valid[,var_predicteurs]))
  
  # Calcul du RMSE pour évaluer la qualite du modele
  rmse <- sqrt(mean((df_valid[,var_rep] - pred.GAM)^2,na.rm=TRUE))

  
# Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(modelgam, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GAM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
  # R_adj_test = r_adj_test
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.GAM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  

  # Calcule le MAE
  MAE <- mean(abs(pred.GAM - df_valid[,var_rep]))
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  
  # output
  results_df <- data.frame(Algorithms = "GAM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelgam, predit = pred.GAM, df = results_df)
  
  return(results)
}
  • Family = gaussian

  • Link function = identity

  • Method = REML

  • Tuning

7.6 RF

  • Default model
  • RF model tuning by grid

  • ntree = \(100,300,500,700,900,1000,1300,1500,1700,2000\)

  • mtry = \(1, 2, 3, 4, 5, 6, 7, 8, 9\)

  • maxnodes = \(5, 10 , 20, 30, 50, 70, 100\)

Total number of models = \(ntree * mtry * maxnode = 630\)

  • Validation of models on test data
Code
ForetAlea <- function(var_rep, df_app, df_valid, mtry, ntree, maxnodes) {
  
  set.seed(1863)
  col_posi <- which(names(df_app) == var_rep)
  ForeVDT <- randomForest::randomForest(df_app[-col_posi], df_app[[col_posi]], mtry = mtry, ntree = ntree, maxnodes = maxnodes)
  
  # Prediction on the validation dataset
  col_posi <- which(names(df_valid) == var_rep)
  pred.RF <- predict(ForeVDT, newdata = df_valid[, -col_posi])
  
  # Calculate RMSE to evaluate model quality
  rmse <- sqrt(mean((df_valid[, col_posi] - pred.RF)^2))
  
  
  # Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(ForeVDT, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  R_adj_test <-calcule_R2(df_valid[,col_posi],pred.RF)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
   # R_adj_test = r_adj_test
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.RF, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  
  # Calculate MAE
  MAE <- mean(abs(pred.RF - df_valid[, col_posi]))
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
    # output
  results_df <- data.frame(Algorithms = "RF",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = ForeVDT, predit = pred.RF, df = results_df)
  
  return(results)
}

7.7 GBM

  • Default model

  • GBM model tuning by grid

  • n.trees = \(500, 1000,1500,1700,2000,3000\)

  • shrinkage = \(0.01, 0.02, 0.05, 0.001, 0.002, 0.005\)

  • interaction.depth = \(3, 5, 6, 8\)

  • n.minobsinnode = \(2, 5, 10, 30, 50, 70\)

Total number of models = \(n.trees * shrinkage * interaction.depth * n.minobsinnode = 864\)

  • Validation of models on test data
Code
GBM <- function(var_rep, df_app, df_valid,distribution = 'gaussian',n.trees ,shrinkage,interaction.depth,n.minobsinnode){
  set.seed(1863)

  formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))

  Gradboost<-gbm(formula, data = df_app,
    distribution = distribution, 
    n.trees = n.trees,
    shrinkage = shrinkage,
    interaction.depth = interaction.depth,
    n.minobsinnode = n.minobsinnode) 
  
  # Prediction sur le jeu de validation :
   col_posi <- which(names(df_valid) == var_rep)
  prev.GBM<-predict(Gradboost,newdata=as.data.frame(df_valid[,-col_posi]))
 
  # Calcul du RMSE pour évaluer la qualité du modele
  rmse <- sqrt(mean((df_valid[,var_rep] - prev.GBM)^2))


# Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(Gradboost, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,col_posi],prev.GBM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
   # R_adj_test = r_adj_test
  res <- rms::lrm(df_valid[,var_rep]  ~ prev.GBM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  

  # calcule MAE
  MAE <- mean(abs(prev.GBM - df_valid[,col_posi])) 
  
    
    # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  
      # output
  results_df <- data.frame(Algorithms = "GBM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = Gradboost, predit = prev.GBM, df = results_df)
  
  
  return(results)
}

7.8 ANN

  • Default model
  • Tunning

runs <- tuning_run(“Experiment.R”, flags = list(dense_units1 = c(32, 64), dense_units2 = c(16, 32), dense_units3 = c(8, 16), dense_units4 = c(4, 8), dropout1 = c(0.4, 0.5), dropout2 = c(0.3, 0.4), dropout3 = c(0.2, 0.3), dropout4 = c(0.1, 0.2), batch_size = c(32, 64)))

  • hidden = c(32,32,16,8)

7.9 Compilation

  • ANN AB_tot
  • ANN BM_tot
  • ANN Richesse

8 ProDij results: total abundance

8.1 Abundance: GLM

Le meilleur modèle des GLM est :

\(mod_6 = glm(AB_tot ~ SableF + LimonF + Argile + C_org + C.N + pH_eau + OS, df = data)\)

8.2 Abundance: GAM

Le meilleur modèle des gam est:

\(mod_4 = gam(AB_tot ~ Long + Lat + Argile + C_org + C.N + pH_eau + OS, df = data)\)

8.3 Abundance: RF

Le meilleur modèle des RF est:

\(mod_9 = rf(AB_tot ~ Long + Lat + SableF + LimonF + Argile + C.N + pH_eau + OS + Sables + MO, df = data)\)

8.4 Abundance: GBM

Le meilleur modèle des GBM est:

\(mod_4 = gbm(AB_tot ~ Long + Lat + SableF + LimonF + LimonG + Argile + C.N + pH_eau + OS + w_sol_ + Sables + SableG + Limons + C_tot + N_tot, df = data)\)

8.5 Abundance: ANN

Le meilleur modèle des ANN est:

\(mod_11 = ann(AB_tot ~ SableF + LimonG + pH_eau + OS + Ferti_ + w_sol_ + Sables + N_tot, df = data)\)

8.6 Best models for: Abundance


GLM

\(mod = glm(AB_tot ~ SableF + LimonF + Argile + C_org + C.N + pH_eau + \\ OS, df = data)\)


GAM

\(mod = gam(AB_tot ~ Long + Lat + Argile + C_org + \\ C.N + pH_eau + OS, df = data)\)


RF

\(mod = rf(AB_tot ~ Long + Lat + SableF + LimonF + Argile + C.N + \\ pH_eau + OS + Sables + MO, df = data)\)


GBM

\(mod= gbm(AB_tot ~ Long + Lat + SableF + LimonF + LimonG + \\ Argile + C.N + pH_eau + OS + w_sol_ + Sables + SableG + Limons + \\ C_tot + N_tot, df = data)\)


ANN

\(mod = ann(AB_tot ~ SableF + LimonG + pH_eau + OS + \\ Ferti_ + w_sol_ + Sables + N_tot, df = data)\)



Conclusion: le meillleur modèle pour l’abondance est le RF

9 ProDij results: total Biomass

9.1 Biomass: GLM

Le meilleur modèle des GLM est :

\(mod_11 = glm(BM_tot ~ OS + C.N, df = data)\)

9.2 Biomass: GAM

Le meilleur modèle des gam est:

\(mod_11 = gam(BM_tot ~ Long + C.N + OS , df = data)\)

9.3 Biomass: RF

Le meilleur modèle des RF est:

\(mod_14 = rf(BM_tot ~ Lat + C.N + pH_eau + LimonF + OS, df = data)\)

9.4 Biomass: GBM

Le meilleur modèle des GBM est:

\(mod_4 = gbm(BM_tot ~ LimonF + SableG + C.N + N_tot + OS, df = data)\)

9.5 Biomass: ANN

Le meilleur modèle des ANN est:

\(mod_11 = ann(BM_tot ~ C_tot + Limons + C.N + Long + w_sol_ Ferti_ + OS, df = data)\)

9.6 Best models for: Biomass


GLM

\(mod = glm(BM_tot ~ OS + C.N, df = data)\)


GAM

\(mod = gam(BM_tot ~ Long + C.N + OS , df = data)\)


RF

\(mod = rf(BM_tot ~ Lat + C.N + pH_eau + LimonF + OS, df = data)\)


GBM

\(mod= gbm(BM_tot ~ LimonF + SableG + C.N + N_tot + OS, df = data)\)


ANN

\(mod = ann(BM_tot ~ C_tot + Limons + C.N + Long + w_sol_ Ferti_ + OS, df = data)\)



Conclusion: le meillleur modèle pour la biomasse est le RF

10 ProDij results: total Richness

10.1 Richness: GLM

Le meilleur modèle des GLM est :

\(mod_10 = glm(Richesse ~ LimonF + SableF + OS, df = data)\)

10.2 Richness: GAM

Le meilleur modèle des gam est:

\(mod_9 = gam(Richesse ~ SableF + C_org + pH_eau + OS, df = data)\)

10.3 Richness: RF

Le meilleur modèle des RF est:

\(mod_2 = rf(Richesse ~ Long + Lat + SableF + LimonF + LimonG + Argile + C_org + C.N + pH_eau + OS + w_sol_ + Sables + SableG + Limons + MO + C_tot + N_tot, df = data)\)

10.4 Richness: GBM

Le meilleur modèle des GBM est:

\(mod_13 = gbm(Richesse ~ C_tot + Limons + SablesF + N_tot + pH_eau + Long, df = data)\)

10.5 Richness: ANN

Le meilleur modèle des ANN est:

\(mod_2 = ann(Richesse ~ Long + Lat + SableF + LimonF + LimonG + Argile + C_org + pH_eau + OS + Ferti_ + w_sol_ + Sables + SableG + Limons + MO + C_tot + N_tot, df = data)\)

10.6 Best models for: Richness


GLM

\(mod = glm(Richesse ~ LimonF + SableF + OS, df = data)\)


GAM

\(mod = gam(Richesse ~ SableF + C_org + pH_eau + OS, df = data)\)


RF

\(mod = rf(Richesse ~ Long + Lat + SableF + LimonF + LimonG + Argile + \\ C_org + C.N + pH_eau + OS + w_sol_ + Sables + SableG + Limons + MO + \\ C_tot + N_tot, df = data)\)


GBM

\(mod= gbm(Richesse ~ C_tot + Limons + SablesF + N_tot + pH_eau + \\ Long, df = data)\)


ANN

\(mod = ann(Richesse ~ Long + Lat + SableF + LimonF + LimonG + \\ Argile + C_org + pH_eau + OS + Ferti_ + w_sol_ + Sables + SableG + \\ Limons + MO + C_tot + N_tot, df = data)\)



Conclusion: le meillleur modèle pour la richesse est le GBM

11 Best of the best models

11.1 Performances

Models Rep.var R2_adj_train R2_test RMSE MAE
RF Abundance 0.83 0.19 6.28 4.71
RF Biomass 0.79 0.07 2.62 2.07
GBM Richness 0.30 0.24 0.82 0.62

11.2 Models


Abundance

\(mod = rf(AB_tot ~ Long + Lat + SableF + LimonF + Argile + C.N + \\ pH_eau + OS + Sables + MO, df = data)\)


Biomass

\(mod = rf(BM_tot ~ Lat + C.N + pH_eau + LimonF + OS, df = data)\)


Richness

\(mod = gbm(Richesse ~ C_tot + Limons + SablesF + N_tot + pH_eau + \\ Long, df = data)\)

11.3 Optimisation/réduction de l’overfitting des bests models


_____________Avant_____________

Models Rep.var R2_adj_train R2_test RMSE MAE
RF Abundance 0.83 0.19 6.28 4.71
RF Biomass 0.79 0.07 2.62 2.07
GBM Richness 0.30 0.24 0.82 0.62


_____________Après_____________

Models Rep.var R2_adj_train R2_test RMSE MAE
RF Abundance 0.52 0.19 6.33 4.83
RF Biomass 0.34 0.01 2.77 2.18
GBM Richness 0.3 0.24 0.82 0.62

12 Importance des variables

12.1 Abundance

12.2 Biomass

12.3 Richness

Distribution not specified, assuming gaussian ...


12.4 Divers

Date de prochain rencontre pour la piblication

Stratégie de séléction des variables