LandWorm 2.0

Modeling and predicting earthworm diversity and distribution in France

Abdourahmane Diallo

2024-07-22

Setting

Packages

Code
  library(tidyverse)
  # library(glme)
  # library(lsmeans)
  # library(agricolae)
  # library(RVAideMemoire)
  library(corrplot)
  # library(emmeans)
  library(lme4)
  library(multcomp)
  library(MASS)
  # library(R2WinBUGS)
  library(arm)
  # library(performance)
  # library(AER)
  # library(AICcmodavg)
  # library(MuMIn)
  library(ade4)
  library(Hmisc)
  library(labdsv)
  library(vegan)
  library(cowplot)
  library(ggpubr)
  library(rstatix)
  library(patchwork)
  library(multcompView)
  library(ggsignif)
  library(grid)
  library(FactoMineR)
  library(factoextra)
  library(explore)
  library(ggrepel)
  library(naniar)
  library(outliers)
  library(leaps)
  library(fastDummies)
  library(caret) # pour l'entrainement des models
  library(mgcv)
  library(ggeffects)
  library(gratia)
  library(GGally) # pour ggpair
  # library(caTools)
  # library(rpart)
  # library(rpart.plot)
  library(openxlsx)
  library(readxl)
  library(leaflet) # pour la carto
  library(quarto)
  library(raster)
  library(knitr)
  library(kableExtra)
  library(stringr)
  library(plotly)
  # library(PerformanceAnalytics)
  # library(usdm)
  library(vcd) # pour la distribution des var reponse
  library(prospectr)# pour split data avec kenSton()
  # library(glmnet)
  library(randomForest)
  # library(doParallel)
  library(gbm)
  library(kernlab)
  # library(e1071)
  library(ggforce)
  library(keras)
  library(tensorflow)
  library(neuralnet)
  # library(parallel)
  library(iml) # pour l'interpretabilité des models https://cran.r-project.org/web/packages/iml/vignettes/intro.html
  library(stats)
  # library(Boruta) # importance des predicteurs
  library(bestNormalize)
  library(rmarkdown)
  library(DT)
  library(gtExtras) # pour la
  library(reshape2)
  # library(mapview)
  library(sf)
  library(ggplot2)
  library(maptools)
  library(ggsn)
  library(spThin)
  library(sp)
  library(gstat)
  library(rms)

Functions

Code
## Identification des NA dans un df -----------------------------------------------
taux_completion<-
  function(df, afficher_zero_percent = FALSE, seuil, trie=FALSE) {
    # Calcule du pourcentage de NA dans le dataframe
    pourcentage_total <-
      round(sum(is.na(df)) / (nrow(df) * ncol(df)) * 100, 1)
    
    # Calcule du pourcentage de NA par colonne
    pourcentage_colonnes <- round(colMeans(is.na(df)) * 100, 1)
    
    # Creation d'un dataframe résultat avec deux colonnes
    result <-
      data.frame(
        Variables = names(df),
        CR = pourcentage_colonnes,
        row.names = NULL
      )
    
    if (afficher_zero_percent) {
      result <- result[result$CR == 0, ]
      result$CR = 100 -result$CR
    } else {
      result <- result[result$CR > 0, ]
      result$CR = 100 -result$CR
      
    }
    
    result <- rbind(result, c("Total", pourcentage_total))
    #result <- rbind(result, c("Total", paste0(pourcentage_total, "")))
    
    result <- result[, c("Variables", "CR")]
    result$CR = as.numeric(result$CR)
    result$CR = round(result$CR,1)
    if (trie){
      result = result %>% arrange(desc(CR))
    }
    result$CR = paste0(result$CR,"%")
    
    return(result)
  }
# Converssion des colonne en num ou factor-----------------------------------------------
conv_col <- function (data, columns_to_convert, to_types) {
  if (to_types == "numeric") {
    # Conversion des colonnes en numeric
    for (col in columns_to_convert) {
      data[, col] <- as.numeric(data[, col])
    }
  } else {
    # Conversion des colonnes en facteurs
    for (col in columns_to_convert) {
      data[, col] <- as.factor(data[, col])
    }
  }
  return(data)
}
#data_converted <- conv_col(data, names(data [, c(1, 3)]), "factor")

# exploration graphiques des variables numeriques -----------------------------------------------
explo_num <- function(nom_col, titre, df = landworm, ligne_col = c(2, 2),mini = min(df[[nom_col]]), maxi=max(df[[nom_col]]) ) {
  par(mfrow = ligne_col)
  
  df[complete.cases(df[[nom_col]]), ]
  df <- df %>%dplyr::filter(!is.na(df[[nom_col]]))
  df[[nom_col]] = as.numeric(df[[nom_col]])
  # Boxplot
  boxplot(df[[nom_col]], col = 'blue', ylab = titre, ylim = c(mini, maxi))
  # Cleveland plot
  # dotchart(df[[nom_col]], pch = 16, col = 'blue', xlab = titre)
  plot(df[[nom_col]], pch = 16, col = 'blue', ylab = titre)
  # Histogram
  hist(df[[nom_col]], col = 'blue', xlab = titre, main = "")
  # Quantile-Quantile plot
  qqnorm(df[[nom_col]], pch = 16, col = 'blue', xlab = '')
  qqline(df[[nom_col]], col = 'red') 
}

# Extraction des predictors + moyennes -----------------------------------------------

extraction <- function(nom_col, tif_file_path, df = landworm, conv = 1) {
  #df <- df %>%filter(!is.na(gps_x) & !is.na(gps_y))
  raster_data <- raster(tif_file_path)
  
  # Création d'un dataframe pour stocker les valeurs extraites
  df_interne <- data.frame(gps_x = df$gps_x, gps_y = df$gps_y)
  proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
  # Transformer les coordonnées GPS en système de coordonnées du raster
  gps_coords_sp <- SpatialPoints(df_interne, proj4string = CRS(proj4Str))
  gps_coords_proj <- spTransform(gps_coords_sp, crs(raster_data))
  
  # Extraction des valeurs du raster 
  values <- raster::extract(raster_data, gps_coords_proj)
  
  # Ajout des valeurs extraites comme nouvelles colonnes a df
  #df_save = data.frame()
  #df_save[[nom_col]] <- values / conv
  
  df[[nom_col]] <- values / conv
  
  return(df)
}

# la moyenne des predictores -----------------------------------------------
moyenne_val_extrct <- function(nom_col, vec_col, df=landworm) {
  df[[nom_col]] <- rowMeans(as.matrix(df[, vec_col, drop = FALSE]), na.rm = TRUE)
  df[[nom_col]] = round(df[[nom_col]],1)
  return(as.data.frame(df))
}


# tests de corrélation avec un seuil -----------------------------------------------
cor_function_seuil <- function(data, seuil,affiche=FALSE) {
  # Création d'un vecteur pour stocker les paires de variables corrélées
  variables_corr <- c()
  
  # Boucle pour tester la corrélation entre chaque paire de variables
  for (i in 1:(ncol(data) - 1)) {
    for (j in (i + 1):ncol(data)) {
      # Calcul de la corrélation entre les variables i et j
      cor_value <- stats::cor(data[, i], data[, j], use = "na.or.complete")
      
      # Stockage du résultat dans le vecteur si supérieur au seuil
      if (cor_value >= seuil | cor_value <= -seuil) {
        if(affiche){
        cat(
          "***",
          colnames(data)[i],
          "  __est correlee a__  ",
          colnames(data)[j],
          "avec un R =",
          cor_value,
          "\n \n \n"
        )
      }
        
        variables_corr <-
          c(variables_corr, colnames(data)[i], colnames(data)[j])
      }
    }
  }
  
  return(variables_corr)
}


# tests de valeurs aberant -----------------------------------------------
test_grub <- function(data, variable, direction = "maxi") {
  
  if (direction == "maxi") { 
    repeat {
      #le test de Grubbs
      test_aberrant <- grubbs.test(data[[variable]], opposite = FALSE)
      
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        max_value <- max(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != max_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  if (direction == "mini") { 
    repeat {
      test_aberrant <- grubbs.test(data[[variable]], opposite = TRUE)
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        min_value <- min(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != min_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  return(data)
}




# boxplote -----------------------------------------------
plot_boxplot <-function(donnee,
           x_col,y_col,x_label,y_label,title,legend_title,
           couleurs,
           affiche_point = TRUE,
           ymin = min(donnee[[y_col]]),
           ymax = 1.2 * max(donnee[[y_col]])) {
    
  graphe <-ggplot(donnee,
             aes_string(
               x = x_col,
               y = y_col,
               colour = x_col
             )) +
  geom_boxplot(
        outlier.shape = NA,
        outlier.colour = "black",
        alpha = 0.20,
        size = 1.5 
      ) +
  labs(title = title,x = x_label,y = y_label) +
  scale_color_manual(values = couleurs, name = legend_title) +
  theme_classic(base_size = 12, base_family = "Arial") +
  theme(axis.text = element_text(size = 10),
        axis.title.y = element_text(
          vjust = 5, size = 12, face = "bold"),
        axis.title.x = element_text(face = "bold"),
        axis.ticks.length = unit(0.2, "cm"),
        legend.position = "none",  # Cette ligne supprime la legende
        #legend.position = "right",
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 12, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = 14, face = "bold"),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
      )
    if (affiche_point) {
      graphe <-
        graphe + geom_jitter(position = position_jitter(seed = 0.5), size = 0.8)
    }
    
    if (y_col %in% names(donnee)) {
      graphe <- graphe +
        coord_cartesian(ylim = c(ymin, ymax))
    }
  
    graphe = graphe + stat_summary(
      fun.y = mean,
      geom = "point",
      shape = 15,
      size = 1.5,
      col = "black",
      fill = "black"
    )
    
    return(graphe)
}



#pour le  pairwise.t.test() -----------------------------------------------------
tri.to.squ <- function(x) {
  rn <- row.names(x)
  cn <- colnames(x)
  an <- unique(c(cn, rn))
  myval <- x[!is.na(x)]
  mymat <-
    matrix(
      1,
      nrow = length(an),
      ncol = length(an),
      dimnames = list(an, an)
    )
  for (ext in 1:length(cn))
  {
    for (int in 1:length(rn))
    {
      if (is.na(x[row.names(x) == rn[int], colnames(x) == cn[ext]]))
        next
      mymat[row.names(mymat) == rn[int], colnames(mymat) == cn[ext]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
      mymat[row.names(mymat) == cn[ext], colnames(mymat) == rn[int]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
    }
  }
  return(mymat)
}



# Selection interaction -------------------------------
select_inter <- function(response_var, df, explanatory_vars) {
  results <- data.frame()
  combinations <- combn(explanatory_vars, 2, simplify = FALSE)

  for(i in seq_along(combinations)) {

    formula <- as.formula(paste(response_var, "~", paste(combinations[[i]], collapse = "*")))
    model <- gam(formula, data = df)
    r_squared <- summary(model)$r.sq
    aic <- AIC(model)
    results <- rbind(results, data.frame("variables" = paste0(combinations[[i]], collapse = ".inter."), 
                                         "r_squared" = r_squared, 
                                 "aic" = aic))
  }
  return(results)
}

# Comparaion betwen predtited and observed -----------------------------------
plot_comp = function (df,ylabel, title_class, legende = TRUE,plotly = FALSE,xlabel = "observations",title=""){ 

  
  p = ggplot(df, aes(x = observation)) + 
  #graph representant observed
  geom_point(aes(y = Observed, color = "Observed values")) +
  geom_line(aes(y = Observed, color = "Observed values")) + 
  
  #graph representant  preticted
  geom_point(aes(y = Predicted, color="Predicted values")) +
  geom_line(aes(y = Predicted, color="Predicted values")) + 
  # ggtitle(title)
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = title,x=xlabel, y=ylabel, color = "Legend :") + 
  ylim(min(c(min(df$Predicted), min(df$Observed))),
            max(c(max(df$Predicted), max(df$Observed)))+1  ) +
    
  scale_color_manual(values = c("Observed values"='red', "Predicted values"='green')) +
  annotate("text", x = 8, y =  max(c(max(df$Predicted), max(df$Observed)))+1, 
           label = title_class, col = "black", size = 3)

  
  if (!legende) {
    p <- p + theme(legend.position = "none")
  }
  
  if(plotly){
    p = ggplotly(p)
  }

return (p)

}


# Calcul R² -----------------------------------
# calcule_R2 = function(x, y) {cor(x, y)^2}
calcule_R2 <- function(y_true, y_pred) {
  sse <- sum((y_true - y_pred)^2)
  sst <- sum((y_true - mean(y_true))^2)
  r_squared <- 1 - (sse / sst)
  return(r_squared)
}



# Fonction d'exploration graphique des relations entre les vd et les predictors -----------------------------------
nuage_point <- function(df, var_rep, predicteur, titre_x, titre_y) {
  # Convertir les variables en numeriques
  df[[predicteur]] <- as.numeric(df[[predicteur]])
  df[[var_rep]] <- as.numeric(df[[var_rep]])
  
  df= df[,c(predicteur,var_rep)]
  df = drop_na(df)
  # correlation entre les deux variables
  correlation <- round(cor(df[[predicteur]], df[[var_rep]]), 3)
  
  #  le graphique
  g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
    geom_point() + # Ajout des points
    geom_smooth(method = "lm", se = TRUE, color = "red") + 
    labs(
      # title = paste("Relationship between",var_rep, "and", predicteur),
         subtitle = paste0(" r = ", correlation),
         x = titre_x, 
         y = titre_y) +
    theme_classic()   
  
  return(g)
}

box_plot <- function(df, var_rep, predicteur, titre_x, titre_y) {
  g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
    geom_boxplot() + # Ajout des boxplots
    labs(
      # title = paste("Boxplot of", var_rep, "by", predicteur),
         x = titre_x, 
         y = titre_y) +
    theme_classic()   
  
  print(g)
}

1 Database import

  • Import of database LandWorm_dataset_site_01_07_2024.xlsx (february 22, 2024)
  • The database contains 8143 rows and 479 columns

Columns with only NAs

To make the dataset easier to read, I identify and remove all columns with only NA:

[1] "thermal_weed_control" "amdmt_orga_freq"      "amdmt_orga_names"    
[4] "amdmt_orga_qtty"      "amdmt_calcic"         "amdmt_calcic_names"  
[7] "amdmt_calcic_qtty"   
  • The database contains 8143 rows and 472 columns

1.1 Protocols (avant séléction)

  • List of protocols available on the database ( 22 levels)
Code
landworm$Protocole = as.factor(landworm$Protocole)
df <- as.data.frame(summary(landworm$Protocole))
colnames(df) <- c("Numbers")
DT::datatable(df, options = list(pageLength = 5))
Code
rm("df")
  • Selection of protocols: 16
  • The database therefore changes from 8143 to 6303 observations.

1.2 Data selection: LandWorm

  • On séléctionne tout sauf NA
Numbers
cp 468
dc 4324
gp 299
mh 855
sg 357
Code
df_suivi = landworm
n_line = nrow(df_suivi)

landworm <- subset(landworm, owner %in% c("dc","cp","gp","sg", "mh"))
landworm$owner=droplevels(landworm$owner)
  • The database therefore changes from 6303 to 6303 observations.

2 Database exploration

  • CR = Completion rate

2.1 Complete columns

Code
df_col=taux_completion(landworm,TRUE,trie=FALSE)
df_col = df_col[df_col$Variables != "Total",]
#print("table")
kable(df_col, caption = "", col.width = c("75%", "25%"))
Variables CR
72 ID 100%
73 Protocole 100%
75 owner 100%
76 AB_tot 100%
Code
# cat(                                                    )
# head(landworm[, "ID"])

2.2 Non-complete columns

2.3 Focus on GPS coordinates

  • There is 266 NA in GPS_X
  • There is 269 NA in GPS_Y
Code
df_suivi = landworm
n_line = nrow(df_suivi)

# landworm$gps_x <- as.numeric(gsub("[^0-9.-]", "", landworm$gps_x))
# landworm$gps_y <- as.numeric(gsub("[^0-9.-]", "", landworm$gps_y))
landworm <- landworm[complete.cases(landworm$gps_x, landworm$gps_y), ]
landworm <- landworm %>%dplyr::filter(!is.na(gps_x) & !is.na(gps_y))
#sum(is.na(landworm$gps_x))
#sum(is.na(landworm$gps_y))
  • We delete the NA lines in the GPS coordinates
  • The database therefore changes from 6303 to 6034 observations.

2.4 Cartography

Code
df_suivi = landworm
n_line = nrow(df_suivi)

df_coord <- landworm[, c("gps_x", "gps_y")] %>% mutate(gps_x = as.numeric(gps_x),gps_y = as.numeric(gps_y))

df_coord$num_ligne <- seq(nrow(df_coord))
carte <- leaflet(df_coord) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~gps_x, lat = ~gps_y, radius = 0.8, fillOpacity = 0.8, fillColor = "blue")
carte
  • We delete points outside France (22)
  • The database therefore changes from 6034 to 6012 observations.

2.5 Focus on years

  • Cleaning the Annee column
  • CR of Annee = 98.9% (35 levels)
  • We remove all the years before 1990 and the NA ( 0 observations)
Code
# sum(is.na(landworm$Annee))
landworm <- landworm %>%dplyr::filter(!is.na(Annee))# on enleve les NA
annes_omit= c("1821", "1960", "1978", "1982", "1983", "1984", "1986", "1988", "1989", "1905") # annee sup
landworm <- landworm[!landworm$Annee %in% annes_omit, ]
landworm=droplevels(landworm)
summary_df <- data.frame(Numbers = summary(landworm$Annee))
summary_df$Annee = rownames(summary_df)
summary_df = summary_df[,c("Annee", "Numbers")]
rownames(summary_df) = NULL
summary_df = summary_df %>% arrange(Annee)
summary_df %>% datatable(options = list(pageLength = 10))
Code
rm("summary_df")
  • The database therefore changes from 6012 to 6010 observations.

2.6 Focus on clcm_lvl1

  • CR of clcm_lvl1 = 84.2% (7 levels)
Code
landworm$clcm_lvl1= as.factor(landworm$clcm_lvl1)
summary_df <- as.data.frame(summary(landworm$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers
1_Naturel 1
2_Agricole 3
Agricultural areas 3706
Artificial surfaces 834
Forest and semi natural areas 541
Territoires agricoles 1
Wetlands 30
NA's 894
  • Merging levels
Code
levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "1_Naturel"] <- "Forest and semi natural areas"
levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "2_Agricole"] <- "Agricultural areas"

levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "Territoires agricoles"] <- "Agricultural areas"

landworm$clcm_lvl1= as.factor(landworm$clcm_lvl1)
summary_df <- as.data.frame(summary(landworm$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers
Forest and semi natural areas 542
Agricultural areas 3710
Artificial surfaces 834
Wetlands 30
NA's 894
  • Update code_clcm_lvl1
Code
#landworm$code_clcm_lvl1 = as.factor(landworm$code_clcm_lvl1)

landworm$code_clcm_lvl1 <- ifelse(landworm$clcm_lvl1 == "Forest and semi natural areas", 3, landworm$code_clcm_lvl1)

landworm$code_clcm_lvl1 <- ifelse(landworm$clcm_lvl1 == "Agricultural areas", 2, landworm$code_clcm_lvl1)
  • For the moment, we will keep the NA of clcm_lvl1

2.7 Focus on clcm_lvl2

  • CR of clcm_lvl2 = 84.2% (13 levels)
Code
landworm$clcm_lvl2= as.factor(landworm$clcm_lvl2)
summary_df <- as.data.frame(summary(landworm$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))
  • Merging levels
Code
levels(landworm$clcm_lvl2)[levels(landworm$clcm_lvl2) == "21_Agricole ouvert"] <- "Arable land"

landworm$clcm_lvl2= as.factor(landworm$clcm_lvl2)
summary_df <- as.data.frame(summary(landworm$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
# kable(summary_df,padding = 5)
summary_df %>% datatable(options = list(pageLength = 10), rownames = TRUE)
  • Update code_clcm_lvl2
Code
landworm$code_clcm_lvl2 <- ifelse(landworm$clcm_lvl2 == "Arable land", 21, landworm$code_clcm_lvl2)

2.8 Focus on clcm_lvl3

  • CR of clcm_lvl3 = 78.5% (29 levels)
Code
df_suivi = landworm
n_line = nrow(df_suivi)

landworm$clcm_lvl3= as.factor(landworm$clcm_lvl3)
summary_df <- as.data.frame(summary(landworm$clcm_lvl3))
colnames(summary_df) <- c("Numbers")
# kable(summary_df,padding = 5)
summary_df %>% datatable(options = list(pageLength = 10), rownames = TRUE)

2.9 Land use selection (clcm_lvl3)

  • Broad-leaved forest

  • Indetermined forest

  • Mixed forest

  • Urban forest and woodland

  • Pastures, meadows and other permanent grasslands under agricultural use

  • Non-irrigated arable land

  • Vineyards

  • Green urban areas

  • Natural grasslands

Code
# select_os= c("Broad-leaved forest", "Coniferous forest", "Mixed forest", 
# "Pastures, meadows and other permanent grasslands under agricultural use", "Non-irrigated arable land", "Vineyards","Green urban areas","Natural grasslands")

select_os= c("Broad-leaved forest", "Mixed forest", "Indetermined forest", 
             "Urban forest and woodland", "Pastures, meadows and other permanent grasslands under agricultural use", "Non-irrigated arable land", "Vineyards","Green urban areas","Natural grasslands")


landworm <- landworm[landworm$clcm_lvl3 %in% select_os, ]
landworm=droplevels(landworm)
landworm$clcm_lvl3 = as.factor(landworm$clcm_lvl3)
summary_df <- as.data.frame(summary(landworm$clcm_lvl3))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))
  • We merge the 4 types of forest:

    • Broad-leaved forest
    • Indetermined forest
    • Mixed forest
    • Urban forest and woodland
  • The database therefore changes from 6010 to 4204 observations.

2.10 Focus on protocols (après séléction)

  • List of protocols available on the database ( 7 levels)
Code
landworm$Protocole = as.factor(landworm$Protocole)
summary_df <- as.data.frame(summary(landworm$Protocole))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))
  • Merging levels :

    • F_HS \(=\) F_HS \(+\) FHS
    • HS \(=\) HS \(+\) hand sorting
Code
levels(landworm$Protocole)[levels(landworm$Protocole) == "FHS"] <- "F_HS"
levels(landworm$Protocole)[levels(landworm$Protocole) == "hand sorting"] <- "HS"
landworm$Protocole = as.factor(landworm$Protocole)
summary_df <- as.data.frame(summary(landworm$Protocole))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

2.11 Land use & protocol overview

Code
# kable (table(landworm$clcm_lvl1, landworm$Protocole,exclude = NULL), align = "c", format = "pipe", padding = 10)
# kable (table(landworm$clcm_lvl2, landworm$Protocole,exclude = NULL), align = "c", format = "pipe", padding = 10)
df = table(landworm$clcm_lvl3, landworm$Protocole,exclude = NULL)
# df = as.data.frame(df)
kable (df, align = "c", format = "pipe", padding = 10)
AITC_HS AITCTM F_HS HS HS_F_16 HS_M_16 M_HS
Forest 4 2 22 147 0 0 3
Green urban areas 2 0 0 595 0 0 10
Natural grasslands 0 66 3 62 0 0 9
Non-irrigated arable land 80 81 358 1267 24 90 36
Pastures, meadows and other permanent grasslands under agricultural use 60 0 227 264 0 42 0
Vineyards 0 1 374 375 0 0 0
Code
# df %>% datatable(options = list(pageLength = 8))

3 Earthworms data

3.1 Total abundance

Sppression des valeurs aberrantes

  • The database therefore changes from 4204 to 4163 observations.

3.2 Total biomass

Sppression des valeurs aberrantes

  • The database therefore changes from 4163 to 4143 observations.

3.3 Total taxonomic richness

Total richness calculation method

    1. Removal of columns with only NA and/or only 0
    1. Merging sub-species with their species
Voici la liste des sp a fusionnée: 
Code
# function pour fusion les sous especes a leurs especes
sp_identique = function(df, nom_sous_espece, nom_espece ) {
  
  # Si les deux colonnes existent dans df
  if (nom_sous_espece %in% names(df) && nom_espece %in% names(df)) {
    
    # Addition des valeurs des deux colonnes et stockage du résultat dans nom_espece
    # df[[nom_espece]] = df[[nom_sous_espece]] + df[[nom_espece]]
    df[[nom_espece]] = rowSums(df[,c(nom_sous_espece,nom_espece)], na.rm = TRUE)
    

    df[[nom_sous_espece]] = NULL
  
  # Si nom_espece n'existe pas dans df mais nom_sous_espece existe
  } else if (nom_sous_espece %in% names(df) && !(nom_espece %in% names(df))) {
    
    # on renome nom_sous_espece par nom_espece
    names(df)[names(df) == nom_sous_espece] <- nom_espece
    df[[nom_sous_espece]] = NULL
  }
  
  return(df)
}


for (i in 1:nrow(df_sp_sous_sp)){
 
landworm_sp = sp_identique(df = landworm_sp,nom_sous_espece = df_sp_sous_sp[i,"col_sp_origines"],
                  nom_espece = df_sp_sous_sp[i,"col_sp_concatener"]) 
}
    1. Identify columns beginning with AB_
    1. Deletion of AB_ columns that are not species
Code
# On supprimme les colonnes AB_ qui ne sont pas des espèces dans le calcule
ab_supprimee =  c("AB_AD",
                  "AB_JV",
                  "AB_SA",
                  "AB_STAD_X",
                  "AB_indéterminable",
                  "AB_Indéterminable",
                  "AB_indéterminable_endogeic",
                  "AB_tot",
                  "AB_Indéterminable_epigeic",
                  "AB_indéterminable_endogeic",
                  "AB_Ep.X",
                  "AB_vide",
                  "AB_Ep.X1",
                  "AB_Ep.X2",
                  "AB_A.X",
                  "AB_Adult",
                  "AB_cocon",
                  "AB_indéterminé",
                  "AB_Juvenile",
                  "AB_Sub.adult",
                  "AB_Indéterminé")

colonnes_AB <- colonnes_AB[!colonnes_AB %in% ab_supprimee]

cat("A ce stade, la liste des especes est: \n")
A ce stade, la liste des especes est: 
Code
df= data.frame(colonnes_AB)
rownames(df) = NULL
DT::datatable(df, options = list(pageLength = 5))
    1. Calculate richness by assigning 1 to each column if the value is different from 0 and NA

A ce stade, la richesse varie de:

Code
# On calcule la richesse en attribiant 1 à chaque colonne si la valeur est différent de 0 et de NA
landworm_sp$Richesse_tot <- 0
landworm_sp$Richesse_tot <- rowSums(!is.na(landworm_sp[colonnes_AB]) & landworm_sp[colonnes_AB] != 0)
#sum (is.na(landworm_sp$Richesse_tot) )
summary(landworm_sp$Richesse_tot)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   3.000   5.000   4.923   7.000  14.000 
    1. Décremmentation de la richesse

Voir code:

Code
# Apres calcule richesse ------------------------------------------------------
sp_prorata <- function(df, sp_x, sp) {
  
  name_col <- names(df)
  
  #  si la colonne sp_x est présente
  if (sp_x %in% name_col) {
    
    # pour chaque ligne 
    for (i in 1:nrow(df)) {
      
      #  si les valeurs de la colonne sp_x ne sont ni 0 ni NA
      if (!is.na(df[i, sp_x]) && df[i, sp_x] != 0) {
        
        # Si la somme des ab des sp est différente de 0
        if (rowSums(df[i, sp], na.rm = TRUE) != 0) {
          # La richesse diminue de 1
          df[i, "Richesse_tot"] <- df[i, "Richesse_tot"] - 1
        }
      }
    }
  }
  
  return(df)
}

# Pour AB_Allolobophora_sp ------------------------------------------
sp_x = "AB_Allolobophora_sp"
sp=c("AB_Allolobophora_burgondiae", 
     "AB_Allolobophora_chlorotica",         
     "AB_Aporrectodea_cupulifera",  
     "AB_Aporrectodea_icterica",    
     "AB_Aporrectodea_limicola",    
     "AB_Aporrectodea_rosea")
# df= landworm_sp[, c(sp_x,sp, "Richesse_tot")]
# df=df[!df$AB_Allolobophora_sp == 0,]
# df=df[! is.na(df$AB_Allolobophora_sp),]
# df$som = rowSums(df[,sp], na.rm=TRUE)
# View(df)
# 
# dff <- sp_prorata(landworm_sp, sp_x, sp)
# dff= dff[, c(sp_x,sp, "Richesse_tot")]
# dff=dff[!dff$AB_Allolobophora_sp == 0,]
# dff=dff[! is.na(dff$AB_Allolobophora_sp),]
# dff$som = rowSums(dff[,sp], na.rm=TRUE)
# View(dff)
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Aporrectodea_indéterminable ------------------------------------------
sp_x = "AB_Aporrectodea_indéterminable"
sp=c("AB_Allolobophora_burgondiae", 
     "AB_Allolobophora_chlorotica",
     "AB_Aporrectodea_cupulifera",
     "AB_Aporrectodea_icterica",
     "AB_Aporrectodea_limicola",
     "AB_Aporrectodea_rosea")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Aporrectodea_sp ------------------------------------------
sp_x = "AB_Aporrectodea_sp"
sp=c("AB_Aporrectodea_giardi",  
     "AB_Aporrectodea_longa",   
     "AB_Aporrectodea_nocturna",
     "AB_Aporrectodea_ripicola",
     "AB_Aporrectodea_longa/giardi")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Dendrobaena_sp ------------------------------------------
# Tous les "Dendrobaena" et "Dendrodrilus_rubidus" + Satchellius mammalis
sp_x = "AB_Dendrobaena_sp"
sp=c("AB_Dendrobaena_alpina",     
     "AB_Dendrobaena_attemsi",
     "AB_Dendrobaena_cognettii",
     "AB_Dendrobaena_hortensis",      
     "AB_Dendrobaena_octaedra",
     "AB_Dendrodrilus_rubidus",
     "AB_Satchellius_mammalis")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Eisenia_sp ------------------------------------------
# Tous les Eisenia possible
sp_x = "AB_Eisenia_sp"
sp=c("AB_Eisenia_andrei",
     "AB_Eisenia_fetida",
     "AB_Eisenia_veneta",             
     "AB_Eiseniella_tetraedra")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Lumbricus_sp  ------------------------------------------
# Tous les Lumbricus sauf Lumbricus castaneus ?
sp_x = "AB_Lumbricus_sp"
sp=c("AB_Lumbricus_centralis",       
"AB_Lumbricus_festivus",         
"AB_Lumbricus_friendi",          
"AB_Lumbricus_herculeus",        
"AB_Lumbricus_meliboeus",        
"AB_Lumbricus_rubellus" ,        
"AB_Lumbricus_terrestris",
"AB_Lumbricus_friendi/centralis")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Microscolex_sp  ------------------------------------------
# Tous les AB_Microscolex
sp_x = "AB_Microscolex_sp"
sp=c("AB_Microscolex_dubius",         
"AB_Microscolex_phosphoreus")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Octolasion_sp  ------------------------------------------
# Tous les AB_Octolasion
sp_x = "AB_Octolasion_sp"
sp=c("AB_Octodrilus_complanatus",    
"AB_Octolasion_cyaneum",         
"AB_Octolasion_lacteum")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Pheretima_indéterminable  ------------------------------------------
# AB_Pheritima_Diffringens
sp_x = "AB_Pheretima_indéterminable"
sp=c("AB_Pheritima_Diffringens", "AB_Pheritima_Diffringens")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Prosellodrilus_sp  ------------------------------------------
# Tous les Prosellodrilus
sp_x = "AB_Prosellodrilus_sp"
sp=c("AB_Prosellodrilus_amplisetosus",
"AB_Prosellodrilus_fragilis",    
"AB_Prosellodrilus_occidentalis",
"AB_Prosellodrilus_praticola",   
"AB_Prosellodrilus_pyrenaicus")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Scherotheca_sp  ------------------------------------------
# Tous les AB_Scherotheca_sp
sp_x = "AB_Scherotheca_sp"
sp=c("AB_Satchellius_mammalis",       
"AB_Scherotheca_aquitana",       
"AB_Scherotheca_dinoscolex",     
"AB_Scherotheca_nivicola",       
"AB_Scherotheca_porotheca",      
"AB_Scherotheca_rhodana",       
"AB_Scherotheca_savignyi")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)
    1. Rules 3: Si le Owner de la parcelle est DC ou GP alors si Aporrectodea_trapezoides et/ou Aporrectodea_tuberculata sont presentent dans la parcelle alors si Aporrectodea_caliginosa est presente aussi dans la parcelle alors la richesse diminue de 1.
Code
# dff <- data.frame(
#   owner = c("DC", "GP", "XX", "DC"),
#   AB_Aporrectodea_trapezoides = c(1, NA, 0, 2),
#   AB_Aporrectodea_tuberculata = c(NA, 0, 1, 3),
#   AB_Aporrectodea_caliginosa = c(2, 1, NA, 4),
#   Richesse_tot = c(10, 20, 30, 40)
# )
# df <- dff

df=landworm_sp
for (i in 1:nrow(df)) {
  # Si owner est "DC" ou "GP"
  if (df[i, "owner"] %in% c("DC", "GP")) {
    
    # si AB_Aporrectodea_trapezoides ou AB_Aporrectodea_tuberculata sont présents et non nuls
    if ((!is.na(df[i, "AB_Aporrectodea_trapezoides"]) && 
         df[i, "AB_Aporrectodea_trapezoides"] != 0) 
        || 
        (!is.na(df[i, "AB_Aporrectodea_tuberculata"]) && 
         df[i, "AB_Aporrectodea_tuberculata"] != 0)) {
      
      #  si AB_Aporrectodea_caliginosa est présente et non nulle
      if (!is.na(df[i, "AB_Aporrectodea_caliginosa"]) && 
          df[i, "AB_Aporrectodea_caliginosa"] != 0) {
        
        # La richesse diminue de 1 pour cette ligne
        df[i, "Richesse_tot"] <- df[i, "Richesse_tot"] - 1
      }
    }
  }
}

landworm_sp = df
    1. Rules 4: Si la Richesse_tot est superieur à 2 et que AB_Lumbricidae et/ou AB_Oligochaeta_so est presente (differente de 0 et de NA), la Richesse_tot dimunue de -1
    1. Verifications

Sppression des valeurs aberrantes

  • The database therefore changes from 4143 to 4143 observations.

Il n y a pas de valeurs aberant. Cependant, il y a 11 parcelles qui ont une richesse superieur a 12, a garder ou a supprimer ?

Details des parcelles a forte richesse (sup à 12):

I remove the 11 plots with richness greater than 12.

  • The database therefore changes from 4143 to 4132 observations.

4 Branch 1

Saving the database after applying the previous filters.

At this stage, the database has 4132 rows and 260 columns without the climatic and pedological variables.

5 Soil data extraction and adding to landWorm

5.1 The source database

5.2 Extraction et fusion

Code
# Ajout variables du soil  (voir script Add_soil_dats.R)
soil_datas <- read.xlsx("datas/soil_datas.xlsx" , sheet = "Sheet 1")
soil_datas <- subset(soil_datas, select = -c(Programme,Annee,Date_Prelevement,ID_Site,gps_x, gps_y))


rows_not_in_soil_datas <- anti_join(landworm, soil_datas, by = "ID_2")
rows_not_in_landworm <- anti_join(soil_datas,landworm, by = "ID_2")

# plot(soil_datas$ID_2, landworm$ID_2)

merged_df <- merge(landworm, soil_datas, by = "ID_2")
ids_not_matching <- anti_join( merged_df,landworm, by = "ID_2")

# colSums(is.na(soil_datas))

nn = nrow(merged_df)

merged_df = drop_na(merged_df,names(soil_datas))

p_sans_pedo = nn - nrow(merged_df)

landworm = merged_df

196 plots have at least one NA in the pedological variables, so I am removing them, and the database goes from 4132 to 3936.

5.3 Sand

Extracted values (g/kg, 0 - 30 cm)

  • The database therefore changes from 3936 to 3934 observations.

5.4 Silt

Extracted values (g/kg, 0 - 30 cm)

Sppression des valeurs aberrantes

  • The database therefore changes from 3934 to 3934 observations.

5.5 Clay

Extracted values (g/kg, 0 - 30 cm)

Sppression des valeurs aberrantes

  • The database therefore changes from 3934 to 3934 observations.

5.6 Soil organic carbone (g/kg)

  • The database therefore changes from 3934 to 3912 observations.

5.7 pH (H2O)

  • The database therefore changes from 3912 to 3912 observations.

5.8 Phosphore (P, mg/kg)

Sppression des valeurs aberrantes

  • The database therefore changes from 3912 to 3912 observations.

5.9 Azote (N, g/kg)

Sppression des valeurs aberrantes

  • The database therefore changes from 3912 to 3901 observations.

5.10 Potassium (K, mg/kg)

  • The database therefore changes from 3901 to 3891 observations.

5.11 C/N

  • The database therefore changes from 3891 to 3887 observations.

5.12 Carbonates de calcium (CaCO3, g/kg)

Sppression des valeurs aberrantes

  • The database therefore changes from 3887 to 3882 observations.

6 Climate data extraction and adding to landWorm

6.1 The source database (CHELSA V2)

Code
# Lire le fichier Excel
chemin_fichier_excel <- "C:/Users/diall/Downloads/datas/ODMAP.xlsx"
climat <- read.xlsx(chemin_fichier_excel, sheet = "climat")

# Fusions des cellules des colonnes avec des éléments dupliqués
for (col in names(climat)) {
  climat[[col]] <- ifelse(duplicated(climat[[col]]), "", climat[[col]])
}

# Affichage du tableau avec kableExtra et centrage du contenu des cellules
# kableExtra::kable(climat)

For each of the following variables, I calculate the mean and the standard deviation at 3 months, 6 months, and 120 months (30 variables in total).

Data available only until 2018/2019.

6.2 Extraction method

See file Script 1 DownloadChelsaData

See file Script 2 AddchelsaData

Steps:

  1. Obtain a vector of “month_Year” periods:

    • Extract the unique periods (month_year).
  2. Initialize the necessary columns:

    • Define the periods of interest (“3_months”, “6_months”, “120_months”).
    • Define the variables of interest (“pet”, “pr”, “tas”, “tasmax”, “tasmin”).
    • Initialize columns for means and standard deviations for each period and each variable.
  3. Main loop through periods and variables:

    • Iterate over each variable of interest.
    • Iterate over each “month_Year” period.
      • Load the rasters corresponding to the 120-month period.
      • Extract raster data for the specified GPS points.
      • Convert units according to the variable.
      • Select sub-periods of 3, 6, and 120 months.
      • Calculate means and standard deviations for each period.
  4. Save the updated data

  5. Merging database and climat database

Code
# Ajout variables du climate  (voir script Add_climate_dats.R)
climate_datas <- read.xlsx("datas/climate_datas.xlsx" , sheet = "Sheet 1")
climate_datas <- subset(climate_datas, select = -c(Programme,Annee,Date_Prelevement,ID_Site,gps_x, gps_y))


rows_not_in_climate_datas <- anti_join(landworm, climate_datas, by = "ID_2")
rows_not_in_landworm <- anti_join(climate_datas,landworm, by = "ID_2")

# plot(climate_datas$ID_2, landworm$ID_2)

merged_df <- merge(landworm, climate_datas, by = "ID_2")
ids_not_matching <- anti_join( merged_df,landworm, by = "ID_2")

# colSums(is.na(climate_datas))

nn = nrow(merged_df)

merged_df = drop_na(merged_df,names(climate_datas))

p_sans_climat = nn - nrow(merged_df)

landworm = merged_df

0 plots have at least one NA in the climate variables, so I am removing them, and the database goes from 3882 to 3882.

6.3 Average annual air temperature (°C) = tas

::: {.panel-tabset}

6.4 1. At 3 months

  • Mean

The database therefore changes from 3882 to 3876 observations.

  • Standard deviation

  • The database therefore changes from 3876 to 3876 observations.

6.5 2. At 6 months

  • Mean

  • The database therefore changes from 3876 to 3876 observations.

  • Standard deviation

  • The database therefore changes from 3876 to 3876 observations.

6.6 3. At 120 months

  • Mean

  • The database therefore changes from 3876 to 3870 observations.

  • Standard deviation

  • The database therefore changes from 3870 to 3870 observations.

:::

6.7 Mean daily maximum 2m air temperature (°C) = tasmax

  • Mean

The database therefore changes from 3870 to 3849 observations.

  • Standard deviation

  • The database therefore changes from 3849 to 3849 observations.
  • Mean

  • The database therefore changes from 3849 to 3849 observations.

  • Standard deviation

  • The database therefore changes from 3849 to 3849 observations.
  • Mean

  • The database therefore changes from 3849 to 3849 observations.

  • Standard deviation

  • The database therefore changes from 3849 to 3849 observations.

6.8 Mean daily maximum 2m air temperature (°C) = tasmin

  • Mean

The database therefore changes from 3849 to 3849 observations.

  • Standard deviation

  • The database therefore changes from 3849 to 3849 observations.
  • Mean

  • The database therefore changes from 3849 to 3849 observations.

  • Standard deviation

  • The database therefore changes from 3849 to 3849 observations.
  • Mean

  • The database therefore changes from 3849 to 3847 observations.

  • Standard deviation

  • The database therefore changes from 3847 to 3847 observations.

6.9 Annual precipitation (mm/month) = pr

  • Mean

The database therefore changes from 3847 to 3826 observations.

  • Standard deviation

  • The database therefore changes from 3826 to 3822 observations.
  • Mean

  • The database therefore changes from 3822 to 3803 observations.

  • Standard deviation

  • The database therefore changes from 3803 to 3803 observations.
  • Mean

  • The database therefore changes from 3803 to 3793 observations.

  • Standard deviation

  • The database therefore changes from 3793 to 3784 observations.

6.10 Potential evapotranspiration (mm/month) => pet

  • Mean

The database therefore changes from 3784 to 3784 observations.

  • Standard deviation

  • The database therefore changes from 3784 to 3784 observations.
  • Mean

  • The database therefore changes from 3784 to 3550 observations.

  • Standard deviation

  • The database therefore changes from 3550 to 3545 observations.
  • Mean

  • The database therefore changes from 3545 to 3523 observations.

  • Standard deviation

  • The database therefore changes from 3523 to 3521 observations.

7 Exploratory analysis

Data set reduction:

  • 3 Response variables: Abundance, biomass, and richness

  • 41 Explanatory variables (including 30 climatic, 10 pedological, and land use)

Code
id_col=c("ID_2","Programme","Annee","ID_Site","Protocole")

vdt_col=c("AB_tot", "BM_tot", "Richesse_tot")

os_and_gps = c('clcm_lvl3','gps_x', 'gps_y')

soils_variables = c("sable.0_30","limon.0_30","argile.0_30",'jrc_pH_H2O','N','P','K','CN', 'CaCO3',"c_orga_0_a_30")

climates_variables = c("tas_mean_3","tas_mean_6","tas_mean_120",
                      "tas_sd_3","tas_sd_6","tas_sd_120",
                      
                      "tasmax_mean_3","tasmax_mean_6","tasmax_mean_120",
                      "tasmax_sd_3","tasmax_sd_6","tasmax_sd_120",
                      
                      "tasmin_mean_3","tasmin_mean_6","tasmin_mean_120",
                      "tasmin_sd_3","tasmin_sd_6","tasmin_sd_120",
                      
                      "pr_mean_3","pr_mean_6","pr_mean_120",
                      "pr_sd_3","pr_sd_6","pr_sd_120",
                      
                      "pet_mean_3","pet_mean_6","pet_mean_120",
                      "pet_sd_3","pet_sd_6","pet_sd_120")

landworm_explo = landworm[,c(id_col,vdt_col,os_and_gps,soils_variables,climates_variables)]

landworm_explo <- landworm_explo %>%
  dplyr::rename(
    Sand = sable.0_30,
    Silt = limon.0_30,
    Clay = argile.0_30,
    pH = jrc_pH_H2O,
    C_org = c_orga_0_a_30,
    LC_ = clcm_lvl3,
    Long = gps_x ,
    Lat =  gps_y
  )



cl_original <- levels(landworm_explo$LC_)
new_cl <- c("For","Gua", "Nag", "Nial", "Pmo", "Viny")
landworm_explo$LC_ <- factor(landworm_explo$LC_, levels = cl_original, labels = new_cl)



soils_variables = c("Sand","Silt","Clay",'pH','N','P','K','CN', 'CaCO3',"C_org")
variables_factor = c("LC_")
all_predictors = c(variables_factor,soils_variables,climates_variables)

all_predictors_num = all_predictors[!all_predictors %in% c(variables_factor)]

Il reste donc 3521 lignes et 51 colonnes.

7.1 Correlation matrix

  • All predictors

  • Correlation < |0.07| = 0

Séparation entre les variables pédologiques et climatiques

a). Variables pédologiques

Conclusion: je supprime Silt et je garde Sand. De même je suprime CaCO3 et je garde pH.

b). Variables climatiques:

On enlève d’abord les variables sd:

pet_sd_120, pet_sd_3, pr_sd_120, pr_sd_6, tasmin_sd_120, tasmin_sd_6, tasmin_sd_3, tasmax_sd_120, tasmax_sd_6, tasmax_sd_3

Maintenat on enleve les variables qui ont le plus de correlation

On enleve donc:

pet_mean_3, pet_mean_120, tas_mean_3, tasmin_mean_3, tasmin_mean_6, tasmin_mean_120, tasmax_mean_6, tasmax_mean_3, tasmax_mean_120, pr_mean_6

Synthese

Il reste donc 18 variables explicatives non correlee

=> 22 variables supprimees

7.2 VIF

No variable from the 18 input variables has collinearity problem. 

The linear correlation coefficients ranges between: 
min correlation ( pr_sd_3 ~ P ):  -0.0006721748 
max correlation ( tas_sd_6 ~ tas_mean_6 ):  0.6810085 

---------- VIFs of the remained variables -------- 
      Variables      VIF
1          Sand 2.352594
2          Clay 3.059920
3            pH 2.802590
4             N 2.796920
5             P 2.056419
6             K 1.389062
7            CN 2.948456
8         C_org 1.847050
9    tas_mean_6 7.077602
10 tas_mean_120 3.282206
11     tas_sd_3 3.032422
12     tas_sd_6 8.558721
13   tas_sd_120 3.927902
14    pr_mean_3 3.725966
15  pr_mean_120 2.572277
16      pr_sd_3 2.058873
17   pet_mean_6 5.740856
18     pet_sd_6 5.842996

7.3 Relationship between the abundance and the predictors

- AB_tot & LC_

-  AB_tot & Sand

-  AB_tot & Silt

-  AB_tot & Clay

-  AB_tot & pH

-  AB_tot & N

-  AB_tot & P

-  AB_tot & K

-  AB_tot & CN

-  AB_tot & CaCO3

-  AB_tot & C_org

-  AB_tot & tas_mean_3

-  AB_tot & tas_mean_6

-  AB_tot & tas_mean_120

-  AB_tot & tas_sd_3

-  AB_tot & tas_sd_6

-  AB_tot & tas_sd_120

-  AB_tot & tasmax_mean_3

-  AB_tot & tasmax_mean_6

-  AB_tot & tasmax_mean_120

-  AB_tot & tasmax_sd_3

-  AB_tot & tasmax_sd_6

-  AB_tot & tasmax_sd_120

-  AB_tot & tasmin_mean_3

-  AB_tot & tasmin_mean_6

-  AB_tot & tasmin_mean_120

-  AB_tot & tasmin_sd_3

-  AB_tot & tasmin_sd_6

-  AB_tot & tasmin_sd_120

-  AB_tot & pr_mean_3

-  AB_tot & pr_mean_6

-  AB_tot & pr_mean_120

-  AB_tot & pr_sd_3

-  AB_tot & pr_sd_6

-  AB_tot & pr_sd_120

-  AB_tot & pet_mean_3

-  AB_tot & pet_mean_6

-  AB_tot & pet_mean_120

-  AB_tot & pet_sd_3

-  AB_tot & pet_sd_6

-  AB_tot & pet_sd_120

7.4 Relationship between the biomass and the predictors

- BM_tot & LC_

-  BM_tot & Sand

-  BM_tot & Silt

-  BM_tot & Clay

-  BM_tot & pH

-  BM_tot & N

-  BM_tot & P

-  BM_tot & K

-  BM_tot & CN

-  BM_tot & CaCO3

-  BM_tot & C_org

-  BM_tot & tas_mean_3

-  BM_tot & tas_mean_6

-  BM_tot & tas_mean_120

-  BM_tot & tas_sd_3

-  BM_tot & tas_sd_6

-  BM_tot & tas_sd_120

-  BM_tot & tasmax_mean_3

-  BM_tot & tasmax_mean_6

-  BM_tot & tasmax_mean_120

-  BM_tot & tasmax_sd_3

-  BM_tot & tasmax_sd_6

-  BM_tot & tasmax_sd_120

-  BM_tot & tasmin_mean_3

-  BM_tot & tasmin_mean_6

-  BM_tot & tasmin_mean_120

-  BM_tot & tasmin_sd_3

-  BM_tot & tasmin_sd_6

-  BM_tot & tasmin_sd_120

-  BM_tot & pr_mean_3

-  BM_tot & pr_mean_6

-  BM_tot & pr_mean_120

-  BM_tot & pr_sd_3

-  BM_tot & pr_sd_6

-  BM_tot & pr_sd_120

-  BM_tot & pet_mean_3

-  BM_tot & pet_mean_6

-  BM_tot & pet_mean_120

-  BM_tot & pet_sd_3

-  BM_tot & pet_sd_6

-  BM_tot & pet_sd_120

7.5 Relationship between the richness and the predictors

- Richesse_tot & LC_

-  Richesse_tot & Sand

-  Richesse_tot & Silt

-  Richesse_tot & Clay

-  Richesse_tot & pH

-  Richesse_tot & N

-  Richesse_tot & P

-  Richesse_tot & K

-  Richesse_tot & CN

-  Richesse_tot & CaCO3

-  Richesse_tot & C_org

-  Richesse_tot & tas_mean_3

-  Richesse_tot & tas_mean_6

-  Richesse_tot & tas_mean_120

-  Richesse_tot & tas_sd_3

-  Richesse_tot & tas_sd_6

-  Richesse_tot & tas_sd_120

-  Richesse_tot & tasmax_mean_3

-  Richesse_tot & tasmax_mean_6

-  Richesse_tot & tasmax_mean_120

-  Richesse_tot & tasmax_sd_3

-  Richesse_tot & tasmax_sd_6

-  Richesse_tot & tasmax_sd_120

-  Richesse_tot & tasmin_mean_3

-  Richesse_tot & tasmin_mean_6

-  Richesse_tot & tasmin_mean_120

-  Richesse_tot & tasmin_sd_3

-  Richesse_tot & tasmin_sd_6

-  Richesse_tot & tasmin_sd_120

-  Richesse_tot & pr_mean_3

-  Richesse_tot & pr_mean_6

-  Richesse_tot & pr_mean_120

-  Richesse_tot & pr_sd_3

-  Richesse_tot & pr_sd_6

-  Richesse_tot & pr_sd_120

-  Richesse_tot & pet_mean_3

-  Richesse_tot & pet_mean_6

-  Richesse_tot & pet_mean_120

-  Richesse_tot & pet_sd_3

-  Richesse_tot & pet_sd_6

-  Richesse_tot & pet_sd_120

7.6 Total abundance distributions


  • Transformation sqrt

lamda = 0.3




7.7 Total biomass distributions


  • Transformation sqrt

lamda = 0.4




7.8 Total taxonomic richness distributions


  • Transformation sqrt

lamda = 0.6




7.9 Standarization

  • Transformation sqrt de l’abondance et de la biomasse

  • Transformation centrée reduite des prédicteurs

8 Modeling

8.1 Predictors



All predictors: => RF, GBM et ANN


 [1] "LC_"             "Sand"            "Silt"            "Clay"           
 [5] "pH"              "N"               "P"               "K"              
 [9] "CN"              "CaCO3"           "C_org"           "tas_mean_3"     
[13] "tas_mean_6"      "tas_mean_120"    "tas_sd_3"        "tas_sd_6"       
[17] "tas_sd_120"      "tasmax_mean_3"   "tasmax_mean_6"   "tasmax_mean_120"
[21] "tasmax_sd_3"     "tasmax_sd_6"     "tasmax_sd_120"   "tasmin_mean_3"  
[25] "tasmin_mean_6"   "tasmin_mean_120" "tasmin_sd_3"     "tasmin_sd_6"    
[29] "tasmin_sd_120"   "pr_mean_3"       "pr_mean_6"       "pr_mean_120"    
[33] "pr_sd_3"         "pr_sd_6"         "pr_sd_120"       "pet_mean_3"     
[37] "pet_mean_6"      "pet_mean_120"    "pet_sd_3"        "pet_sd_6"       
[41] "pet_sd_120"     


No correlation: => GLM et GAM |0.7|


 [1] "LC_"          "Sand"         "Clay"         "pH"           "N"           
 [6] "P"            "K"            "CN"           "C_org"        "tas_mean_6"  
[11] "tas_mean_120" "tas_sd_3"     "tas_sd_6"     "tas_sd_120"   "pr_mean_3"   
[16] "pr_mean_120"  "pr_sd_3"      "pet_mean_6"   "pet_sd_6"    

See Correlation matrix

8.2 Variable selection Process


Initialization


Model creation and evaluation Loop

For each iteration i up to length(variables) - 1:

  1. Model creation

    • We fit a model using all available predictor variables.
  2. Calculation of variable importance

    • We use the permutation feature importance algorithm based on Fisher, Rudin, and Dominici (2018):
      • Input: model \(\hat{F}\), variables \(X\), target \(y\), error measure \(L(y, \hat{F}(X))\)
      • Estimate the original model error \(L_{original}\).
      • For each variable \(j\):
        1. Permute \(j\) in \(X\) to obtain \(X_{perm}\), thus breaking the association between \(j\) and \(y\).
        2. Estimate the error \(L_{perm}\) with \(X_{perm}\).
        3. Calculate the importance \(FI_j = L_{perm} / L_{original}\).
      • Sort the variables by decreasing FI.
  3. Determination of the least important variable

  4. Predictions on Test Data

  5. Calculation of performance metrics

    • RMSE, adjusted for training, for testing, and MAE.
  6. Update of Train and Test datasets

    • Remove the least important variable from train & test.

Recording of Results

8.3 Data preparation

Code
# AB_tot --------------------------------------------------------------------------
df_mod_AB_tot = data_deep[,c("AB_tot",predictor_deep, LC_levels)]
df_mod_AB_tot = drop_na(df_mod_AB_tot)
df_mod_AB_tot = droplevels(df_mod_AB_tot)


# Partition
# set.seed(123)
# ind <- sample(2, nrow(df_mod_AB_tot), replace = T, prob = c(.8, .2))
# AB_tot_train <- df_mod_AB_tot[ind==1,]
# AB_tot_test <- df_mod_AB_tot[ind==2,]

set.seed(42)
split <- rsample::initial_split(df_mod_AB_tot,prop = 0.8, strata = "AB_tot")
AB_tot_train <- rsample::training(split)
AB_tot_test <- rsample::testing(split)

write.csv2(x =AB_tot_train,file = "datas/landW/AB_tot_train.csv", row.names = FALSE)
write.csv2(x =AB_tot_test,file = "datas/landW/AB_tot_test.csv", row.names = FALSE)
# 
# AB_tot_train = read.csv2("datas/landW/AB_tot_train.csv")
# AB_tot_test = read.csv2("datas/landW/AB_tot_test.csv")
# df_mod_AB_tot = rbind(AB_tot_train,AB_tot_test)

AB_tot_train = as.data.frame(AB_tot_train)
AB_tot_test = as.data.frame(AB_tot_test)

df <- data.frame(y =AB_tot_train[,"AB_tot"])
abundance_dist_train = ggplot(df, aes(x=y)) +
geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
  geom_density(fill="black", alpha=0.2) +
  theme_gray() +
  labs(title="Abundance: Train", x="Value", y="Density") +
  theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/landW/abundance_dist_train.png", plot = abundance_dist_train, dpi = 300,width = 3,height = 2)

df <- data.frame(y =AB_tot_test[,"AB_tot"])
abundance_dist_test = ggplot(df, aes(x=y)) +
geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
  geom_density(fill="black", alpha=0.2) +
  theme_gray() +
  labs(title="Abundance: Test", x="Value", y="Density") +
  theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/landW/abundance_dist_test.png", plot = abundance_dist_test, dpi = 300,width = 3,height = 2)

# Distrvitbution de var rep dans train et de test: est ce homogene ?
abundance_dist_train_and_test = ggarrange(abundance_dist_train, abundance_dist_test,
                          labels = c('(a)', '(b)'),
                          common.legend = TRUE,
                          legend = 'right'
)


ggsave("Results/landW/abundance_dist_train_and_test.png", plot = abundance_dist_train_and_test, dpi = 300,height = 2,width = 4)

Abundance

  • Data partition (3521, 48):

    • train data (80 %) = 2816, 48

    • test data (20 %) = 705, 48

Biomasse

  • Data partition (2304, 48):

    • train data (80 %) = 1840, 48

    • test data (20 %) = 464, 48

Richness

  • Data partition (3521, 48):

    • train data (80 %) = 2816, 48

    • test data (20 %) = 705, 48

8.4 GLM

Code
GLM <- function(var_rep, df_app, df_valid,family = 'gaussian'){
  
  
  var_predicteurs = names(df_app[,-1])
 
  df_app = df_app[,c(var_rep,var_predicteurs)]
  df_valid = df_valid[,c(var_rep,var_predicteurs)]
  
  formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))
  
  
  # entrainement du modele sur le jeu d'entrainement
  modelglm<-glm(formula,family = family ,data = df_app)
  
  # Prediction sur le jeu de validation
  pred.GLM<-predict(modelglm,newdata=as.data.frame(df_valid[,var_predicteurs]))
  
  # Calcul du RMSE pour évaluer la qualite du modele
  rmse <- round (sqrt(mean((df_valid[,var_rep] - pred.GLM)^2,na.rm=TRUE)),2)
  
  
 # Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(modelglm, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² 
  # R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GLM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
  
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.GLM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  
  MAE <- mean(abs(pred.GLM - df_valid[,var_rep]),na.rm=TRUE)
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  # output
  results_df <- data.frame(Algorithms = "GLM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
    
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelglm,predit = pred.GLM, df = results_df)
  return(results)
}
  • Gaussian distribution

8.5 GAM

Code
GAM <- function(var_rep, df_app, df_valid, family = 'gaussian',method = "REML", interaction = FALSE){
  
  var_predicteurs = names(df_app[,-1])
  
  
  if (var_rep == "AB_tot"){ 

  modelgam<-gam(AB_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v,
        family=family,method = method,data = df_app)
  
  }
  
  
  
  
  if (var_rep == "BM_tot"){ 

  modelgam<-gam(BM_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v,
        family=family,method = method,data = df_app)
  
    
  }
  
  
  
  if(var_rep == "Richesse_tot"){ 
    
  modelgam<-gam(Richesse_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v ,
        family=family,method = method,data = df_app)
   
  }
  
  
  # Prediction sur le jeu de validation
  pred.GAM <- predict(modelgam,newdata=as.data.frame(df_valid[,var_predicteurs]))
  
  # Calcul du RMSE pour évaluer la qualite du modele
  rmse <- sqrt(mean((df_valid[,var_rep] - pred.GAM)^2,na.rm=TRUE))

  
# Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(modelgam, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GAM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
    res <- rms::lrm(df_valid[,var_rep]  ~ pred.GAM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  

  # Calcule le MAE
  MAE <- mean(abs(pred.GAM - df_valid[,var_rep]))
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  
  # output
  results_df <- data.frame(Algorithms = "GAM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelgam, predit = pred.GAM, df = results_df)
  
  return(results)
}
  • Family = gaussian

  • Link function = identity

  • Method = REML

  • Tuning

8.6 RF

  • Default model
  • RF model tuning by grid

  • ntree = \(100,300,500,700,900,1000,1300,1500,1700,2000\)

  • mtry = \(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24\)

  • maxnodes = \(10 , 20, 30, 40, 50, 60, 70, 80, 90, 100\)

Total number of models = \(ntree * mtry * maxnode = 960\)

  • Validation of models on test data
Code
ForetAlea <- function(var_rep, df_app, df_valid, mtry, ntree, maxnodes) {
  
  set.seed(1863)
  col_posi <- which(names(df_app) == var_rep)
  ForeVDT <- randomForest::randomForest(df_app[-col_posi], df_app[[col_posi]], mtry = mtry, ntree = ntree, maxnodes = maxnodes)
  
  # Prediction on the validation dataset
  col_posi <- which(names(df_valid) == var_rep)
  pred.RF <- predict(ForeVDT, newdata = df_valid[, -col_posi])
  
  # Calculate RMSE to evaluate model quality
  rmse <- sqrt(mean((df_valid[, col_posi] - pred.RF)^2))
  
  
  # Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(ForeVDT, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,col_posi],pred.RF)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.RF, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  
  # Calculate MAE
  MAE <- mean(abs(pred.RF - df_valid[, col_posi]))
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
    # output
  results_df <- data.frame(Algorithms = "RF",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = ForeVDT, predit = pred.RF, df = results_df)
  
  return(results)
}

8.7 GBM

  • Default model

  • GBM model tuning by grid

  • n.trees = \(1000, 1500, 1700, 2000, 3000\)

  • shrinkage = \(0.01, 0.02, 0.05, 0.001, 0.002, 0.005\)

  • interaction.depth = \(3, 5, 6, 8, 10\)

  • n.minobsinnode = \(2, 5, 10, 30, 50, 70\)

Total number of models = \(n.trees * shrinkage * interaction.depth * n.minobsinnode = 900\)

  • Validation of models on test data
Code
GBM <- function(var_rep, df_app, df_valid,distribution = 'gaussian',n.trees ,shrinkage,interaction.depth,n.minobsinnode){

  formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))

  Gradboost<-gbm(formula, data = df_app,
    distribution = distribution, 
    n.trees = n.trees,
    shrinkage = shrinkage,
    interaction.depth = interaction.depth,
    n.minobsinnode = n.minobsinnode) 
  
  # Prediction sur le jeu de validation :
   col_posi <- which(names(df_valid) == var_rep)
  prev.GBM<-predict(Gradboost,newdata=as.data.frame(df_valid[,-col_posi]))
 
  # Calcul du RMSE pour évaluer la qualité du modele
  rmse <- sqrt(mean((df_valid[,var_rep] - prev.GBM)^2))


# Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(Gradboost, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,col_posi],prev.GBM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
    res <- rms::lrm(df_valid[,var_rep]  ~ prev.GBM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  

  # calcule MAE
  MAE <- mean(abs(prev.GBM - df_valid[,col_posi])) 
  
    
    # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  
      # output
  results_df <- data.frame(Algorithms = "GBM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = Gradboost, predit = prev.GBM, df = results_df)
  
  
  return(results)
}

8.8 ANN

8.9 Compilation

  • ANN AB_tot
  • ANN BM_tot
  • ANN Richesse_tot

Summary of Compilations

  • For GLM and GAM, there were 19 explanatory variables => \(18*5*3*2 = 540\)

  • For RF, GBM, and ANN, there were 41 variables => \(40*5*3*3 = 1800\)

Excluding optimization models

9 R² ajustés

9.1 R² dans les modèles de régression non linéaire

“Le R² ajusté est utile pour les modèles de régression linéaire, il est moins pertinent pour les modèles de machine learning en raison de la complexité des modèle.”

La formule du coefficient de détermination \(R^2\) est :

\[\begin{equation} R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} \end{equation}\]

où :

  • $ y_i $ est la valeur observée,
  • $ _i $ est la valeur prédite par le modèle,
  • $ {y} $ est la moyenne des valeurs observées,
  • $ n $ est le nombre total d’observations.

La formule du \(R²\) ajusté est :

\[\begin{equation} R^2_{\text{ajusté}} = 1 - \left( \frac{(1 - R^2) \cdot (n - 1)}{n - k - 1} \right) \end{equation}\]

où :

  • \(R²\) est le coefficient de détermination,
  • \(n\) est le nombre total d’observations,
  • \(k\) est le nombre de prédicteurs (variables indépendantes) dans le modèle.

Justificatifs

See towardsdatascience.com

“La procédure d’ajustement du modèle des models non linéaires ne repose pas sur la minimisation progressive de la somme des carrés des erreurs résiduelles (RSS) et, par conséquent, le modèle ajusté de manière optimale pourrait avoir une somme des carrés résiduels supérieure à la somme totale des carrés. Cela signifie que R² pour de tels modèles peut être une quantité négative. En tant que tel, R² n’est pas une mesure utile de la qualité de l’ajustement pour la plupart des modèles non linéaires.”

“Les modèles non linéaires utilisent souvent des techniques d’ajustement de modèle telles que l’estimation de vraisemblance maximale (MLE) qui ne minimisent pas nécessairement la somme des carrés résiduels (RSS). Ainsi, étant donné deux modèles non linéaires ajustés à l’aide de MLE, celui qui présente la meilleure qualité d’ajustement peut se révéler avoir un R² ou un R² ajusté plus faible. Une autre conséquence de ce fait est que l’ajout de variables de régression aux modèles non linéaires peut réduire le R². Globalement, le R² ou le R² ajusté ne doivent pas être utilisés pour juger de la qualité d’ajustement d’un modèle de régression non linéaire.”

“De nombreux modèles de régression non linéaires n’utilisent pas la technique d’estimation des moindres carrés ordinaires pour ajuster le modèle.”

10 LandWorm results: total abundance

Abundance: best models

10.1 Abundance: GLM

The best model of GLM is :

\(mod_11 = glm(AB_tot ~ pet_sd_6 + tas_sd_6 + pr_mean_120 + P + Clay + \\ Sand + CN + pH + LC_, df = data)\)

10.2 Abundance: GAM

The best model of gam is:

\(mod_7 = gam(AB_tot ~ tas_sd_6 + tas_sd_3 + Sand + tas_sd_120 +\\ K + pet_sd_6 + pH + CN + P + tas_mean_6 + pr_mean_3 + tas_mean_120 + LC_, df = data)\)

10.3 Abundance: RF

The best model of RF is:

\(mod_28 = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)\)

10.4 Abundance: GBM

The best model of GBM is:

\(mod_17 = gbm(AB_tot ~ tasmin_sd_6 + tasmax_mean_6 + Sand + pr_mean_120 + tas_mean_6 +\\ pet_sd_3 + CaCO3 + tasmin_mean_120 + CN + K + tasmax_mean_120 + pet_sd_6 + pr_sd_6 +\\ P + Silt + tas_sd_6 + pr_mean_6 + pet_mean_3 + tasmin_mean_6 + pr_sd_120 + Clay + pH + \\ tas_mean_120 + pr_mean_3 + LC_, df = data)\)

10.5 Abundance: ANN

The best model of ANN is:

\(mod_27 = ann(AB_tot ~ pr_sd_6 + pH + P + pr_sd_120 + K + tasmin_sd_3 + \\ pet_sd_3 + pr_sd_3 + pet_mean_120 + tas_sd_120 + CaCO3 + Silt + tasmax_mean_120 + \\ tasmin_sd_120 + LC_ , df = data)\)

10.6 Abundance: best models

Models Rep.var R2_test RMSE MAE
GLM Abundance 0.31 5.65 4.52
GAM Abundance 0.36 5.43 4.28
RF Abundance 0.46 5.00 3.87
GBM Abundance 0.36 5.55 4.39
ANN Abundance 0.33 6.20 4.72

Meta models

Models Rep.var R2_test RMSE MAE
averaging Abundance 0.44 5.29 4.15
Stacking Abundance 0.47 4.94 3.83


GLM

\(mod = glm(AB_tot ~ pet_sd_6 + tas_sd_6 + pr_mean_120 + P + Clay + \\ Sand + CN + pH + LC_, df = data)\)


GAM

\(mod = gam(AB_tot ~ tas_sd_6 + tas_sd_3 + Sand + tas_sd_120 +\\ K + pet_sd_6 + pH + CN + P + tas_mean_6 + pr_mean_3 + tas_mean_120 + LC_, df = data)\)


RF

\(mod = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)\)


GBM

\(mod= gbm(AB_tot ~ tasmin_sd_6 + tasmax_mean_6 + Sand + pr_mean_120 + tas_mean_6 +\\ pet_sd_3 + CaCO3 + tasmin_mean_120 + CN + K + tasmax_mean_120 + pet_sd_6 + pr_sd_6 +\\ P + Silt + tas_sd_6 + pr_mean_6 + pet_mean_3 + tasmin_mean_6 + pr_sd_120 + Clay + pH + \\ tas_mean_120 + pr_mean_3 + LC_, df = data)\)


ANN

\(mod = ann(AB_tot ~ pr_sd_6 + pH + P + pr_sd_120 + K + tasmin_sd_3 + \\ pet_sd_3 + pr_sd_3 + pet_mean_120 + tas_sd_120 + CaCO3 + Silt + tasmax_mean_120 + \\ tasmin_sd_120 + LC_ , df = data)\)



Conclusion: le meillleur model pour l’abondance est le RF

11 LandWorm results: total biomass

Biomass: best models

11.1 Biomass: GLM

The best model of GLM is :

\(mod_6 = glm(BM_tot ~ N + pet_mean_6 + pet_sd_6 + tas_sd_120 + P + Sand + \\ Clay + C_org + tas_sd_6 + tas_sd_3 + pr_sd_3 + CN + tas_mean_6 + pH + \\ pr_mean_120 + LC_, df = data)\)

11.2 Biomass: GAM

The best model of gam is:

\(mod_5 = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)\)

11.3 Biomass: RF

The best model of RF is:

\(mod_34 = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + LC_ + tasmin_sd_120 + pet_mean_3 , df = data)\)

11.4 Biomass: GBM

The best model of GBM is:

\(mod_25 = gbm(BM_tot ~ pr_mean_6 + pet_sd_6 + tasmax_sd_3 + tasmin_mean_120 + \\P + pr_mean_120 + pr_sd_3 + pH + pr_sd_120 + tasmax_sd_6 + pet_mean_3 + Silt + Sand +\\ tasmax_mean_6 + pr_mean_3 + LC_ + tasmin_sd_3 , df = data)\)

11.5 Biomass: ANN

The best model of ANN is:

\(mod_15 = ann(BM_tot ~ pr_mean_3 + tasmax_mean_120 + pet_mean_120 + pr_sd_6 + \\Sand + tasmin_sd_6 + pet_sd_120 + K + Silt + tasmax_mean_6 + tasmin_sd_3 +\\ pet_sd_6 + pet_mean_3 + tas_mean_6 + tasmax_mean_3 + P + C_org + CN + tas_sd_3 + pr_sd_3 + \\tasmin_sd_120 + pet_sd_3 + tasmin_mean_3 + pr_mean_6 + pr_mean_120 + pH + LC_, df = data)\)

11.6 Biomass: best models

Models Rep.var R2_test RMSE MAE
GLM Biomass 0.39 3.10 2.46
GAM Biomass 0.49 2.86 2.24
RF Biomass 0.56 2.62 2.08
GBM Biomass 0.47 2.91 2.31
ANN Biomass 0.50 3.05 2.35

Meta models

Models Rep.var R2_test RMSE MAE
Averaging Biomass 0.54 2.74 2.15
Stacking Biomass 0.57 2.58 2.05


GLM

\(mod = glm(BM_tot ~ N + pet_mean_6 + pet_sd_6 + tas_sd_120 + P + Sand + \\ Clay + C_org + tas_sd_6 + tas_sd_3 + pr_sd_3 + CN + tas_mean_6 + pH + \\ pr_mean_120 + LC_, df = data)\)


GAM

\(mod = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)\)


RF

\(mod = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + LC_ + tasmin_sd_120 + pet_mean_3 , df = data)\)


GBM

\(mod= gbm(BM_tot ~ pr_mean_6 + pet_sd_6 + tasmax_sd_3 + tasmin_mean_120 + \\P + pr_mean_120 + pr_sd_3 + pH + pr_sd_120 + tasmax_sd_6 + pet_mean_3 + Silt + Sand +\\ tasmax_mean_6 + pr_mean_3 + LC_ + tasmin_sd_3 , df = data)\)


ANN

\(mod = ann(BM_tot ~ pr_mean_3 + tasmax_mean_120 + pet_mean_120 + pr_sd_6 + \\Sand + tasmin_sd_6 + pet_sd_120 + K + Silt + tasmax_mean_6 + tasmin_sd_3 +\\ pet_sd_6 + pet_mean_3 + tas_mean_6 + tasmax_mean_3 + P + C_org + CN + tas_sd_3 + pr_sd_3 + \\tasmin_sd_120 + pet_sd_3 + tasmin_mean_3 + pr_mean_6 + pr_mean_120 + pH + LC_, df = data)\)



Conclusion: le meillleur model pour la biomasse est le RF

12 LandWorm results: total Richness

Richness: best models

12.1 Richness: GLM

The best model of GLM is :

\(mod_7 = glm(Richesse_tot ~ N + pr_sd_3 + tas_mean_120 + CN + P + tas_sd_6 + pet_sd_6 + pet_mean_6 + tas_mean_6 + tas_sd_120 + pr_mean_120 + pH + LC_, df = data)\)

12.2 Richness: GAM

The best model of gam is:

\(mod_4 = gam(Richesse_tot ~ Sand + K + pr_sd_3 + tas_sd_3 + CN + tas_sd_120 +\\ P + tas_sd_6 + tas_mean_120 + pet_sd_6 + pet_mean_6 + N + tas_mean_6 + pr_mean_120 +\\ pH + LC_ , df = data)\)

12.3 Richness: RF

The best model of RF is:

\(mod_24 = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 + K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 + tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)\)

12.4 Richness: GBM

The best model of GBM is:

\(mod_9 = gbm(Richesse_tot ~ pr_mean_3 + Clay + tasmax_mean_6 + pr_mean_120 + tasmax_sd_120 + tasmin_sd_6 + CaCO3 + pr_sd_6 + N + K + pr_mean_6 + pr_sd_120 + Sand + tasmin_sd_3 + P + pH + pet_sd_120 + LC_, df = data)\)

12.5 Richness: ANN

The best model of ANN is:

\(mod_33 = ann(Richesse_tot ~ P + tasmin_sd_120 + pr_sd_3 + tasmin_sd_3 + pr_sd_120 + pr_mean_6 + Silt + pH + LC_, df = data)\)

12.6 Richness: best models

Models Rep.var R2_test RMSE MAE
GLM Richness 0.37 1.92 1.54
GAM Richness 0.48 1.75 1.38
RF Richness 0.57 1.61 1.24
GBM Richness 0.43 1.85 1.48
ANN Richness 0.44 2.13 1.65

Meta models

Models Rep.var R2_test RMSE MAE
Averaging Richness 0.52 1.73 1.36
Stacking Richness 0.58 1.58 1.23


GLM

\(mod = glm(Richesse_tot ~ N + pr_sd_3 + tas_mean_120 + CN + P + tas_sd_6 + pet_sd_6 + pet_mean_6 + tas_mean_6 + tas_sd_120 + pr_mean_120 + pH + LC_, df = data)\)


GAM

\(mod = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)\)


RF

\(mod = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 + K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 + tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)\)


GBM

\(mod= gbm(Richesse_tot ~ pr_mean_3 + Clay + tasmax_mean_6 + pr_mean_120 + tasmax_sd_120 + tasmin_sd_6 + CaCO3 + pr_sd_6 + N + K + pr_mean_6 + pr_sd_120 + Sand + tasmin_sd_3 + P + pH + pet_sd_120 + LC_, df = data)\)


ANN

\(mod = ann(Richesse_tot ~ P + tasmin_sd_120 + pr_sd_3 + tasmin_sd_3 + pr_sd_120 + pr_mean_6 + Silt + pH + LC_, df = data)\)



Conclusion: le meillleur model pour la Richesse_tot est le RF

13 Best of the best models

13.1 Performances

Models Rep.var R2_adj_train R2_test RMSE MAE
RF Abundance 0.83 0.46 5.00 3.87
RF Biomass 0.84 0.56 2.62 2.08
RF Richness 0.87 0.57 1.61 1.24

13.2 Models


Abundance

\(mod = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)\)


Biomass

\(mod = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + \\ LC_ + tasmin_sd_120 + pet_mean_3 , df = data)\)


Richness

\(mod = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 +\\ K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 +\\ tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)\)

13.3 Optimisation des bests models


____________Befor_____________

Models Rep.var R2_adj_train R2_test RMSE MAE
RF Abundance 0.83 0.46 5.00 3.87
RF Biomass 0.84 0.56 2.62 2.08
RF Richness 0.87 0.57 1.61 1.24


____________After_____________

Models Rep.var R2_adj_train R2_test RMSE MAE
RF Abundance 0.82 0.46 4.99 3.87
RF Biomass 0.85 0.56 2.62 2.08
RF Richness 0.87 0.57 1.59 1.23

13.4 Graphes

13.5 Comparaison

14 Importance des variables

14.1 Abundance

14.2 Biomass

14.3 Richness

15 Prediction and mapping

Reading layer `france_shapefile_sans_corse' from data source 
  `C:\Users\diall\OneDrive\Bureau\M2_MODE\stage_abdou_m2\R_Stage_M2\cartographie\france_shapefile_sans_corse.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 12 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -5.141277 ymin: 42.33292 xmax: 8.232497 ymax: 51.08899
Geodetic CRS:  +proj=longlat +datum=WGS84 +no_defs

15.1 Extraction

Datas preparations

  • Land use

  • Soils predictors

  • Climate predictors

15.2 Import best models and predictions

Abundance

Code
# mod_AB_tot = readRDS("cartographie/best_mod/RF_mod_AB_tot.RDS")
# 
# AB_tot_pred<- predict(mod_AB_tot, newdata = predictors_trans)
# AB_tot_pred = round(as.numeric(AB_tot_pred))
# # summary(AB_tot_pred)
# AB_tot_pred = data.frame(Abundance = AB_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
# 
# write.csv(x =AB_tot_pred,file = "cartographie/AB_tot_pred.csv",row.names = FALSE)
# 
# saveRDS(AB_tot_pred, "cartographie/AB_tot_pred.rds")
# 
# AB_tot_pred = readRDS("cartographie/AB_tot_pred.rds")

Biomass

Code
# mod_BM_tot = readRDS("cartographie/best_mod/RF_mod_BM_tot.RDS")
# 
# BM_tot_pred<- predict(mod_BM_tot, newdata = predictors_trans)
# BM_tot_pred = round(as.numeric(BM_tot_pred))
# # summary(BM_tot_pred)
# BM_tot_pred = data.frame(Biomass = BM_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
# 
# write.csv(x =BM_tot_pred,file = "cartographie/BM_tot_pred.csv",row.names = FALSE)
# 
# saveRDS(BM_tot_pred, "cartographie/BM_tot_pred.rds")

# BM_tot_pred = readRDS("cartographie/BM_tot_pred.rds")

Richness

Code
# mod_Richesse_tot = readRDS("cartographie/best_mod/RF_mod_Richesse_tot.RDS")
# 
# Richesse_tot_pred<- predict(mod_Richesse_tot, newdata = predictors_trans)
# Richesse_tot_pred = round(as.numeric(Richesse_tot_pred))
# # summary(Richesse_tot_pred)
# Richesse_tot_pred = data.frame(Richness = Richesse_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
# 
# write.csv(x =Richesse_tot_pred,file = "cartographie/Richesse_tot_pred.csv",row.names = FALSE)
# 
# saveRDS(Richesse_tot_pred, "cartographie/Richesse_tot_pred.rds")
# 
# Richesse_tot_pred = readRDS("cartographie/Richesse_tot_pred.rds")

15.3 Interpolation

Sampling

  • 800 m regular grid

  • Stores gps coordinates

Fig. Interpolation

15.4 Abundance

15.5 Biomass

15.6 Richness

15.7 Approximate earthworm diversity

15.8 Mapview

See Mapping earthworm prediction in France

15.9 Co-kriging

16 Writing

16.1 Cate M&M

16.2 Carte CLC

16.3 Redaction results

17 Questions

Quelles dates de prevelement pour la prediction

17.1 Additional information





All the material from my internship, including scripts and datasets, is available on my GitHub and my Rpubs.

Thank you for your attention