LandWorm 2.0

Modeling and predicting earthworm diversity and distribution in France

Abdourahmane Diallo

2024-07-22

Setting

Packages

Code

  library(tidyverse)
  # library(glme)
  # library(lsmeans)
  # library(agricolae)
  # library(RVAideMemoire)
  library(corrplot)
  # library(emmeans)
  library(lme4)
  library(multcomp)
  library(MASS)
  # library(R2WinBUGS)
  library(arm)
  # library(performance)
  # library(AER)
  # library(AICcmodavg)
  # library(MuMIn)
  library(ade4)
  library(Hmisc)
  library(labdsv)
  library(vegan)
  library(cowplot)
  library(ggpubr)
  library(rstatix)
  library(patchwork)
  library(multcompView)
  library(ggsignif)
  library(grid)
  library(FactoMineR)
  library(factoextra)
  library(explore)
  library(ggrepel)
  library(naniar)
  library(outliers)
  library(leaps)
  library(fastDummies)
  library(caret) # pour l'entrainement des models
  library(mgcv)
  library(ggeffects)
  library(gratia)
  library(GGally) # pour ggpair
  # library(caTools)
  # library(rpart)
  # library(rpart.plot)
  library(openxlsx)
  library(readxl)
  library(leaflet) # pour la carto
  library(quarto)
  library(raster)
  library(knitr)
  library(kableExtra)
  library(stringr)
  library(plotly)
  # library(PerformanceAnalytics)
  # library(usdm)
  library(vcd) # pour la distribution des var reponse
  library(prospectr)# pour split data avec kenSton()
  # library(glmnet)
  library(randomForest)
  # library(doParallel)
  library(gbm)
  library(kernlab)
  # library(e1071)
  library(ggforce)
  library(keras)
  library(tensorflow)
  library(neuralnet)
  # library(parallel)
  library(iml) # pour l'interpretabilité des models https://cran.r-project.org/web/packages/iml/vignettes/intro.html
  library(stats)
  # library(Boruta) # importance des predicteurs
  library(bestNormalize)
  library(rmarkdown)
  library(DT)
  library(gtExtras) # pour la
  library(reshape2)
  # library(mapview)
  library(sf)
  library(ggplot2)
  library(maptools)
  library(ggsn)
  library(spThin)
  library(sp)
  library(gstat)
  library(rms)

Functions

Code

## Identification des NA dans un df -----------------------------------------------
taux_completion<-
  function(df, afficher_zero_percent = FALSE, seuil, trie=FALSE) {
    # Calcule du pourcentage de NA dans le dataframe
    pourcentage_total <-
      round(sum(is.na(df)) / (nrow(df) * ncol(df)) * 100, 1)
    
    # Calcule du pourcentage de NA par colonne
    pourcentage_colonnes <- round(colMeans(is.na(df)) * 100, 1)
    
    # Creation d'un dataframe résultat avec deux colonnes
    result <-
      data.frame(
        Variables = names(df),
        CR = pourcentage_colonnes,
        row.names = NULL
      )
    
    if (afficher_zero_percent) {
      result <- result[result$CR == 0, ]
      result$CR = 100 -result$CR
    } else {
      result <- result[result$CR > 0, ]
      result$CR = 100 -result$CR
      
    }
    
    result <- rbind(result, c("Total", pourcentage_total))
    #result <- rbind(result, c("Total", paste0(pourcentage_total, "")))
    
    result <- result[, c("Variables", "CR")]
    result$CR = as.numeric(result$CR)
    result$CR = round(result$CR,1)
    if (trie){
      result = result %>% arrange(desc(CR))
    }
    result$CR = paste0(result$CR,"%")
    
    return(result)
  }
# Converssion des colonne en num ou factor-----------------------------------------------
conv_col <- function (data, columns_to_convert, to_types) {
  if (to_types == "numeric") {
    # Conversion des colonnes en numeric
    for (col in columns_to_convert) {
      data[, col] <- as.numeric(data[, col])
    }
  } else {
    # Conversion des colonnes en facteurs
    for (col in columns_to_convert) {
      data[, col] <- as.factor(data[, col])
    }
  }
  return(data)
}
#data_converted <- conv_col(data, names(data [, c(1, 3)]), "factor")

# exploration graphiques des variables numeriques -----------------------------------------------
explo_num <- function(nom_col, titre, df = landworm, ligne_col = c(2, 2),mini = min(df[[nom_col]]), maxi=max(df[[nom_col]]) ) {
  par(mfrow = ligne_col)
  
  df[complete.cases(df[[nom_col]]), ]
  df <- df %>%dplyr::filter(!is.na(df[[nom_col]]))
  df[[nom_col]] = as.numeric(df[[nom_col]])
  # Boxplot
  boxplot(df[[nom_col]], col = 'blue', ylab = titre, ylim = c(mini, maxi))
  # Cleveland plot
  # dotchart(df[[nom_col]], pch = 16, col = 'blue', xlab = titre)
  plot(df[[nom_col]], pch = 16, col = 'blue', ylab = titre)
  # Histogram
  hist(df[[nom_col]], col = 'blue', xlab = titre, main = "")
  # Quantile-Quantile plot
  qqnorm(df[[nom_col]], pch = 16, col = 'blue', xlab = '')
  qqline(df[[nom_col]], col = 'red') 
}

# Extraction des predictors + moyennes -----------------------------------------------

extraction <- function(nom_col, tif_file_path, df = landworm, conv = 1) {
  #df <- df %>%filter(!is.na(gps_x) & !is.na(gps_y))
  raster_data <- raster(tif_file_path)
  
  # Création d'un dataframe pour stocker les valeurs extraites
  df_interne <- data.frame(gps_x = df$gps_x, gps_y = df$gps_y)
  proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
  # Transformer les coordonnées GPS en système de coordonnées du raster
  gps_coords_sp <- SpatialPoints(df_interne, proj4string = CRS(proj4Str))
  gps_coords_proj <- spTransform(gps_coords_sp, crs(raster_data))
  
  # Extraction des valeurs du raster 
  values <- raster::extract(raster_data, gps_coords_proj)
  
  # Ajout des valeurs extraites comme nouvelles colonnes a df
  #df_save = data.frame()
  #df_save[[nom_col]] <- values / conv
  
  df[[nom_col]] <- values / conv
  
  return(df)
}

# la moyenne des predictores -----------------------------------------------
moyenne_val_extrct <- function(nom_col, vec_col, df=landworm) {
  df[[nom_col]] <- rowMeans(as.matrix(df[, vec_col, drop = FALSE]), na.rm = TRUE)
  df[[nom_col]] = round(df[[nom_col]],1)
  return(as.data.frame(df))
}


# tests de corrélation avec un seuil -----------------------------------------------
cor_function_seuil <- function(data, seuil,affiche=FALSE) {
  # Création d'un vecteur pour stocker les paires de variables corrélées
  variables_corr <- c()
  
  # Boucle pour tester la corrélation entre chaque paire de variables
  for (i in 1:(ncol(data) - 1)) {
    for (j in (i + 1):ncol(data)) {
      # Calcul de la corrélation entre les variables i et j
      cor_value <- stats::cor(data[, i], data[, j], use = "na.or.complete")
      
      # Stockage du résultat dans le vecteur si supérieur au seuil
      if (cor_value >= seuil | cor_value <= -seuil) {
        if(affiche){
        cat(
          "***",
          colnames(data)[i],
          "  __est correlee a__  ",
          colnames(data)[j],
          "avec un R =",
          cor_value,
          "\n \n \n"
        )
      }
        
        variables_corr <-
          c(variables_corr, colnames(data)[i], colnames(data)[j])
      }
    }
  }
  
  return(variables_corr)
}


# tests de valeurs aberant -----------------------------------------------
test_grub <- function(data, variable, direction = "maxi") {
  
  if (direction == "maxi") { 
    repeat {
      #le test de Grubbs
      test_aberrant <- grubbs.test(data[[variable]], opposite = FALSE)
      
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        max_value <- max(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != max_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  if (direction == "mini") { 
    repeat {
      test_aberrant <- grubbs.test(data[[variable]], opposite = TRUE)
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        min_value <- min(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != min_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  return(data)
}




# boxplote -----------------------------------------------
plot_boxplot <-function(donnee,
           x_col,y_col,x_label,y_label,title,legend_title,
           couleurs,
           affiche_point = TRUE,
           ymin = min(donnee[[y_col]]),
           ymax = 1.2 * max(donnee[[y_col]])) {
    
  graphe <-ggplot(donnee,
             aes_string(
               x = x_col,
               y = y_col,
               colour = x_col
             )) +
  geom_boxplot(
        outlier.shape = NA,
        outlier.colour = "black",
        alpha = 0.20,
        size = 1.5 
      ) +
  labs(title = title,x = x_label,y = y_label) +
  scale_color_manual(values = couleurs, name = legend_title) +
  theme_classic(base_size = 12, base_family = "Arial") +
  theme(axis.text = element_text(size = 10),
        axis.title.y = element_text(
          vjust = 5, size = 12, face = "bold"),
        axis.title.x = element_text(face = "bold"),
        axis.ticks.length = unit(0.2, "cm"),
        legend.position = "none",  # Cette ligne supprime la legende
        #legend.position = "right",
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 12, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = 14, face = "bold"),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
      )
    if (affiche_point) {
      graphe <-
        graphe + geom_jitter(position = position_jitter(seed = 0.5), size = 0.8)
    }
    
    if (y_col %in% names(donnee)) {
      graphe <- graphe +
        coord_cartesian(ylim = c(ymin, ymax))
    }
  
    graphe = graphe + stat_summary(
      fun.y = mean,
      geom = "point",
      shape = 15,
      size = 1.5,
      col = "black",
      fill = "black"
    )
    
    return(graphe)
}



#pour le  pairwise.t.test() -----------------------------------------------------
tri.to.squ <- function(x) {
  rn <- row.names(x)
  cn <- colnames(x)
  an <- unique(c(cn, rn))
  myval <- x[!is.na(x)]
  mymat <-
    matrix(
      1,
      nrow = length(an),
      ncol = length(an),
      dimnames = list(an, an)
    )
  for (ext in 1:length(cn))
  {
    for (int in 1:length(rn))
    {
      if (is.na(x[row.names(x) == rn[int], colnames(x) == cn[ext]]))
        next
      mymat[row.names(mymat) == rn[int], colnames(mymat) == cn[ext]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
      mymat[row.names(mymat) == cn[ext], colnames(mymat) == rn[int]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
    }
  }
  return(mymat)
}



# Selection interaction -------------------------------
select_inter <- function(response_var, df, explanatory_vars) {
  results <- data.frame()
  combinations <- combn(explanatory_vars, 2, simplify = FALSE)

  for(i in seq_along(combinations)) {

    formula <- as.formula(paste(response_var, "~", paste(combinations[[i]], collapse = "*")))
    model <- gam(formula, data = df)
    r_squared <- summary(model)$r.sq
    aic <- AIC(model)
    results <- rbind(results, data.frame("variables" = paste0(combinations[[i]], collapse = ".inter."), 
                                         "r_squared" = r_squared, 
                                 "aic" = aic))
  }
  return(results)
}

# Comparaion betwen predtited and observed -----------------------------------
plot_comp = function (df,ylabel, title_class, legende = TRUE,plotly = FALSE,xlabel = "observations",title=""){ 

  
  p = ggplot(df, aes(x = observation)) + 
  #graph representant observed
  geom_point(aes(y = Observed, color = "Observed values")) +
  geom_line(aes(y = Observed, color = "Observed values")) + 
  
  #graph representant  preticted
  geom_point(aes(y = Predicted, color="Predicted values")) +
  geom_line(aes(y = Predicted, color="Predicted values")) + 
  # ggtitle(title)
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = title,x=xlabel, y=ylabel, color = "Legend :") + 
  ylim(min(c(min(df$Predicted), min(df$Observed))),
            max(c(max(df$Predicted), max(df$Observed)))+1  ) +
    
  scale_color_manual(values = c("Observed values"='red', "Predicted values"='green')) +
  annotate("text", x = 8, y =  max(c(max(df$Predicted), max(df$Observed)))+1, 
           label = title_class, col = "black", size = 3)

  
  if (!legende) {
    p <- p + theme(legend.position = "none")
  }
  
  if(plotly){
    p = ggplotly(p)
  }

return (p)

}


# Calcul R² -----------------------------------
# calcule_R2 = function(x, y) {cor(x, y)^2}
calcule_R2 <- function(y_true, y_pred) {
  sse <- sum((y_true - y_pred)^2)
  sst <- sum((y_true - mean(y_true))^2)
  r_squared <- 1 - (sse / sst)
  return(r_squared)
}



# Fonction d'exploration graphique des relations entre les vd et les predictors -----------------------------------
nuage_point <- function(df, var_rep, predicteur, titre_x, titre_y) {
  # Convertir les variables en numeriques
  df[[predicteur]] <- as.numeric(df[[predicteur]])
  df[[var_rep]] <- as.numeric(df[[var_rep]])
  
  df= df[,c(predicteur,var_rep)]
  df = drop_na(df)
  # correlation entre les deux variables
  correlation <- round(cor(df[[predicteur]], df[[var_rep]]), 3)
  
  #  le graphique
  g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
    geom_point() + # Ajout des points
    geom_smooth(method = "lm", se = TRUE, color = "red") + 
    labs(
      # title = paste("Relationship between",var_rep, "and", predicteur),
         subtitle = paste0(" r = ", correlation),
         x = titre_x, 
         y = titre_y) +
    theme_classic()   
  
  return(g)
}

box_plot <- function(df, var_rep, predicteur, titre_x, titre_y) {
  g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
    geom_boxplot() + # Ajout des boxplots
    labs(
      # title = paste("Boxplot of", var_rep, "by", predicteur),
         x = titre_x, 
         y = titre_y) +
    theme_classic()   
  
  print(g)
}

1 Database import

Import of database LandWorm_dataset_site_01_07_2024.xlsx (february 22, 2024)

The database contains 8143 rows and 479 columns

Columns with only NAs

To make the dataset easier to read, I identify and remove all columns with only NA:

[1] "thermal_weed_control" "amdmt_orga_freq"      "amdmt_orga_names"    
[4] "amdmt_orga_qtty"      "amdmt_calcic"         "amdmt_calcic_names"  
[7] "amdmt_calcic_qtty"

The database contains 8143 rows and 472 columns

1.1 Protocols (avant séléction)

List of protocols available on the database ( 22 levels)

Code

landworm$Protocole = as.factor(landworm$Protocole)
df <- as.data.frame(summary(landworm$Protocole))
colnames(df) <- c("Numbers")
DT::datatable(df, options = list(pageLength = 5))

Code

rm("df")

Selection of protocols: 16

The database therefore changes from 8143 to 6303 observations.

1.2 Data selection: LandWorm

On séléctionne tout sauf NA

	Numbers
cp	468
dc	4324
gp	299
mh	855
sg	357

Code

df_suivi = landworm
n_line = nrow(df_suivi)

landworm <- subset(landworm, owner %in% c("dc","cp","gp","sg", "mh"))
landworm$owner=droplevels(landworm$owner)

The database therefore changes from 6303 to 6303 observations.

2 Database exploration

CR = Completion rate

2.1 Complete columns

Code

df_col=taux_completion(landworm,TRUE,trie=FALSE)
df_col = df_col[df_col$Variables != "Total",]
#print("table")
kable(df_col, caption = "", col.width = c("75%", "25%"))

	Variables	CR
72	ID	100%
73	Protocole	100%
75	owner	100%
76	AB_tot	100%

Code

# cat(                                                    )
# head(landworm[, "ID"])

2.2 Non-complete columns

2.3 Focus on GPS coordinates

There is 266 NA in GPS_X
There is 269 NA in GPS_Y

Code

df_suivi = landworm
n_line = nrow(df_suivi)

# landworm$gps_x <- as.numeric(gsub("[^0-9.-]", "", landworm$gps_x))
# landworm$gps_y <- as.numeric(gsub("[^0-9.-]", "", landworm$gps_y))
landworm <- landworm[complete.cases(landworm$gps_x, landworm$gps_y), ]
landworm <- landworm %>%dplyr::filter(!is.na(gps_x) & !is.na(gps_y))
#sum(is.na(landworm$gps_x))
#sum(is.na(landworm$gps_y))

We delete the NA lines in the GPS coordinates
The database therefore changes from 6303 to 6034 observations.

2.4 Cartography

Code

df_suivi = landworm
n_line = nrow(df_suivi)

df_coord <- landworm[, c("gps_x", "gps_y")] %>% mutate(gps_x = as.numeric(gps_x),gps_y = as.numeric(gps_y))

df_coord$num_ligne <- seq(nrow(df_coord))
carte <- leaflet(df_coord) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~gps_x, lat = ~gps_y, radius = 0.8, fillOpacity = 0.8, fillColor = "blue")
carte

We delete points outside France (22)
The database therefore changes from 6034 to 6012 observations.

2.5 Focus on years

Cleaning the Annee column

CR of Annee = 98.9% (35 levels)

We remove all the years before 1990 and the NA ( 0 observations)

Code

# sum(is.na(landworm$Annee))
landworm <- landworm %>%dplyr::filter(!is.na(Annee))# on enleve les NA
annes_omit= c("1821", "1960", "1978", "1982", "1983", "1984", "1986", "1988", "1989", "1905") # annee sup
landworm <- landworm[!landworm$Annee %in% annes_omit, ]
landworm=droplevels(landworm)
summary_df <- data.frame(Numbers = summary(landworm$Annee))
summary_df$Annee = rownames(summary_df)
summary_df = summary_df[,c("Annee", "Numbers")]
rownames(summary_df) = NULL
summary_df = summary_df %>% arrange(Annee)
summary_df %>% datatable(options = list(pageLength = 10))

Code

rm("summary_df")

The database therefore changes from 6012 to 6010 observations.

2.6 Focus on clcm_lvl1

CR of clcm_lvl1 = 84.2% (7 levels)

Code

landworm$clcm_lvl1= as.factor(landworm$clcm_lvl1)
summary_df <- as.data.frame(summary(landworm$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)

	Numbers
1_Naturel	1
2_Agricole	3
Agricultural areas	3706
Artificial surfaces	834
Forest and semi natural areas	541
Territoires agricoles	1
Wetlands	30
NA's	894

Merging levels

Code

levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "1_Naturel"] <- "Forest and semi natural areas"
levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "2_Agricole"] <- "Agricultural areas"

levels(landworm$clcm_lvl1)[levels(landworm$clcm_lvl1) == "Territoires agricoles"] <- "Agricultural areas"

landworm$clcm_lvl1= as.factor(landworm$clcm_lvl1)
summary_df <- as.data.frame(summary(landworm$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)

	Numbers
Forest and semi natural areas	542
Agricultural areas	3710
Artificial surfaces	834
Wetlands	30
NA's	894

Update code_clcm_lvl1

Code

#landworm$code_clcm_lvl1 = as.factor(landworm$code_clcm_lvl1)

landworm$code_clcm_lvl1 <- ifelse(landworm$clcm_lvl1 == "Forest and semi natural areas", 3, landworm$code_clcm_lvl1)

landworm$code_clcm_lvl1 <- ifelse(landworm$clcm_lvl1 == "Agricultural areas", 2, landworm$code_clcm_lvl1)

For the moment, we will keep the NA of clcm_lvl1

2.7 Focus on clcm_lvl2

CR of clcm_lvl2 = 84.2% (13 levels)

Code

landworm$clcm_lvl2= as.factor(landworm$clcm_lvl2)
summary_df <- as.data.frame(summary(landworm$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

Merging levels

Code

levels(landworm$clcm_lvl2)[levels(landworm$clcm_lvl2) == "21_Agricole ouvert"] <- "Arable land"

landworm$clcm_lvl2= as.factor(landworm$clcm_lvl2)
summary_df <- as.data.frame(summary(landworm$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
# kable(summary_df,padding = 5)
summary_df %>% datatable(options = list(pageLength = 10), rownames = TRUE)

Update code_clcm_lvl2

Code

landworm$code_clcm_lvl2 <- ifelse(landworm$clcm_lvl2 == "Arable land", 21, landworm$code_clcm_lvl2)

2.8 Focus on clcm_lvl3

CR of clcm_lvl3 = 78.5% (29 levels)

Code

df_suivi = landworm
n_line = nrow(df_suivi)

landworm$clcm_lvl3= as.factor(landworm$clcm_lvl3)
summary_df <- as.data.frame(summary(landworm$clcm_lvl3))
colnames(summary_df) <- c("Numbers")
# kable(summary_df,padding = 5)
summary_df %>% datatable(options = list(pageLength = 10), rownames = TRUE)

2.9 Land use selection (clcm_lvl3)

Broad-leaved forest
Indetermined forest
Mixed forest
Urban forest and woodland
Pastures, meadows and other permanent grasslands under agricultural use
Non-irrigated arable land
Vineyards
Green urban areas
Natural grasslands

Code

# select_os= c("Broad-leaved forest", "Coniferous forest", "Mixed forest", 
# "Pastures, meadows and other permanent grasslands under agricultural use", "Non-irrigated arable land", "Vineyards","Green urban areas","Natural grasslands")

select_os= c("Broad-leaved forest", "Mixed forest", "Indetermined forest", 
             "Urban forest and woodland", "Pastures, meadows and other permanent grasslands under agricultural use", "Non-irrigated arable land", "Vineyards","Green urban areas","Natural grasslands")


landworm <- landworm[landworm$clcm_lvl3 %in% select_os, ]
landworm=droplevels(landworm)
landworm$clcm_lvl3 = as.factor(landworm$clcm_lvl3)
summary_df <- as.data.frame(summary(landworm$clcm_lvl3))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

We merge the 4 types of forest:
- Broad-leaved forest
- Indetermined forest
- Mixed forest
- Urban forest and woodland

The database therefore changes from 6010 to 4204 observations.

2.10 Focus on protocols (après séléction)

List of protocols available on the database ( 7 levels)

Code

landworm$Protocole = as.factor(landworm$Protocole)
summary_df <- as.data.frame(summary(landworm$Protocole))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

Merging levels :
- F_HS $=$ F_HS $+$ FHS
- HS $=$ HS $+$ hand sorting

Code

levels(landworm$Protocole)[levels(landworm$Protocole) == "FHS"] <- "F_HS"
levels(landworm$Protocole)[levels(landworm$Protocole) == "hand sorting"] <- "HS"
landworm$Protocole = as.factor(landworm$Protocole)
summary_df <- as.data.frame(summary(landworm$Protocole))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

2.11 Land use & protocol overview

Code

# kable (table(landworm$clcm_lvl1, landworm$Protocole,exclude = NULL), align = "c", format = "pipe", padding = 10)
# kable (table(landworm$clcm_lvl2, landworm$Protocole,exclude = NULL), align = "c", format = "pipe", padding = 10)
df = table(landworm$clcm_lvl3, landworm$Protocole,exclude = NULL)
# df = as.data.frame(df)
kable (df, align = "c", format = "pipe", padding = 10)

	AITC_HS	AITCTM	F_HS	HS	HS_F_16	HS_M_16	M_HS
Forest	4	2	22	147	0	0	3
Green urban areas	2	0	0	595	0	0	10
Natural grasslands	0	66	3	62	0	0	9
Non-irrigated arable land	80	81	358	1267	24	90	36
Pastures, meadows and other permanent grasslands under agricultural use	60	0	227	264	0	42	0
Vineyards	0	1	374	375	0	0	0

Code

# df %>% datatable(options = list(pageLength = 8))

3 Earthworms data

3.1 Total abundance

Sppression des valeurs aberrantes

The database therefore changes from 4204 to 4163 observations.

3.2 Total biomass

Sppression des valeurs aberrantes

The database therefore changes from 4163 to 4143 observations.

3.3 Total taxonomic richness

Total richness calculation method

1. Removal of columns with only NA and/or only 0

1. Merging sub-species with their species

Voici la liste des sp a fusionnée:

Code

# function pour fusion les sous especes a leurs especes
sp_identique = function(df, nom_sous_espece, nom_espece ) {
  
  # Si les deux colonnes existent dans df
  if (nom_sous_espece %in% names(df) && nom_espece %in% names(df)) {
    
    # Addition des valeurs des deux colonnes et stockage du résultat dans nom_espece
    # df[[nom_espece]] = df[[nom_sous_espece]] + df[[nom_espece]]
    df[[nom_espece]] = rowSums(df[,c(nom_sous_espece,nom_espece)], na.rm = TRUE)
    

    df[[nom_sous_espece]] = NULL
  
  # Si nom_espece n'existe pas dans df mais nom_sous_espece existe
  } else if (nom_sous_espece %in% names(df) && !(nom_espece %in% names(df))) {
    
    # on renome nom_sous_espece par nom_espece
    names(df)[names(df) == nom_sous_espece] <- nom_espece
    df[[nom_sous_espece]] = NULL
  }
  
  return(df)
}


for (i in 1:nrow(df_sp_sous_sp)){
 
landworm_sp = sp_identique(df = landworm_sp,nom_sous_espece = df_sp_sous_sp[i,"col_sp_origines"],
                  nom_espece = df_sp_sous_sp[i,"col_sp_concatener"]) 
}

1. Identify columns beginning with AB_

1. Deletion of AB_ columns that are not species

Code

# On supprimme les colonnes AB_ qui ne sont pas des espèces dans le calcule
ab_supprimee =  c("AB_AD",
                  "AB_JV",
                  "AB_SA",
                  "AB_STAD_X",
                  "AB_indéterminable",
                  "AB_Indéterminable",
                  "AB_indéterminable_endogeic",
                  "AB_tot",
                  "AB_Indéterminable_epigeic",
                  "AB_indéterminable_endogeic",
                  "AB_Ep.X",
                  "AB_vide",
                  "AB_Ep.X1",
                  "AB_Ep.X2",
                  "AB_A.X",
                  "AB_Adult",
                  "AB_cocon",
                  "AB_indéterminé",
                  "AB_Juvenile",
                  "AB_Sub.adult",
                  "AB_Indéterminé")

colonnes_AB <- colonnes_AB[!colonnes_AB %in% ab_supprimee]

cat("A ce stade, la liste des especes est: \n")

A ce stade, la liste des especes est:

Code

df= data.frame(colonnes_AB)
rownames(df) = NULL
DT::datatable(df, options = list(pageLength = 5))

1. Calculate richness by assigning 1 to each column if the value is different from 0 and NA

A ce stade, la richesse varie de:

Code

# On calcule la richesse en attribiant 1 à chaque colonne si la valeur est différent de 0 et de NA
landworm_sp$Richesse_tot <- 0
landworm_sp$Richesse_tot <- rowSums(!is.na(landworm_sp[colonnes_AB]) & landworm_sp[colonnes_AB] != 0)
#sum (is.na(landworm_sp$Richesse_tot) )
summary(landworm_sp$Richesse_tot)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   3.000   5.000   4.923   7.000  14.000

1. Décremmentation de la richesse

Voir code:

Code

# Apres calcule richesse ------------------------------------------------------
sp_prorata <- function(df, sp_x, sp) {
  
  name_col <- names(df)
  
  #  si la colonne sp_x est présente
  if (sp_x %in% name_col) {
    
    # pour chaque ligne 
    for (i in 1:nrow(df)) {
      
      #  si les valeurs de la colonne sp_x ne sont ni 0 ni NA
      if (!is.na(df[i, sp_x]) && df[i, sp_x] != 0) {
        
        # Si la somme des ab des sp est différente de 0
        if (rowSums(df[i, sp], na.rm = TRUE) != 0) {
          # La richesse diminue de 1
          df[i, "Richesse_tot"] <- df[i, "Richesse_tot"] - 1
        }
      }
    }
  }
  
  return(df)
}

# Pour AB_Allolobophora_sp ------------------------------------------
sp_x = "AB_Allolobophora_sp"
sp=c("AB_Allolobophora_burgondiae", 
     "AB_Allolobophora_chlorotica",         
     "AB_Aporrectodea_cupulifera",  
     "AB_Aporrectodea_icterica",    
     "AB_Aporrectodea_limicola",    
     "AB_Aporrectodea_rosea")
# df= landworm_sp[, c(sp_x,sp, "Richesse_tot")]
# df=df[!df$AB_Allolobophora_sp == 0,]
# df=df[! is.na(df$AB_Allolobophora_sp),]
# df$som = rowSums(df[,sp], na.rm=TRUE)
# View(df)
# 
# dff <- sp_prorata(landworm_sp, sp_x, sp)
# dff= dff[, c(sp_x,sp, "Richesse_tot")]
# dff=dff[!dff$AB_Allolobophora_sp == 0,]
# dff=dff[! is.na(dff$AB_Allolobophora_sp),]
# dff$som = rowSums(dff[,sp], na.rm=TRUE)
# View(dff)
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Aporrectodea_indéterminable ------------------------------------------
sp_x = "AB_Aporrectodea_indéterminable"
sp=c("AB_Allolobophora_burgondiae", 
     "AB_Allolobophora_chlorotica",
     "AB_Aporrectodea_cupulifera",
     "AB_Aporrectodea_icterica",
     "AB_Aporrectodea_limicola",
     "AB_Aporrectodea_rosea")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Aporrectodea_sp ------------------------------------------
sp_x = "AB_Aporrectodea_sp"
sp=c("AB_Aporrectodea_giardi",  
     "AB_Aporrectodea_longa",   
     "AB_Aporrectodea_nocturna",
     "AB_Aporrectodea_ripicola",
     "AB_Aporrectodea_longa/giardi")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Dendrobaena_sp ------------------------------------------
# Tous les "Dendrobaena" et "Dendrodrilus_rubidus" + Satchellius mammalis
sp_x = "AB_Dendrobaena_sp"
sp=c("AB_Dendrobaena_alpina",     
     "AB_Dendrobaena_attemsi",
     "AB_Dendrobaena_cognettii",
     "AB_Dendrobaena_hortensis",      
     "AB_Dendrobaena_octaedra",
     "AB_Dendrodrilus_rubidus",
     "AB_Satchellius_mammalis")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Eisenia_sp ------------------------------------------
# Tous les Eisenia possible
sp_x = "AB_Eisenia_sp"
sp=c("AB_Eisenia_andrei",
     "AB_Eisenia_fetida",
     "AB_Eisenia_veneta",             
     "AB_Eiseniella_tetraedra")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)


# Pour AB_Lumbricus_sp  ------------------------------------------
# Tous les Lumbricus sauf Lumbricus castaneus ?
sp_x = "AB_Lumbricus_sp"
sp=c("AB_Lumbricus_centralis",       
"AB_Lumbricus_festivus",         
"AB_Lumbricus_friendi",          
"AB_Lumbricus_herculeus",        
"AB_Lumbricus_meliboeus",        
"AB_Lumbricus_rubellus" ,        
"AB_Lumbricus_terrestris",
"AB_Lumbricus_friendi/centralis")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Microscolex_sp  ------------------------------------------
# Tous les AB_Microscolex
sp_x = "AB_Microscolex_sp"
sp=c("AB_Microscolex_dubius",         
"AB_Microscolex_phosphoreus")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Octolasion_sp  ------------------------------------------
# Tous les AB_Octolasion
sp_x = "AB_Octolasion_sp"
sp=c("AB_Octodrilus_complanatus",    
"AB_Octolasion_cyaneum",         
"AB_Octolasion_lacteum")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Pheretima_indéterminable  ------------------------------------------
# AB_Pheritima_Diffringens
sp_x = "AB_Pheretima_indéterminable"
sp=c("AB_Pheritima_Diffringens", "AB_Pheritima_Diffringens")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Prosellodrilus_sp  ------------------------------------------
# Tous les Prosellodrilus
sp_x = "AB_Prosellodrilus_sp"
sp=c("AB_Prosellodrilus_amplisetosus",
"AB_Prosellodrilus_fragilis",    
"AB_Prosellodrilus_occidentalis",
"AB_Prosellodrilus_praticola",   
"AB_Prosellodrilus_pyrenaicus")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)



# Pour AB_Scherotheca_sp  ------------------------------------------
# Tous les AB_Scherotheca_sp
sp_x = "AB_Scherotheca_sp"
sp=c("AB_Satchellius_mammalis",       
"AB_Scherotheca_aquitana",       
"AB_Scherotheca_dinoscolex",     
"AB_Scherotheca_nivicola",       
"AB_Scherotheca_porotheca",      
"AB_Scherotheca_rhodana",       
"AB_Scherotheca_savignyi")
landworm_sp <- sp_prorata(landworm_sp, sp_x, sp)

1. Rules 3: Si le Owner de la parcelle est DC ou GP alors si Aporrectodea_trapezoides et/ou Aporrectodea_tuberculata sont presentent dans la parcelle alors si Aporrectodea_caliginosa est presente aussi dans la parcelle alors la richesse diminue de 1.

Code

# dff <- data.frame(
#   owner = c("DC", "GP", "XX", "DC"),
#   AB_Aporrectodea_trapezoides = c(1, NA, 0, 2),
#   AB_Aporrectodea_tuberculata = c(NA, 0, 1, 3),
#   AB_Aporrectodea_caliginosa = c(2, 1, NA, 4),
#   Richesse_tot = c(10, 20, 30, 40)
# )
# df <- dff

df=landworm_sp
for (i in 1:nrow(df)) {
  # Si owner est "DC" ou "GP"
  if (df[i, "owner"] %in% c("DC", "GP")) {
    
    # si AB_Aporrectodea_trapezoides ou AB_Aporrectodea_tuberculata sont présents et non nuls
    if ((!is.na(df[i, "AB_Aporrectodea_trapezoides"]) && 
         df[i, "AB_Aporrectodea_trapezoides"] != 0) 
        || 
        (!is.na(df[i, "AB_Aporrectodea_tuberculata"]) && 
         df[i, "AB_Aporrectodea_tuberculata"] != 0)) {
      
      #  si AB_Aporrectodea_caliginosa est présente et non nulle
      if (!is.na(df[i, "AB_Aporrectodea_caliginosa"]) && 
          df[i, "AB_Aporrectodea_caliginosa"] != 0) {
        
        # La richesse diminue de 1 pour cette ligne
        df[i, "Richesse_tot"] <- df[i, "Richesse_tot"] - 1
      }
    }
  }
}

landworm_sp = df

1. Rules 4: Si la Richesse_tot est superieur à 2 et que AB_Lumbricidae et/ou AB_Oligochaeta_so est presente (differente de 0 et de NA), la Richesse_tot dimunue de -1

1. Verifications

Sppression des valeurs aberrantes

The database therefore changes from 4143 to 4143 observations.

Il n y a pas de valeurs aberant. Cependant, il y a 11 parcelles qui ont une richesse superieur a 12, a garder ou a supprimer ?

Details des parcelles a forte richesse (sup à 12):

I remove the 11 plots with richness greater than 12.

The database therefore changes from 4143 to 4132 observations.

4 Branch 1

Saving the database after applying the previous filters.

At this stage, the database has 4132 rows and 260 columns without the climatic and pedological variables.

5 Soil data extraction and adding to landWorm

5.1 The source database

5.2 Extraction et fusion

Code

# Ajout variables du soil  (voir script Add_soil_dats.R)
soil_datas <- read.xlsx("datas/soil_datas.xlsx" , sheet = "Sheet 1")
soil_datas <- subset(soil_datas, select = -c(Programme,Annee,Date_Prelevement,ID_Site,gps_x, gps_y))


rows_not_in_soil_datas <- anti_join(landworm, soil_datas, by = "ID_2")
rows_not_in_landworm <- anti_join(soil_datas,landworm, by = "ID_2")

# plot(soil_datas$ID_2, landworm$ID_2)

merged_df <- merge(landworm, soil_datas, by = "ID_2")
ids_not_matching <- anti_join( merged_df,landworm, by = "ID_2")

# colSums(is.na(soil_datas))

nn = nrow(merged_df)

merged_df = drop_na(merged_df,names(soil_datas))

p_sans_pedo = nn - nrow(merged_df)

landworm = merged_df

196 plots have at least one NA in the pedological variables, so I am removing them, and the database goes from 4132 to 3936.

5.3 Sand

Extracted values (g/kg, 0 - 30 cm)

The database therefore changes from 3936 to 3934 observations.

5.4 Silt

Extracted values (g/kg, 0 - 30 cm)

Sppression des valeurs aberrantes

The database therefore changes from 3934 to 3934 observations.

5.5 Clay

Extracted values (g/kg, 0 - 30 cm)

Sppression des valeurs aberrantes

The database therefore changes from 3934 to 3934 observations.

5.6 Soil organic carbone (g/kg)

The database therefore changes from 3934 to 3912 observations.

5.7 pH (H2O)

The database therefore changes from 3912 to 3912 observations.

5.8 Phosphore (P, mg/kg)

Sppression des valeurs aberrantes

The database therefore changes from 3912 to 3912 observations.

5.9 Azote (N, g/kg)

Sppression des valeurs aberrantes

The database therefore changes from 3912 to 3901 observations.

5.10 Potassium (K, mg/kg)

The database therefore changes from 3901 to 3891 observations.

5.11 C/N

The database therefore changes from 3891 to 3887 observations.

5.12 Carbonates de calcium (CaCO3, g/kg)

Sppression des valeurs aberrantes

The database therefore changes from 3887 to 3882 observations.

6 Climate data extraction and adding to landWorm

6.1 The source database (CHELSA V2)

Code

# Lire le fichier Excel
chemin_fichier_excel <- "C:/Users/diall/Downloads/datas/ODMAP.xlsx"
climat <- read.xlsx(chemin_fichier_excel, sheet = "climat")

# Fusions des cellules des colonnes avec des éléments dupliqués
for (col in names(climat)) {
  climat[[col]] <- ifelse(duplicated(climat[[col]]), "", climat[[col]])
}

# Affichage du tableau avec kableExtra et centrage du contenu des cellules
# kableExtra::kable(climat)

For each of the following variables, I calculate the mean and the standard deviation at 3 months, 6 months, and 120 months (30 variables in total).

Data available only until 2018/2019.

6.2 Extraction method

See file Script 1 DownloadChelsaData

See file Script 2 AddchelsaData

Steps:

Obtain a vector of “month_Year” periods:
- Extract the unique periods (month_year).
Initialize the necessary columns:
- Define the periods of interest (“3_months”, “6_months”, “120_months”).
- Define the variables of interest (“pet”, “pr”, “tas”, “tasmax”, “tasmin”).
- Initialize columns for means and standard deviations for each period and each variable.
Main loop through periods and variables:
- Iterate over each variable of interest.
- Iterate over each “month_Year” period.
  - Load the rasters corresponding to the 120-month period.
  - Extract raster data for the specified GPS points.
  - Convert units according to the variable.
  - Select sub-periods of 3, 6, and 120 months.
  - Calculate means and standard deviations for each period.
Save the updated data
Merging database and climat database

Code

# Ajout variables du climate  (voir script Add_climate_dats.R)
climate_datas <- read.xlsx("datas/climate_datas.xlsx" , sheet = "Sheet 1")
climate_datas <- subset(climate_datas, select = -c(Programme,Annee,Date_Prelevement,ID_Site,gps_x, gps_y))


rows_not_in_climate_datas <- anti_join(landworm, climate_datas, by = "ID_2")
rows_not_in_landworm <- anti_join(climate_datas,landworm, by = "ID_2")

# plot(climate_datas$ID_2, landworm$ID_2)

merged_df <- merge(landworm, climate_datas, by = "ID_2")
ids_not_matching <- anti_join( merged_df,landworm, by = "ID_2")

# colSums(is.na(climate_datas))

nn = nrow(merged_df)

merged_df = drop_na(merged_df,names(climate_datas))

p_sans_climat = nn - nrow(merged_df)

landworm = merged_df

0 plots have at least one NA in the climate variables, so I am removing them, and the database goes from 3882 to 3882.

6.3 Average annual air temperature (°C) = tas

::: {.panel-tabset}

6.4 1. At 3 months

Mean

The database therefore changes from 3882 to 3876 observations.

Standard deviation

The database therefore changes from 3876 to 3876 observations.

6.5 2. At 6 months

Mean

The database therefore changes from 3876 to 3876 observations.
Standard deviation

The database therefore changes from 3876 to 3876 observations.

6.6 3. At 120 months

Mean

The database therefore changes from 3876 to 3870 observations.
Standard deviation

The database therefore changes from 3870 to 3870 observations.

:::

6.7 Mean daily maximum 2m air temperature (°C) = tasmax

1. At 3 months
2. At 6 months
3. At 120 months

Mean

The database therefore changes from 3870 to 3849 observations.

Standard deviation

The database therefore changes from 3849 to 3849 observations.

Mean

The database therefore changes from 3849 to 3849 observations.
Standard deviation

The database therefore changes from 3849 to 3849 observations.

Mean

The database therefore changes from 3849 to 3849 observations.
Standard deviation

The database therefore changes from 3849 to 3849 observations.

6.8 Mean daily maximum 2m air temperature (°C) = tasmin

1. At 3 months
2. At 6 months
3. At 120 months

Mean

The database therefore changes from 3849 to 3849 observations.

Standard deviation

The database therefore changes from 3849 to 3849 observations.

Mean

The database therefore changes from 3849 to 3849 observations.
Standard deviation

The database therefore changes from 3849 to 3849 observations.

Mean

The database therefore changes from 3849 to 3847 observations.
Standard deviation

The database therefore changes from 3847 to 3847 observations.

Mean

The database therefore changes from 3847 to 3826 observations.

Standard deviation

The database therefore changes from 3826 to 3822 observations.

Mean

The database therefore changes from 3822 to 3803 observations.
Standard deviation

The database therefore changes from 3803 to 3803 observations.

Mean

The database therefore changes from 3803 to 3793 observations.
Standard deviation

The database therefore changes from 3793 to 3784 observations.

6.10 Potential evapotranspiration (mm/month) => pet

1. At 3 months
2. At 6 months
3. At 120 months

Mean

The database therefore changes from 3784 to 3784 observations.

Standard deviation

The database therefore changes from 3784 to 3784 observations.

Mean

The database therefore changes from 3784 to 3550 observations.
Standard deviation

The database therefore changes from 3550 to 3545 observations.

Mean

The database therefore changes from 3545 to 3523 observations.
Standard deviation

The database therefore changes from 3523 to 3521 observations.

7 Exploratory analysis

Data set reduction:

3 Response variables: Abundance, biomass, and richness
41 Explanatory variables (including 30 climatic, 10 pedological, and land use)

Code

id_col=c("ID_2","Programme","Annee","ID_Site","Protocole")

vdt_col=c("AB_tot", "BM_tot", "Richesse_tot")

os_and_gps = c('clcm_lvl3','gps_x', 'gps_y')

soils_variables = c("sable.0_30","limon.0_30","argile.0_30",'jrc_pH_H2O','N','P','K','CN', 'CaCO3',"c_orga_0_a_30")

climates_variables = c("tas_mean_3","tas_mean_6","tas_mean_120",
                      "tas_sd_3","tas_sd_6","tas_sd_120",
                      
                      "tasmax_mean_3","tasmax_mean_6","tasmax_mean_120",
                      "tasmax_sd_3","tasmax_sd_6","tasmax_sd_120",
                      
                      "tasmin_mean_3","tasmin_mean_6","tasmin_mean_120",
                      "tasmin_sd_3","tasmin_sd_6","tasmin_sd_120",
                      
                      "pr_mean_3","pr_mean_6","pr_mean_120",
                      "pr_sd_3","pr_sd_6","pr_sd_120",
                      
                      "pet_mean_3","pet_mean_6","pet_mean_120",
                      "pet_sd_3","pet_sd_6","pet_sd_120")

landworm_explo = landworm[,c(id_col,vdt_col,os_and_gps,soils_variables,climates_variables)]

landworm_explo <- landworm_explo %>%
  dplyr::rename(
    Sand = sable.0_30,
    Silt = limon.0_30,
    Clay = argile.0_30,
    pH = jrc_pH_H2O,
    C_org = c_orga_0_a_30,
    LC_ = clcm_lvl3,
    Long = gps_x ,
    Lat =  gps_y
  )



cl_original <- levels(landworm_explo$LC_)
new_cl <- c("For","Gua", "Nag", "Nial", "Pmo", "Viny")
landworm_explo$LC_ <- factor(landworm_explo$LC_, levels = cl_original, labels = new_cl)



soils_variables = c("Sand","Silt","Clay",'pH','N','P','K','CN', 'CaCO3',"C_org")
variables_factor = c("LC_")
all_predictors = c(variables_factor,soils_variables,climates_variables)

all_predictors_num = all_predictors[!all_predictors %in% c(variables_factor)]

Il reste donc 3521 lignes et 51 colonnes.

7.1 Correlation matrix

All predictors

Correlation < |0.07| = 0

Séparation entre les variables pédologiques et climatiques

a). Variables pédologiques

Conclusion: je supprime Silt et je garde Sand. De même je suprime CaCO3 et je garde pH.

b). Variables climatiques:

On enlève d’abord les variables sd:

pet_sd_120, pet_sd_3, pr_sd_120, pr_sd_6, tasmin_sd_120, tasmin_sd_6, tasmin_sd_3, tasmax_sd_120, tasmax_sd_6, tasmax_sd_3

Maintenat on enleve les variables qui ont le plus de correlation

On enleve donc:

pet_mean_3, pet_mean_120, tas_mean_3, tasmin_mean_3, tasmin_mean_6, tasmin_mean_120, tasmax_mean_6, tasmax_mean_3, tasmax_mean_120, pr_mean_6

Synthese

Il reste donc 18 variables explicatives non correlee

=> 22 variables supprimees

7.2 VIF

No variable from the 18 input variables has collinearity problem. 

The linear correlation coefficients ranges between: 
min correlation ( pr_sd_3 ~ P ):  -0.0006721748 
max correlation ( tas_sd_6 ~ tas_mean_6 ):  0.6810085 

---------- VIFs of the remained variables -------- 
      Variables      VIF
1          Sand 2.352594
2          Clay 3.059920
3            pH 2.802590
4             N 2.796920
5             P 2.056419
6             K 1.389062
7            CN 2.948456
8         C_org 1.847050
9    tas_mean_6 7.077602
10 tas_mean_120 3.282206
11     tas_sd_3 3.032422
12     tas_sd_6 8.558721
13   tas_sd_120 3.927902
14    pr_mean_3 3.725966
15  pr_mean_120 2.572277
16      pr_sd_3 2.058873
17   pet_mean_6 5.740856
18     pet_sd_6 5.842996

7.3 Relationship between the abundance and the predictors

- AB_tot & LC_

-  AB_tot & Sand

-  AB_tot & Silt

-  AB_tot & Clay

-  AB_tot & pH

-  AB_tot & N

-  AB_tot & P

-  AB_tot & K

-  AB_tot & CN

-  AB_tot & CaCO3

-  AB_tot & C_org

-  AB_tot & tas_mean_3

-  AB_tot & tas_mean_6

-  AB_tot & tas_mean_120

-  AB_tot & tas_sd_3

-  AB_tot & tas_sd_6

-  AB_tot & tas_sd_120

-  AB_tot & tasmax_mean_3

-  AB_tot & tasmax_mean_6

-  AB_tot & tasmax_mean_120

-  AB_tot & tasmax_sd_3

-  AB_tot & tasmax_sd_6

-  AB_tot & tasmax_sd_120

-  AB_tot & tasmin_mean_3

-  AB_tot & tasmin_mean_6

-  AB_tot & tasmin_mean_120

-  AB_tot & tasmin_sd_3

-  AB_tot & tasmin_sd_6

-  AB_tot & tasmin_sd_120

-  AB_tot & pr_mean_3

-  AB_tot & pr_mean_6

-  AB_tot & pr_mean_120

-  AB_tot & pr_sd_3

-  AB_tot & pr_sd_6

-  AB_tot & pr_sd_120

-  AB_tot & pet_mean_3

-  AB_tot & pet_mean_6

-  AB_tot & pet_mean_120

-  AB_tot & pet_sd_3

-  AB_tot & pet_sd_6

-  AB_tot & pet_sd_120

7.4 Relationship between the biomass and the predictors

- BM_tot & LC_

-  BM_tot & Sand

-  BM_tot & Silt

-  BM_tot & Clay

-  BM_tot & pH

-  BM_tot & N

-  BM_tot & P

-  BM_tot & K

-  BM_tot & CN

-  BM_tot & CaCO3

-  BM_tot & C_org

-  BM_tot & tas_mean_3

-  BM_tot & tas_mean_6

-  BM_tot & tas_mean_120

-  BM_tot & tas_sd_3

-  BM_tot & tas_sd_6

-  BM_tot & tas_sd_120

-  BM_tot & tasmax_mean_3

-  BM_tot & tasmax_mean_6

-  BM_tot & tasmax_mean_120

-  BM_tot & tasmax_sd_3

-  BM_tot & tasmax_sd_6

-  BM_tot & tasmax_sd_120

-  BM_tot & tasmin_mean_3

-  BM_tot & tasmin_mean_6

-  BM_tot & tasmin_mean_120

-  BM_tot & tasmin_sd_3

-  BM_tot & tasmin_sd_6

-  BM_tot & tasmin_sd_120

-  BM_tot & pr_mean_3

-  BM_tot & pr_mean_6

-  BM_tot & pr_mean_120

-  BM_tot & pr_sd_3

-  BM_tot & pr_sd_6

-  BM_tot & pr_sd_120

-  BM_tot & pet_mean_3

-  BM_tot & pet_mean_6

-  BM_tot & pet_mean_120

-  BM_tot & pet_sd_3

-  BM_tot & pet_sd_6

-  BM_tot & pet_sd_120

7.5 Relationship between the richness and the predictors

- Richesse_tot & LC_

-  Richesse_tot & Sand

-  Richesse_tot & Silt

-  Richesse_tot & Clay

-  Richesse_tot & pH

-  Richesse_tot & N

-  Richesse_tot & P

-  Richesse_tot & K

-  Richesse_tot & CN

-  Richesse_tot & CaCO3

-  Richesse_tot & C_org

-  Richesse_tot & tas_mean_3

-  Richesse_tot & tas_mean_6

-  Richesse_tot & tas_mean_120

-  Richesse_tot & tas_sd_3

-  Richesse_tot & tas_sd_6

-  Richesse_tot & tas_sd_120

-  Richesse_tot & tasmax_mean_3

-  Richesse_tot & tasmax_mean_6

-  Richesse_tot & tasmax_mean_120

-  Richesse_tot & tasmax_sd_3

-  Richesse_tot & tasmax_sd_6

-  Richesse_tot & tasmax_sd_120

-  Richesse_tot & tasmin_mean_3

-  Richesse_tot & tasmin_mean_6

-  Richesse_tot & tasmin_mean_120

-  Richesse_tot & tasmin_sd_3

-  Richesse_tot & tasmin_sd_6

-  Richesse_tot & tasmin_sd_120

-  Richesse_tot & pr_mean_3

-  Richesse_tot & pr_mean_6

-  Richesse_tot & pr_mean_120

-  Richesse_tot & pr_sd_3

-  Richesse_tot & pr_sd_6

-  Richesse_tot & pr_sd_120

-  Richesse_tot & pet_mean_3

-  Richesse_tot & pet_mean_6

-  Richesse_tot & pet_mean_120

-  Richesse_tot & pet_sd_3

-  Richesse_tot & pet_sd_6

-  Richesse_tot & pet_sd_120

7.6 Total abundance distributions

Transformation sqrt

lamda = 0.3

7.7 Total biomass distributions

Transformation sqrt

lamda = 0.4

7.8 Total taxonomic richness distributions

Transformation sqrt

lamda = 0.6

7.9 Standarization

Transformation sqrt de l’abondance et de la biomasse
Transformation centrée reduite des prédicteurs

8 Modeling

8.1 Predictors

All predictors: => RF, GBM et ANN

 [1] "LC_"             "Sand"            "Silt"            "Clay"           
 [5] "pH"              "N"               "P"               "K"              
 [9] "CN"              "CaCO3"           "C_org"           "tas_mean_3"     
[13] "tas_mean_6"      "tas_mean_120"    "tas_sd_3"        "tas_sd_6"       
[17] "tas_sd_120"      "tasmax_mean_3"   "tasmax_mean_6"   "tasmax_mean_120"
[21] "tasmax_sd_3"     "tasmax_sd_6"     "tasmax_sd_120"   "tasmin_mean_3"  
[25] "tasmin_mean_6"   "tasmin_mean_120" "tasmin_sd_3"     "tasmin_sd_6"    
[29] "tasmin_sd_120"   "pr_mean_3"       "pr_mean_6"       "pr_mean_120"    
[33] "pr_sd_3"         "pr_sd_6"         "pr_sd_120"       "pet_mean_3"     
[37] "pet_mean_6"      "pet_mean_120"    "pet_sd_3"        "pet_sd_6"       
[41] "pet_sd_120"

No correlation: => GLM et GAM |0.7|

 [1] "LC_"          "Sand"         "Clay"         "pH"           "N"           
 [6] "P"            "K"            "CN"           "C_org"        "tas_mean_6"  
[11] "tas_mean_120" "tas_sd_3"     "tas_sd_6"     "tas_sd_120"   "pr_mean_3"   
[16] "pr_mean_120"  "pr_sd_3"      "pet_mean_6"   "pet_sd_6"

See Correlation matrix

8.2 Variable selection Process

Initialization

Model creation and evaluation Loop

For each iteration i up to length(variables) - 1:

Model creation
- We fit a model using all available predictor variables.
Calculation of variable importance
- We use the permutation feature importance algorithm based on Fisher, Rudin, and Dominici (2018):
  - Input: model $\hat{F}$, variables $X$, target $y$, error measure $L(y, \hat{F}(X))$
  - Estimate the original model error $L_{original}$.
  - For each variable $j$:
    1. Permute $j$ in $X$ to obtain $X_{perm}$, thus breaking the association between $j$ and $y$.
    2. Estimate the error $L_{perm}$ with $X_{perm}$.
    3. Calculate the importance $FI_j = L_{perm} / L_{original}$.
  - Sort the variables by decreasing FI.
Determination of the least important variable
Predictions on Test Data
Calculation of performance metrics
- RMSE, adjusted R² for training, R² for testing, and MAE.
Update of Train and Test datasets
- Remove the least important variable from train & test.

Recording of Results

8.3 Data preparation

Code

# AB_tot --------------------------------------------------------------------------
df_mod_AB_tot = data_deep[,c("AB_tot",predictor_deep, LC_levels)]
df_mod_AB_tot = drop_na(df_mod_AB_tot)
df_mod_AB_tot = droplevels(df_mod_AB_tot)


# Partition
# set.seed(123)
# ind <- sample(2, nrow(df_mod_AB_tot), replace = T, prob = c(.8, .2))
# AB_tot_train <- df_mod_AB_tot[ind==1,]
# AB_tot_test <- df_mod_AB_tot[ind==2,]

set.seed(42)
split <- rsample::initial_split(df_mod_AB_tot,prop = 0.8, strata = "AB_tot")
AB_tot_train <- rsample::training(split)
AB_tot_test <- rsample::testing(split)

write.csv2(x =AB_tot_train,file = "datas/landW/AB_tot_train.csv", row.names = FALSE)
write.csv2(x =AB_tot_test,file = "datas/landW/AB_tot_test.csv", row.names = FALSE)
# 
# AB_tot_train = read.csv2("datas/landW/AB_tot_train.csv")
# AB_tot_test = read.csv2("datas/landW/AB_tot_test.csv")
# df_mod_AB_tot = rbind(AB_tot_train,AB_tot_test)

AB_tot_train = as.data.frame(AB_tot_train)
AB_tot_test = as.data.frame(AB_tot_test)

df <- data.frame(y =AB_tot_train[,"AB_tot"])
abundance_dist_train = ggplot(df, aes(x=y)) +
geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
  geom_density(fill="black", alpha=0.2) +
  theme_gray() +
  labs(title="Abundance: Train", x="Value", y="Density") +
  theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/landW/abundance_dist_train.png", plot = abundance_dist_train, dpi = 300,width = 3,height = 2)

df <- data.frame(y =AB_tot_test[,"AB_tot"])
abundance_dist_test = ggplot(df, aes(x=y)) +
geom_histogram( fill="#69b3a2", color="#e9ecef", bins=30, alpha=2) +
  geom_density(fill="black", alpha=0.2) +
  theme_gray() +
  labs(title="Abundance: Test", x="Value", y="Density") +
  theme(plot.title = element_text(hjust = 0.5))
# ggsave("Results/landW/abundance_dist_test.png", plot = abundance_dist_test, dpi = 300,width = 3,height = 2)

# Distrvitbution de var rep dans train et de test: est ce homogene ?
abundance_dist_train_and_test = ggarrange(abundance_dist_train, abundance_dist_test,
                          labels = c('(a)', '(b)'),
                          common.legend = TRUE,
                          legend = 'right'
)


ggsave("Results/landW/abundance_dist_train_and_test.png", plot = abundance_dist_train_and_test, dpi = 300,height = 2,width = 4)

Abundance

Data partition (3521, 48):
- train data (80 %) = 2816, 48
- test data (20 %) = 705, 48

Biomasse

Data partition (2304, 48):
- train data (80 %) = 1840, 48
- test data (20 %) = 464, 48

Richness

Data partition (3521, 48):
- train data (80 %) = 2816, 48
- test data (20 %) = 705, 48

8.4 GLM

Code

GLM <- function(var_rep, df_app, df_valid,family = 'gaussian'){
  
  
  var_predicteurs = names(df_app[,-1])
 
  df_app = df_app[,c(var_rep,var_predicteurs)]
  df_valid = df_valid[,c(var_rep,var_predicteurs)]
  
  formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))
  
  
  # entrainement du modele sur le jeu d'entrainement
  modelglm<-glm(formula,family = family ,data = df_app)
  
  # Prediction sur le jeu de validation
  pred.GLM<-predict(modelglm,newdata=as.data.frame(df_valid[,var_predicteurs]))
  
  # Calcul du RMSE pour évaluer la qualite du modele
  rmse <- round (sqrt(mean((df_valid[,var_rep] - pred.GLM)^2,na.rm=TRUE)),2)
  
  
 # Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(modelglm, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² 
  # R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GLM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
  
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.GLM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  
  MAE <- mean(abs(pred.GLM - df_valid[,var_rep]),na.rm=TRUE)
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  # output
  results_df <- data.frame(Algorithms = "GLM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
    
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelglm,predit = pred.GLM, df = results_df)
  return(results)
}

Gaussian distribution

8.5 GAM

Code

GAM <- function(var_rep, df_app, df_valid, family = 'gaussian',method = "REML", interaction = FALSE){
  
  var_predicteurs = names(df_app[,-1])
  
  
  if (var_rep == "AB_tot"){ 

  modelgam<-gam(AB_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v,
        family=family,method = method,data = df_app)
  
  }
  
  
  
  
  if (var_rep == "BM_tot"){ 

  modelgam<-gam(BM_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v,
        family=family,method = method,data = df_app)
  
    
  }
  
  
  
  if(var_rep == "Richesse_tot"){ 
    
  modelgam<-gam(Richesse_tot ~ s(CaCO3) + s(gps_x) + s(N) + s(tasmax) + s(gps_y) + s(clay) + s(silt) + s(P) + s(bio12) + clcm_lvl3f + clcm_lvl3gua + clcm_lvl3ng + clcm_lvl3nial + clcm_lvl3p + clcm_lvl3v ,
        family=family,method = method,data = df_app)
   
  }
  
  
  # Prediction sur le jeu de validation
  pred.GAM <- predict(modelgam,newdata=as.data.frame(df_valid[,var_predicteurs]))
  
  # Calcul du RMSE pour évaluer la qualite du modele
  rmse <- sqrt(mean((df_valid[,var_rep] - pred.GAM)^2,na.rm=TRUE))

  
# Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(modelgam, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,var_rep],pred.GAM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
    res <- rms::lrm(df_valid[,var_rep]  ~ pred.GAM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  

  # Calcule le MAE
  MAE <- mean(abs(pred.GAM - df_valid[,var_rep]))
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  
  # output
  results_df <- data.frame(Algorithms = "GAM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = modelgam, predit = pred.GAM, df = results_df)
  
  return(results)
}

Family = gaussian
Link function = identity
Method = REML
Tuning

8.6 RF

Default model

RF model tuning by grid
ntree = $100,300,500,700,900,1000,1300,1500,1700,2000$
mtry = $2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24$
maxnodes = $10 , 20, 30, 40, 50, 60, 70, 80, 90, 100$

Total number of models = $ntree * mtry * maxnode = 960$

Validation of models on test data

Code

ForetAlea <- function(var_rep, df_app, df_valid, mtry, ntree, maxnodes) {
  
  set.seed(1863)
  col_posi <- which(names(df_app) == var_rep)
  ForeVDT <- randomForest::randomForest(df_app[-col_posi], df_app[[col_posi]], mtry = mtry, ntree = ntree, maxnodes = maxnodes)
  
  # Prediction on the validation dataset
  col_posi <- which(names(df_valid) == var_rep)
  pred.RF <- predict(ForeVDT, newdata = df_valid[, -col_posi])
  
  # Calculate RMSE to evaluate model quality
  rmse <- sqrt(mean((df_valid[, col_posi] - pred.RF)^2))
  
  
  # Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(ForeVDT, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,col_posi],pred.RF)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
  res <- rms::lrm(df_valid[,var_rep]  ~ pred.RF, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  
  # Calculate MAE
  MAE <- mean(abs(pred.RF - df_valid[, col_posi]))
  
  # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
    # output
  results_df <- data.frame(Algorithms = "RF",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = ForeVDT, predit = pred.RF, df = results_df)
  
  return(results)
}

8.7 GBM

Default model
GBM model tuning by grid
n.trees = $1000, 1500, 1700, 2000, 3000$
shrinkage = $0.01, 0.02, 0.05, 0.001, 0.002, 0.005$
interaction.depth = $3, 5, 6, 8, 10$
n.minobsinnode = $2, 5, 10, 30, 50, 70$

Total number of models = $n.trees * shrinkage * interaction.depth * n.minobsinnode = 900$

Validation of models on test data

Code

GBM <- function(var_rep, df_app, df_valid,distribution = 'gaussian',n.trees ,shrinkage,interaction.depth,n.minobsinnode){

  formula <- substitute(var_rep ~ ., list(var_rep = as.name(var_rep)))

  Gradboost<-gbm(formula, data = df_app,
    distribution = distribution, 
    n.trees = n.trees,
    shrinkage = shrinkage,
    interaction.depth = interaction.depth,
    n.minobsinnode = n.minobsinnode) 
  
  # Prediction sur le jeu de validation :
   col_posi <- which(names(df_valid) == var_rep)
  prev.GBM<-predict(Gradboost,newdata=as.data.frame(df_valid[,-col_posi]))
 
  # Calcul du RMSE pour évaluer la qualité du modele
  rmse <- sqrt(mean((df_valid[,var_rep] - prev.GBM)^2))


# Calcul du R² ajusté pour train
  R_adj_train <- calcule_R2(df_app[,var_rep],  predict(Gradboost, newdata=df_app))
  n_train <- nrow(df_app)
  p_train <- ncol(df_app) - 1
  r_adj_train <- 1 - ((1 - R_adj_train) * (n_train - 1) / (n_train - p_train - 1))
  
  # Calcul du R² ajusté pour test
  # R_adj_test <-calcule_R2(df_valid[,col_posi],prev.GBM)
  # n_test <- nrow(df_valid)
  # p_test <- ncol(df_valid) - 1
  # r_adj_test <- 1 - ((1 - R_adj_test) * (n_test - 1) / (n_test - p_test - 1))
    res <- rms::lrm(df_valid[,var_rep]  ~ prev.GBM, x= TRUE, y = TRUE)
  res = res$stats
  r_adj_test = round (res[["R2"]],2)
  

  # calcule MAE
  MAE <- mean(abs(prev.GBM - df_valid[,col_posi])) 
  
    
    # Round results
  rmse <- round(rmse, 2)
  r_adj_train <- round(r_adj_train, 2)
  r_adj_test <- round(r_adj_test, 2)
  MAE <- round(MAE, 2)
  
  
      # output
  results_df <- data.frame(Algorithms = "GBM",
                         Response_variables = var_rep,
                         R2_adjusted_train = r_adj_train,
                         R2_adjusted_test = r_adj_test,
                         RMSE = rmse,
                         MAE = MAE)
  
  
  results <- list(RMSE = rmse, R_adj_train = r_adj_train, R_adj_test = r_adj_test, MAE = MAE, model = Gradboost, predit = prev.GBM, df = results_df)
  
  
  return(results)
}

8.8 ANN

8.9 Compilation

ANN AB_tot

ANN BM_tot

ANN Richesse_tot

Summary of Compilations

For GLM and GAM, there were 19 explanatory variables => $18*5*3*2 = 540$
For RF, GBM, and ANN, there were 41 variables => $40*5*3*3 = 1800$

Excluding optimization models

9 R² ajustés

9.1 R² dans les modèles de régression non linéaire

“Le R² ajusté est utile pour les modèles de régression linéaire, il est moins pertinent pour les modèles de machine learning en raison de la complexité des modèle.”

La formule du coefficient de détermination $R^2$ est :

\[\begin{equation} R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} \end{equation}\]

où :

$ y_i $ est la valeur observée,
$ _i $ est la valeur prédite par le modèle,
$ {y} $ est la moyenne des valeurs observées,
$ n $ est le nombre total d’observations.

La formule du $R²$ ajusté est :

\[\begin{equation} R^2_{\text{ajusté}} = 1 - \left( \frac{(1 - R^2) \cdot (n - 1)}{n - k - 1} \right) \end{equation}\]

où :

$R²$ est le coefficient de détermination,
$n$ est le nombre total d’observations,
$k$ est le nombre de prédicteurs (variables indépendantes) dans le modèle.

Justificatifs

See towardsdatascience.com

“La procédure d’ajustement du modèle des models non linéaires ne repose pas sur la minimisation progressive de la somme des carrés des erreurs résiduelles (RSS) et, par conséquent, le modèle ajusté de manière optimale pourrait avoir une somme des carrés résiduels supérieure à la somme totale des carrés. Cela signifie que R² pour de tels modèles peut être une quantité négative. En tant que tel, R² n’est pas une mesure utile de la qualité de l’ajustement pour la plupart des modèles non linéaires.”

“Les modèles non linéaires utilisent souvent des techniques d’ajustement de modèle telles que l’estimation de vraisemblance maximale (MLE) qui ne minimisent pas nécessairement la somme des carrés résiduels (RSS). Ainsi, étant donné deux modèles non linéaires ajustés à l’aide de MLE, celui qui présente la meilleure qualité d’ajustement peut se révéler avoir un R² ou un R² ajusté plus faible. Une autre conséquence de ce fait est que l’ajout de variables de régression aux modèles non linéaires peut réduire le R². Globalement, le R² ou le R² ajusté ne doivent pas être utilisés pour juger de la qualité d’ajustement d’un modèle de régression non linéaire.”

“De nombreux modèles de régression non linéaires n’utilisent pas la technique d’estimation des moindres carrés ordinaires pour ajuster le modèle.”

10 LandWorm results: total abundance

Abundance: best models

10.1 Abundance: GLM

The best model of GLM is :

$mod_11 = glm(AB_tot ~ pet_sd_6 + tas_sd_6 + pr_mean_120 + P + Clay + \\ Sand + CN + pH + LC_, df = data)$

10.2 Abundance: GAM

The best model of gam is:

$mod_7 = gam(AB_tot ~ tas_sd_6 + tas_sd_3 + Sand + tas_sd_120 +\\ K + pet_sd_6 + pH + CN + P + tas_mean_6 + pr_mean_3 + tas_mean_120 + LC_, df = data)$

10.3 Abundance: RF

The best model of RF is:

$mod_28 = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)$

10.4 Abundance: GBM

The best model of GBM is:

$mod_17 = gbm(AB_tot ~ tasmin_sd_6 + tasmax_mean_6 + Sand + pr_mean_120 + tas_mean_6 +\\ pet_sd_3 + CaCO3 + tasmin_mean_120 + CN + K + tasmax_mean_120 + pet_sd_6 + pr_sd_6 +\\ P + Silt + tas_sd_6 + pr_mean_6 + pet_mean_3 + tasmin_mean_6 + pr_sd_120 + Clay + pH + \\ tas_mean_120 + pr_mean_3 + LC_, df = data)$

10.5 Abundance: ANN

The best model of ANN is:

$mod_27 = ann(AB_tot ~ pr_sd_6 + pH + P + pr_sd_120 + K + tasmin_sd_3 + \\ pet_sd_3 + pr_sd_3 + pet_mean_120 + tas_sd_120 + CaCO3 + Silt + tasmax_mean_120 + \\ tasmin_sd_120 + LC_ , df = data)$

10.6 Abundance: best models

Models	Rep.var	R2_test	RMSE	MAE
GLM	Abundance	0.31	5.65	4.52
GAM	Abundance	0.36	5.43	4.28
RF	Abundance	0.46	5.00	3.87
GBM	Abundance	0.36	5.55	4.39
ANN	Abundance	0.33	6.20	4.72

Meta models

Models	Rep.var	R2_test	RMSE	MAE
averaging	Abundance	0.44	5.29	4.15
Stacking	Abundance	0.47	4.94	3.83

GLM

$mod = glm(AB_tot ~ pet_sd_6 + tas_sd_6 + pr_mean_120 + P + Clay + \\ Sand + CN + pH + LC_, df = data)$

GAM

$mod = gam(AB_tot ~ tas_sd_6 + tas_sd_3 + Sand + tas_sd_120 +\\ K + pet_sd_6 + pH + CN + P + tas_mean_6 + pr_mean_3 + tas_mean_120 + LC_, df = data)$

$mod = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)$

GBM

$mod= gbm(AB_tot ~ tasmin_sd_6 + tasmax_mean_6 + Sand + pr_mean_120 + tas_mean_6 +\\ pet_sd_3 + CaCO3 + tasmin_mean_120 + CN + K + tasmax_mean_120 + pet_sd_6 + pr_sd_6 +\\ P + Silt + tas_sd_6 + pr_mean_6 + pet_mean_3 + tasmin_mean_6 + pr_sd_120 + Clay + pH + \\ tas_mean_120 + pr_mean_3 + LC_, df = data)$

ANN

$mod = ann(AB_tot ~ pr_sd_6 + pH + P + pr_sd_120 + K + tasmin_sd_3 + \\ pet_sd_3 + pr_sd_3 + pet_mean_120 + tas_sd_120 + CaCO3 + Silt + tasmax_mean_120 + \\ tasmin_sd_120 + LC_ , df = data)$

Conclusion: le meillleur model pour l’abondance est le RF

11 LandWorm results: total biomass

Biomass: best models

11.1 Biomass: GLM

The best model of GLM is :

$mod_6 = glm(BM_tot ~ N + pet_mean_6 + pet_sd_6 + tas_sd_120 + P + Sand + \\ Clay + C_org + tas_sd_6 + tas_sd_3 + pr_sd_3 + CN + tas_mean_6 + pH + \\ pr_mean_120 + LC_, df = data)$

11.2 Biomass: GAM

The best model of gam is:

$mod_5 = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)$

11.3 Biomass: RF

The best model of RF is:

$mod_34 = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + LC_ + tasmin_sd_120 + pet_mean_3 , df = data)$

11.4 Biomass: GBM

The best model of GBM is:

$mod_25 = gbm(BM_tot ~ pr_mean_6 + pet_sd_6 + tasmax_sd_3 + tasmin_mean_120 + \\P + pr_mean_120 + pr_sd_3 + pH + pr_sd_120 + tasmax_sd_6 + pet_mean_3 + Silt + Sand +\\ tasmax_mean_6 + pr_mean_3 + LC_ + tasmin_sd_3 , df = data)$

11.5 Biomass: ANN

The best model of ANN is:

$mod_15 = ann(BM_tot ~ pr_mean_3 + tasmax_mean_120 + pet_mean_120 + pr_sd_6 + \\Sand + tasmin_sd_6 + pet_sd_120 + K + Silt + tasmax_mean_6 + tasmin_sd_3 +\\ pet_sd_6 + pet_mean_3 + tas_mean_6 + tasmax_mean_3 + P + C_org + CN + tas_sd_3 + pr_sd_3 + \\tasmin_sd_120 + pet_sd_3 + tasmin_mean_3 + pr_mean_6 + pr_mean_120 + pH + LC_, df = data)$

11.6 Biomass: best models

Models	Rep.var	R2_test	RMSE	MAE
GLM	Biomass	0.39	3.10	2.46
GAM	Biomass	0.49	2.86	2.24
RF	Biomass	0.56	2.62	2.08
GBM	Biomass	0.47	2.91	2.31
ANN	Biomass	0.50	3.05	2.35

Meta models

Models	Rep.var	R2_test	RMSE	MAE
Averaging	Biomass	0.54	2.74	2.15
Stacking	Biomass	0.57	2.58	2.05

GLM

$mod = glm(BM_tot ~ N + pet_mean_6 + pet_sd_6 + tas_sd_120 + P + Sand + \\ Clay + C_org + tas_sd_6 + tas_sd_3 + pr_sd_3 + CN + tas_mean_6 + pH + \\ pr_mean_120 + LC_, df = data)$

GAM

$mod = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)$

$mod = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + LC_ + tasmin_sd_120 + pet_mean_3 , df = data)$

GBM

$mod= gbm(BM_tot ~ pr_mean_6 + pet_sd_6 + tasmax_sd_3 + tasmin_mean_120 + \\P + pr_mean_120 + pr_sd_3 + pH + pr_sd_120 + tasmax_sd_6 + pet_mean_3 + Silt + Sand +\\ tasmax_mean_6 + pr_mean_3 + LC_ + tasmin_sd_3 , df = data)$

ANN

$mod = ann(BM_tot ~ pr_mean_3 + tasmax_mean_120 + pet_mean_120 + pr_sd_6 + \\Sand + tasmin_sd_6 + pet_sd_120 + K + Silt + tasmax_mean_6 + tasmin_sd_3 +\\ pet_sd_6 + pet_mean_3 + tas_mean_6 + tasmax_mean_3 + P + C_org + CN + tas_sd_3 + pr_sd_3 + \\tasmin_sd_120 + pet_sd_3 + tasmin_mean_3 + pr_mean_6 + pr_mean_120 + pH + LC_, df = data)$

Conclusion: le meillleur model pour la biomasse est le RF

12 LandWorm results: total Richness

Richness: best models

12.1 Richness: GLM

The best model of GLM is :

$mod_7 = glm(Richesse_tot ~ N + pr_sd_3 + tas_mean_120 + CN + P + tas_sd_6 + pet_sd_6 + pet_mean_6 + tas_mean_6 + tas_sd_120 + pr_mean_120 + pH + LC_, df = data)$

12.2 Richness: GAM

The best model of gam is:

$mod_4 = gam(Richesse_tot ~ Sand + K + pr_sd_3 + tas_sd_3 + CN + tas_sd_120 +\\ P + tas_sd_6 + tas_mean_120 + pet_sd_6 + pet_mean_6 + N + tas_mean_6 + pr_mean_120 +\\ pH + LC_ , df = data)$

12.3 Richness: RF

The best model of RF is:

$mod_24 = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 + K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 + tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)$

12.4 Richness: GBM

The best model of GBM is:

$mod_9 = gbm(Richesse_tot ~ pr_mean_3 + Clay + tasmax_mean_6 + pr_mean_120 + tasmax_sd_120 + tasmin_sd_6 + CaCO3 + pr_sd_6 + N + K + pr_mean_6 + pr_sd_120 + Sand + tasmin_sd_3 + P + pH + pet_sd_120 + LC_, df = data)$

12.5 Richness: ANN

The best model of ANN is:

$mod_33 = ann(Richesse_tot ~ P + tasmin_sd_120 + pr_sd_3 + tasmin_sd_3 + pr_sd_120 + pr_mean_6 + Silt + pH + LC_, df = data)$

12.6 Richness: best models

Models	Rep.var	R2_test	RMSE	MAE
GLM	Richness	0.37	1.92	1.54
GAM	Richness	0.48	1.75	1.38
RF	Richness	0.57	1.61	1.24
GBM	Richness	0.43	1.85	1.48
ANN	Richness	0.44	2.13	1.65

Meta models

Models	Rep.var	R2_test	RMSE	MAE
Averaging	Richness	0.52	1.73	1.36
Stacking	Richness	0.58	1.58	1.23

GLM

$mod = glm(Richesse_tot ~ N + pr_sd_3 + tas_mean_120 + CN + P + tas_sd_6 + pet_sd_6 + pet_mean_6 + tas_mean_6 + tas_sd_120 + pr_mean_120 + pH + LC_, df = data)$

GAM

$mod = gam(BM_tot ~ P + CN + K + C_org + Sand + tas_sd_3 + pet_sd_6 + pr_sd_3 + tas_sd_6 + tas_mean_120 + pr_mean_3 + pH + pr_mean_120 + LC_ + LC_ , df = data)$

$mod = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 + K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 + tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)$

GBM

$mod= gbm(Richesse_tot ~ pr_mean_3 + Clay + tasmax_mean_6 + pr_mean_120 + tasmax_sd_120 + tasmin_sd_6 + CaCO3 + pr_sd_6 + N + K + pr_mean_6 + pr_sd_120 + Sand + tasmin_sd_3 + P + pH + pet_sd_120 + LC_, df = data)$

ANN

$mod = ann(Richesse_tot ~ P + tasmin_sd_120 + pr_sd_3 + tasmin_sd_3 + pr_sd_120 + pr_mean_6 + Silt + pH + LC_, df = data)$

Conclusion: le meillleur model pour la Richesse_tot est le RF

13 Best of the best models

13.1 Performances

Models	Rep.var	R2_adj_train	R2_test	RMSE	MAE
RF	Abundance	0.83	0.46	5.00	3.87
RF	Biomass	0.84	0.56	2.62	2.08
RF	Richness	0.87	0.57	1.61	1.24

13.2 Models

Abundance

$mod = rf(AB_tot ~ CN + pr_mean_6 + pet_mean_3 + K + tasmin_sd_6 + \\ tasmin_mean_120 + tasmin_mean_6 + Clay + pr_sd_120 + tas_mean_120 + CaCO3 + \\tas_sd_120 + pH + 4LC_ , df = data)$

Biomass

$mod = rf(BM_tot ~ pr_mean_120 + Sand + pH + pet_sd_120 + tasmax_sd_3 + \\ LC_ + tasmin_sd_120 + pet_mean_3 , df = data)$

Richness

$mod = rf(Richesse_tot ~ pr_sd_3 + tasmin_sd_120 + pr_sd_120 + tasmin_sd_6 +\\ K + tas_mean_120 + pr_mean_6 + Silt + P + CaCO3 + pr_mean_120 + tasmax_sd_3 +\\ tas_sd_6 + tas_sd_120 + tasmax_sd_120 + pH + pet_sd_120 + LC_, df = data)$

13.3 Optimisation des bests models

____________Befor_____________

Models	Rep.var	R2_adj_train	R2_test	RMSE	MAE
RF	Abundance	0.83	0.46	5.00	3.87
RF	Biomass	0.84	0.56	2.62	2.08
RF	Richness	0.87	0.57	1.61	1.24

____________After_____________

Models	Rep.var	R2_adj_train	R2_test	RMSE	MAE
RF	Abundance	0.82	0.46	4.99	3.87
RF	Biomass	0.85	0.56	2.62	2.08
RF	Richness	0.87	0.57	1.59	1.23

13.4 Graphes

13.5 Comparaison

14 Importance des variables

14.1 Abundance

Predictors importances
Predictors effects
Predictors interactions

14.2 Biomass

Predictors importances
Predictors effects
Predictors interactions

14.3 Richness

Predictors importances
Predictors effects
Predictors interactions

15 Prediction and mapping

Reading layer `france_shapefile_sans_corse' from data source 
  `C:\Users\diall\OneDrive\Bureau\M2_MODE\stage_abdou_m2\R_Stage_M2\cartographie\france_shapefile_sans_corse.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 12 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -5.141277 ymin: 42.33292 xmax: 8.232497 ymax: 51.08899
Geodetic CRS:  +proj=longlat +datum=WGS84 +no_defs

15.1 Extraction

Datas preparations

Land use
Soils predictors
Climate predictors

15.2 Import best models and predictions

Abundance

Code

# mod_AB_tot = readRDS("cartographie/best_mod/RF_mod_AB_tot.RDS")
# 
# AB_tot_pred<- predict(mod_AB_tot, newdata = predictors_trans)
# AB_tot_pred = round(as.numeric(AB_tot_pred))
# # summary(AB_tot_pred)
# AB_tot_pred = data.frame(Abundance = AB_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
# 
# write.csv(x =AB_tot_pred,file = "cartographie/AB_tot_pred.csv",row.names = FALSE)
# 
# saveRDS(AB_tot_pred, "cartographie/AB_tot_pred.rds")
# 
# AB_tot_pred = readRDS("cartographie/AB_tot_pred.rds")

Biomass

Code

# mod_BM_tot = readRDS("cartographie/best_mod/RF_mod_BM_tot.RDS")
# 
# BM_tot_pred<- predict(mod_BM_tot, newdata = predictors_trans)
# BM_tot_pred = round(as.numeric(BM_tot_pred))
# # summary(BM_tot_pred)
# BM_tot_pred = data.frame(Biomass = BM_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
# 
# write.csv(x =BM_tot_pred,file = "cartographie/BM_tot_pred.csv",row.names = FALSE)
# 
# saveRDS(BM_tot_pred, "cartographie/BM_tot_pred.rds")

# BM_tot_pred = readRDS("cartographie/BM_tot_pred.rds")

Richness

Code

# mod_Richesse_tot = readRDS("cartographie/best_mod/RF_mod_Richesse_tot.RDS")
# 
# Richesse_tot_pred<- predict(mod_Richesse_tot, newdata = predictors_trans)
# Richesse_tot_pred = round(as.numeric(Richesse_tot_pred))
# # summary(Richesse_tot_pred)
# Richesse_tot_pred = data.frame(Richness = Richesse_tot_pred, Longitude = predictors$gps_x, Latitude = predictors$gps_y)
# 
# write.csv(x =Richesse_tot_pred,file = "cartographie/Richesse_tot_pred.csv",row.names = FALSE)
# 
# saveRDS(Richesse_tot_pred, "cartographie/Richesse_tot_pred.rds")
# 
# Richesse_tot_pred = readRDS("cartographie/Richesse_tot_pred.rds")

15.3 Interpolation

Sampling

800 m regular grid
Stores gps coordinates

Fig. Interpolation

15.4 Abundance

15.5 Biomass

15.6 Richness

15.7 Approximate earthworm diversity

15.8 Mapview

See Mapping earthworm prediction in France

15.9 Co-kriging

16 Writing

16.1 Cate M&M

16.2 Carte CLC

16.3 Redaction results

17 Questions

Quelles dates de prevelement pour la prediction

17.1 Additional information

All the material from my internship, including scripts and datasets, is available on my GitHub and my Rpubs.

Thank you for your attention