Check data: TERRE INOVIA V1

Abdou Diallo

2025-02-28

1 Sitting

Function

Code
# Converssion des colonne en num ou factor-----------------------------------------------
conv_col <- function (data, columns_to_convert, to_types, format = "%Y-%m-%d") {
  if (to_types == "numeric") {
    # Conversion des colonnes en numeric
    for (col in columns_to_convert) {
      data[, col] <- as.numeric(data[, col])
    }
  } 
  if (to_types == "factor") {
    # Conversion des colonnes en facteurs
    for (col in columns_to_convert) {
      data[, col] <- as.factor(data[, col])
    }
  }
  
  if (to_types == "character") {
    # Conversion des colonnes en facteurs
    for (col in columns_to_convert) {
      data[, col] <- as.character(data[, col])
    }
  }

  if (to_types == "date") {
    format = format
    # Conversion des colonnes en facteurs
    for (col in columns_to_convert) {
      data[, col] <- as.Date(data[, col], format = format)
      data[, col] <- as.character(data[, col])
    }
  }
  
  
  return(data)
}
#data_converted <- conv_col(data, names(data [, c(1, 3)]), "factor")


# calcule de % NA -----------------------------------------------
percent_na <- function(df, col) {
  if (!col %in% colnames(df)) {
    stop("❌ La colonne spécifiée n'existe pas dans le dataframe.")
  }
  
  na_count <- sum(is.na(df[[col]]))  # Nombre de NA
  total <- nrow(df)  # Nombre total de lignes
  
  percent <- (na_count / total) * 100  # Calcul du %
  return( paste (round(percent, 2)," %" ))# Retourne un résultat arrondi
}



## Identification des NA dans un df -----------------------------------------------
taux_completion<-
  function(df, afficher_zero_percent = FALSE, seuil, trie=FALSE) {
    # Calcule du pourcentage de NA dans le dataframe
    pourcentage_total <-
      round(sum(is.na(df)) / (nrow(df) * ncol(df)) * 100, 1)
    
    # Calcule du pourcentage de NA par colonne
    pourcentage_colonnes <- round(colMeans(is.na(df)) * 100, 1)
    
    # Creation d'un dataframe résultat avec deux colonnes
    result <-
      data.frame(
        Variables = names(df),
        CR = pourcentage_colonnes,
        row.names = NULL
      )
    
    if (afficher_zero_percent) {
      result <- result[result$CR == 0, ]
      result$CR = 100 -result$CR
    } else {
      result <- result[result$CR > 0, ]
      result$CR = 100 -result$CR
      
    }
    
    result <- rbind(result, c("Total", pourcentage_total))
    #result <- rbind(result, c("Total", paste0(pourcentage_total, "")))
    
    result <- result[, c("Variables", "CR")]
    result$CR = as.numeric(result$CR)
    result$CR = round(result$CR,1)
    if (trie){
      result = result %>% arrange(desc(CR))
    }
    result$CR = paste0(result$CR,"%")
    
    return(result)
  }


# exploration graphiques des variables numeriques -----------------------------------------------
explo_num <- function(nom_col, titre, df = landworm, ligne_col = c(1, 2),mini = min(df[[nom_col]]), maxi=max(df[[nom_col]]) ) {
  par(mfrow = ligne_col)
  
  df[complete.cases(df[[nom_col]]), ]
  df <- df %>%dplyr::filter(!is.na(df[[nom_col]]))
  df[[nom_col]] = as.numeric(df[[nom_col]])
  # Boxplot
  # boxplot(df[[nom_col]], col = 'blue', ylab = titre, ylim = c(mini, maxi))
  # Cleveland plot
  # dotchart(df[[nom_col]], pch = 16, col = 'blue', xlab = titre)
  plot(df[[nom_col]], pch = 16, col = 'blue', ylab = titre)
  # Histogram
  hist(df[[nom_col]], col = 'blue', xlab = titre, main = "")
  # Quantile-Quantile plot
  # qqnorm(df[[nom_col]], pch = 16, col = 'blue', xlab = '')
  # qqline(df[[nom_col]], col = 'red') 
}

# Extraction des predictors + moyennes -----------------------------------------------

extraction <- function(nom_col, tif_file_path, df = landworm, conv = 1) {
  #df <- df %>%filter(!is.na(gps_x) & !is.na(gps_y))
  raster_data <- raster(tif_file_path)
  
  # Création d'un dataframe pour stocker les valeurs extraites
  df_interne <- data.frame(gps_x = df$gps_x, gps_y = df$gps_y)
  proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
  # Transformer les coordonnées GPS en systeme de coordonnées du raster
  gps_coords_sp <- SpatialPoints(df_interne, proj4string = CRS(proj4Str))
  gps_coords_proj <- spTransform(gps_coords_sp, crs(raster_data))
  
  # Extraction des valeurs du raster 
  values <- raster::extract(raster_data, gps_coords_proj)
  
  # Ajout des valeurs extraites comme nouvelles colonnes a df
  #df_save = data.frame()
  #df_save[[nom_col]] <- values / conv
  
  df[[nom_col]] <- values / conv
  
  return(df)
}

# la moyenne des predictores -----------------------------------------------
moyenne_val_extrct <- function(nom_col, vec_col, df=landworm) {
  df[[nom_col]] <- rowMeans(as.matrix(df[, vec_col, drop = FALSE]), na.rm = TRUE)
  df[[nom_col]] = round(df[[nom_col]],1)
  return(as.data.frame(df))
}


# tests de corrélation avec un seuil -----------------------------------------------
cor_function_seuil <- function(data, seuil,affiche=FALSE) {
  # Création d'un vecteur pour stocker les paires de variables corrélées
  variables_corr <- c()
  
  # Boucle pour tester la corrélation entre chaque paire de variables
  for (i in 1:(ncol(data) - 1)) {
    for (j in (i + 1):ncol(data)) {
      # Calcul de la corrélation entre les variables i et j
      cor_value <- stats::cor(data[, i], data[, j], use = "na.or.complete")
      
      # Stockage du résultat dans le vecteur si supérieur au seuil
      if (cor_value >= seuil | cor_value <= -seuil) {
        if(affiche){
        cat(
          "***",
          colnames(data)[i],
          "  __est correlee a__  ",
          colnames(data)[j],
          "avec un R =",
          cor_value,
          "\n \n \n"
        )
      }
        
        variables_corr <-
          c(variables_corr, colnames(data)[i], colnames(data)[j])
      }
    }
  }
  
  return(variables_corr)
}


# tests de valeurs aberant -----------------------------------------------
test_grub <- function(data, variable, direction = "maxi") {
  
  if (direction == "maxi") { 
    repeat {
      #le test de Grubbs
      test_aberrant <- grubbs.test(data[[variable]], opposite = FALSE)
      
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        max_value <- max(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != max_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  if (direction == "mini") { 
    repeat {
      test_aberrant <- grubbs.test(data[[variable]], opposite = TRUE)
      # Obtenir la p-valeur du test
      p.value <- test_aberrant$p.value
      # Si la p-valeur est inférieure au seuil de 0.05, on supprime la valeur aberrante
      if (p.value < 0.05) {
        min_value <- min(data[[variable]],na.rm=TRUE)
        data <- subset(data, data[[variable]] != min_value | is.na(data[[variable]]))
      } else {
        # S'il n'y a plus de valeurs aberrantes, sortir de la boucle
        break
      }
    }
  }
  
  
  return(data)
}




# boxplote -----------------------------------------------
plot_boxplot <-function(donnee,
           x_col,y_col,x_label,y_label,title,legend_title,
           couleurs,
           affiche_point = TRUE,
           ymin = min(donnee[[y_col]]),
           ymax = 1.2 * max(donnee[[y_col]])) {
    
  graphe <-ggplot(donnee,
             aes_string(
               x = x_col,
               y = y_col,
               colour = x_col
             )) +
  geom_boxplot(
        outlier.shape = NA,
        outlier.colour = "black",
        alpha = 0.20,
        size = 1.5 
      ) +
  labs(title = title,x = x_label,y = y_label) +
  scale_color_manual(values = couleurs, name = legend_title) +
  theme_classic(base_size = 12, base_family = "Arial") +
  theme(axis.text = element_text(size = 10),
        axis.title.y = element_text(
          vjust = 5, size = 12, face = "bold"),
        axis.title.x = element_text(face = "bold"),
        axis.ticks.length = unit(0.2, "cm"),
        legend.position = "none",  # Cette ligne supprime la legende
        #legend.position = "right",
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 12, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = 14, face = "bold"),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm")
      )
    if (affiche_point) {
      graphe <-
        graphe + geom_jitter(position = position_jitter(seed = 0.5), size = 0.8)
    }
    
    if (y_col %in% names(donnee)) {
      graphe <- graphe +
        coord_cartesian(ylim = c(ymin, ymax))
    }
  
    graphe = graphe + stat_summary(
      fun.y = mean,
      geom = "point",
      shape = 15,
      size = 1.5,
      col = "black",
      fill = "black"
    )
    
    return(graphe)
}



#pour le  pairwise.t.test() -----------------------------------------------------
tri.to.squ <- function(x) {
  rn <- row.names(x)
  cn <- colnames(x)
  an <- unique(c(cn, rn))
  myval <- x[!is.na(x)]
  mymat <-
    matrix(
      1,
      nrow = length(an),
      ncol = length(an),
      dimnames = list(an, an)
    )
  for (ext in 1:length(cn))
  {
    for (int in 1:length(rn))
    {
      if (is.na(x[row.names(x) == rn[int], colnames(x) == cn[ext]]))
        next
      mymat[row.names(mymat) == rn[int], colnames(mymat) == cn[ext]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
      mymat[row.names(mymat) == cn[ext], colnames(mymat) == rn[int]] <-
        x[row.names(x) == rn[int], colnames(x) == cn[ext]]
    }
  }
  return(mymat)
}



# Selection interaction -------------------------------
select_inter <- function(response_var, df, explanatory_vars) {
  results <- data.frame()
  combinations <- combn(explanatory_vars, 2, simplify = FALSE)

  for(i in seq_along(combinations)) {

    formula <- as.formula(paste(response_var, "~", paste(combinations[[i]], collapse = "*")))
    model <- gam(formula, data = df)
    r_squared <- summary(model)$r.sq
    aic <- AIC(model)
    results <- rbind(results, data.frame("variables" = paste0(combinations[[i]], collapse = ".inter."), 
                                         "r_squared" = r_squared, 
                                 "aic" = aic))
  }
  return(results)
}

# Comparaion betwen predtited and observed -----------------------------------
plot_comp = function (df,ylabel, title_class, legende = TRUE,plotly = FALSE,xlabel = "observations",title=""){ 

  
  p = ggplot(df, aes(x = observation)) + 
  #graph representant observed
  geom_point(aes(y = Observed, color = "Observed values")) +
  geom_line(aes(y = Observed, color = "Observed values")) + 
  
  #graph representant  preticted
  geom_point(aes(y = Predicted, color="Predicted values")) +
  geom_line(aes(y = Predicted, color="Predicted values")) + 
  # ggtitle(title)
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = title,x=xlabel, y=ylabel, color = "Legend :") + 
  ylim(min(c(min(df$Predicted), min(df$Observed))),
            max(c(max(df$Predicted), max(df$Observed)))+1  ) +
    
  scale_color_manual(values = c("Observed values"='red', "Predicted values"='green')) +
  annotate("text", x = 8, y =  max(c(max(df$Predicted), max(df$Observed)))+1, 
           label = title_class, col = "black", size = 3)

  
  if (!legende) {
    p <- p + theme(legend.position = "none")
  }
  
  if(plotly){
    p = ggplotly(p)
  }

return (p)

}


# Calcul R² -----------------------------------
# calcule_R2 = function(x, y) {cor(x, y)^2}
calcule_R2 <- function(y_true, y_pred) {
  sse <- sum((y_true - y_pred)^2)
  sst <- sum((y_true - mean(y_true))^2)
  r_squared <- 1 - (sse / sst)
  return(r_squared)
}



# Fonction d'exploration graphique des relations entre les vd et les predictors -----------------------------------
nuage_point <- function(df, var_rep, predicteur, titre_x, titre_y) {
  # Convertir les variables en numeriques
  df[[predicteur]] <- as.numeric(df[[predicteur]])
  df[[var_rep]] <- as.numeric(df[[var_rep]])
  
  df= df[,c(predicteur,var_rep)]
  df = drop_na(df)
  # correlation entre les deux variables
  correlation <- round(cor(df[[predicteur]], df[[var_rep]]), 3)
  
  #  le graphique
  g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
    geom_point() + # Ajout des points
    geom_smooth(method = "lm", se = TRUE, color = "red") + 
    labs(
      # title = paste("Relationship between",var_rep, "and", predicteur),
         subtitle = paste0(" r = ", correlation),
         x = titre_x, 
         y = titre_y) +
    theme_classic()   
  
  return(g)
}

box_plot <- function(df, var_rep, predicteur, titre_x, titre_y) {
  g <- ggplot(df, aes(x = !!rlang::sym(predicteur), y = !!rlang::sym(var_rep))) +
    geom_boxplot() + # Ajout des boxplots
    labs(
      # title = paste("Boxplot of", var_rep, "by", predicteur),
         x = titre_x, 
         y = titre_y) +
    theme_classic()   
  
  print(g)
}

2 Exploration

2.1 Data base import

  • Import of bddbase Fichier_pois hiver variétés 2017-2018_post_complet_28072020.xlsx version V1 (february 21, 2025)
Code
# Chargement des données (Modifier le fichier si besoin)
path ="~/azodyn_check/AZODYN_optileg/data/Fichier_pois hiver variétés 2017-2018_post_complet_28072020.xlsx"
bdd <- readxl::read_excel(path,sheet = "data_modif")
bdd = as.data.frame(bdd)
  • The bddbase contains 286 rows and 43 columns
Code
tags$div(
  tags$h4("Title: Overview of data", style = "text-align: center;"), 
  DT::datatable(bdd, options = list(pageLength = 5))
)

Title: Overview of data

  • Define the type of variable
Code
bdd <- conv_col(data = bdd, columns_to_convert = names(bdd),to_types = "character") 

all_colums= c("Codes_essai_Labkey", "Organisme", "lieu", "Code_postal", "Code_station_meteo", "Commune_station_meteo", "Type_de_sol", "Precedent", "an", 
"sem_t", "cond", "geno", "psol_cm", "ru_mm", "sem_j", "dens_sem_g_m2", 
"lev_j", "df_j", "drg_j", "ff_j", "mat_j", "rec_j", "surf_m2", 
"h2o_pc", "rdt14_q_ha", "pb", "tp_pc", "pmg14_g", "np__m2", "etfru_nb", 
"asc_j", "asc_pc", "ht_ff", "ht_rec", "Herbicides_1", "Fongicides_1", 
"Fongicides_2", "Fongicides_3", "Insecticides_1", "Insecticides_2", 
"Insecticides_3", "Insecticides_4", "Chlorose")

empty_columns <- colnames(bdd)[colSums(is.na(bdd)) == nrow(bdd)]

date_columns <- c("sem_j","lev_j", "df_j","ff_j","rec_j")
bdd <- conv_col(data = bdd, columns_to_convert = date_columns,to_types = "date")

factor_colums = c("Codes_essai_Labkey", "Organisme", "lieu", "Code_postal", 
                  "Code_station_meteo", "Commune_station_meteo", "Type_de_sol","Precedent","an","geno"
                  )

bdd <- conv_col(data = bdd, columns_to_convert = factor_colums,to_types = "factor")

numeric_colums = c("psol_cm","ru_mm","dens_sem_g_m2","rdt14_q_ha","tp_pc", "pmg14_g", "np__m2")
bdd <- conv_col(data = bdd, columns_to_convert = numeric_colums,to_types = "numeric") 


# Verification
# length(all_colums) == length(empty_columns) + length(date_columns) + length(factor_colums) + length(numeric_colums)

# str(bdd)

cat("Verification the type of the colums")
Verification the type of the colums
Code
str(bdd)
'data.frame':   286 obs. of  43 variables:
 $ Codes_essai_Labkey   : Factor w/ 25 levels "B17VCE02004",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ Organisme            : Factor w/ 18 levels "APVA","AXEREAL",..: 17 17 17 17 17 17 17 17 17 17 ...
 $ lieu                 : Factor w/ 23 levels "AMIFONTAINE",..: 14 14 14 14 14 14 14 14 14 14 ...
 $ Code_postal          : Factor w/ 22 levels "10200","10210",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ Code_station_meteo   : Factor w/ 18 levels "1001","1002",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ Commune_station_meteo: Factor w/ 18 levels "Angers","Boigneville",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ Type_de_sol          : Factor w/ 16 levels "Argile","Argile_limono_sableuse",..: 13 13 13 13 13 13 13 13 13 13 ...
 $ Precedent            : Factor w/ 4 levels "Blé_dur","Blé_tendre_d'hiver",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ an                   : Factor w/ 2 levels "2017","2018": 1 1 1 1 1 1 1 1 1 1 ...
 $ sem_t                : chr  NA NA NA NA ...
 $ cond                 : chr  NA NA NA NA ...
 $ geno                 : Factor w/ 16 levels "AVIRON","BALLTRAP",..: 1 2 3 4 5 7 8 9 10 11 ...
 $ psol_cm              : num  NA NA NA NA NA NA NA NA NA NA ...
 $ ru_mm                : num  150 150 150 150 150 150 150 150 150 150 ...
 $ sem_j                : chr  "2016-11-21" "2016-11-21" "2016-11-21" "2016-11-21" ...
 $ dens_sem_g_m2        : num  NA NA NA NA NA NA NA NA NA NA ...
 $ lev_j                : chr  "2016-12-12" "2016-12-12" "2016-12-12" "2016-12-14" ...
 $ df_j                 : chr  "2017-04-15" "2017-04-15" "2017-04-14" "2017-04-12" ...
 $ drg_j                : chr  NA NA NA NA ...
 $ ff_j                 : chr  NA NA NA NA ...
 $ mat_j                : chr  NA NA NA NA ...
 $ rec_j                : chr  NA NA NA NA ...
 $ surf_m2              : chr  NA NA NA NA ...
 $ h2o_pc               : chr  NA NA NA NA ...
 $ rdt14_q_ha           : num  66.5 66.8 69.2 58.1 67.7 ...
 $ pb                   : chr  NA NA NA NA ...
 $ tp_pc                : num  21.8 20.8 20.6 20 21.2 ...
 $ pmg14_g              : num  182 183 161 194 183 ...
 $ np__m2               : num  60.7 60 59.7 52.7 59.7 ...
 $ etfru_nb             : chr  NA NA NA NA ...
 $ asc_j                : chr  NA NA NA NA ...
 $ asc_pc               : chr  NA NA NA NA ...
 $ ht_ff                : chr  NA NA NA NA ...
 $ ht_rec               : chr  NA NA NA NA ...
 $ Herbicides_1         : chr  NA NA NA NA ...
 $ Fongicides_1         : chr  NA NA NA NA ...
 $ Fongicides_2         : chr  NA NA NA NA ...
 $ Fongicides_3         : chr  NA NA NA NA ...
 $ Insecticides_1       : chr  NA NA NA NA ...
 $ Insecticides_2       : chr  NA NA NA NA ...
 $ Insecticides_3       : chr  NA NA NA NA ...
 $ Insecticides_4       : chr  NA NA NA NA ...
 $ Chlorose_            : chr  NA NA NA NA ...

2.2 Complete columns

CR = completion rate

Code
df_col=taux_completion(bdd,TRUE,trie=FALSE)
df_col = df_col[df_col$Variables != "Total",]
tags$div(
  tags$h4("Title: Complete columns", style = "text-align: center;"), 
  DT::datatable(df_col, options = list(pageLength = 5))
)

Title: Complete columns

2.3 Non-complete columns

CR = completion rate

Code
df_col= taux_completion(bdd,FALSE,trie = TRUE)
df_col = df_col[df_col$Variables != "Total",]
tags$div(
  tags$h4("Title: Non complete columns", style = "text-align: center;"), 
  DT::datatable(df_col, options = list(pageLength = 5))
)

Title: Non complete columns

2.4 Figure

Code
# Comptage des Valeurs_Manquantes
na_counts <- colSums(is.na(bdd))
na_percentage <- na_counts / nrow(bdd) * 100

# Création d'un bddframe des Valeurs_Manquantes
na_df <- data.frame(Variable = names(na_counts), 
                    Valeurs_Manquantes = na_counts, Manquant = na_percentage)

# Séparation des colonnes entierement vides et partiellement manquantes
na_partial <- na_df %>% filter(`Manquant` > 0 & `Manquant` < 100)
na_full <- na_df %>% filter(`Manquant` == 100)

### 🔍 Affichage des colonnes partiellement manquantes  
# if(nrow(na_partial) > 0) {
  # cat("🔸 Colonnes avec des Valeurs_Manquantes partielles :\n")
  # kable(na_partial) %>%
  #   kable_styling(bootstrap_options = c("striped", "hover"))
# } else {
#   cat("✅ Aucune colonne partiellement manquante !")
# }

### 🔴 Affichage des colonnes avec 100% de Valeurs_Manquantes  
# if(nrow(na_full) > 0) {
  # cat("\n\n🚨 Colonnes avec 100% de Valeurs_Manquantes :\n")
  # kable(na_full) %>%
  #   kable_styling(bootstrap_options = c("striped", "hover", "bordered"))
# } else {
#   cat("\n✅ Aucune colonne entierement vide !")
# }

# 📊 Visualisation des Valeurs_Manquantes (colonnes partiellement manquantes)
if(nrow(na_partial) > 0) {
  cat("🔸 Colonnes avec des Valeurs_Manquantes partielles :")
  plot_missing(bdd %>% dplyr::select(any_of(na_partial$Variable)))
}
🔸 Colonnes avec des Valeurs_Manquantes partielles :

Code
# 🔴 Visualisation des colonnes entierement vides
if(nrow(na_full) > 0) {
  cat("\n\n⚠️ Graphique des colonnes entierement vides :")
  plot_missing(bdd %>% dplyr::select(any_of(na_full$Variable)))
}


⚠️ Graphique des colonnes entierement vides :

2.5 Descriptive statistics

Factors colums

Code
# Statistiques pour les variables numériques
df <- bdd[, (colnames(bdd) %in% c(factor_colums))]
df  = droplevels(df)

# Résumé des données
summary_df <- summary(df)

# Convertir le résumé en data frame
summary_df <- as.data.frame.matrix(summary_df)
row.names(summary_df) =NULL

# Créer une table bien formatée avec kableExtra
kable(summary_df, format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Codes_essai_Labkey Organisme lieu Code_postal Code_station_meteo Commune_station_meteo Type_de_sol Precedent an geno
B17VCE49016: 13 FNAMS : 36 AMIFONTAINE : 23 2190 : 23 5150 : 42 Vatry : 42 Argile : 32 Blé_dur : 26 2017:181 BALLTRAP: 25
B17VCE53017: 13 Vivescia : 29 BRAIN_SUR_L_AUTHION : 23 49800 : 23 5247 : 24 Chaumont: 24 Limon : 32 Blé_tendre_d'hiver: 74 2018:105 FASTE : 25
B17VCE81019: 13 Terrena : 26 Antoigne : 13 18340 : 22 4901 : 23 Angers : 23 argilo_calcaire_profond: 24 Maïs_Fourrage : 13 NA FLOKON : 25
B17VPI31004: 13 Coop_agricole_de_Juniville: 23 CONDOM : 13 31450 : 13 1002 : 21 Troyes : 21 Craie : 23 Orge_d_hiver : 32 NA FRESNEL : 25
B17VPI32003: 13 AXEREAL : 22 LABASTIDE_ST_GEORGES : 13 32100 : 13 1801 : 20 Bourges : 20 Sable_argileux : 23 NA's :141 NA FROSEN : 25
B18VCE49017: 13 Ets_Laboulet : 13 MONTESQUIEU_LAURAGAIS: 13 49260 : 13 3190 : 13 Condom : 13 (Other) :127 NA NA FURIOUS : 25
(Other) :208 (Other) :137 (Other) :188 (Other):179 (Other):143 (Other) :143 NA's : 25 NA NA (Other) :136

Numeric colums

Code
data_min_mean_max = bdd [, numeric_colums]

df = data.frame(Colums = character(),Min=numeric(), Mean = numeric(), SD_ = numeric(),
                Max = numeric())

for (i in names(data_min_mean_max)){
  
 df_2 = data.frame(Predictors = i,
                  Min= round (min(data_min_mean_max[,i], na.rm=TRUE), 3),
                  Mean = round (mean(data_min_mean_max[,i], na.rm=TRUE),3), 
                  SD_ = round (sd(data_min_mean_max[,i], na.rm=TRUE),3),
                  Max = round (max(data_min_mean_max[,i], na.rm=TRUE),3)
                            )
 df= rbind (df, df_2)
}
tags$div(
  tags$h4("Title: Non complete columns", style = "text-align: center;"), 
  DT::datatable(df, options = list(pageLength = 5))
)

Title: Non complete columns

2.6 Histogrammes (numeric colums)

Code
for (i in 1:length(numeric_colums)) {
  cat ("For the colums : ", numeric_colums [i], "\n")
  print(summary(bdd[,numeric_colums [i]]))
explo_num(nom_col = numeric_colums [i], titre = numeric_colums [i], df = bdd)
}
For the colums :  psol_cm 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    100     100     100     100     100     100     273 

For the colums :  ru_mm 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   80.0   100.0   100.0   121.2   150.0   150.0 

For the colums :  dens_sem_g_m2 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   80.0    80.0   100.0   101.6   110.0   120.0     243 

For the colums :  rdt14_q_ha 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  27.54   41.77   48.83   50.25   59.53   78.28       5 

For the colums :  tp_pc 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  16.00   20.80   22.10   21.75   22.90   25.70      23 

For the colums :  pmg14_g 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  126.6   174.9   190.3   191.8   210.0   276.0      46 

For the colums :  np__m2 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  36.00   58.00   63.33   67.39   76.50  120.00     165 

2.7 Boxplots pour les colonnes factorielles

Code
# Boucle sur toutes les colonnes catégorielles
for (col in factor_colums) {
  cat("For the column:", col, "\n")
  
  # Création du graphique ggplot
  p <- ggplot(bdd, aes(x = .data[[col]])) +
    geom_bar(fill = "steelblue", color = "black", width = 0.5) +  # Largeur réduite
    labs(
         x = col, y = "Nombre d'occurrences") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotation des labels
  
  print(p)  # Afficher le graphique
}
For the column: Codes_essai_Labkey 

For the column: Organisme 

For the column: lieu 

For the column: Code_postal 

For the column: Code_station_meteo 

For the column: Commune_station_meteo 

For the column: Type_de_sol 

For the column: Precedent 

For the column: an 

For the column: geno 

2.8 Matrice de corrélation

Na missign values!

Code
df = bdd[,numeric_colums]
cor_df <- round(cor(df,use='pairwise.complete.obs'), 2)

#reshape2::melt the data frame
melted_cor <- reshape2::melt(cor_df)

#create correlation heatmap
p = ggplot(data = melted_cor, aes(x = Var1, y = Var2, fill = value)) + 
  geom_tile() +
  geom_text(aes(label = round(value, 2)), size = 2) +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), name = "Correlation") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.background = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
        axis.text.y = element_text(size = 8))

# p <- ggplotly(p)
p

2.9 Focus on the Organisme

Code
rm("summary_df")
bdd$Organisme= as.factor(bdd$Organisme)
summary_df <- as.data.frame(summary(bdd$Organisme))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

2.10 Focus on the years

Code
bdd$an= as.factor(bdd$an)
summary_df <- as.data.frame(summary(bdd$an))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

Onlys two years

back to the Synthesis

3 Exploration: Site

3.1 Focus on the site

Code
bdd = conv_col(bdd,c("lieu", "Code_postal", "Code_station_meteo", "Commune_station_meteo"),"factor")
# Créer un résumé des occurrences des lieux
summary_df <- as.data.frame(summary(bdd$lieu))
colnames(summary_df) <- c("Sites_numbers")

#  colonnes Code_postal, Region et Departement
summary_df <- summary_df %>%
  mutate(Lieu = rownames(summary_df)) %>%  
  left_join(bdd %>% dplyr::select(lieu, Code_postal, Code_station_meteo, Commune_station_meteo) %>% distinct(),  
            by = c("Lieu" = "lieu"))

summary_df = summary_df[, c("Lieu", "Sites_numbers", "Code_postal", "Code_station_meteo", "Commune_station_meteo")]

tags$div(
  tags$h4("Title: Site locations ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: Site locations

back to the Synthesis

3.2 Focus on the Code postale

Code
bdd$Code_postal= as.factor(bdd$Code_postal)
summary_df <- as.data.frame(summary(bdd$Code_postal))
colnames(summary_df) <- c("Numbers")
summary_df %>% datatable(options = list(pageLength = 5))

Only 22 Code postal for 23 Lieu: Plaimpied Givaudins and CROSSES have the same Code postal \(=\) 18340

3.3 Interactive Map

Code
# bdd$Code_postal
df = bdd %>% filter(nchar(as.character(Code_postal)) < 5)
# View(df)
df = bdd
# df$Code_postal

df <- df %>% mutate(Code_postal = if_else(Code_postal == "2190", "02190", Code_postal))
df <- df %>% mutate(Code_postal = if_else(Code_postal == "8300", "08300", Code_postal))
df <- df %>% mutate(Code_postal = stringr::str_replace_all(Code_postal, " ", ""))

df$Code_postal = as.character(df$Code_postal)
# df$Code_postal
df$Code_postal = as.numeric(df$Code_postal)

# Géocodage des lieu
bdd$lieu =as.character(bdd$lieu)
geo_lieu <- bdd %>%
  distinct(lieu) %>%  # Supprime les doublons
  tidygeocoder::geocode(address = lieu, method = "osm")  # OpenStreetMap pour obtenir latitude & longitude

# Fusionner avec les données originales
bdd_geo <- bdd %>% dplyr::left_join(geo_lieu, by = "lieu")

bdd$lieu =as.factor(bdd$lieu)
Code
# Charger la carte de la France
france_map <- map_data("france")

df_coord <- bdd_geo[, c("long", "lat")]

df_coord$num_ligne <- seq(nrow(df_coord))
carte <- leaflet(df_coord) %>%
  addTiles() %>%
  addCircleMarkers(lng = ~long, lat = ~lat, radius = 0.8, fillOpacity = 0.8, fillColor = "blue")
carte

3.4 Scientific map

Code
# Carte
bdd = bdd_geo
df = bdd
df = df[,c("long", "lat")]
df <- st_as_sf(df, coords = c("long", "lat"))
st_crs(df) =  "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

carte_obs = ggplot() +
  geom_polygon(data = france_map, aes(x = long, y = lat, group = group), 
               fill = "lightgray", color = "black",size = 0.2) +
  geom_point(data = bdd_geo, aes(x = long, y = lat), 
             color = "red", size = 0.4) +
  labs(title = "", x = 'Longitude', y = 'Latitude',
       # title = "Map of France",
       # subtitle = '',
       caption = 'Source: XXXX, 2025')+
  # ggsn::scalebar(location = "bottomleft", dist = 100, dist_unit = "km") +
  # cowplot::theme_cowplot()+

  # geom_text(data = bdd_geo, aes(x = long, y = lat, label = lieu), 
  #           vjust = -0.5, size = 1.5,colour = 'blue',) +  # Ajout des labels

  theme(panel.grid.major = element_line(color = gray(.5),
                                        linetype = 'dashed',
                                        size = 0.1),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill=NA,color = 'black'),
        panel.ontop = TRUE,
        axis.text = element_text(size = 8),
        axis.title.y = element_text(vjust = 5, size = 10, face = "bold"),
        axis.title.x = element_text(size = 10, face = "bold",vjust = -1),
        axis.ticks.length = unit(0.2, "cm"),
        plot.caption = element_text(vjust = -1.5),
        plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))

# carte_obs = carte_obs +
#   # blank() +
#   ggsn::north(landworm_sf) +
#   ggsn::scalebar(landworm_sf, dist = 50, dist_unit = "km",
#            transform = TRUE, model = "WGS84",location = "bottomleft", st.size = 1)

ggsave("~/azodyn_check/AZODYN_optileg/outputs/carte_obs.svg", 
       plot = carte_obs,
       ,height = 6,width = 6
       )

ggsave("check_data_files/figure-revealjs/map fig-1.svg", 
       plot = carte_obs,
       ,height = 6,width = 6
       )


# carte_obs

42.5 45.0 47.5 50.0 -5 0 5 10 Longitude Latitude Source: XXXX, 2025

Add north and the scalebar!

4 Exploration: Soils

4.1 Focus on the soil

Code
bdd$Type_de_sol= as.factor(bdd$Type_de_sol)
summary_df <- as.data.frame(summary(bdd$Type_de_sol))
colnames(summary_df) = c("Numbers")
tags$div(
  tags$h4("Title: Soils types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: Soils types

- Merging levels

Code
levels(bdd$Type_de_sol)[levels(bdd$Type_de_sol) == "argilo_calcaire"] <- "Argilo_calcaire"
levels(bdd$Type_de_sol)[levels(bdd$Type_de_sol) == "argilo_calcaire_profond"] <- "Argilo_calcaire_profond"
levels(bdd$Type_de_sol)[levels(bdd$Type_de_sol) == "limon_argileux"] <- "Limon_argileux"
levels(bdd$Type_de_sol)[levels(bdd$Type_de_sol) == "Limon_argilo_sableux"] <- "Limon_argileux_sableux"


bdd$Type_de_sol= as.factor(bdd$Type_de_sol)
summary_df <- as.data.frame(summary(bdd$Type_de_sol))
colnames(summary_df) = c("Numbers")
tags$div(
  tags$h4("Title: Soils types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: Soils types

The available water capacity in the soil is:

Code
plot(as.factor(bdd$ru_mm), xlab = "Values", ylab = "Numbers")

Thesoil depth is:

Code
summary( as.factor(bdd$psol_cm))
 100 NA's 
  13  273 
Code
plot(as.factor(bdd$psol_cm), xlab = "Values", ylab = "Numbers")

back to the Synthesis

5 Exploration: plant

5.1 Variables pertinentes

geno = variete

sem_j=date semis

dens_sem_g_m2 = densité semis grains/m2

lev_j = date levée

df_j = date debut flo

ff_j = fin flo

rec_j = date recolte

rdt14_q_ha = rendement 14% humidité

tp_pc = taux proteines grain

pmg14_g = PMG à 14%hum

np__m2 = nb_pieds_m2

5.2 Focus ont the genotype

Code
df = bdd
pourc_na = percent_na(df,"geno")
df$geno= as.factor(df$geno)
summary_df <- as.data.frame(summary(df$geno))
colnames(summary_df) = c("Numbers")
tags$div(
  tags$h4("Title: Genotypes types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: Genotypes types

0 % de NA

back to the Synthesis

5.3 Focus on sem_j=date semis

Code
df = bdd
pourc_na = percent_na(df,"sem_j")
df$sem_j= as.factor(df$sem_j)
summary_df <- as.data.frame(summary(df$sem_j))
colnames(summary_df) = c("Numbers")

summary_df$Dates = as.Date (row.names(summary_df))
summary_df$Annee <- year(summary_df$Dates)  # Extraction de l'année
row.names(summary_df) = NULL
summary_df = summary_df[, c("Annee","Dates", "Numbers")]

tags$div(
  tags$h4("Title: sem_j types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: sem_j types

7.69 % de NA

5.4 Focus on dens_sem_g_m2 = densité semis grains/m2

Code
df = bdd
pourc_na = percent_na(df,"dens_sem_g_m2")
df$dens_sem_g_m2= as.factor(df$dens_sem_g_m2)
summary_df <- as.data.frame(summary(df$dens_sem_g_m2))
colnames(summary_df) = c("Numbers")
tags$div(
  tags$h4("Title: dens_sem_g_m2 types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: dens_sem_g_m2 types

84.97 % de NA

5.5 Focus on lev_j = date levée

Code
df = bdd
pourc_na = percent_na(df,"lev_j")
df$lev_j= as.factor(df$lev_j)
summary_df <- as.data.frame(summary(df$lev_j))
colnames(summary_df) = c("Numbers")

summary_df$Dates = as.Date (row.names(summary_df))
summary_df$Annee <- year(summary_df$Dates)  # Extraction de l'année
row.names(summary_df) = NULL
summary_df = summary_df[, c("Annee","Dates", "Numbers")]

tags$div(
  tags$h4("Title: lev_j types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: lev_j types

Uniquement année de récolte 2017

95.45 % de NA

5.6 Focus on df_j = date debut flo

Code
df = bdd
pourc_na = percent_na(df,"df_j")
df$df_j= as.factor(df$df_j)
summary_df <- as.data.frame(summary(df$df_j))
colnames(summary_df) = c("Numbers")

summary_df$Dates = as.Date (row.names(summary_df))
summary_df$Annee <- year(summary_df$Dates)  # Extraction de l'année
row.names(summary_df) = NULL
summary_df = summary_df[, c("Annee","Dates", "Numbers")]

tags$div(
  tags$h4("Title: df_j types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: df_j types

9.79 % de NA

5.7 Focus ont the rec_j = date recolte

Code
df = bdd
pourc_na = percent_na(df,"rec_j")
df$rec_j= as.factor(df$rec_j)
summary_df <- as.data.frame(summary(df$rec_j))
colnames(summary_df) = c("Numbers")

summary_df$Dates = as.Date (row.names(summary_df))
summary_df$Annee <- year(summary_df$Dates)  # Extraction de l'année
row.names(summary_df) = NULL
summary_df = summary_df[, c("Annee","Dates", "Numbers")]

tags$div(
  tags$h4("Title: rec_j types ", style = "text-align: center;"), 
  DT::datatable(summary_df, options = list(pageLength = 5))
)

Title: rec_j types

33.92 % de NA

5.8 Focus ont the rdt14_q_ha = rendement 14% humidité

Code
df = bdd
pourc_na = percent_na(df,"rdt14_q_ha")
print(summary(bdd[,"rdt14_q_ha"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  27.54   41.77   48.83   50.25   59.53   78.28       5 
Code
hist(bdd[,"rdt14_q_ha"])

Code
ggplot(bdd, aes(x = 1, y = rdt14_q_ha, color = as.factor(an))) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +
  labs(title = "Yield distribution by year",
       x = "",
       y = "Yield (q/ha)",
       color = "Years") +
  theme_minimal() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

1.75 % de NA

5.9 Focus ont the tp_pc = taux proteines grain

Code
df = bdd
pourc_na = percent_na(df,"tp_pc")
print(summary(bdd[,"tp_pc"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  16.00   20.80   22.10   21.75   22.90   25.70      23 
Code
hist(bdd[,"tp_pc"])

Code
ggplot(bdd, aes(x = 1, y = tp_pc, color = as.factor(an))) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +
  labs(title = "Grain protein content by year",
       x = "",
       y = "Grain protein content",
       color = "Years") +
  theme_minimal() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

8.04 % de NA

5.10 Focus ont the pmg14_g = PMG à 14%hum

Code
df = bdd
pourc_na = percent_na(df,"pmg14_g")
print(summary(bdd[,"pmg14_g"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  126.6   174.9   190.3   191.8   210.0   276.0      46 
Code
hist(bdd[,"pmg14_g"])

Code
ggplot(bdd, aes(x = 1, y = pmg14_g, color = as.factor(an))) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +
  labs(title = "Average weight of a grain at 14% humidity by year",
       x = "",
       y = "PMG at 14%",
       color = "Years") +
  theme_minimal() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

16.08 % de NA

5.11 Focus ont the np__m2 = nb_pieds_m2

Code
df = bdd
pourc_na = percent_na(df,"np__m2")
print(summary(bdd[,"np__m2"]))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  36.00   58.00   63.33   67.39   76.50  120.00     165 
Code
hist(bdd[,"np__m2"])

Code
ggplot(bdd, aes(x = 1, y = np__m2, color = as.factor(an))) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +
  labs(title = "nb_pieds_m2",
       x = "",
       y = "nb_pieds_m2",
       color = "Years") +
  theme_minimal() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

57.69 % de NA

6 Synthesis

1 situation = lieu x année x variété x autre pratique culturale + commentaires

Code
nbr_lieu = length(unique(bdd$lieu))
nbr_annee = length(unique(bdd$an))
nbr_geno = length(unique(bdd$geno))
tot_situation = nbr_lieu * nbr_annee * nbr_geno

df = table(df$lieu, df$geno, df$an)
# df
nbr_sol = length(unique(bdd$Type_de_sol))