chemin_fichier_excel = "C:/Users/diall/Downloads/datas/LandWorm_dataset_site_V1.9.xlsx"
bdd <- read.xlsx(chemin_fichier_excel, sheet = "Sheet1")
bdd$Programme=as.factor(bdd$Programme)
sort(levels(bdd$Programme))
## [1] "#Vers2022" "AF" "AF-SMART-SO"
## [4] "AgrHys" "AgrInnov" "AGRIVAIR-VITTEL"
## [7] "Beaujolais-SICAREX" "BFF-Brami" "BFF-Brami-Bioferme"
## [10] "Bio1" "BIO2" "Bio2_ANDRA"
## [13] "biobio" "BioGeoSTIB" "Breizh Sukr"
## [16] "CA 45" "CA_41" "CA_45"
## [19] "CAPAMIN" "CEREMA" "Cerema / ARB"
## [22] "CN" "DATASOL" "Dephy"
## [25] "Dephy Bio" "DIAMS" "DYNABIO"
## [28] "ECLAS" "ECO_IMPACT" "GARGAMEL"
## [31] "HOP!" "INPN" "JASSUR"
## [34] "Kerguéhennec" "LaCage" "Life-PTD"
## [37] "Macon-Davayé" "MONS" "OASYS"
## [40] "OBL" "Optim'yst" "OPVT"
## [43] "OPVT_BZH" "OPVT_IDF" "Orchamp"
## [46] "PSDR" "RESACOR" "RESCAPE"
## [49] "REVA" "RMQS" "RMQS_BioDiv"
## [52] "SBT-ENI-M" "SBT-ENI-TB" "SHR"
## [55] "SOERE-ACBB-Lusignan" "SOERE-ACBB-Theix" "SOERE-PRO-EFELE-PROs"
## [58] "SOERE-PRO-EFELE-TS-MO" "SOERE PRO" "SOFIA"
## [61] "SoilMan-Farm Network" "SOLIVITI" "Sols de Bretagne"
## [64] "SUSTAIN reseau" "thèse MH" "thèse NC"
## [67] "TIGA" "TRAMBIOSOIL" "TSE"
## [70] "Vadebio" "VADEBIO" "VADETOX-Biotechnosol"
## [73] "VIFLORHUM" "VineDivers" "VitiEcobioSol-CIBC"
## [76] "VITTEL - AGRIVAIR" "ZAA" "ZAA_HR"
levels(bdd$Programme)[levels(bdd$Programme) == "CA 45"] <- "CA_45"
bdd$Annee= as.factor(bdd$Annee)
levels(bdd$Annee)
## [1] "1821" "1960" "1978" "1982" "1983"
## [6] "1984" "1986" "1988" "1989" "1990"
## [11] "1991" "1992" "1993" "1994" "1995"
## [16] "1996" "1997" "1998" "1999" "2000"
## [21] "2001" "2002" "2003" "2003_04" "2004"
## [26] "2005" "2005_05" "2006" "2006_04" "2007"
## [31] "2007_04" "2008" "2008_05" "2009" "2009_04"
## [36] "2010" "2010_03" "2010_04" "2011" "2011_03"
## [41] "2011_05" "2011_10" "2011_11" "2012" "2012_03"
## [46] "2012_04" "2012_11" "2013" "2013_03" "2013_04"
## [51] "2013_11" "2014" "2014_03" "2014_04" "2014_10"
## [56] "2015" "2015_11" "2016" "2016_03" "2016_04"
## [61] "2017" "2017.04" "2017.11" "2018" "2018.03"
## [66] "2018.04" "2019" "2019.03" "2019.04" "2019/10"
## [71] "2020" "2020-11-27" "2020-12-11" "2020_02" "2021"
## [76] "2021-01-14" "2021-03-02" "2021-03-17" "2021-03-30" "2021-04-14"
## [81] "2021-05-06" "2021-05-11" "2021-05-26" "2021-06-08" "2021-06-22"
## [86] "2021-06-23" "2021-07-27" "2021-09-09" "2021-10-14" "2021-10-30"
## [91] "2021-11-04" "2021-11-23" "2021-11-25" "2021-11-29" "2021-12-09"
## [96] "2021-12-14" "2021.03" "2021.04" "2021_03" "2021_04"
## [101] "2021_10" "2022" "2022-01-11" "2022-01-25" "2022-01-26"
## [106] "2022-02-08" "2022-02-22" "2022-03-22" "2022_02" "2022_03"
## [111] "2022_09" "2022_10" "2023" "2023_03"
bdd$Annee <- gsub("^(\\d{4}).*$", "\\1", bdd$Annee) # on prend uniquement les 04 premier chiffre
bdd$Annee= as.factor(bdd$Annee)
# bdd$Date_Prelevement
filtre_date <- subset(bdd, !is.na(Date_Prelevement) & !grepl("^\\d{2}/\\d{2}/\\d{4}$", Date_Prelevement))$Date_Prelevement
filtre_date[c(1,5,10,79,101,112,195,240,243,454,473,475,485,627,655,680)]
## [1] "2022-04-19"
## [2] "42818"
## [3] "lundi 9 mars 2020"
## [4] "2020-10-26"
## [5] "18/03/15"
## [6] "43889"
## [7] "29/032021"
## [8] "23/02/2021 (bloc 1 à 4)\n24/02/2021 ( bloc 5 et 6)"
## [9] "08/03/2021\n09/03/2021"
## [10] "2020-02-06"
## [11] "?"
## [12] "2019"
## [13] "2021-03-01"
## [14] "13 et 15/04/2015"
## [15] "2021-04-06"
## [16] "2022-03-25"
bdd$clcm_lvl1= as.factor(bdd$clcm_lvl1)
summary_df <- as.data.frame(summary(bdd$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers | |
---|---|
1_Naturel | 1 |
2_Agricole | 3 |
Agricultural areas | 4939 |
Artificial surfaces | 956 |
Forest and semi natural areas | 587 |
Wetlands | 37 |
NA’s | 1496 |
levels(bdd$clcm_lvl1)[levels(bdd$clcm_lvl1) == "1_Naturel"] <- "Forest and semi natural areas"
levels(bdd$clcm_lvl1)[levels(bdd$clcm_lvl1) == "2_Agricole"] <- "Agricultural areas"
bdd$clcm_lvl1= as.factor(bdd$clcm_lvl1)
summary_df <- as.data.frame(summary(bdd$clcm_lvl1))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers | |
---|---|
Forest and semi natural areas | 588 |
Agricultural areas | 4942 |
Artificial surfaces | 956 |
Wetlands | 37 |
NA’s | 1496 |
bdd$code_clcm_lvl1 <- ifelse(bdd$clcm_lvl1 == "Forest and semi natural areas", 3, bdd$code_clcm_lvl1)
bdd$code_clcm_lvl1 <- ifelse(bdd$clcm_lvl1 == "Agricultural areas", 2, bdd$code_clcm_lvl1)
bdd$clcm_lvl2= as.factor(bdd$clcm_lvl2)
summary_df <- as.data.frame(summary(bdd$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 8)
Numbers | |
---|---|
21_Agricole ouvert | 3 |
Arable land | 2915 |
Artificial, non-agricultural vegetated areas | 750 |
Forests | 273 |
Heterogeneous agricultural areas | 223 |
Industrial, commercial and transport units | 181 |
Inland wetlands | 37 |
Mine, dump and construction sites | 25 |
Open spaces with little or no vegetation | 9 |
Pastures | 729 |
Permanent crops | 1072 |
Scrub and/or herbaceous vegetation associations | 305 |
NA’s | 1497 |
levels(bdd$clcm_lvl2)[levels(bdd$clcm_lvl2) == "21_Agricole ouvert"] <- "Arable land"
bdd$clcm_lvl2= as.factor(bdd$clcm_lvl2)
summary_df <- as.data.frame(summary(bdd$clcm_lvl2))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers | |
---|---|
Arable land | 2918 |
Artificial, non-agricultural vegetated areas | 750 |
Forests | 273 |
Heterogeneous agricultural areas | 223 |
Industrial, commercial and transport units | 181 |
Inland wetlands | 37 |
Mine, dump and construction sites | 25 |
Open spaces with little or no vegetation | 9 |
Pastures | 729 |
Permanent crops | 1072 |
Scrub and/or herbaceous vegetation associations | 305 |
NA’s | 1497 |
bdd$code_clcm_lvl2 <- ifelse(bdd$clcm_lvl2 == "Arable land", 21, bdd$code_clcm_lvl2)
bdd$clcm_lvl3= as.factor(bdd$clcm_lvl3)
summary_df <- as.data.frame(summary(bdd$clcm_lvl3))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers | |
---|---|
Agro-forestry areas | 117 |
Airports | 44 |
Beaches, dunes, sands | 1 |
Broad-leaved forest | 29 |
Complex cultivation patterns | 88 |
Coniferous forest | 4 |
Construction sites | 21 |
Fruit trees and berry plantations | 18 |
Green urban areas | 676 |
Humid zone | 1 |
Indetermined forest | 2 |
Industrial or commercial units and public facilities | 12 |
Mixed forest | 88 |
Moors and heathland | 13 |
Natural grasslands | 144 |
Non-irrigated arable land | 2840 |
Other arable land | 72 |
Other artificial, non-agricultural vegetated areas | 13 |
Other heterogeneous agricultural areas | 18 |
Other mine, dump and construction sites | 4 |
Other scrub and/or herbaceous vegetation associations | 8 |
Pastures, meadows and other permanent grasslands under agricultural use | 521 |
Road and rail networks and associated land | 124 |
Sport and leisure facilities | 59 |
Transitional woodland-shrub | 14 |
Vineyards | 1044 |
NA’s | 2044 |
bdd$Protocole = as.factor(bdd$Protocole)
summary_df <- as.data.frame(summary(bdd$Protocole))
colnames(summary_df) <- c("Numbers")
kable(summary_df,padding = 5)
Numbers | |
---|---|
AITC_HS | 231 |
AITCTM | 227 |
F | 51 |
F_HS | 970 |
FHS | 46 |
hand sorting | 357 |
HS | 3281 |
HS_16 | 216 |
HS_4 | 396 |
HS_biodiv | 32 |
HS_biodiv? | 2 |
HS_F_16 | 24 |
HS_M_16 | 151 |
HS_M_25 | 29 |
HSAITC_16 | 1 |
HSAITC_4 | 123 |
HSAITC_6.25 | 56 |
HSAITC_7.95775385 | 10 |
M | 1238 |
M_HS | 70 |
Unknown | 320 |
visual_hunting | 188 |
Merging levels :
levels(bdd$Protocole)[levels(bdd$Protocole) == "hand sorting"] <- "HS"
bdd$Protocole = as.factor(bdd$Protocole)
summary_df <- as.data.frame(summary(bdd$Protocole))
colnames(summary_df) <- c("Numbers")
#kable(summary_df,padding = 5)
summary(bdd$AB_tot)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 22.67 96.00 173.28 250.00 2875.00 320
#bdd <- subset(bdd, AB_tot <= 800)
par(mfrow=c(1,2))
# Boxplot
boxplot(bdd$AB_tot, col = 'blue', ylab = "Total abundance")
# Cleveland plot
dotchart(bdd$AB_tot, pch = 16, col = 'blue', xlab = "Total abundance")
Quelques parcelles ont des valeurs un peu douteux
summary(bdd$BM_tot)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 16.97 53.81 90.89 107.13 19962.47 4282
#bdd <- subset(bdd, AB_tot <= 800)
par(mfrow=c(1,2))
# Boxplot
boxplot(bdd$BM_tot, col = 'blue', ylab = "Total biomass")
# Cleveland plot
dotchart(bdd$BM_tot, pch = 16, col = 'blue', xlab = "Total biomass")
Quelques parcelles ont des valeurs un peu douteux
bdd$sand=as.numeric(bdd$sand)
summary(bdd$sand)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.15 13.90 22.77 29.45 42.80 96.20 6001
#bdd <- subset(bdd, AB_tot <= 800)
par(mfrow=c(1,2))
# Boxplot
boxplot(bdd$sand, col = 'blue', ylab = "sand")
# Cleveland plot
dotchart(bdd$sand, pch = 16, col = 'blue', xlab = "sand")
bdd$clay=as.numeric(bdd$clay)
summary(bdd$clay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.02 19.10 82.00 139.07 224.75 667.00 3469
#bdd <- subset(bdd, AB_tot <= 800)
par(mfrow=c(1,2))
# Boxplot
boxplot(bdd$clay, col = 'blue', ylab = "clay")
# Cleveland plot
dotchart(bdd$clay, pch = 16, col = 'blue', xlab = "clay")
A verifier ?, beaucoup de valeur sont superieur a 100
bdd$silt=as.numeric(bdd$silt)
summary(bdd$silt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.05 30.15 42.60 44.23 59.00 81.20 6001
#bdd <- subset(bdd, AB_tot <= 800)
par(mfrow=c(1,2))
# Boxplot
boxplot(bdd$silt, col = 'blue', ylab = "silt")
# Cleveland plot
dotchart(bdd$silt, pch = 16, col = 'blue', xlab = "silt")
# les valeurs de la colonne ph_eau qui ne sont pas des chiffres ou supérieures à 14
valeurs_bizarres <- subset(bdd, !(grepl("^\\d*\\.?\\d+$", ph_eau) | as.numeric(ph_eau) > 14 & is.na(ph_eau)))$ph_eau
valeurs_bizarres
## [1] ">8" ">9" "Entre 5 et 6" "Entre 6 et 7"
## [5] "Entre 6 et 7" "de 6 à 6,9" "de 6 à 6,9" "de 6 à 6,9"
## [9] "je ne sais pas" "je ne sais pas" "Entre 6 et 7" "entre 5 et 6"
## [13] "<5" "je ne sais pas" "je ne sais pas" "5 à 5,9"
## [17] "DE 7 A 7,9" "DE 5 A 5,9" "DE 6 A 6,9"
df_comp=bdd
df_comp <- df_comp[!grepl("[^0-9.]", df_comp$ph_eau), ]
df_comp$ph_eau <- as.numeric(df_comp$ph_eau)
Il y a 19 parcelles qui ont des pH qui ne sont pas des chiffre. Dans mon cas, j’ai enlever ces 19 lignes.
summary(as.numeric(bdd$ph_eau))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.09 6.50 7.60 18.35 8.11 44140.00 4042
df_comp = df_comp[!df_comp$ph_eau== 44140.00,]
Une parcelle est à \(44140.00\).
par(mfrow=c(1,2))
# Boxplot
boxplot(df_comp$ph_eau, col = 'blue', ylab = "ph_eau")
# Cleveland plot
dotchart(df_comp$ph_eau, pch = 16, col = 'blue', xlab = "ph_eau")
summary(as.numeric(bdd$c_tot))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 12.60 23.06 34.30 51.26 162.04 7148
#plot(as.numeric(bdd$c_tot))
par(mfrow=c(1,2))
# Boxplot
boxplot(as.numeric(bdd$c_tot), col = 'blue', ylab = "c_tot")
# Cleveland plot
dotchart(as.numeric(bdd$c_tot), pch = 16, col = 'blue', xlab = "c_tot")
Valeurs superieurs à 100 ?, unités ?,…
summary(as.numeric(bdd$c_org))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.520 9.553 14.300 18.220 23.632 156.200 4659
#plot(as.numeric(bdd$c_org))
par(mfrow=c(1,2))
# Boxplot
boxplot(as.numeric(bdd$c_org), col = 'blue', ylab = "c_org")
# Cleveland plot
dotchart(as.numeric(bdd$c_org), pch = 16, col = 'blue', xlab = "c_org")
Valeurs superieurs à 100 ?, unités ?,…
summary(as.numeric(bdd$om))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 16.14 24.65 33.36 40.26 631.00 4132
#plot(as.numeric(bdd$om))
par(mfrow=c(1,2))
# Boxplot
boxplot(as.numeric(bdd$om), col = 'blue', ylab = "om")
# Cleveland plot
dotchart(as.numeric(bdd$om), pch = 16, col = 'blue', xlab = "om")
Valeurs superieurs à 100 ?, unités ?,…
# Check des doublons
duplicated_rows <- subset(bdd, duplicated(bdd[, c("ID", "AB_tot", "BM_tot")]) | duplicated(bdd[, c("ID", "AB_tot", "BM_tot")], fromLast = TRUE))