# Load packages
library(readr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(DataExplorer)
library(tidyverse)
library(sf)
library(rnaturalearth)
library(lme4)
library(emmeans)
library(tidyr)
library(broom)
library(readxl)
CONV_CA_Ratio <- read_csv(
"D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/CONV_CA_Ratio.csv",
col_types = cols(
IQR_weed1_complexity = col_character()
)
)
# Add slope to CAONV_CA_Ratio
slope <- read_excel("data/raw/slope.xlsx")
slope$Farm_ID <- sub("_[A-Za-z]$", "", slope$Farm_ID)
slope <- slope %>%
distinct(Farm_ID, .keep_all = TRUE)
colnames(slope)[colnames(slope) == "Farm_ID"] <- "IQR_block"
library(dplyr)
CONV_CA_Ratio <- CONV_CA_Ratio %>%
left_join(slope %>% dplyr::select(IQR_block, Slope),
by = "IQR_block")
weeds <- read_excel("data/raw/weeds.xlsx")
library(dplyr)
We used OAF_AEZ (hereafter IQF_environment) for the initial evaluation and characterization of Conservation Agriculture (CA) effects. IQF_environment captures key biophysical and agronomic drivers of yield variation (e.g., moisture regime, elevation, and temperature) while maintaining a parsimonious structure with only four categories. Compared with the national AEZ classification (IQF_agzone), which includes a larger number of categories, IQF_environment provides greater statistical power to detect environmental main effects and interactions in models assessing overall CA performance. IQF_environment is also closely aligned with crop-specific agronomic realities, including maize adaptation zones (highland vs. mid-altitude) and bean growth habits (bush vs. climbing), making it well suited for characterizing general treatment responses across environments.
In subsequent driver analyses, where the objective was to better explain environmental heterogeneity in yield responses, we incorporated IQF_agzone. Although more complex, IQF_agzone explained a larger proportion of variance in yield ratio (marginal R² = 0.117 vs. 0.060; conditional R² = 0.255 vs. 0.228) and showed stronger zone-related effect sizes. Therefore, it provided finer environmental differentiation when modeling drivers of performance variability. In summary, IQF_environment was retained for the initial CA effect assessment to maximize statistical power and interpretability, while IQF_agzone was incorporated in driver analyses to capture more detailed environmental heterogeneity.
# ================================
# Compare zoning systems (fit2)
# ================================
library(lme4)
library(car)
library(effectsize)
library(performance)
# ---- Fit models ----
# IQF_environment (OAF_AEZ)
fit2_conv_env <- lmer(
log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment +
crop:IQF_environment +
year:IQF_environment +
year:crop:IQF_environment +
(1 | IQR_block),
data = CONV_CA_Ratio
)
# IQF_agzone
fit2_conv_agzone <- lmer(
log(Y_ratio + 1) ~ crop + year + crop:year + IQF_agzone +
crop:IQF_agzone +
year:IQF_agzone +
year:crop:IQF_agzone +
(1 | IQR_block),
data = CONV_CA_Ratio
)
# ---- Model comparison ----
cat("\n=== AIC / BIC ===\n")
##
## === AIC / BIC ===
print(AIC(fit2_conv_env, fit2_conv_agzone))
## df AIC
## fit2_conv_env 18 -1873.847
## fit2_conv_agzone 43 -1832.201
print(BIC(fit2_conv_env, fit2_conv_agzone))
## df BIC
## fit2_conv_env 18 -1766.859
## fit2_conv_agzone 43 -1576.618
# ---- Type III ANOVA ----
cat("\n=== ANOVA: Environment ===\n")
##
## === ANOVA: Environment ===
anova_env <- Anova(fit2_conv_env, type = 3)
print(anova_env)
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: log(Y_ratio + 1)
## Chisq Df Pr(>Chisq)
## (Intercept) 17.9908 1 2.220e-05 ***
## crop 21.1971 1 4.144e-06 ***
## year 4.2321 1 0.039667 *
## IQF_environment 26.0883 3 9.140e-06 ***
## crop:year 21.7159 1 3.162e-06 ***
## crop:IQF_environment 16.1789 3 0.001042 **
## year:IQF_environment 25.0575 3 1.502e-05 ***
## crop:year:IQF_environment 15.9107 3 0.001183 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== ANOVA: AgZone ===\n")
##
## === ANOVA: AgZone ===
anova_agz <- Anova(fit2_conv_agzone, type = 3)
print(anova_agz)
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: log(Y_ratio + 1)
## Chisq Df Pr(>Chisq)
## (Intercept) 30.0357 1 4.242e-08 ***
## crop 8.2825 1 0.004003 **
## year 16.5780 1 4.669e-05 ***
## IQF_agzone 54.2814 10 4.300e-08 ***
## crop:year 8.4376 1 0.003675 **
## crop:IQF_agzone 11.5932 9 0.237229
## year:IQF_agzone 53.6382 9 2.212e-08 ***
## crop:year:IQF_agzone 11.4518 9 0.246008
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ---- Partial Eta² ----
cat("\n=== Partial Eta²: Environment ===\n")
##
## === Partial Eta²: Environment ===
eta_env <- eta_squared(fit2_conv_env)
print(eta_env)
## # Effect Size for ANOVA (Type III)
##
## Parameter | Eta2 (partial) | 95% CI
## ---------------------------------------------------------
## crop | 0.03 | [0.02, 1.00]
## year | 0.01 | [0.01, 1.00]
## IQF_environment | 0.01 | [0.01, 1.00]
## crop:year | 0.03 | [0.02, 1.00]
## crop:IQF_environment | 6.59e-03 | [0.00, 1.00]
## year:IQF_environment | 0.01 | [0.01, 1.00]
## crop:year:IQF_environment | 6.47e-03 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
cat("\n=== Partial Eta²: AgZone ===\n")
##
## === Partial Eta²: AgZone ===
eta_agz <- eta_squared(fit2_conv_agzone)
print(eta_agz)
## # Effect Size for ANOVA (Type III)
##
## Parameter | Eta2 (partial) | 95% CI
## ----------------------------------------------------
## crop | 0.02 | [0.01, 1.00]
## year | 4.49e-03 | [0.00, 1.00]
## IQF_agzone | 0.04 | [0.02, 1.00]
## crop:year | 0.02 | [0.01, 1.00]
## crop:IQF_agzone | 4.98e-03 | [0.00, 1.00]
## year:IQF_agzone | 0.04 | [0.02, 1.00]
## crop:year:IQF_agzone | 4.92e-03 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
# ---- R² ----
cat("\n=== R²: Environment ===\n")
##
## === R²: Environment ===
r2_env <- r2(fit2_conv_env)
print(r2_env)
## # R2 for Mixed Models
##
## Conditional R2: 0.228
## Marginal R2: 0.060
cat("\n=== R²: AgZone ===\n")
##
## === R²: AgZone ===
r2_agz <- r2(fit2_conv_agzone)
print(r2_agz)
## # R2 for Mixed Models
##
## Conditional R2: 0.255
## Marginal R2: 0.117
# ---- Diagnostics (optional) ----
par(mfrow = c(1,2))
plot(fit2_conv_env)
qqnorm(resid(fit2_conv_env)); qqline(resid(fit2_conv_env))
Description of the analysis: Linear mixed-effects model fitted to the log-transformed yield ratio (log(Y_ratio + 1)) comparing Conservation Agriculture (CA) to Conventional Agriculture (CONV). The model included:Fixed effects: crop, year, IQF_environment, and all two- and three-way interactions Random effect: IQR_block The treatment variable (Treat_code) was excluded in this model (fit2_conv) Model performance was compared to a simpler model (fit1_conv) using BIC. The summary of fixed effects and significance tests were obtained via ANOVA with Satterthwaite approximation.
Description of the results: On the analysis of the log-transformed CA:CONV yield ratio, there was a significant effect of crop, year, and environment, as well as their two- and three-way interactions.
# Model 1: no log transform, no Treat_code
fit1_conv <- lmer(
Y_ratio ~ crop + year + IQR_Season + IQF_environment +
IQR_Season:IQF_environment +
(1 | IQR_block),
data = CONV_CA_Ratio
)
# Model 2: log transform, no Treat_code
fit2_conv <- lmer(
log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment +
crop:IQF_environment +
year:IQF_environment +
year:crop:IQF_environment +
year:crop +
(1 | IQR_block),
data = CONV_CA_Ratio
)
# Model comparison
BIC(fit1_conv, fit2_conv)
# View model summaries
summary(fit1_conv)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## Y_ratio ~ crop + year + IQR_Season + IQF_environment + IQR_Season:IQF_environment +
## (1 | IQR_block)
## Data: CONV_CA_Ratio
##
## REML criterion at convergence: 1872.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5822 -0.5427 -0.0954 0.4009 9.8223
##
## Random effects:
## Groups Name Variance Std.Dev.
## IQR_block (Intercept) 0.01826 0.1351
## Residual 0.09644 0.3105
## Number of obs: 2818, groups: IQR_block, 565
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 2.75394 1.07417 2.564
## cropMaize -0.09917 0.08647 -1.147
## year -0.07516 0.04296 -1.749
## IQR_Season23B -0.09980 0.09560 -1.044
## IQR_Season24A -0.24542 0.05258 -4.668
## IQR_Season24B -0.34972 0.06028 -5.802
## IQF_environmentMid_Low_Dry -0.02494 0.03979 -0.627
## IQF_environmentMid_Low_Wet -0.01160 0.04143 -0.280
## IQF_environmentTransition 0.01562 0.04527 0.345
## IQR_Season23B:IQF_environmentMid_Low_Dry -0.24710 0.05346 -4.622
## IQR_Season24A:IQF_environmentMid_Low_Dry 0.26474 0.05466 4.843
## IQR_Season24B:IQF_environmentMid_Low_Dry 0.19055 0.05409 3.523
## IQR_Season25A:IQF_environmentMid_Low_Dry -0.06885 0.09277 -0.742
## IQR_Season25B:IQF_environmentMid_Low_Dry -0.01160 0.05507 -0.211
## IQR_Season23B:IQF_environmentMid_Low_Wet -0.06178 0.05539 -1.115
## IQR_Season24A:IQF_environmentMid_Low_Wet 0.24877 0.05706 4.360
## IQR_Season24B:IQF_environmentMid_Low_Wet 0.15245 0.05622 2.712
## IQR_Season25A:IQF_environmentMid_Low_Wet 0.06260 0.09438 0.663
## IQR_Season25B:IQF_environmentMid_Low_Wet 0.05454 0.05756 0.948
## IQR_Season23B:IQF_environmentTransition -0.10401 0.06086 -1.709
## IQR_Season24A:IQF_environmentTransition 0.17929 0.06288 2.851
## IQR_Season24B:IQF_environmentTransition 0.15105 0.06199 2.437
## IQR_Season25A:IQF_environmentTransition 0.03154 0.09673 0.326
## IQR_Season25B:IQF_environmentTransition 0.04978 0.06287 0.792
## fit warnings:
## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
summary(fit2_conv)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log(Y_ratio + 1) ~ crop + year + crop:year + IQF_environment +
## crop:IQF_environment + year:IQF_environment + year:crop:IQF_environment +
## year:crop + (1 | IQR_block)
## Data: CONV_CA_Ratio
##
## REML criterion at convergence: -1909.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2810 -0.5815 -0.0355 0.4949 6.5777
##
## Random effects:
## Groups Name Variance Std.Dev.
## IQR_block (Intercept) 0.005424 0.07365
## Residual 0.024837 0.15760
## Number of obs: 2818, groups: IQR_block, 565
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.10145 0.25968 4.242
## cropMaize 2.18031 0.47356 4.604
## year -0.02227 0.01083 -2.057
## IQF_environmentMid_Low_Dry -1.71564 0.33938 -5.055
## IQF_environmentMid_Low_Wet -0.79508 0.35516 -2.239
## IQF_environmentTransition -0.98950 0.38866 -2.546
## cropMaize:year -0.09316 0.01999 -4.660
## cropMaize:IQF_environmentMid_Low_Dry 0.43874 0.56254 0.780
## cropMaize:IQF_environmentMid_Low_Wet -1.24660 0.58205 -2.142
## cropMaize:IQF_environmentTransition -0.88133 0.61635 -1.430
## year:IQF_environmentMid_Low_Dry 0.07026 0.01415 4.965
## year:IQF_environmentMid_Low_Wet 0.03404 0.01482 2.298
## year:IQF_environmentTransition 0.04245 0.01621 2.619
## cropMaize:year:IQF_environmentMid_Low_Dry -0.01417 0.02366 -0.599
## cropMaize:year:IQF_environmentMid_Low_Wet 0.05497 0.02448 2.245
## cropMaize:year:IQF_environmentTransition 0.03960 0.02589 1.530
# Fixed effects significance
anova(fit2_conv)
# Residual diagnostics
plot(fit2_conv)
qqnorm(resid(fit2_conv)); qqline(resid(fit2_conv))
Description of the analysis: Used eemeans from the mixed-effects model to summarize the overall CA:CONV yield ratio, and to assess the effects of crop, year, and environment. Results were back-transformed for interpretation on the original yield ratio scale. Interaction effects were explored and visualized.
Description of the results: Conventional outperforms CA as a whole experimental average; overall CA:CONV ratio = 0.787 (eemeans for the same model shown above). Advantage of CONV was similar for maize and beans (t-tests of individual parameter estimates p=0.7). Although there was a significant effect of year, there was no trend for CA to perform better from year 1 to 3. Also, there was a clear effect of the environment with transition and mid-low-wet being more favourable for CA
overall_emm <- emmeans(fit2_conv, ~ 1, type = "response")
print(overall_emm)
## 1 response SE df lower.CL upper.CL
## overall 0.779 0.0081 580 0.763 0.795
##
## Results are averaged over the levels of: crop, IQF_environment
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
## Intervals are back-transformed from the log(mu + 1) scale
library(emmeans)
# Compute emmeans for crop within year × environment
emm_crop_year_env <- emmeans(fit2_conv, ~ crop | year * IQF_environment, type = "response")
emm_df <- as.data.frame(emm_crop_year_env)
ggplot(emm_df, aes(x = IQF_environment, y = response, color = crop)) +
geom_point(position = position_dodge(0.4), size = 3) +
geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL),
position = position_dodge(0.4), width = 0.2) +
facet_wrap(~ year, nrow = 1) +
labs(
title = "Estimated Yield Ratio (CA / CONV) by Crop and Environment",
x = "Environment",
y = "Estimated Yield Ratio (CA:CT)",
color = "Crop"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(size = 8, angle = 45, hjust = 1)
)
## Does the % wins of CA increse progresibely from year 1 to 3?
Separated by crop
library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)
# 1. Prepare data
trend_data <- CONV_CA_Ratio %>%
filter(
is.finite(Y_ratio),
year %in% c("23", "24", "25"),
IQR_SeasAB %in% c("A", "B")
) %>%
mutate(
year = as.numeric(year),
crop_type = ifelse(IQR_SeasAB == "A", "Maize", "Beans")
)
# 2. Function to compute slope counts
get_slope_label <- function(df) {
# One slope per block
slopes <- df %>%
group_by(IQR_block) %>%
filter(n_distinct(year) >= 2) %>%
summarise(slope = coef(lm(Y_ratio ~ year))[2], .groups = "drop") %>%
mutate(
direction = case_when(
slope > 0 ~ "↑ Positive",
slope < 0 ~ "↓ Negative",
TRUE ~ "= Flat"
)
) %>%
count(direction) %>%
complete(direction, fill = list(n = 0))
# Format label text
paste0(
"↑ Positive: ", slopes$n[slopes$direction == "↑ Positive"], "\n",
"↓ Negative: ", slopes$n[slopes$direction == "↓ Negative"], "\n",
"= Flat: ", slopes$n[slopes$direction == "= Flat"]
)
}
# Labels
label_maize <- get_slope_label(trend_data %>% filter(crop_type == "Maize"))
label_beans <- get_slope_label(trend_data %>% filter(crop_type == "Beans"))
# Common y-limit helper
y_max <- max(trend_data$Y_ratio, na.rm = TRUE)
# 3. MAIZE plot
p_maize <- ggplot(
trend_data %>% filter(crop_type == "Maize"),
aes(x = year, y = Y_ratio)
) +
geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends
geom_smooth(
aes(group = IQR_block),
method = "lm",
se = FALSE,
color = "steelblue",
linewidth = 0.6
) +
# Global trend
geom_smooth(
method = "lm",
se = FALSE,
color = "red",
linewidth = 1.3
) +
# Annotation
annotate(
"label",
x = 23.05,
y = y_max * 0.95,
label = label_maize,
hjust = 0,
vjust = 1,
size = 4.5,
fontface = "bold",
fill = "white",
label.size = 0.4
) +
scale_x_continuous(breaks = c(23, 24, 25)) +
labs(
title = "Maize",
x = "Year",
y = "Yield Ratio (CA:CT)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 16)
)
# 4. BEANS plot
p_beans <- ggplot(
trend_data %>% filter(crop_type == "Beans"),
aes(x = year, y = Y_ratio)
) +
geom_point(alpha = 0.35, size = 2) +
# Block-level linear trends
geom_smooth(
aes(group = IQR_block),
method = "lm",
se = FALSE,
color = "darkorange",
linewidth = 0.6
) +
# Global trend
geom_smooth(
method = "lm",
se = FALSE,
color = "red",
linewidth = 1.3
) +
# Annotation
annotate(
"label",
x = 23.05,
y = y_max * 0.95,
label = label_beans,
hjust = 0,
vjust = 1,
size = 4.5,
fontface = "bold",
fill = "white",
label.size = 0.4
) +
scale_x_continuous(breaks = c(23, 24, 25)) +
labs(
title = "Beans",
x = "Year",
y = "Yield Ratio (CA:CT)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 16)
)
# 5. Print plots
print(p_maize)
print(p_beans)
Description of the analysis: We first classified each block into performance typologies (Very Poor to Good) based on its CA:CONV yield ratio across seasons.For each block, we calculated its average relative yield, identified whether it ever experienced a yield penalty greater than 5%, and compiled its full sequence of seasonal performances. Based on these indicators, each farm was grouped into one of four long-term performance types: -Good: no season < 0.95 -Acceptable: Average ≥ 0.95 but at least one season < 0.95 -Poor: Average between 0.85 and 0.95 -Very Poor: Average ≤ 0.85 These thresholds were defined arbitrarily but intentionally, informed by field experience and our understanding of what yield reductions farmers may find acceptable if longer-term soil-health and economic benefits of CA are demonstrated and communicated clearly. Importantly, this typology is not final—it will be refined and validated through farmer consultations to ensure the categories and cut-off points align with farmers’ own perceptions of acceptable and unacceptable performance. We then visualized the distribution of these typologies by environment and district. Finally, we summarized and plotted the average relative yield per block, aggregated by district, to highlight areas where CA performed relatively better.
Description of the results: Even if we look at the district level, the best districts have a poor CA performance on average. The district where CA performed better (as an average from the 6 seasons) was Gisagara, and even there, the average CA:CONV ratio was 0.95 and the % of plots where CA was => to CONV was 35%. Similarly only aroun 35% of th eplots in Gisagara can be defined as with CA having acceptable yield, the rest had poor or very poor.
Based on the relation among farmers “preference” and relative yield of the season
## Based on the focus groups discussions
# --- Sort data by Y_ratio ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$Y_ratio), ]
# --- Create groups of 15 consecutive observations ---
CONV_CA_Ratio$group10 <- rep(1:ceiling(nrow(CONV_CA_Ratio)/15), each = 15)[1:nrow(CONV_CA_Ratio)]
# --- Aggregate to get group means ---
library(dplyr)
grouped_data <- CONV_CA_Ratio %>%
group_by(group10) %>%
summarise(
mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
successes = sum(DQ_future_adoption, na.rm = TRUE),
n = n(),
.groups = "drop"
)
grouped_data$failures <- grouped_data$n - grouped_data$successes
grouped_data$mean_DQ <- grouped_data$successes / grouped_data$n
# --- Logistic regression ---
model <- glm(
cbind(successes, failures) ~ mean_Y_ratio,
family = binomial,
data = grouped_data
)
# --- Compute McFadden pseudo R² ---
null_model <- glm(
cbind(successes, failures) ~ 1,
family = binomial,
data = grouped_data
)
R2_McFadden <- 1 - (as.numeric(logLik(model)) /
as.numeric(logLik(null_model)))
R2_McFadden
## [1] 0.2263066
# --- Compute Y_ratio where predicted probability = 0.7 ---
beta0 <- coef(model)[1]
beta1 <- coef(model)[2]
target_p <- 0.7
logit_target <- log(target_p / (1 - target_p))
Y_ratio_07 <- (logit_target - beta0) / beta1
# --- Create smooth prediction curve ---
newdata <- data.frame(
mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
max(grouped_data$mean_Y_ratio),
length.out = 200)
)
newdata$pred <- predict(model, newdata, type = "response")
# --- Plot ---
plot(grouped_data$mean_Y_ratio,
grouped_data$mean_DQ,
pch = 16,
xlab = "Mean CA:CONV Yield Ratio (groups of 15)",
ylab = "Proportion of future Adoption",
ylim = c(0,1))
lines(newdata$mean_Y_ratio,
newdata$pred,
col = "blue",
lwd = 2)
# --- Reference lines ---
abline(h = 0.7, col = "red", lty = 2)
abline(v = Y_ratio_07, col = "darkgreen", lwd = 2)
# --- Annotations ---
text(min(grouped_data$mean_Y_ratio), 0.76,
labels = "Proportion = 0.7",
col = "red",
pos = 4)
text(Y_ratio_07, 0.2,
labels = paste0("Y_ratio = ", round(Y_ratio_07, 3)),
pos = 4,
col = "darkgreen")
# --- Add R² to plot ---
text(max(grouped_data$mean_Y_ratio),
0.95,
labels = paste0("McFadden R² = ",
round(R2_McFadden, 3)),
pos = 2)
### Smae but 2-segments model instead of sigmoidela
## Based on the focus groups discussions
library(dplyr)
library(segmented)
# --- Sort data by Y_ratio ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$Y_ratio), ]
# --- Create groups of 15 ---
CONV_CA_Ratio$group15 <- rep(1:ceiling(nrow(CONV_CA_Ratio)/20), each = 20)[1:nrow(CONV_CA_Ratio)]
# --- Aggregate ---
grouped_data <- CONV_CA_Ratio %>%
group_by(group15) %>%
summarise(
mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
mean_DQ = mean(DQ_future_adoption, na.rm = TRUE),
.groups = "drop"
)
# --- Fit models ---
lm_model <- lm(mean_DQ ~ mean_Y_ratio, data = grouped_data)
seg_model <- segmented(lm_model,
seg.Z = ~ mean_Y_ratio,
npsi = 1)
# --- Extract breakpoint ---
breakpoint <- seg_model$psi[2]
# --- Predicted Y at breakpoint ---
Y_at_breakpoint <- predict(seg_model,
newdata = data.frame(mean_Y_ratio = breakpoint))
# --- Compute R² ---
R2 <- summary(seg_model)$r.squared
# --- Define NEW target dynamically ---
target <- Y_at_breakpoint - 0.15
# --- Prediction grid ---
newdata <- data.frame(
mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
max(grouped_data$mean_Y_ratio),
length.out = 500)
)
newdata$pred <- predict(seg_model, newdata)
# --- Compute Yield Ratio where predicted proportion = target ---
idx <- which.min(abs(newdata$pred - target))
Y_ratio_target <- newdata$mean_Y_ratio[idx]
# --- Plot ---
plot(grouped_data$mean_Y_ratio,
grouped_data$mean_DQ,
pch = 16,
xlab = "Mean CA:CT Yield Ratio (groups of 15)",
ylab = "Proportion of future Adoption",
ylim = c(0,1))
lines(newdata$mean_Y_ratio,
newdata$pred,
col = "blue",
lwd = 2)
# --- Breakpoint ---
abline(v = breakpoint,
col = "darkgreen",
lwd = 2,
lty = 2)
abline(h = Y_at_breakpoint,
col = "darkgreen",
lty = 3)
points(breakpoint, Y_at_breakpoint,
pch = 19, col = "darkgreen")
# --- New dynamic target ---
abline(h = target,
col = "red",
lty = 2)
abline(v = Y_ratio_target,
col = "red",
lwd = 2)
points(Y_ratio_target, target,
pch = 19, col = "red")
# --- Annotations ---
text(breakpoint, 0.0,
labels = paste0("Breakpoint = ",
round(breakpoint, 3)),
col = "darkgreen",
pos = 4)
text(breakpoint, Y_at_breakpoint + 0.05,
labels = paste0("Adoption at Break = ",
round(Y_at_breakpoint, 3)),
col = "darkgreen",
pos = 4)
text(Y_ratio_target, 0.2,
labels = paste0("Yield Ratio (Break − 0.15) = ",
round(Y_ratio_target, 3)),
col = "red",
pos = 4)
text(max(grouped_data$mean_Y_ratio),
0.95,
labels = paste0("R² = ",
round(R2, 3)),
pos = 2)
### Same as avobe but only for farmers that perceived erosion reduction
by CA AND have large slope >5
## Based on the focus groups discussions
library(dplyr)
library(segmented)
# --- Filter to DG_erosion_reduction_rate == 1 ---
CONV_subset <- CONV_CA_Ratio %>%
filter(Slope > 4) %>%
filter(DQ_erosion_reduction_rate == 1)
# --- Sort data by Y_ratio ---
CONV_subset <- CONV_subset[order(CONV_subset$Y_ratio), ]
# --- Create groups of 20 ---
CONV_subset$group20 <- rep(1:ceiling(nrow(CONV_subset)/20), each = 20)[1:nrow(CONV_subset)]
# --- Aggregate ---
grouped_data <- CONV_subset %>%
group_by(group20) %>%
summarise(
mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
mean_DQ = mean(DQ_future_adoption, na.rm = TRUE),
.groups = "drop"
)
# --- Fit models ---
lm_model <- lm(mean_DQ ~ mean_Y_ratio, data = grouped_data)
seg_model <- segmented(lm_model,
seg.Z = ~ mean_Y_ratio,
npsi = 1)
# --- Extract breakpoint ---
breakpoint <- seg_model$psi[2]
# --- Predicted Y at breakpoint ---
Y_at_breakpoint <- predict(
seg_model,
newdata = data.frame(mean_Y_ratio = breakpoint)
)
# --- Compute R² ---
R2 <- summary(seg_model)$r.squared
# --- Define dynamic target ---
target <- Y_at_breakpoint - 0.15
# --- Prediction grid ---
newdata <- data.frame(
mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
max(grouped_data$mean_Y_ratio),
length.out = 500)
)
newdata$pred <- predict(seg_model, newdata)
# --- Compute Yield Ratio where predicted proportion = target ---
idx <- which.min(abs(newdata$pred - target))
Y_ratio_target <- newdata$mean_Y_ratio[idx]
# --- Plot ---
plot(grouped_data$mean_Y_ratio,
grouped_data$mean_DQ,
pch = 16,
xlab = "Mean CA:CONV Yield Ratio (DG_erosion_reduction_rate = 1)",
ylab = "Proportion of future Adoption",
ylim = c(0,1))
lines(newdata$mean_Y_ratio,
newdata$pred,
col = "blue",
lwd = 2)
# --- Breakpoint ---
abline(v = breakpoint,
col = "darkgreen",
lwd = 2,
lty = 2)
abline(h = Y_at_breakpoint,
col = "darkgreen",
lty = 3)
points(breakpoint, Y_at_breakpoint,
pch = 19, col = "darkgreen")
# --- Target line ---
abline(h = target,
col = "red",
lty = 2)
abline(v = Y_ratio_target,
col = "red",
lwd = 2)
points(Y_ratio_target, target,
pch = 19, col = "red")
# --- Annotations ---
text(breakpoint, 0.05,
labels = paste0("Breakpoint = ",
round(breakpoint, 3)),
col = "darkgreen",
pos = 4)
text(breakpoint, Y_at_breakpoint + 0.05,
labels = paste0("Adoption at Break = ",
round(Y_at_breakpoint, 3)),
col = "darkgreen",
pos = 4)
text(Y_ratio_target, 0.15,
labels = paste0("Yield Ratio (Break − 0.15) = ",
round(Y_ratio_target, 3)),
col = "red",
pos = 4)
text(max(grouped_data$mean_Y_ratio),
0.95,
labels = paste0("R² = ",
round(R2, 3)),
pos = 2)
## Based on the focus groups discussions
library(dplyr)
library(ggplot2)
# --- Create slope categories ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
mutate(
slope_cat = case_when(
Slope <= 5 ~ "Low (≤5)",
Slope > 5 & Slope <= 15 ~ "Mid (5–15)",
Slope > 15 ~ "High (>15)"
)
)
# --- Sort within slope category ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
arrange(slope_cat, Y_ratio)
# --- Create groups of 15 within each slope category ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
group_by(slope_cat) %>%
mutate(group15 = ceiling(row_number() / 20)) %>%
ungroup()
# --- Aggregate ---
grouped_data <- CONV_CA_Ratio %>%
group_by(slope_cat, group15) %>%
summarise(
mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
successes = sum(DQ_future_adoption, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
mutate(
failures = n - successes,
mean_DQ = successes / n
)
# --- Fit logistic model per slope category ---
models <- grouped_data %>%
group_by(slope_cat) %>%
group_map(~ glm(cbind(successes, failures) ~ mean_Y_ratio,
family = binomial,
data = .x))
names(models) <- unique(grouped_data$slope_cat)
# --- Compute threshold Y_ratio where p = 0.7 for each slope ---
thresholds <- lapply(models, function(m) {
beta0 <- coef(m)[1]
beta1 <- coef(m)[2]
logit_target <- log(0.7 / (1 - 0.7))
(logit_target - beta0) / beta1
})
threshold_df <- data.frame(
slope_cat = names(thresholds),
Y_ratio_07 = unlist(thresholds),
prop = 0.7
)
# --- Plot with facets ---
ggplot(grouped_data, aes(x = mean_Y_ratio, y = mean_DQ)) +
geom_point() +
stat_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE,
color = "blue") +
geom_hline(yintercept = 0.7, color = "red", linetype = "dashed") +
geom_vline(data = threshold_df,
aes(xintercept = Y_ratio_07),
color = "darkgreen",
linewidth = 1) +
geom_point(data = threshold_df,
aes(x = Y_ratio_07, y = prop),
inherit.aes = FALSE,
color = "black") +
geom_text(data = threshold_df,
aes(x = Y_ratio_07, y = 0.15,
label = paste0("Y_ratio = ",
round(Y_ratio_07, 3))),
color = "darkgreen",
inherit.aes = FALSE) +
facet_wrap(~ slope_cat) +
labs(
x = "Mean Y_ratio (groups of 15)",
y = "Proportion DG_future_adoption"
) +
ylim(0,1) +
theme_minimal()
### Another way
## Based on the focus groups discussions
library(dplyr)
library(segmented)
# --- Sort data by Slope ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$Slope), ]
# --- Create groups of 20 ---
CONV_CA_Ratio$group20 <- rep(1:ceiling(nrow(CONV_CA_Ratio)/20), each = 20)[1:nrow(CONV_CA_Ratio)]
# --- Aggregate ---
grouped_data <- CONV_CA_Ratio %>%
group_by(group20) %>%
summarise(
mean_Slope = mean(Slope, na.rm = TRUE),
mean_DQ = mean(DQ_future_adoption, na.rm = TRUE),
.groups = "drop"
)
# --- Fit models ---
lm_model <- lm(mean_DQ ~ mean_Slope, data = grouped_data)
seg_model <- segmented(lm_model,
seg.Z = ~ mean_Slope,
npsi = 1)
# --- Extract breakpoint (Slope value) ---
breakpoint <- seg_model$psi[2]
# --- Predicted Y at breakpoint ---
Y_at_breakpoint <- predict(
seg_model,
newdata = data.frame(mean_Slope = breakpoint)
)
# --- Compute R² ---
R2 <- summary(seg_model)$r.squared
# --- Define dynamic target ---
target <- Y_at_breakpoint - 0.15
# --- Prediction grid ---
newdata <- data.frame(
mean_Slope = seq(min(grouped_data$mean_Slope),
max(grouped_data$mean_Slope),
length.out = 500)
)
newdata$pred <- predict(seg_model, newdata)
# --- Compute Slope where predicted proportion = target ---
idx <- which.min(abs(newdata$pred - target))
Slope_target <- newdata$mean_Slope[idx]
# --- Plot ---
plot(grouped_data$mean_Slope,
grouped_data$mean_DQ,
pch = 16,
xlab = "Mean Slope (groups of 20)",
ylab = "Proportion of future Adoption",
ylim = c(0,1))
lines(newdata$mean_Slope,
newdata$pred,
col = "blue",
lwd = 2)
# --- Breakpoint ---
abline(v = breakpoint,
col = "darkgreen",
lwd = 2,
lty = 2)
abline(h = Y_at_breakpoint,
col = "darkgreen",
lty = 3)
points(breakpoint, Y_at_breakpoint,
pch = 19, col = "darkgreen")
# --- Target line ---
abline(h = target,
col = "red",
lty = 2)
abline(v = Slope_target,
col = "red",
lwd = 2)
points(Slope_target, target,
pch = 19, col = "red")
# --- Annotations ---
text(breakpoint, 0.05,
labels = paste0("Breakpoint (Slope) = ",
round(breakpoint, 3)),
col = "darkgreen",
pos = 4)
text(breakpoint, Y_at_breakpoint + 0.05,
labels = paste0("Adoption at Break = ",
round(Y_at_breakpoint, 3)),
col = "darkgreen",
pos = 4)
text(Slope_target, 0.15,
labels = paste0("Slope (Break − 0.15) = ",
round(Slope_target, 3)),
col = "red",
pos = 4)
text(max(grouped_data$mean_Slope),
0.95,
labels = paste0("R² = ",
round(R2, 3)),
pos = 2)
# --- Sort data by Crop and Y_ratio ---
CONV_CA_Ratio <- CONV_CA_Ratio[order(CONV_CA_Ratio$crop,
CONV_CA_Ratio$Y_ratio), ]
library(dplyr)
# --- Create groups of 10 consecutive observations WITHIN each Crop ---
CONV_CA_Ratio <- CONV_CA_Ratio %>%
group_by(crop) %>%
mutate(group10 = ceiling(row_number() / 10)) %>%
ungroup()
# --- Aggregate to get group means per Crop ---
grouped_data <- CONV_CA_Ratio %>%
group_by(crop, group10) %>%
summarise(
mean_Y_ratio = mean(Y_ratio, na.rm = TRUE),
successes = sum(DQ_future_adoption, na.rm = TRUE),
n = n(),
.groups = "drop"
)
grouped_data$failures <- grouped_data$n - grouped_data$successes
grouped_data$mean_DQ <- grouped_data$successes / grouped_data$n
# --- Logistic regression including Crop ---
model <- glm(
cbind(successes, failures) ~ mean_Y_ratio + crop,
family = binomial,
data = grouped_data
)
summary(model)
##
## Call:
## glm(formula = cbind(successes, failures) ~ mean_Y_ratio + crop,
## family = binomial, data = grouped_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.04297 0.12410 -8.404 <2e-16 ***
## mean_Y_ratio 2.06496 0.15242 13.548 <2e-16 ***
## cropMaize -0.13291 0.08141 -1.633 0.103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 658.55 on 282 degrees of freedom
## Residual deviance: 429.21 on 280 degrees of freedom
## AIC: 1114.3
##
## Number of Fisher Scoring iterations: 5
# --- Create prediction grid for each Crop ---
newdata <- expand.grid(
mean_Y_ratio = seq(min(grouped_data$mean_Y_ratio),
max(grouped_data$mean_Y_ratio),
length.out = 200),
crop = unique(grouped_data$crop)
)
newdata$pred <- predict(model, newdata, type = "response")
# --- Plot ---
cols <- rainbow(length(unique(grouped_data$crop)))
crop_levels <- unique(grouped_data$crop)
plot(NULL,
xlim = range(grouped_data$mean_Y_ratio),
ylim = c(0, 1),
xlab = "Mean Y_ratio (groups of 10)",
ylab = "Proportion DQ_future_adoption")
for(i in seq_along(crop_levels)) {
crop_i <- crop_levels[i]
# Points
points(
grouped_data$mean_Y_ratio[grouped_data$crop == crop_i],
grouped_data$mean_DQ[grouped_data$crop == crop_i],
col = cols[i],
pch = 16
)
# Logistic curve
lines(
newdata$mean_Y_ratio[newdata$crop == crop_i],
newdata$pred[newdata$crop == crop_i],
col = cols[i],
lwd = 2
)
}
legend("topleft",
legend = crop_levels,
col = cols,
pch = 16,
lwd = 2)
library(dplyr)
library(tidyr)
# --- Create slope categories and remove NA ---
erosion_table <- CONV_CA_Ratio %>%
filter(
!is.na(DQ_erosion_reduction_rate),
!is.na(Slope),
!is.na(IQR_Season)
) %>%
mutate(
slope_cat = case_when(
Slope <= 5 ~ "Low",
Slope > 5 & Slope <= 15 ~ "Mid",
Slope > 15 ~ "High"
)
) %>%
group_by(IQR_Season, slope_cat) %>%
summarise(
N = n(),
perc = mean(DQ_erosion_reduction_rate) * 100,
.groups = "drop"
) %>%
mutate(
value = paste0(round(perc, 1), "% (", N, ")")
) %>%
dplyr::select(IQR_Season, slope_cat, value) %>%
pivot_wider(
names_from = slope_cat,
values_from = value
) %>%
arrange(IQR_Season)
erosion_table
library(dplyr)
library(tidyr)
# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")
# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
group_by(IQR_block) %>%
summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
filter(n_valid_seasons >= 4)
# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
IQR_Season %in% seasons,
is.finite(Y_ratio))
# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
group_by(IQR_block, IQR_Season) %>%
summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(
names_from = IQR_Season,
values_from = Y_ratio,
names_prefix = "Y_ratio_"
)
# Step 5: Compute performance typology
Perf_Type <- filtered_yield %>%
group_by(IQR_block) %>%
summarise(
n_seasons = n_distinct(IQR_Season),
any_below_0.95 = any(Y_ratio < 0.95),
mean_ratio = mean(Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
CA_typology = case_when(
!any_below_0.95 ~ "Good",
any_below_0.95 & mean_ratio >= 0.95 ~ "Acceptable",
any_below_0.95 & mean_ratio < 0.95 & mean_ratio > 0.85 ~ "Poor",
mean_ratio <= 0.85 ~ "Very Poor"
)
)
# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>% # Prioritize 23A > 24A > 25A
distinct(IQR_block, .keep_all = TRUE) %>%
dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)
# Step 7: Join all into Perf_Type
Perf_Type <- Perf_Type %>%
left_join(RY_wide, by = "IQR_block") %>%
left_join(block_meta, by = "IQR_block")
library(dplyr)
library(tidyr)
# Step 1: Define target seasons
seasons <- c("23A", "23B", "24A", "24B", "25A", "25B")
# Step 2: Compute number of valid seasons per block
blocks_with_enough_seasons <- CONV_CA_Ratio %>%
filter(IQR_Season %in% seasons, is.finite(Y_ratio)) %>%
group_by(IQR_block) %>%
summarise(n_valid_seasons = n_distinct(IQR_Season), .groups = "drop") %>%
filter(n_valid_seasons >= 4)
# Step 3: Filter main dataset to include only blocks with ≥4 seasons
filtered_yield <- CONV_CA_Ratio %>%
filter(IQR_block %in% blocks_with_enough_seasons$IQR_block,
IQR_Season %in% seasons,
is.finite(Y_ratio))
# Step 4: Prepare wide-format Relative Yield table
RY_wide <- filtered_yield %>%
group_by(IQR_block, IQR_Season) %>%
summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
pivot_wider(
names_from = IQR_Season,
values_from = Y_ratio,
names_prefix = "Y_ratio_"
)
# Step 5: Compute performance typology
Perf_Type_09 <- filtered_yield %>%
group_by(IQR_block) %>%
summarise(
n_seasons = n_distinct(IQR_Season),
any_below_0.9 = any(Y_ratio < 0.9),
mean_ratio = mean(Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
CA_typology_09 = case_when(
!any_below_0.9 ~ "Good",
any_below_0.9 & mean_ratio >= 0.9 ~ "Acceptable",
any_below_0.9 & mean_ratio < 0.9 & mean_ratio > 0.8 ~ "Poor",
mean_ratio <= 0.8 ~ "Very Poor"
)
)
# Step 6 (fixed): Get block-level metadata using fallback from multiple seasons
block_meta <- CONV_CA_Ratio %>%
filter(IQR_Season %in% c("23A", "24A", "25A")) %>%
arrange(IQR_block, match(IQR_Season, c("23A", "24A", "25A"))) %>% # Prioritize 23A > 24A > 25A
distinct(IQR_block, .keep_all = TRUE) %>%
dplyr::select(IQR_block, IQF_environment, IQF_region, IQF_District)
# Step 7: Join all into Perf_Type
Perf_Type_09 <- Perf_Type_09 %>%
left_join(RY_wide, by = "IQR_block") %>%
left_join(block_meta, by = "IQR_block")
# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
levels = c("Very Poor", "Poor", "Acceptable", "Good"))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
geom_bar(
aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
position = "stack"
) +
facet_wrap(~ IQF_environment) +
labs(
title = "CA-Performance Typology Distribution by OAF-AEZ",
x = "Performance Typology",
y = "Percentage of Performance Typologies"
) +
scale_fill_manual(values = c(
"Very Poor" = "#e41a1c",
"Poor" = "#ff7f00",
"Acceptable" = "#377eb8",
"Good" = "#4daf4a"
)) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
)
# Set factor levels for consistent order
Perf_Type$CA_typology <- factor(Perf_Type$CA_typology,
levels = c("Very Poor", "Poor", "Acceptable", "Good"))
ggplot(Perf_Type, aes(x = CA_typology, fill = CA_typology)) +
geom_bar(
aes(y = after_stat(..count.. / tapply(..count.., PANEL, sum)[PANEL] * 100)),
position = "stack"
) +
facet_wrap(~ IQF_District) +
labs(
title = "CA-Performance Typology Distribution by District",
x = "Performance Typology",
y = "Percentage of Performance Typologies"
) +
scale_fill_manual(values = c(
"Very Poor" = "#e41a1c",
"Poor" = "#ff7f00",
"Acceptable" = "#377eb8",
"Good" = "#4daf4a"
)) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
)
library(dplyr)
library(ggplot2)
library(glue)
# 1. Mean Y_ratio per block (across seasons)
block_means <- CONV_CA_Ratio %>%
filter(is.finite(Y_ratio)) %>%
group_by(IQR_block, IQF_District) %>%
summarise(mean_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")
# 2. Summary for annotations
district_summary <- block_means %>%
group_by(IQF_District) %>%
summarise(
avg_ratio = round(mean(mean_Y_ratio, na.rm = TRUE), 2),
n_total = n(),
n_ge1 = sum(mean_Y_ratio >= 1, na.rm = TRUE),
percent_ge1 = round(100 * n_ge1 / n_total)
) %>%
mutate(
label = glue("≥1: {percent_ge1}%\nµ = {avg_ratio}"),
y_pos = 1.5
)
# 3. Plot
ggplot(block_means, aes(x = IQF_District, y = mean_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "gray90") +
geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue", size = 2) +
geom_text(
data = district_summary,
aes(x = IQF_District, y = y_pos, label = label),
inherit.aes = FALSE,
size = 2,
vjust = 0
) +
labs(
title = "Distribution of Mean Y_ratio per Block by District",
x = "District",
y = "Mean Y_ratio per Block (across seasons)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0.5)
)
Description of the analysis: We analyzed the block-level average performance of Conservation Agriculture (CA) relative to Conventional Agriculture (CONV) by calculating the mean CA:CONV yield ratio per block across all seasons (i.e., 3 years = 6 seasons, 3 maize + 3 beans). A cumulative distribution plot was used to visualize how many blocks reached or exceeded yield parity (Y_ratio ≥ 1). Additionally, density plots by agro-environment visualized how CA performance varied across zones.
Description of the results: As we showed previously, when examining the average performance of CA at the country, environment, or district level, results are discouraging, suggesting that a blanket recommendation of this package is unlikely to generate broad impact or adoption, even if we target the best environmetn or district. However, when zooming in to the plot level, there are a substantial number of cases where CA either outperforms CONV or has a similar performance. Specifically, CA outperforms CONV in 18% of blocks, and in 21% of cases achieves a yield ratio close to or above acceptable thresholds (Shown before in the figure of Performance Typologies)
library(dplyr)
library(ggplot2)
# 1. Summarize data to block level
block_data <- CONV_CA_Ratio %>%
filter(is.finite(Y_ratio)) %>%
group_by(IQR_block) %>%
summarise(
avg_Y_ratio = mean(Y_ratio, na.rm = TRUE),
IQF_environment = first(IQF_environment), # to retain for optional grouping
.groups = "drop"
) %>%
arrange(avg_Y_ratio) %>%
mutate(cum_percent = 100 * (row_number() / n()))
# 2. Plot cumulative distribution
ggplot(block_data, aes(x = avg_Y_ratio, y = cum_percent)) +
geom_point(size = 2, alpha = 0.7, color = "darkorange") +
geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
scale_x_continuous(limits = c(0, 2.5)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
labs(
x = "Block-Level Average Yield Ratio (CA / CONV)",
y = "Cumulative % of Blocks",
title = "Cumulative Distribution of Relative Yield (CA vs CONV)",
subtitle = "Each point is a block's average over all seasons; Dashed line = yield parity",
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5)
)
##Visual distribution of CONV advantage vs CA_Yield
library(ggplot2)
ggplot(CONV_CA_Ratio, aes(x = Y_ratio, fill = IQF_environment)) +
geom_density(alpha = 0.3) +
geom_vline(xintercept = 1, linetype = "dashed") +
labs(title = "Density of Relative Yield (CA / CONV) by Agzone")
We start with the database of yield drivers that was already processed in the scrips: “data_mm_block” but we change the relative
data_mm_block <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/data_mm_block.csv")
drivers_data <- data_mm_block %>%
mutate(
CA_typology = fct_drop(CA_typology),
IQR_block = factor(IQR_block),
IQR_TF_tubura_client = factor(IQR_TF_tubura_client),
IQR_plot_fert_quality = factor(IQR_plot_fert_quality),
IQR_soil_texture = fct_na_value_to_level(factor(IQR_soil_texture), "Unknown"),
IQR_soil_color = fct_na_value_to_level(factor(IQR_soil_color), "Unknown"),
IQF_position_slope = fct_na_value_to_level(factor(IQF_position_slope), "Unknown"),
Weed_species_A_combined = fct_na_value_to_level(factor(Weed_species_A_combined), "Unknown")
)
# We add the variable slope to drivers_data
drivers_data <- drivers_data %>%
left_join(
slope %>% dplyr::select(IQR_block, Slope),
by = "IQR_block"
)
str(drivers_data)
## tibble [473 × 19] (S3: tbl_df/tbl/data.frame)
## $ IQR_block : chr [1:473] "102_10" "102_11" "102_12" "102_13" ...
## $ ICR_N_perc_23A : num [1:473] 0.21 0.17 0.17 0.08 0.11 0.19 0.22 0.31 0.39 0.37 ...
## $ ICR_Org_C_avg : num [1:473] 2.01 1.75 2.16 1.68 1.99 ...
## $ ICR_K_perc_23A : num [1:473] 0.75 0.32 0.57 0.41 0.42 0.55 1.75 1.39 2.09 1.25 ...
## $ ICR_Avail_P_avg : num [1:473] 11.7 11.5 22 60.6 35.6 ...
## $ ICR_arable_land_avg : num [1:473] 40.4 56.4 45.8 54 28.2 54.6 51.8 47.8 60.2 38.2 ...
## $ ICF_Alt_avg : num [1:473] 1636 1293 1141 1207 1145 ...
## $ ICR_rainfall_avg : num [1:473] 3.96 3.97 3.97 3.97 3.97 ...
## $ Comp_amount_corr_quali : num [1:473] 5.17 4.39 4.42 4.73 4.9 ...
## $ ICF_planting_date : num [1:473] 8.2 9.4 6.75 12.4 5.2 2.8 6.4 4.4 5.2 6.8 ...
## $ IQR_TF_tubura_client : Factor w/ 3 levels "---","0","1": 3 3 2 3 2 3 3 3 3 3 ...
## $ IQR_plot_fert_quality : Factor w/ 4 levels "---","high","low_quality",..: 4 2 4 2 4 4 4 2 4 2 ...
## $ IQR_soil_texture : Factor w/ 4 levels "Coarse","Fine",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ IQR_soil_color : Factor w/ 6 levels "#N/A","black",..: 2 2 2 5 2 5 2 2 5 2 ...
## $ IQF_position_slope : Factor w/ 7 levels "---","Backslope",..: 4 2 2 2 4 2 6 2 2 2 ...
## $ Weed_species_A_combined: Factor w/ 5 levels "—","Broadleaf",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ CA_typology : Factor w/ 3 levels "Acceptable","Poor",..: 3 1 3 2 1 3 1 1 2 2 ...
## $ IQF_environment : chr [1:473] "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" "Mid_Low_Wet" ...
## $ Slope : num [1:473] 15 15 15 20 12 14 15 8 10 8 ...
# 1. Remove 'CA_typology' column from drivers_data
drivers_data <- drivers_data |> dplyr::select(-CA_typology)
# 2. Calculate average Y_ratio by IQR_block from CONV_CA_Ratio
# First, ensure grouping by both IQR_block and IQR_season, then average per IQR_block
avg_Y_ratio <- CONV_CA_Ratio |>
dplyr::group_by(IQR_block, IQR_Season) |>
dplyr::summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") |>
dplyr::group_by(IQR_block) |>
dplyr::summarise(avg_Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop")
# 3. Merge avg_Y_ratio into drivers_data by IQR_block
drivers_data <- drivers_data |>
dplyr::left_join(avg_Y_ratio, by = "IQR_block")
This is the relative yield for maize and beans averaged per block, but exluding 2023 as the cover crop (mucuna) was too competitive and was later changed to hemp
library(dplyr)
# 2. Compute average Y_ratio for year 24 and 25, grouped by IQR_block
avg_Y_ratio_2425 <- CONV_CA_Ratio %>%
filter(year %in% c(24, 25)) %>% # Keep only year 24 and 25
group_by(IQR_block, IQR_Season) %>% # Group by block and season
summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
group_by(IQR_block) %>%
summarise(Y_ratio_2425 = mean(Y_ratio, na.rm = TRUE), .groups = "drop") # Final average per block
# 3. Merge this into drivers_data by IQR_block
drivers_data <- drivers_data %>%
left_join(avg_Y_ratio_2425, by = "IQR_block")
This is the relative yield for maize only, averaged for 3 years
library(dplyr)
# 2. Compute average Y_ratio for year == 25, grouped by IQR_block
Y_ratio_maize <- CONV_CA_Ratio %>%
filter(crop == "Maize") %>% # Keep only year 25
group_by(IQR_block, IQR_Season) %>% # Group by block and season
summarise(Y_ratio = mean(Y_ratio, na.rm = TRUE), .groups = "drop") %>%
group_by(IQR_block) %>%
summarise(Y_ratio_maize = mean(Y_ratio, na.rm = TRUE), .groups = "drop") # Final average per block
# 3. Merge this into drivers_data by IQR_block
drivers_data <- drivers_data %>%
left_join(Y_ratio_maize, by = "IQR_block")
library(dplyr)
# Merge CA_typology from Perf_Type using IQR_block as the key
drivers_data <- drivers_data |>
left_join(Perf_Type |>
dplyr::select(IQR_block, CA_typology),
by = "IQR_block")
drivers_data <- drivers_data |>
left_join(Perf_Type_09 |>
dplyr::select(IQR_block, CA_typology_09),
by = "IQR_block")
#Explore, visually the protential drivers and the relatio nwith Y_ration of the blocks
Variables we have so far (after filtering some out and re-grouping cathegories with low N: ICR_Org_C_avg num: ICR_K_perc_23A num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICF_Alt_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” Output variables: avg_Y_ratio num: CA_typology factor:
Notes: when we look at all the database pooled together, there is no relation at all among SOC and performance of the CA. Howeever, where we split the analysis be Environment, then an interation becomes clear, where there is no relation in mid altitude and the relation is quite significant and positive in Highland and sligtly negative in transition. Conclusion. SOC should remain for the drivers analysis
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_Org_C_avg, data = drivers_data)
# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4] # p-value for slope
# 3. Create plot
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICR_Org_C_avg vs. avg_Y_ratio",
x = "Organic Carbon (%)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3)),
size = 5, color = "black") +
theme_minimal()
### What if we facet by environment? to see if interactions are masking
the effect
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions (adjust if needed)
#----------------------------------------------
# Choose fixed position for annotation
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot: points, lm line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Org_C_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Organic Carbon (%)",
y = "Average Yield Ratio"
) +
theme_minimal()
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Fit ANOVA model
model_anova <- aov(ICR_Org_C_avg ~ CA_typology, data = drivers_data)
# 2. Tukey HSD post-hoc test
tukey <- TukeyHSD(model_anova)
# 3. Extract group letters
letters_df <- multcompLetters(tukey$CA_typology[,"p adj"])$Letters
letters_df <- data.frame(CA_typology = names(letters_df), Letters = letters_df)
# 4. Calculate group means
means_df <- drivers_data %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 5. Combine letters and means
plot_labels <- left_join(letters_df, means_df, by = "CA_typology")
# 6. Create boxplot with letters and means at fixed Y positions
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray", color = "black") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.5) +
geom_text(data = plot_labels, aes(x = CA_typology, y = 7.5, label = Letters),
fontface = "bold", size = 5, color = "darkred") +
geom_text(data = plot_labels, aes(x = CA_typology, y = 7, label = mean_label),
fontface = "plain", size = 4, color = "black") +
labs(
title = "ICR_Org_C_avg by CA_typology",
subtitle = "Letters indicate significant differences (Tukey HSD)",
x = "CA Typology",
y = "Organic Carbon (%)"
) +
theme_minimal()
### What if we facet by Environment? myve some interactions masking the
effect?
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
mod <- aov(ICR_Org_C_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(CA_typology)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Org_C_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
# Letters
geom_text(
data = stats_df,
aes(x = CA_typology, y = 9.5, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
# Means
geom_text(
data = stats_df,
aes(x = CA_typology, y = 8.25, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Org_C_avg by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Organic Carbon (%)"
) +
theme_minimal()
## CA Performance as affected by ICR_K_perc_23A ### with avg_Y_ratio
Notes: there is some minimal relation in highland. When we look at the
correlation among SOC and K, its relatively high in highland. So we
could just use SOC in the drivers analysis and exclude K Conclusion:
Keep SCO but explcude K
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_K_perc_23A), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_K_perc_23A, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_K_perc_23A, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_K_perc_23A vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Potassium (%) in 23A",
y = "Average Yield Ratio"
) +
theme_minimal()
### with CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
mod <- aov(ICR_K_perc_23A ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_K_perc_23A)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_K_perc_23A)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
# Letters
geom_text(
data = stats_df,
aes(x = CA_typology, y = 3.2, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
# Means
geom_text(
data = stats_df,
aes(x = CA_typology, y = 2.8, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_K_perc_23A by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Potassium (%) in 23A"
) +
theme_minimal()
## Since SOC semms to have some explanatory power and K very little,
lets see the correlation among them Notes: there is quite a correlation
among K and SOC in highland
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(ICR_K_perc_23A), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_K_perc_23A ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR", "\u00B2", " = ", round(r_squared, 2))
)
#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_K_perc_23A, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
correlation_stats <- correlation_stats %>%
mutate(x = label_x, y = label_y)
#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_K_perc_23A)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = correlation_stats,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
hjust = 1, vjust = 1,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Correlation between Organic Carbon and Potassium (23A)",
subtitle = "Linear regression per environment with p-values and R\u00B2",
x = "ICR_Org_C_avg (Organic Carbon %)",
y = "ICR_K_perc_23A (Potassium %)"
) +
theme_minimal()
### What if instead of contineous, we define deffitient or non
defficiennt
library(dplyr)
library(ggplot2)
library(segmented)
# ===============================
# 1) Average env_index per block across seasons
# ===============================
env_block_avg <- CONV_CA_Ratio %>%
filter(!is.na(IQR_block), !is.na(env_index)) %>%
group_by(IQR_block) %>%
summarise(
env_index_avg = mean(env_index, na.rm = TRUE),
.groups = "drop"
)
# ===============================
# 2) Ensure drivers_data is block-level for K
# ===============================
k_block <- drivers_data %>%
filter(!is.na(IQR_block), !is.na(ICR_K_perc_23A)) %>%
group_by(IQR_block) %>%
summarise(
ICR_K_perc_23A = mean(ICR_K_perc_23A, na.rm = TRUE),
.groups = "drop"
)
# ===============================
# 3) Join datasets
# ===============================
plot_data <- env_block_avg %>%
left_join(k_block, by = "IQR_block") %>%
filter(!is.na(ICR_K_perc_23A))
cat("Blocks used:", nrow(plot_data), "\n")
## Blocks used: 459
# ===============================
# 4) Fit linear model
# ===============================
lm_model <- lm(env_index_avg ~ ICR_K_perc_23A,
data = plot_data)
# ===============================
# 5) Fit segmented (breakpoint) model
# ===============================
seg_model <- segmented(
lm_model,
seg.Z = ~ ICR_K_perc_23A,
psi = list(ICR_K_perc_23A = median(plot_data$ICR_K_perc_23A, na.rm = TRUE))
)
summary(seg_model)
##
## ***Regression Model with Segmented Relationship(s)***
##
## Call:
## segmented.lm(obj = lm_model, seg.Z = ~ICR_K_perc_23A, psi = list(ICR_K_perc_23A = median(plot_data$ICR_K_perc_23A,
## na.rm = TRUE)))
##
## Estimated Break-Point(s):
## Est. St.Err
## psi1.ICR_K_perc_23A 0.3 0.045
##
## Coefficients of the linear terms:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.71785 0.06459 11.115 < 2e-16 ***
## ICR_K_perc_23A 1.06639 0.31448 3.391 0.000757 ***
## U1.ICR_K_perc_23A -1.00761 0.31623 -3.186 NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2824 on 455 degrees of freedom
## Multiple R-Squared: 0.08489, Adjusted R-squared: 0.07885
##
## Boot restarting based on 6 samples. Last fit:
## Convergence attained in 1 iterations (rel. change 1.5678e-07)
# Extract breakpoint
breakpoint <- seg_model$psi[2]
cat("Estimated Breakpoint:", round(breakpoint, 3), "\n")
## Estimated Breakpoint: 0.3
# Add predictions
plot_data$predicted <- predict(seg_model)
# ===============================
# 6) Plot
# ===============================
ggplot(plot_data,
aes(x = ICR_K_perc_23A,
y = env_index_avg)) +
geom_point(alpha = 0.7,
color = "steelblue",
size = 2) +
geom_line(aes(y = predicted),
color = "red",
size = 1.2) +
geom_vline(xintercept = breakpoint,
linetype = "dashed",
color = "darkgreen",
size = 1) +
labs(
title = "Breakpoint Model: Environmental Index vs Potassium (%)",
subtitle = paste0("Estimated breakpoint ≈ ",
round(breakpoint, 3)),
x = "ICR_K_perc_23A",
y = "Mean env_index across seasons"
) +
theme_minimal(base_size = 14)
Based on litterature: VLow: <0.2 meq/100g Low: >=0.2 and < 0.5
meq/100g. Opt: =>0.5meq/100g. https://link.springer.com/article/10.1186/s40066-018-0165-5#:~:text=Exchangeable%20K,-The%20soil%20exchangeable&text=The%20range%20of%20values%20recorded,their%20soil%20exchangeable%20K%2C%20respectively.
Then adjusted to our findings: VLow: <0.3 meq/100g Low: >=0.23 and < 0.56 meq/100g. Opt: =>0.6meq/100g.
library(dplyr)
drivers_data <- drivers_data %>%
mutate(
K_factor = case_when(
ICR_K_perc_23A < 0.3 ~ "VLow",
ICR_K_perc_23A >= 0.3 & ICR_K_perc_23A < 0.6 ~ "Low",
ICR_K_perc_23A >= 0.6 ~ "Opt",
TRUE ~ NA_character_
),
K_factor = factor(K_factor,
levels = c("VLow", "Low", "Opt"),
ordered = TRUE)
)
library(ggplot2)
library(dplyr)
# 1️⃣ Filter valid data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(K_factor)
) %>%
droplevels()
# 2️⃣ Run ANOVA
anova_model <- aov(avg_Y_ratio ~ K_factor, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3️⃣ Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ K_factor, data = df_filt)$p.value
# 4️⃣ Build annotation label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 5️⃣ Calculate group means
mean_df <- df_filt %>%
group_by(K_factor) %>%
summarise(
mean_val = mean(avg_Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6️⃣ Plot
ggplot(df_filt, aes(x = K_factor, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Means under each box
geom_text(
data = mean_df,
aes(
x = K_factor,
y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
label = mean_label
),
inherit.aes = FALSE,
size = 4
) +
# P-values annotation
annotate(
"text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5
) +
labs(
title = "avg_Y_ratio by K_factor",
subtitle = "ANOVA and Kruskal–Wallis test results shown",
x = "Potassium Level (K_factor)",
y = "Average Yield Ratio"
) +
theme_minimal()
NOtes: the values sees high in general and not relation with the yield of the plots (environmental index calculated as the ) Conclusions: poor
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_Avail_P_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_Avail_P_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot: points, regression line, facet, p-values
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_Avail_P_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Avail_P_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Available Phosphorus (ppm)",
y = "Average Yield Ratio"
) +
theme_minimal()
###ICR_Avail_P_avg by CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
#---------------------------------------------------------
# 1. Function to run ANOVA + Tukey + extract letters
#---------------------------------------------------------
get_letters <- function(df) {
mod <- aov(ICR_Avail_P_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#---------------------------------------------------------
# 2. Run statistics PER ENVIRONMENT
#---------------------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_Avail_P_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
#---------------------------------------------------------
# 3. Plot (letters + means fixed at Y)
#---------------------------------------------------------
ggplot(drivers_data, aes(x = CA_typology, y = ICR_Avail_P_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
# Letters
geom_text(
data = stats_df,
aes(x = CA_typology, y = 110, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
# Means
geom_text(
data = stats_df,
aes(x = CA_typology, y = 100, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_Avail_P_avg by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Available Phosphorus (ppm)"
) +
theme_minimal()
###Correlation: ICR_Org_C_avg vs. ICR_Avail_P_avg
First we check the values. I suspect they are toon high
library(dplyr)
library(ggplot2)
drivers_data_clean <- drivers_data %>%
filter(!is.na(ICR_Avail_P_avg))
ggplot(drivers_data_clean,
aes(x = ICR_Avail_P_avg)) +
stat_ecdf(geom = "step", size = 1.2, color = "blue") +
scale_x_continuous(
breaks = seq(
floor(min(drivers_data_clean$ICR_Avail_P_avg, na.rm = TRUE)),
ceiling(max(drivers_data_clean$ICR_Avail_P_avg, na.rm = TRUE)),
by = 15
)
) +
labs(
title = "Cumulative Distribution of Available P",
x = "ICR_Avail_P_avg",
y = "Cumulative Proportion"
) +
theme_minimal()
they still seem high, lets see the relation with the evironmetal index
calculated before. This is an index of the yield of all the seasons in
the block as compared to plots in the same seasson-district
library(dplyr)
library(ggplot2)
library(segmented)
# ===============================
# 1) Average env_index per block across seasons
# ===============================
env_block_avg <- CONV_CA_Ratio %>%
filter(!is.na(IQR_block), !is.na(env_index)) %>%
group_by(IQR_block) %>%
summarise(
env_index_avg = mean(env_index, na.rm = TRUE),
.groups = "drop"
)
# ===============================
# 2) Ensure drivers_data is block-level for P
# ===============================
p_block <- drivers_data %>%
filter(!is.na(IQR_block), !is.na(ICR_Avail_P_avg)) %>%
group_by(IQR_block) %>%
summarise(
ICR_Avail_P_avg = mean(ICR_Avail_P_avg, na.rm = TRUE),
.groups = "drop"
)
# ===============================
# 3) Join and restrict to P ≤ 70
# ===============================
plot_data <- env_block_avg %>%
left_join(p_block, by = "IQR_block") %>%
filter(!is.na(ICR_Avail_P_avg),
ICR_Avail_P_avg <= 70)
cat("Blocks used:", nrow(plot_data), "\n")
## Blocks used: 442
# ===============================
# 4) Fit linear model
# ===============================
lm_model <- lm(env_index_avg ~ ICR_Avail_P_avg,
data = plot_data)
# ===============================
# 5) Fit segmented (breakpoint) model
# ===============================
seg_model <- segmented(
lm_model,
seg.Z = ~ ICR_Avail_P_avg,
psi = list(ICR_Avail_P_avg = 30) # initial guess
)
summary(seg_model)
##
## ***Regression Model with Segmented Relationship(s)***
##
## Call:
## segmented.lm(obj = lm_model, seg.Z = ~ICR_Avail_P_avg, psi = list(ICR_Avail_P_avg = 30))
##
## Estimated Break-Point(s):
## Est. St.Err
## psi1.ICR_Avail_P_avg 4.829 0.999
##
## Coefficients of the linear terms:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.48109 0.32650 1.473 0.141
## ICR_Avail_P_avg 0.10610 0.08338 1.272 0.204
## U1.ICR_Avail_P_avg -0.10491 0.08339 -1.258 NA
##
## Residual standard error: 0.2971 on 438 degrees of freedom
## Multiple R-Squared: 0.01508, Adjusted R-squared: 0.008334
##
## Boot restarting based on 8 samples. Last fit:
## Convergence attained in 1 iterations (rel. change 0)
# Extract breakpoint
breakpoint <- seg_model$psi[2]
cat("Estimated Breakpoint:", round(breakpoint, 2), "\n")
## Estimated Breakpoint: 4.83
# Add predictions
plot_data$predicted <- predict(seg_model)
# ===============================
# 6) Plot
# ===============================
ggplot(plot_data,
aes(x = ICR_Avail_P_avg,
y = env_index_avg)) +
geom_point(alpha = 0.7,
color = "steelblue",
size = 2) +
geom_line(aes(y = predicted),
color = "red",
size = 1.2) +
geom_vline(xintercept = breakpoint,
linetype = "dashed",
color = "darkgreen",
size = 1) +
scale_x_continuous(breaks = seq(0, 70, by = 10)) +
labs(
title = "Breakpoint Model: Environmental Index vs Available P",
subtitle = paste0("Estimated breakpoint ≈ ",
round(breakpoint, 2)),
x = "ICR_Avail_P_avg",
y = "Mean env_index across seasons"
) +
theme_minimal(base_size = 14)
### Add the fariable P_factor to drivers_data
library(dplyr)
drivers_data <- drivers_data %>%
mutate(
P_factor = case_when(
ICR_Avail_P_avg < 5 ~ "low",
ICR_Avail_P_avg >= 5 & ICR_Avail_P_avg < 15 ~ "mid",
ICR_Avail_P_avg >= 15 ~ "high",
TRUE ~ NA_character_
),
P_factor = factor(P_factor,
levels = c("low", "mid", "high"))
)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#---------------------------------------------------------
# 1. Fit linear models per environment
#---------------------------------------------------------
correlation_stats <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(ICR_Avail_P_avg), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_Avail_P_avg ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR", "\u00B2", " = ", round(r_squared, 2))
)
#---------------------------------------------------------
# 2. Set annotation position
#---------------------------------------------------------
label_y <- max(drivers_data$ICR_Avail_P_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
correlation_stats <- correlation_stats %>%
mutate(x = label_x, y = label_y)
#---------------------------------------------------------
# 3. Plot: scatter + regression + p + R² + facet
#---------------------------------------------------------
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_Avail_P_avg)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = correlation_stats,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
hjust = 1, vjust = 1,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Correlation between Organic Carbon and Available Phosphorus",
subtitle = "Linear regression per environment with p-values and R\u00B2",
x = "ICR_Org_C_avg (Organic Carbon %)",
y = "ICR_Avail_P_avg (Phosphorus ppm)"
) +
theme_minimal()
### Now we do cathegoris rathen than num Very Low/Critical Range: =<5
ppm Deficient (Critical Level): 5- 10 ppm Low to Moderate Range: 10-15
ppm Optimal/Sufficient Range: > 15 ppm
NOtes: no correlation when pooled together but a trant in 2 out of 4 environments. Moreover it is a hosehold variable that is ususally used to explain perfomrance of practices Conclusions: Keep it for drivers analysis
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_arable_land_avg, data = drivers_data)
# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4] # p-value for slope
# 3. Create plot
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICR_arable_land_avg vs. avg_Y_ratio",
x = "Arable Land (%)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3)),
size = 5, color = "black") +
theme_minimal()
### In facet by environment
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICR_arable_land_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_arable_land_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Prepare annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICR_arable_land_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_arable_land_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values shown per environment",
x = "Arable land (%)",
y = "Average Yield Ratio"
) +
theme_minimal()
###ICR_arable_land_avg by CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
get_letters <- function(df) {
mod <- aov(ICR_arable_land_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_arable_land_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_arable_land_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 1)))
ggplot(drivers_data, aes(x = CA_typology, y = ICR_arable_land_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 90, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 82, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_arable_land_avg by CA_typology (per environment)",
subtitle = "Letters = Tukey HSD within each environment",
x = "CA Typology",
y = "Arable land (%)"
) +
theme_minimal()
###Correlation: ICR_Org_C_avg vs ICR_arable_land_avg
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
correlation_stats <- drivers_data %>%
filter(!is.na(ICR_Org_C_avg), !is.na(ICR_arable_land_avg), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_arable_land_avg ~ ICR_Org_C_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR", "\u00B2", " = ", round(r_squared, 2))
)
label_y <- max(drivers_data$ICR_arable_land_avg, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_Org_C_avg, na.rm = TRUE) * 0.95
correlation_stats <- correlation_stats %>%
mutate(x = label_x, y = label_y)
ggplot(drivers_data, aes(x = ICR_Org_C_avg, y = ICR_arable_land_avg)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = correlation_stats,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
hjust = 1, vjust = 1,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Correlation between Organic Carbon and Arable Land",
subtitle = "Linear regression per environment with p-values and R\u00B2",
x = "ICR_Org_C_avg (Organic Carbon %)",
y = "ICR_arable_land_avg (Arable land %)"
) +
theme_minimal()
## CA Performance as affected by ICF_Alt_avg
NOtes: there seems to be no correlation when pooled together and even less when split by environment. Environment clasiffication is mainly based on altitude so if we are using em=nvironments as driver, altitude can be eliminated Conclusions: Do not use further in drivers analysis
###All together
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_Alt_avg, data = drivers_data)
# 2. Extract p-value from model summary
p_val <- summary(model)$coefficients[2, 4] # p-value for slope
# 3. Create plot
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICF_Alt_avg vs. avg_Y_ratio",
x = "Altitude (%)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3)),
size = 5, color = "black") +
theme_minimal()
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
#----------------------------------------------
# 1. Fit separate linear models per environment
#----------------------------------------------
stats_df <- drivers_data %>%
filter(!is.na(ICF_Alt_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICF_Alt_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICF_Alt_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(drivers_data, aes(x = ICF_Alt_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_Alt_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values per environment",
x = "Altitude (m)",
y = "Average Yield Ratio"
) +
theme_minimal()
###ICF_Alt_avg by CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
# Function for letters
get_letters <- function(df) {
mod <- aov(ICF_Alt_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
# Run stats by environment
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICF_Alt_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICF_Alt_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 0)))
# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICF_Alt_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 2000, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 1800, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_Alt_avg by CA_typology (per environment)",
subtitle = "Tukey HSD: significant differences by letter",
x = "CA Typology",
y = "Altitude (m)"
) +
theme_minimal()
NOtes: Rain seems to partially explain CA performance, with a cuadratice-model response when pooled all environmetns together, with 4 mm per day as optimumn. However, when split by environments, linear models can fit as good Conclusions: Keep, consider cuadratic relation and not linear if not grouped within environments ###Simple regression: ICR_rainfall_avg vs avg_Y_ratio
library(ggplot2)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg, data = drivers_data)
# 2. Extract p-value and R-squared from model summary
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5, color = "black") +
theme_minimal()
### What wbout tieh a cuadratic model?
library(ggplot2)
# 1. Fit quadratic model
model <- lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2),
data = drivers_data)
# 2. Extract p-value and R²
p_val <- summary(model)$coefficients["I(ICR_rainfall_avg^2)", 4]
r2 <- summary(model)$r.squared
# 3. Create plot with p and R²
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(
method = "lm",
formula = y ~ x + I(x^2),
se = TRUE,
color = "darkred",
linetype = "dashed"
) +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5)
###Faceted regression: ICR_rainfall_avg vs avg_Y_ratio by environment
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
# 1. Fit separate linear models per environment
stats_df <- drivers_data %>%
filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
p_label = paste0("p = ", signif(p_value, 3))
) %>%
dplyr::select(IQF_environment, p_label)
# 2. Annotation positions
label_y <- max(drivers_data$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(drivers_data$ICR_rainfall_avg, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = p_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio",
subtitle = "Linear regressions and p-values per environment",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
) +
theme_minimal()
### same as avobe but with cuadratic model
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
# 1. Fit quadratic models per environment
stats_df <- drivers_data %>%
filter(!is.na(ICR_rainfall_avg), !is.na(avg_Y_ratio), !is.na(IQF_environment)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICR_rainfall_avg + I(ICR_rainfall_avg^2), data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients["I(ICR_rainfall_avg^2)", 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
# IMPORTANT: use R^2 (not R²)
stat_label = paste0(
"p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2)
)
) %>%
dplyr::select(IQF_environment, stat_label)
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
# 3. Plot
ggplot(drivers_data, aes(x = ICR_rainfall_avg, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(
method = "lm",
formula = y ~ x + I(x^2),
se = TRUE,
color = "darkred",
linetype = "dashed"
) +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_rainfall_avg vs. avg_Y_ratio (Quadratic)",
subtitle = "Quadratic regressions per environment with p-values and R^2",
x = "Rainfall (avg, mm)",
y = "Average Yield Ratio"
)
###Boxplot: ICR_rainfall_avg by CA_typology (with Tukey HSD letters)
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
# Function to extract Tukey letters
get_letters <- function(df) {
mod <- aov(ICR_rainfall_avg ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
# Run stats by environment
stats_df <- drivers_data %>%
filter(!is.na(CA_typology), !is.na(ICR_rainfall_avg)) %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICR_rainfall_avg, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# Plot
ggplot(drivers_data, aes(x = CA_typology, y = ICR_rainfall_avg)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 7, label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology, y = 6.2, label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICR_rainfall_avg by CA_typology (per environment)",
subtitle = "Tukey HSD: significant differences by letter",
x = "CA Typology",
y = "Rainfall (avg, mm)"
) +
theme_minimal()
## CA Performance as affected by Comp_amount_corr_quali
NOtes: It seems to have a significant relation with both Y_relative and typology. Wether pooled all together of per environment, but when split by environment, all show the sam esign of trend, but is clear and significant only for mid-dry and transition Conclusions: Keep for the drivers analysis ###Simple regression: Comp_amount_corr_quali vs avg_Y_ratio
library(ggplot2)
# Filter data
df_filt <- drivers_data %>%
dplyr::filter(!is.na(Comp_amount_corr_quali),
!is.na(avg_Y_ratio),
Comp_amount_corr_quali <= 15)
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = df_filt)
# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "Comp_amount_corr_quali vs. avg_Y_ratio",
x = "Compost amount (corrected, ≤15)",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5) +
theme_minimal()
###Linear regression per environment (facet + R² + p, compost ≤ 15)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(Comp_amount_corr_quali),
!is.na(avg_Y_ratio),
!is.na(IQF_environment),
Comp_amount_corr_quali <= 15)
#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ Comp_amount_corr_quali, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Comp_amount_corr_quali, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Comp_amount_corr_quali vs. avg_Y_ratio",
subtitle = "Linear regressions per environment (compost ≤ 15)",
x = "Compost amount (corrected)",
y = "Average Yield Ratio"
) +
theme_minimal()
###Boxplot by CA_typology (Tukey HSD, compost ≤ 15)
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(CA_typology),
!is.na(Comp_amount_corr_quali),
!is.na(IQF_environment),
Comp_amount_corr_quali <= 15)
get_letters <- function(df) {
mod <- aov(Comp_amount_corr_quali ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(Comp_amount_corr_quali, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
ggplot(df_filt, aes(x = CA_typology, y = Comp_amount_corr_quali)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.95,
label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$Comp_amount_corr_quali, na.rm = TRUE) * 0.85,
label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Comp_amount_corr_quali by CA_typology (per environment)",
subtitle = "Tukey HSD (compost ≤ 15)",
x = "CA Typology",
y = "Compost amount (corrected)"
) +
theme_minimal()
## CA Performance as affected by ICF_planting_date
NOtes: Conclusions: ###Simple regression: ICR_rainfall_avg ICF_planting_date
library(ggplot2)
library(dplyr)
# Filter data
df_filt <- drivers_data %>%
dplyr::filter(!is.na(ICF_planting_date),
!is.na(avg_Y_ratio))
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ ICF_planting_date, data = df_filt)
# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
labs(
title = "ICF_planting_date vs. avg_Y_ratio",
x = "Planting date",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 = ", round(r2, 2)),
size = 5) +
theme_minimal()
###Linear regression per environment (facet + R^2 + p)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(ICF_planting_date),
!is.na(avg_Y_ratio),
!is.na(IQF_environment))
#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ ICF_planting_date, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = ICF_planting_date, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_planting_date vs. avg_Y_ratio",
subtitle = "Linear regressions per environment",
x = "Planting date",
y = "Average Yield Ratio"
) +
theme_minimal()
###Boxplot by CA_typology (Tukey HSD)
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
df_filt <- drivers_data %>%
dplyr::filter(!is.na(CA_typology),
!is.na(ICF_planting_date),
!is.na(IQF_environment))
get_letters <- function(df) {
mod <- aov(ICF_planting_date ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(ICF_planting_date, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 1)))
ggplot(df_filt, aes(x = CA_typology, y = ICF_planting_date)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95,
label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.85,
label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_planting_date by CA_typology (per environment)",
subtitle = "Tukey HSD",
x = "CA Typology",
y = "Planting date"
) +
theme_minimal()
###How does delay in planting relate to rain over the season?
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
# 1. Filter valid data
df_filt <- drivers_data %>%
dplyr::filter(
!is.na(ICF_planting_date),
!is.na(ICR_rainfall_avg),
!is.na(IQF_environment)
)
# 2. Fit linear model per environment
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(ICR_rainfall_avg ~ ICF_planting_date, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
# 3. Annotation position
label_y <- max(df_filt$ICR_rainfall_avg, na.rm = TRUE) * 0.95
label_x <- max(df_filt$ICF_planting_date, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
# 4. Plot
ggplot(df_filt, aes(x = ICF_planting_date, y = ICR_rainfall_avg)) +
geom_point(alpha = 0.6, color = "steelblue", size = 1.5) +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 4,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "ICF_planting_date vs. ICR_rainfall_avg",
subtitle = "Linear regression per environment (p-value and R^2 shown)",
x = "Planting date",
y = "Rainfall average (mm)"
) +
theme_minimal()
library(ggplot2)
library(dplyr)
# Filter data
df_filt <- drivers_data %>%
filter(!is.na(Slope),
!is.na(avg_Y_ratio))
# 1. Fit linear regression model
model <- lm(avg_Y_ratio ~ Slope, data = df_filt)
# 2. Extract p-value and R^2
p_val <- summary(model)$coefficients[2, 4]
r2 <- summary(model)$r.squared
# 3. Create plot
ggplot(df_filt, aes(x = Slope, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE,
color = "darkred", linetype = "dashed") +
labs(
title = "Slope vs. avg_Y_ratio",
x = "Slope",
y = "Average Yield Ratio"
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = paste0("p = ", signif(p_val, 3),
"\nR^2 == ", round(r2, 2)),
parse = TRUE,
size = 5) +
theme_minimal()
###Linear regression per environment using Slope
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyr)
df_filt <- drivers_data %>%
filter(!is.na(Slope),
!is.na(avg_Y_ratio),
!is.na(IQF_environment))
#----------------------------------------------
# 1. Fit models per environment
#----------------------------------------------
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
model = map(data, ~ lm(avg_Y_ratio ~ Slope, data = .x)),
p_value = map_dbl(model, ~ summary(.x)$coefficients[2, 4]),
r_squared = map_dbl(model, ~ summary(.x)$r.squared),
stat_label = paste0("p = ", signif(p_value, 3),
"\nR^2 = ", round(r_squared, 2))
) %>%
dplyr::select(IQF_environment, stat_label)
#----------------------------------------------
# 2. Annotation positions
#----------------------------------------------
label_y <- max(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95
label_x <- max(df_filt$Slope, na.rm = TRUE) * 0.95
stats_df <- stats_df %>%
mutate(x = label_x, y = label_y)
#----------------------------------------------
# 3. Plot
#----------------------------------------------
ggplot(df_filt, aes(x = Slope, y = avg_Y_ratio)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE,
color = "darkred", linetype = "dashed") +
geom_text(
data = stats_df,
aes(x = x, y = y, label = stat_label),
inherit.aes = FALSE,
size = 5,
hjust = 1,
vjust = 1
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Slope vs. avg_Y_ratio",
subtitle = "Linear regressions per environment",
x = "Slope",
y = "Average Yield Ratio"
) +
theme_minimal()
### Now slopes per CA_typology
library(ggplot2)
library(dplyr)
library(multcompView)
library(tidyr)
library(purrr)
df_filt <- drivers_data %>%
filter(!is.na(CA_typology),
!is.na(Slope),
!is.na(IQF_environment))
#----------------------------------------------
# Function to extract Tukey letters
#----------------------------------------------
get_letters <- function(df) {
mod <- aov(Slope ~ CA_typology, data = df)
tuk <- TukeyHSD(mod)
lets <- multcompLetters(tuk$CA_typology[, "p adj"])$Letters
data.frame(
CA_typology = names(lets),
Letters = lets,
stringsAsFactors = FALSE
)
}
#----------------------------------------------
# Compute letters and means
#----------------------------------------------
stats_df <- df_filt %>%
group_by(IQF_environment) %>%
nest() %>%
mutate(
letters = map(data, get_letters),
means = map(data, ~ .x %>%
group_by(CA_typology) %>%
summarise(mean_val = mean(Slope, na.rm = TRUE),
.groups = "drop"))
) %>%
unnest(letters) %>%
left_join(
unnest(dplyr::select(., IQF_environment, means)),
by = c("IQF_environment", "CA_typology")
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
#----------------------------------------------
# Plot
#----------------------------------------------
ggplot(df_filt, aes(x = CA_typology, y = Slope)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6,
color = "steelblue", size = 1.4) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$Slope, na.rm = TRUE) * 0.95,
label = Letters),
inherit.aes = FALSE,
fontface = "bold",
size = 3,
color = "darkred"
) +
geom_text(
data = stats_df,
aes(x = CA_typology,
y = max(df_filt$Slope, na.rm = TRUE) * 0.85,
label = mean_label),
inherit.aes = FALSE,
size = 3
) +
facet_wrap(~ IQF_environment) +
labs(
title = "Slope by CA_typology (per environment)",
subtitle = "ANOVA + Tukey HSD",
x = "CA Typology",
y = "Slope"
) +
theme_minimal()
### What about slope cathegory and yield ratio
# First we add clope_catto drivers data
library(dplyr)
lookup_slope <- CONV_CA_Ratio %>%
dplyr::select(IQR_block, slope_cat) %>%
distinct(IQR_block, .keep_all = TRUE)
drivers_data <- drivers_data %>%
left_join(lookup_slope, by = "IQR_block")
## Then boxes of avrage Y ratio of CA per slope cathegories by agzone
library(ggplot2)
# Make sure slope_cat is categorical
drivers_data$slope_cat <- as.factor(drivers_data$slope_cat)
ggplot(drivers_data, aes(x = slope_cat, y = avg_Y_ratio, fill = slope_cat)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~ IQF_environment) +
theme_minimal() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
) +
labs(
x = "Slope Category",
y = "Average Y Ratio",
title = "Average Y Ratio by Slope Category across Environments"
)
##Factorial variables ## CA Performance as affected by IQR_TF_tubura_client NOtes: Conclusions: ###IQR_TF_tubura_client with Y_ratio
library(ggplot2)
library(dplyr)
# 1. Filter valid data
df_filt <- drivers_data %>%
dplyr::filter(
!is.na(avg_Y_ratio),
!is.na(IQR_TF_tubura_client),
!IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_TF_tubura_client = droplevels(IQR_TF_tubura_client)) # drop unused levels
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Run Kruskal-Wallis test
kruskal_test <- kruskal.test(avg_Y_ratio ~ IQR_TF_tubura_client, data = df_filt)
kruskal_p <- kruskal_test$p.value
# 4. Build annotation label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 5. Calculate group means
mean_df <- df_filt %>%
group_by(IQR_TF_tubura_client) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Plot boxplot with p-values and means
ggplot(df_filt, aes(x = IQR_TF_tubura_client, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Means under each box
geom_text(
data = mean_df,
aes(x = IQR_TF_tubura_client,
y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
label = mean_label),
inherit.aes = FALSE,
size = 4
) +
# P-value annotation
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_TF_tubura_client",
subtitle = "ANOVA and Kruskal–Wallis test results shown",
x = "Tubura client status",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_TF_tubura_client with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# Clean and prepare data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(IQR_TF_tubura_client),
!CA_typology %in% c("no data", "---", "", "NA"),
!IQR_TF_tubura_client %in% c("no data", "---", "", "NA")
)
# Calculate N per category of explanatory variable
n_labels <- df_bar %>%
group_by(IQR_TF_tubura_client) %>%
summarise(n = n(), .groups = "drop")
# Plot with N annotations
ggplot(df_bar, aes(x = IQR_TF_tubura_client, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = IQR_TF_tubura_client, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology distribution by Tubura client status",
subtitle = "Proportions with total sample size per group",
x = "IQR_TF_tubura_client",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# Build contingency table
tab <- table(df_bar$IQR_TF_tubura_client, df_bar$CA_typology)
# Run chi-squared test
chi_test <- chisq.test(tab)
# Print results
chi_test
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = NaN, df = 6, p-value = NA
# Fisher’s Exact Test (for categorical association)
fisher_test <- fisher.test(tab)
# Print results
fisher_test
##
## Fisher's Exact Test for Count Data
##
## data: tab
## p-value = 0.673
## alternative hypothesis: two.sided
NOtes: Conclusions: ###IQR_plot_fert_quality with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_plot_fert_quality),
!IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_plot_fert_quality = factor(IQR_plot_fert_quality, levels = c("low_quality", "medium", "high")))
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Tukey HSD and Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_plot_fert_quality`[, "p adj"])$Letters
letters_df <- data.frame(
IQR_plot_fert_quality = names(letters),
Letters = letters,
stringsAsFactors = FALSE
)
# 4. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_plot_fert_quality, data = df_filt)$p.value
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 5. Group means
means_df <- df_filt %>%
group_by(IQR_plot_fert_quality) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means and letters
labels_df <- left_join(means_df, letters_df, by = "IQR_plot_fert_quality")
# 7. Plot
ggplot(df_filt, aes(x = IQR_plot_fert_quality, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Add Tukey letters above boxes
geom_text(
data = labels_df,
aes(x = IQR_plot_fert_quality, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Add means below boxes
geom_text(
data = labels_df,
aes(x = IQR_plot_fert_quality, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
# Add test label
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_plot_fert_quality",
subtitle = "Tukey HSD letters, means, and statistical tests",
x = "Plot Fertility Quality",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_plot_fert_quality with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# 1. Clean and prepare data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(IQR_plot_fert_quality),
!CA_typology %in% c("no data", "---", "", "NA"),
!IQR_plot_fert_quality %in% c("no data", "---", "", "NA")
) %>%
mutate(
IQR_plot_fert_quality = factor(IQR_plot_fert_quality,
levels = c("high", "medium", "low_quality")) # Desired order
)
# 2. Calculate N per category
n_labels <- df_bar %>%
group_by(IQR_plot_fert_quality) %>%
summarise(n = n(), .groups = "drop")
# 3. Plot
ggplot(df_bar, aes(x = IQR_plot_fert_quality, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = IQR_plot_fert_quality, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology distribution by Fertilizer Quality",
subtitle = "Proportions with total sample size per group",
x = "IQR_plot_fert_quality",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# 4. Chi-squared test
tab <- table(df_bar$IQR_plot_fert_quality, df_bar$CA_typology)
chi_test <- chisq.test(tab)
chi_test
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 3.8727, df = 6, p-value = 0.6939
library(dplyr)
library(ggplot2)
fert_data <- drivers_data %>%
filter(!is.na(IQR_plot_fert_quality)) %>%
mutate(
IQR_plot_fert_quality = droplevels(IQR_plot_fert_quality)
)
## Paste the env_index
library(dplyr)
env_block_avg <- CONV_CA_Ratio %>%
filter(!is.na(IQR_block),
!is.na(env_index)) %>%
group_by(IQR_block) %>%
summarise(
env_index_avg = mean(env_index, na.rm = TRUE),
.groups = "drop"
)
fert_data <- drivers_data %>%
filter(!is.na(IQR_plot_fert_quality)) %>%
mutate(
IQR_plot_fert_quality = droplevels(IQR_plot_fert_quality)
) %>%
left_join(env_block_avg, by = "IQR_block")
cat("Rows in fert_data:", nrow(fert_data), "\n")
## Rows in fert_data: 471
summary(fert_data$env_index_avg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3312 0.8015 0.9828 0.9967 1.1471 2.4047
# Check variables relation
# Environmetal index
library(dplyr)
library(ggplot2)
# ---------------------------------------
# 1️⃣ Ensure env_index_avg exists
# ---------------------------------------
fert_data <- fert_data %>%
filter(!is.na(IQR_plot_fert_quality),
!is.na(env_index_avg))
# ---------------------------------------
# 2️⃣ Summary statistics
# ---------------------------------------
summary_env <- fert_data %>%
group_by(IQR_plot_fert_quality) %>%
summarise(
mean_val = mean(env_index_avg, na.rm = TRUE),
N = n(),
y_pos = max(env_index_avg, na.rm = TRUE) * 1.05,
label = paste0("Mean = ", round(mean_val, 2),
"\nN = ", N),
.groups = "drop"
)
# ---------------------------------------
# 3️⃣ Plot
# ---------------------------------------
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
y = env_index_avg,
fill = IQR_plot_fert_quality)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.15,
alpha = 0.3,
color = "black") +
geom_text(data = summary_env,
aes(x = IQR_plot_fert_quality,
y = y_pos,
label = label),
inherit.aes = FALSE,
size = 4) +
labs(
title = "Environmental Index vs Farmer Perceived Soil Fertility",
x = "Farmer Perceived Soil Fertility",
y = "Average Environmental Index (across seasons)"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5)
)
# SOC
library(dplyr)
library(ggplot2)
fert_data <- drivers_data %>%
filter(!is.na(IQR_plot_fert_quality),
!is.na(ICR_Org_C_avg))
summary_OC <- fert_data %>%
group_by(IQR_plot_fert_quality) %>%
summarise(
mean_val = mean(ICR_Org_C_avg, na.rm = TRUE),
N = n(),
y_pos = max(ICR_Org_C_avg, na.rm = TRUE) * 1.05,
label = paste0("Mean=", round(mean_val, 2),
"\nN=", N),
.groups = "drop"
)
# Plot
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
y = ICR_Org_C_avg,
fill = IQR_plot_fert_quality)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.15, alpha = 0.3) +
geom_text(data = summary_OC,
aes(x = IQR_plot_fert_quality,
y = y_pos,
label = label),
inherit.aes = FALSE,
size = 4) +
labs(
title = "Organic Carbon vs Farmer Perception",
x = "Farmer Perceived Soil Fertility",
y = "Organic Carbon"
) +
theme_minimal() +
theme(legend.position = "none")
##P
# Summary
summary_P <- fert_data %>%
group_by(IQR_plot_fert_quality) %>%
summarise(
mean_val = mean(ICR_Avail_P_avg, na.rm = TRUE),
N = n(),
y_pos = max(ICR_Avail_P_avg, na.rm = TRUE) * 1.05,
label = paste0("Mean=", round(mean_val, 1),
"\nN=", N),
.groups = "drop"
)
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
y = ICR_Avail_P_avg,
fill = IQR_plot_fert_quality)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.15, alpha = 0.3) +
geom_text(data = summary_P,
aes(x = IQR_plot_fert_quality,
y = y_pos,
label = label),
inherit.aes = FALSE,
size = 4) +
theme_minimal() +
theme(legend.position = "none")
##K
summary_K <- fert_data %>%
group_by(IQR_plot_fert_quality) %>%
summarise(
mean_val = mean(ICR_K_perc_23A, na.rm = TRUE),
N = n(),
y_pos = max(ICR_K_perc_23A, na.rm = TRUE) * 1.05,
label = paste0("Mean = ", round(mean_val, 3),
"\nN = ", N),
.groups = "drop"
)
# ---------------------------------------
# 3️⃣ Plot
# ---------------------------------------
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
y = ICR_K_perc_23A,
fill = IQR_plot_fert_quality)) +
geom_boxplot(alpha = 0.6) +
geom_jitter(width = 0.15,
alpha = 0.3,
color = "black") +
geom_text(data = summary_K,
aes(x = IQR_plot_fert_quality,
y = y_pos,
label = label),
inherit.aes = FALSE,
size = 4) +
labs(
title = "Potassium (ICR_K_perc_23A) vs Farmer Perceived Soil Fertility",
x = "Farmer Perceived Soil Fertility",
y = "Potassium (%)"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5)
)
# Soil texture
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
fill = IQR_soil_texture)) +
geom_bar(position = "fill") +
labs(
title = "Soil Texture Distribution by Farmer Perception",
x = "Farmer Perception",
y = "Proportion",
fill = "Soil Texture"
) +
theme_minimal()
chisq.test(table(fert_data$IQR_plot_fert_quality,
fert_data$IQR_soil_texture))
##
## Pearson's Chi-squared test
##
## data: table(fert_data$IQR_plot_fert_quality, fert_data$IQR_soil_texture)
## X-squared = NaN, df = 9, p-value = NA
# Soil color
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
fill = IQR_soil_color)) +
geom_bar(position = "fill") +
labs(
title = "Soil Color by Farmer Perception",
y = "Proportion"
) +
theme_minimal()
# Slope position
ggplot(fert_data,
aes(x = IQR_plot_fert_quality,
fill = IQF_position_slope)) +
geom_bar(position = "fill") +
labs(
title = "Slope Position by Farmer Perception",
y = "Proportion"
) +
theme_minimal()
## CA Performance as affected by IQR_soil_texture NOtes: Conclusions:
###IQR_soil_texture with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean the data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_soil_texture),
!IQR_soil_texture %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_soil_texture = droplevels(IQR_soil_texture))
# Optional: Reorder texture levels manually (if desired)
# Example order: "sand", "loam", "clay", etc.
df_filt <- df_filt %>%
mutate(IQR_soil_texture = factor(IQR_soil_texture, levels = c("Fine", "Mix", "Coarse")))
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Run Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_texture, data = df_filt)$p.value
# 4. Tukey HSD and compact letter display
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_texture`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_texture = names(letters), Letters = letters)
# 5. Compute group means
means_df <- df_filt %>%
group_by(IQR_soil_texture) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_texture")
# 7. Test label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_texture, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above box
geom_text(
data = labels_df,
aes(x = IQR_soil_texture, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below box
geom_text(
data = labels_df,
aes(x = IQR_soil_texture, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
# Statistical tests label
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_soil_texture",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Soil Texture",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_soil_texture with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# 1. Clean and prepare data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(IQR_soil_texture),
!CA_typology %in% c("no data", "---", "", "NA"),
!IQR_soil_texture %in% c("no data", "---", "", "NA")
) %>%
mutate(IQR_soil_texture = droplevels(as.factor(IQR_soil_texture))) %>%
mutate(IQR_soil_texture = factor(IQR_soil_texture ,
levels = c("Fine", "Mix", "Coarse")))
# Desired order
# 2. Calculate N per level of explanatory variable
n_labels <- df_bar %>%
group_by(IQR_soil_texture) %>%
summarise(n = n(), .groups = "drop")
# 3. Bar plot
ggplot(df_bar, aes(x = IQR_soil_texture, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = IQR_soil_texture, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology distribution by Soil Texture",
subtitle = "Proportions with total sample size per texture class",
x = "IQR_soil_texture",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# 4. Statistical tests
tab <- table(df_bar$IQR_soil_texture, df_bar$CA_typology)
# Chi-squared test
chi_test <- chisq.test(tab)
chi_test
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 6.0014, df = 6, p-value = 0.423
NOtes: Conclusions: ###IQR_soil_color with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean the data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_soil_color),
!IQR_soil_color %in% c("no data", "---", "", "#N/A")
) %>%
mutate(IQR_soil_color = droplevels(IQR_soil_color))
# 2. Run ANOVA
anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal–Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = df_filt)$p.value
# 4. Tukey HSD and letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQR_soil_color`[, "p adj"])$Letters
letters_df <- data.frame(IQR_soil_color = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(IQR_soil_color) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine letters and means
labels_df <- left_join(means_df, letters_df, by = "IQR_soil_color")
# 7. Test label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above boxes
geom_text(
data = labels_df,
aes(x = IQR_soil_color, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below boxes
geom_text(
data = labels_df,
aes(x = IQR_soil_color, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQR_soil_color",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Soil Color",
y = "Average Yield Ratio"
) +
theme_minimal()
### Soil color facet by environment
library(ggplot2)
library(dplyr)
library(multcompView)
# -------------------------------------------------
# 1️⃣ Filter and clean
# -------------------------------------------------
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQR_soil_color),
!is.na(IQF_environment),
!IQR_soil_color %in% c("no data", "---", "", "#N/A")
) %>%
mutate(IQR_soil_color = droplevels(factor(IQR_soil_color)))
# -------------------------------------------------
# 2️⃣ Run ANOVA + Kruskal + Tukey per environment
# -------------------------------------------------
results_env <- df_filt %>%
group_by(IQF_environment) %>%
group_modify(~{
# Count number of soil color groups
n_groups <- length(unique(.x$IQR_soil_color))
# Compute group means (always)
means_df <- .x %>%
group_by(IQR_soil_color) %>%
summarise(
mean_val = mean(avg_Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# If fewer than 2 groups → skip tests
if (n_groups < 2) {
means_df$Letters <- ""
means_df$anova_p <- NA
means_df$kruskal_p <- NA
return(means_df)
}
# ---------------- ANOVA ----------------
anova_model <- aov(avg_Y_ratio ~ IQR_soil_color, data = .x)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# ---------------- Kruskal ----------------
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQR_soil_color, data = .x)$p.value
# ---------------- Tukey (robust) ----------------
tukey <- TukeyHSD(anova_model)
tukey_mat <- tukey$IQR_soil_color
if (is.null(tukey_mat) || nrow(tukey_mat) == 0) {
means_df$Letters <- ""
} else {
tukey_vec <- tukey_mat[, "p adj"]
# Ensure names exist
if (is.null(names(tukey_vec))) {
names(tukey_vec) <- rownames(tukey_mat)
}
letters <- multcompLetters(tukey_vec)$Letters
letters_df <- data.frame(
IQR_soil_color = names(letters),
Letters = letters,
stringsAsFactors = FALSE
)
means_df <- left_join(means_df, letters_df,
by = "IQR_soil_color")
}
means_df$anova_p <- anova_p
means_df$kruskal_p <- kruskal_p
means_df
}) %>%
ungroup()
# -------------------------------------------------
# 3️⃣ Create p-value labels per environment
# -------------------------------------------------
test_labels <- results_env %>%
distinct(IQF_environment, anova_p, kruskal_p) %>%
mutate(
test_label = paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
)
# -------------------------------------------------
# 4️⃣ Plot
# -------------------------------------------------
ggplot(df_filt, aes(x = IQR_soil_color, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Tukey Letters
geom_text(
data = results_env,
aes(
x = IQR_soil_color,
y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.05,
label = Letters
),
inherit.aes = FALSE,
fontface = "bold",
size = 5,
color = "darkred"
) +
# Means
geom_text(
data = results_env,
aes(
x = IQR_soil_color,
y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
label = mean_label
),
inherit.aes = FALSE,
size = 4
) +
# P-values
geom_text(
data = test_labels,
aes(x = Inf, y = Inf, label = test_label),
hjust = 1.1,
vjust = 1.5,
inherit.aes = FALSE,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "avg_Y_ratio by IQR_soil_color",
subtitle = "Facet by Environment | Tukey letters + ANOVA + Kruskal–Wallis",
x = "Soil Color",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQR_soil_color with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# 1. Clean data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(IQR_soil_color),
!CA_typology %in% c("no data", "---", "", "NA"),
!IQR_soil_color %in% c("no data", "---", "", "#N/A")
)
# 2. Calculate N per category
n_labels <- df_bar %>%
group_by(IQR_soil_color) %>%
summarise(n = n(), .groups = "drop")
# 3. Plot
ggplot(df_bar, aes(x = IQR_soil_color, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = IQR_soil_color, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology distribution by Soil Color",
subtitle = "Proportional bars with total sample size",
x = "Soil Color",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# 4. Chi-squared and Fisher’s test
tab <- table(df_bar$IQR_soil_color, df_bar$CA_typology)
# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = NaN, df = 15, p-value = NA
# Fisher’s Exact Test (with simulation to avoid memory issues)
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 1e+05 replicates)
##
## data: tab
## p-value = 0.6496
## alternative hypothesis: two.sided
NOtes: Conclusions: ###IQF_position_slope with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQF_position_slope),
!IQF_position_slope %in% c("no data", "---", "", "NA")
) %>%
mutate(IQF_position_slope = droplevels(IQF_position_slope))
# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = df_filt)$p.value
# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_position_slope`[, "p adj"])$Letters
letters_df <- data.frame(IQF_position_slope = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(IQF_position_slope) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "IQF_position_slope")
# 7. Statistical test label
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Tukey HSD letters (above)
geom_text(
data = labels_df,
aes(x = IQF_position_slope, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means (below)
geom_text(
data = labels_df,
aes(x = IQF_position_slope, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQF_position_slope",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Slope Position",
y = "Average Yield Ratio"
) +
theme_minimal()
###Facet by environment
library(ggplot2)
library(dplyr)
library(multcompView)
# -------------------------------------------------
# 1️⃣ Filter and clean
# -------------------------------------------------
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQF_position_slope),
!is.na(IQF_environment),
!IQF_position_slope %in% c("no data", "---", "", "NA")
) %>%
mutate(IQF_position_slope = droplevels(factor(IQF_position_slope)))
# -------------------------------------------------
# 2️⃣ Run ANOVA + Kruskal + Tukey per environment
# -------------------------------------------------
results_env <- df_filt %>%
group_by(IQF_environment) %>%
group_modify(~{
n_groups <- length(unique(.x$IQF_position_slope))
# Means
means_df <- .x %>%
group_by(IQF_position_slope) %>%
summarise(
mean_val = mean(avg_Y_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
if (n_groups < 2) {
means_df$Letters <- ""
means_df$anova_p <- NA
means_df$kruskal_p <- NA
return(means_df)
}
# ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_position_slope, data = .x)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# Kruskal
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_position_slope, data = .x)$p.value
# Tukey (robust)
tukey <- TukeyHSD(anova_model)
tukey_mat <- tukey$IQF_position_slope
if (is.null(tukey_mat) || nrow(tukey_mat) == 0) {
means_df$Letters <- ""
} else {
tukey_vec <- tukey_mat[, "p adj"]
if (is.null(names(tukey_vec))) {
names(tukey_vec) <- rownames(tukey_mat)
}
letters <- multcompLetters(tukey_vec)$Letters
letters_df <- data.frame(
IQF_position_slope = names(letters),
Letters = letters,
stringsAsFactors = FALSE
)
means_df <- left_join(means_df, letters_df,
by = "IQF_position_slope")
}
means_df$anova_p <- anova_p
means_df$kruskal_p <- kruskal_p
means_df
}) %>%
ungroup()
# -------------------------------------------------
# 3️⃣ Create p-value labels per environment
# -------------------------------------------------
test_labels <- results_env %>%
distinct(IQF_environment, anova_p, kruskal_p) %>%
mutate(
test_label = paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
)
# -------------------------------------------------
# 4️⃣ Plot
# -------------------------------------------------
ggplot(df_filt, aes(x = IQF_position_slope, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Tukey Letters
geom_text(
data = results_env,
aes(
x = IQF_position_slope,
y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.05,
label = Letters
),
inherit.aes = FALSE,
fontface = "bold",
size = 5,
color = "darkred"
) +
# Means
geom_text(
data = results_env,
aes(
x = IQF_position_slope,
y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95,
label = mean_label
),
inherit.aes = FALSE,
size = 4
) +
# P-values
geom_text(
data = test_labels,
aes(x = Inf, y = Inf, label = test_label),
hjust = 1.1,
vjust = 1.5,
inherit.aes = FALSE,
size = 4
) +
facet_wrap(~ IQF_environment) +
labs(
title = "avg_Y_ratio by IQF_position_slope",
subtitle = "Facet by Environment | Tukey letters + ANOVA + Kruskal–Wallis",
x = "Slope Position",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQF_position_slope with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# 1. Clean and prepare data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(IQF_position_slope),
!CA_typology %in% c("no data", "---", "", "NA"),
!IQF_position_slope %in% c("no data", "---", "", "NA")
)
# 2. Calculate total N per slope category
n_labels <- df_bar %>%
group_by(IQF_position_slope) %>%
summarise(n = n(), .groups = "drop")
# 3. Plot proportion bar chart
ggplot(df_bar, aes(x = IQF_position_slope, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = IQF_position_slope, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology distribution by Position on Slope",
subtitle = "Proportions and total N per slope category",
x = "Slope Position",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# 4. Run statistical tests
tab <- table(df_bar$IQF_position_slope, df_bar$CA_typology)
# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = NaN, df = 18, p-value = NA
# Fisher’s Exact Test with simulation fallback
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 1e+05 replicates)
##
## data: tab
## p-value = 0.1501
## alternative hypothesis: two.sided
NOtes: Conclusions: ###Weed_species_A_combined with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(Weed_species_A_combined),
!Weed_species_A_combined %in% c("no data", "-", "—", "NA")
) %>%
mutate(Weed_species_A_combined = droplevels(Weed_species_A_combined))
# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal–Wallis
kruskal_p <- kruskal.test(avg_Y_ratio ~ Weed_species_A_combined, data = df_filt)$p.value
# 4. Tukey HSD and group letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`Weed_species_A_combined`[, "p adj"])$Letters
letters_df <- data.frame(Weed_species_A_combined = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(Weed_species_A_combined) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine means + letters
labels_df <- left_join(means_df, letters_df, by = "Weed_species_A_combined")
# 7. Combine test labels
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = Weed_species_A_combined, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above
geom_text(
data = labels_df,
aes(x = Weed_species_A_combined, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below
geom_text(
data = labels_df,
aes(x = Weed_species_A_combined, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by Weed_species_A_combined",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Weed Species (Combined)",
y = "Average Yield Ratio"
) +
theme_minimal()
###Weed_species_A_combined with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# 1. Filter and clean data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(Weed_species_A_combined),
!CA_typology %in% c("no data", "---", "", "NA"),
!Weed_species_A_combined %in% c("no data", "—", "", "NA")
)
# 2. Total N per weed category
n_labels <- df_bar %>%
group_by(Weed_species_A_combined) %>%
summarise(n = n(), .groups = "drop")
# 3. Proportional bar plot
ggplot(df_bar, aes(x = Weed_species_A_combined, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = Weed_species_A_combined, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology distribution by Dominant Weed Species",
subtitle = "Stacked bars show % CA_typology, with N on top",
x = "Weed Species Group (A combined)",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# 4. Statistical tests
tab <- table(df_bar$Weed_species_A_combined, df_bar$CA_typology)
# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = NaN, df = 12, p-value = NA
# Fisher’s Exact Test with simulated p-value (for large tables)
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 1e+05 replicates)
##
## data: tab
## p-value = 0.3741
## alternative hypothesis: two.sided
NOtes: Conclusions: ###IQF_environment with Y_ratio
library(ggplot2)
library(dplyr)
library(multcompView)
# 1. Filter and clean data
df_filt <- drivers_data %>%
filter(
!is.na(avg_Y_ratio),
!is.na(IQF_environment),
!IQF_environment %in% c("no data", "---", "", "NA")
) %>%
mutate(IQF_environment = droplevels(factor(IQF_environment)))
df_filt <- df_filt %>%
mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))
# 2. ANOVA
anova_model <- aov(avg_Y_ratio ~ IQF_environment, data = df_filt)
anova_p <- summary(anova_model)[[1]][["Pr(>F)"]][1]
# 3. Kruskal-Wallis test
kruskal_p <- kruskal.test(avg_Y_ratio ~ IQF_environment, data = df_filt)$p.value
# 4. Tukey HSD + Letters
tukey <- TukeyHSD(anova_model)
letters <- multcompLetters(tukey$`IQF_environment`[, "p adj"])$Letters
letters_df <- data.frame(IQF_environment = names(letters), Letters = letters)
# 5. Group means
means_df <- df_filt %>%
group_by(IQF_environment) %>%
summarise(mean_val = mean(avg_Y_ratio, na.rm = TRUE), .groups = "drop") %>%
mutate(mean_label = paste0("mean = ", round(mean_val, 2)))
# 6. Combine letters + means
labels_df <- left_join(means_df, letters_df, by = "IQF_environment")
# 7. Test summary
test_label <- paste0(
"ANOVA p = ", signif(anova_p, 3),
"\nKruskal p = ", signif(kruskal_p, 3)
)
# 8. Plot
ggplot(df_filt, aes(x = IQF_environment, y = avg_Y_ratio)) +
geom_boxplot(outlier.shape = NA, fill = "lightgray") +
geom_jitter(width = 0.2, alpha = 0.6, color = "steelblue") +
# Letters above
geom_text(
data = labels_df,
aes(x = IQF_environment, y = max(df_filt$avg_Y_ratio, na.rm = TRUE) * 1.02, label = Letters),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "darkred"
) +
# Means below
geom_text(
data = labels_df,
aes(x = IQF_environment, y = min(df_filt$avg_Y_ratio, na.rm = TRUE) * 0.95, label = mean_label),
inherit.aes = FALSE,
size = 4
) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.5,
label = test_label,
size = 5) +
labs(
title = "avg_Y_ratio by IQF_environment",
subtitle = "Tukey HSD letters, group means, ANOVA and Kruskal–Wallis p-values",
x = "Environment Type",
y = "Average Yield Ratio"
) +
theme_minimal()
###IQF_environment with CA_typology
library(ggplot2)
library(dplyr)
library(scales)
# 1. Clean data
df_bar <- drivers_data %>%
filter(
!is.na(CA_typology),
!is.na(IQF_environment),
!CA_typology %in% c("no data", "---", "", "NA"),
!IQF_environment %in% c("no data", "---", "", "NA")
)
df_bar <- df_bar %>%
mutate(IQF_environment = factor(IQF_environment, levels = c("Highland", "Transition", "Mid_Low_Wet", "Mid_Low_Dry")))
# 2. Total N per environment
n_labels <- df_bar %>%
group_by(IQF_environment) %>%
summarise(n = n(), .groups = "drop")
# 3. Proportional bar plot
ggplot(df_bar, aes(x = IQF_environment, fill = CA_typology)) +
geom_bar(position = "fill", color = "black") +
geom_text(
data = n_labels,
aes(x = IQF_environment, y = 1.02, label = paste0("N = ", n)),
inherit.aes = FALSE,
size = 4
) +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.1))) +
labs(
title = "CA_typology Distribution by Environment",
subtitle = "Proportional bars with total N per environment",
x = "IQF Environment",
y = "Proportion of CA Typology",
fill = "CA Typology"
) +
theme_minimal()
# 4. Statistical tests
tab <- table(df_bar$IQF_environment, df_bar$CA_typology)
# Chi-squared test
chi_test <- chisq.test(tab)
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 25.835, df = 9, p-value = 0.002174
# Fisher’s Exact Test with simulation
fisher_test <- fisher.test(tab, simulate.p.value = TRUE, B = 1e5)
print(fisher_test)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 1e+05 replicates)
##
## data: tab
## p-value = 0.00761
## alternative hypothesis: two.sided
Total residues, Termite insidence, couch grass as weed and Erosion reduction ### Exploration of Total green residues at harvest variable
library(dplyr)
# Step 1: Calculate total unique blocks per season
block_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
total_blocks = n_distinct(IQR_block),
blocks_with_residues_gt0 = n_distinct(IQR_block[ICR_total_residues_tn_ha > 0]),
blocks_with_residues_nonNA = n_distinct(IQR_block[!is.na(ICR_total_residues_tn_ha)]),
.groups = "drop"
) %>%
mutate(
perc_gt0 = 100 * blocks_with_residues_gt0 / total_blocks,
perc_nonNA = 100 * blocks_with_residues_nonNA / total_blocks
)
# View the result
print(block_summary)
## # A tibble: 6 × 6
## IQR_Season total_blocks blocks_with_residues…¹ blocks_with_residues…² perc_gt0
## <chr> <int> <int> <int> <dbl>
## 1 23A 538 537 536 99.8
## 2 23B 499 234 499 46.9
## 3 24A 449 419 449 93.3
## 4 24B 463 461 463 99.6
## 5 25A 428 427 428 99.8
## 6 25B 441 441 441 100
## # ℹ abbreviated names: ¹blocks_with_residues_gt0, ²blocks_with_residues_nonNA
## # ℹ 1 more variable: perc_nonNA <dbl>
library(dplyr)
residue_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
count = sum(!is.na(ICR_total_residues_tn_ha)),
min = min(ICR_total_residues_tn_ha, na.rm = TRUE),
q25 = quantile(ICR_total_residues_tn_ha, 0.25, na.rm = TRUE),
median = median(ICR_total_residues_tn_ha, na.rm = TRUE),
mean = mean(ICR_total_residues_tn_ha, na.rm = TRUE),
q75 = quantile(ICR_total_residues_tn_ha, 0.75, na.rm = TRUE),
max = max(ICR_total_residues_tn_ha, na.rm = TRUE),
sd = sd(ICR_total_residues_tn_ha, na.rm = TRUE)
) %>%
arrange(IQR_Season)
print(residue_summary)
## # A tibble: 6 × 9
## IQR_Season count min q25 median mean q75 max sd
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23A 536 3 11.7 15.9 16.7 20.0 139. 8.37
## 2 23B 499 0 0 0 0.374 0.670 2.82 0.520
## 3 24A 449 0 8 16.3 17.1 24.5 189. 13.8
## 4 24B 463 0 0.6 1.10 1.87 1.86 124. 5.94
## 5 25A 428 0 12.0 20.2 23.4 32.0 141. 15.8
## 6 25B 441 0.0472 0.675 1.06 1.70 1.73 174. 8.27
library(ggplot2)
ggplot(CONV_CA_Ratio, aes(x = IQR_Season, y = ICR_total_residues_tn_ha)) +
geom_boxplot(outlier.color = "red", fill = "skyblue") +
labs(title = "Distribution of Total Residues by Season",
y = "ICR_total_residues_tn_ha", x = "IQR_Season") +
theme_minimal()
library(dplyr)
block_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
total_blocks = n_distinct(IQR_block),
blocks_with_mulch_gt0 = n_distinct(IQR_block[ICR_mulch_perc_planti > 0]),
blocks_with_mulch_nonNA = n_distinct(IQR_block[!is.na(ICR_mulch_perc_planti)]),
.groups = "drop"
) %>%
mutate(
perc_gt0 = 100 * blocks_with_mulch_gt0 / total_blocks,
perc_nonNA = 100 * blocks_with_mulch_nonNA / total_blocks
)
print(block_summary)
## # A tibble: 6 × 6
## IQR_Season total_blocks blocks_with_mulch_gt0 blocks_with_mulch_nonNA perc_gt0
## <chr> <int> <int> <int> <dbl>
## 1 23A 538 0 538 0
## 2 23B 499 479 499 96.0
## 3 24A 449 374 449 83.3
## 4 24B 463 446 463 96.3
## 5 25A 428 308 426 72.0
## 6 25B 441 430 429 97.5
## # ℹ 1 more variable: perc_nonNA <dbl>
library(dplyr)
mulch_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
count = sum(!is.na(ICR_mulch_perc_planti)),
min = min(ICR_mulch_perc_planti, na.rm = TRUE),
q25 = quantile(ICR_mulch_perc_planti, 0.25, na.rm = TRUE),
median = median(ICR_mulch_perc_planti, na.rm = TRUE),
mean = mean(ICR_mulch_perc_planti, na.rm = TRUE),
q75 = quantile(ICR_mulch_perc_planti, 0.75, na.rm = TRUE),
max = max(ICR_mulch_perc_planti, na.rm = TRUE),
sd = sd(ICR_mulch_perc_planti, na.rm = TRUE)
) %>%
arrange(IQR_Season)
print(mulch_summary)
## # A tibble: 6 × 9
## IQR_Season count min q25 median mean q75 max sd
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23A 538 0 0 0 0 0 0 0
## 2 23B 499 0 46.5 65 56.2 75 162 28.2
## 3 24A 449 0 15 30 28.4 40 100 22.4
## 4 24B 463 0 30 50 49.6 70 100 26.9
## 5 25A 426 0 0 20 22.5 35 80 20.3
## 6 25B 429 5 40 60 57.3 75 100 22.0
library(ggplot2)
ggplot(CONV_CA_Ratio, aes(x = IQR_Season, y = ICR_mulch_perc_planti)) +
geom_boxplot(outlier.color = "red", fill = "skyblue") +
labs(title = "Distribution of Mulch per Plant by Season",
y = "ICR_mulch_per_planti",
x = "IQR_Season") +
theme_minimal()
model <- lm(Y_ratio ~ ICR_mulch_perc_planti,
data = CONV_CA_Ratio,
na.action = na.omit)
summary(model)
##
## Call:
## lm(formula = Y_ratio ~ ICR_mulch_perc_planti, data = CONV_CA_Ratio,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7573 -0.2142 -0.0365 0.1512 3.3184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8018102 0.0101032 79.362 <2e-16 ***
## ICR_mulch_perc_planti 0.0003911 0.0002177 1.796 0.0725 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3523 on 2802 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.00115, Adjusted R-squared: 0.000794
## F-statistic: 3.227 on 1 and 2802 DF, p-value: 0.07252
library(ggplot2)
library(ggplot2)
library(dplyr)
library(ggplot2)
data_filt <- CONV_CA_Ratio %>%
filter(year != 23)
ggplot(data_filt,
aes(x = ICR_mulch_perc_planti,
y = Y_ratio,
color = crop)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Yield Ratio vs Mulch Percentage per Plant by Crop (Year 23 removed)",
x = "ICR_mulch_perc_planti",
y = "Y_ratio",
color = "Crop"
) +
theme_minimal()
### Is there an optimum much cover? Enough to reach close to plateau NO,
it gous up utill 100% coverage
library(dplyr)
library(ggplot2)
library(segmented)
library(purrr)
# 1️⃣ Clean data (remove year 23 if desired)
data_filt <- CONV_CA_Ratio %>%
filter(year != 23,
!is.na(Y_ratio),
!is.na(ICR_mulch_perc_planti),
!is.na(crop))
# 2️⃣ Fit segmented model per crop
seg_models <- data_filt %>%
group_by(crop) %>%
group_modify(~{
# Linear model first
lm_mod <- lm(Y_ratio ~ ICR_mulch_perc_planti, data = .x)
# Segmented model (starting guess at 30%)
seg_mod <- tryCatch(
segmented(lm_mod,
seg.Z = ~ ICR_mulch_perc_planti,
psi = list(ICR_mulch_perc_planti = 30)),
error = function(e) NULL
)
if (is.null(seg_mod)) {
.x$pred <- predict(lm_mod)
.x$breakpoint <- NA
} else {
.x$pred <- predict(seg_mod)
.x$breakpoint <- seg_mod$psi[2]
}
.x
}) %>%
ungroup()
# 3️⃣ Extract breakpoint per crop
breakpoints <- seg_models %>%
group_by(crop) %>%
summarise(breakpoint = first(breakpoint), .groups = "drop")
print(breakpoints)
## # A tibble: 2 × 2
## crop breakpoint
## <chr> <dbl>
## 1 Beans 15.0
## 2 Maize 30.0
# 4️⃣ Plot with facet by crop
ggplot(seg_models,
aes(x = ICR_mulch_perc_planti,
y = Y_ratio)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_line(aes(y = pred),
color = "red",
size = 1.2) +
facet_wrap(~ crop, scales = "free") +
labs(
title = "Yield Ratio vs Mulch Percentage",
subtitle = "Segmented (Plateau) Model per Crop",
x = "Mulch (%)",
y = "Y_ratio"
) +
theme_minimal(base_size = 14)
We do average of season 24A,B and 25A and B then by Blick ID
library(dplyr)
# 1. Calculate block-level averages
mulch_block_avg <- CONV_CA_Ratio %>%
filter(IQR_Season %in% c("24A", "24B", "25A", "25B")) %>%
group_by(IQR_block) %>%
summarise(
avg_ICR_mulch_perc_planti = as.numeric(
mean(ICR_mulch_perc_planti, na.rm = TRUE)
),
.groups = "drop"
)
# 2. Join into drivers_data
drivers_data <- drivers_data %>%
left_join(mulch_block_avg, by = "IQR_block")
library(dplyr)
block_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
total_blocks = n_distinct(IQR_block),
blocks_with_mulch_gt0 = n_distinct(IQR_block[ICR_mulch_thicknesscm_planting > 0]),
blocks_with_mulch_nonNA = n_distinct(IQR_block[!is.na(ICR_mulch_thicknesscm_planting)]),
.groups = "drop"
) %>%
mutate(
perc_gt0 = 100 * blocks_with_mulch_gt0 / total_blocks,
perc_nonNA = 100 * blocks_with_mulch_nonNA / total_blocks
)
print(block_summary)
## # A tibble: 6 × 6
## IQR_Season total_blocks blocks_with_mulch_gt0 blocks_with_mulch_nonNA perc_gt0
## <chr> <int> <int> <int> <dbl>
## 1 23A 538 0 538 0
## 2 23B 499 1 0 0.200
## 3 24A 449 323 449 71.9
## 4 24B 463 1 0 0.216
## 5 25A 428 1 0 0.234
## 6 25B 441 1 0 0.227
## # ℹ 1 more variable: perc_nonNA <dbl>
library(ggplot2)
ggplot(CONV_CA_Ratio,
aes(x = IQR_Season,
y = ICR_mulch_thicknesscm_planting)) +
geom_boxplot(outlier.color = "red", fill = "skyblue") +
labs(
title = "Distribution of Mulch Thickness (cm) at Planting by Season",
y = "ICR_mulch_thicknesscm_planting",
x = "IQR_Season"
) +
theme_minimal()
library(dplyr)
library(ggplot2)
data_filt <- CONV_CA_Ratio %>%
filter(!IQR_Season %in% c("23A", "23B", "24B", "25A", "25B"))
ggplot(data_filt,
aes(x = ICR_mulch_thicknesscm_planting,
y = Y_ratio,
color = crop)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Yield Ratio vs Mulch Thickness (cm) at Planting by Crop (Year 23 removed)",
x = "ICR_mulch_thicknesscm_planting",
y = "Y_ratio",
color = "Crop"
) +
theme_minimal()
data_filt2 <- CONV_CA_Ratio %>%
filter(!IQR_Season %in% c("23A", "23B", "24B", "25A", "25B"))
ggplot(data_filt2,
aes(x = ICR_mulch_thicknesscm_planting,
y = ICR_mulch_perc_planti,
color = crop)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "% Soil mulched vs Mulch Thickness (cm) at Planting by Crop (Year 23 removed)",
x = "ICR_mulch_thicknesscm_planting",
y = "% Soil mulched",
color = "Crop"
) +
theme_minimal()
library(dplyr)
block_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
total_blocks = n_distinct(IQR_block),
blocks_with_termite_gt0 = n_distinct(IQR_block[ICR_Termite_incidence > 0]),
blocks_with_termite_nonNA = n_distinct(IQR_block[!is.na(ICR_Termite_incidence)]),
.groups = "drop"
) %>%
mutate(
perc_gt0 = 100 * blocks_with_termite_gt0 / total_blocks,
perc_nonNA = 100 * blocks_with_termite_nonNA / total_blocks
)
print(block_summary)
## # A tibble: 6 × 6
## IQR_Season total_blocks blocks_with_termite_…¹ blocks_with_termite_…² perc_gt0
## <chr> <int> <int> <int> <dbl>
## 1 23A 538 0 538 0
## 2 23B 499 1 16 0.200
## 3 24A 449 1 449 0.223
## 4 24B 463 75 463 16.2
## 5 25A 428 2 428 0.467
## 6 25B 441 1 0 0.227
## # ℹ abbreviated names: ¹blocks_with_termite_gt0, ²blocks_with_termite_nonNA
## # ℹ 1 more variable: perc_nonNA <dbl>
termite_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
count = sum(!is.na(ICR_Termite_incidence)),
min = min(ICR_Termite_incidence, na.rm = TRUE),
q25 = quantile(ICR_Termite_incidence, 0.25, na.rm = TRUE),
median = median(ICR_Termite_incidence, na.rm = TRUE),
mean = mean(ICR_Termite_incidence, na.rm = TRUE),
q75 = quantile(ICR_Termite_incidence, 0.75, na.rm = TRUE),
max = max(ICR_Termite_incidence, na.rm = TRUE),
sd = sd(ICR_Termite_incidence, na.rm = TRUE)
) %>%
arrange(IQR_Season)
print(termite_summary)
## # A tibble: 6 × 9
## IQR_Season count min q25 median mean q75 max sd
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23A 538 0 0 0 0 0 0 0
## 2 23B 16 0 0 0 0 0 0 0
## 3 24A 449 0 0 0 0.00548 0 2.46 0.116
## 4 24B 463 0 0 0 0.162 0 1 0.369
## 5 25A 428 0 0 0 0.0444 0 15 0.750
## 6 25B 0 Inf NA NA NaN NA -Inf NA
There is not clear relation
library(dplyr)
library(ggplot2)
# -------------------------------------------------
# 1️⃣ Clean main dataset FIRST
# -------------------------------------------------
CONV_CA_Ratio_clean <- CONV_CA_Ratio %>%
filter(
!is.na(Y_ratio),
!is.na(IQR_Season),
!is.na(IQR_block)
) %>%
mutate(
ICR_Termite_incidence = as.numeric(ICR_Termite_incidence),
ICR_Termite_incidence = ifelse(is.na(ICR_Termite_incidence), 0, ICR_Termite_incidence),
ICR_mulch_perc_planti = as.numeric(ICR_mulch_perc_planti)
)
# -------------------------------------------------
# 2️⃣ Apply 24B termite replacement by block
# -------------------------------------------------
termite_24B <- CONV_CA_Ratio_clean %>%
filter(IQR_Season == "24B") %>%
dplyr::select(IQR_block, ICR_24B = ICR_Termite_incidence)
plot_data <- CONV_CA_Ratio_clean %>%
left_join(termite_24B, by = "IQR_block") %>%
mutate(
ICR_Termite_incidence = ifelse(
IQR_Season == "24B",
ICR_Termite_incidence,
ICR_24B
),
# 🔴 CRITICAL FIX
ICR_Termite_incidence = coalesce(ICR_Termite_incidence, 0),
Termite_group = ifelse(ICR_Termite_incidence > 0,
"Termite > 0",
"Termite = 0"),
Termite_group = factor(Termite_group,
levels = c("Termite = 0", "Termite > 0"))
) %>%
dplyr::select(IQR_block, IQR_Season, Y_ratio,
ICR_Termite_incidence, Termite_group, ICR_mulch_perc_planti)
# -------------------------------------------------
# 3️⃣ Summary statistics
# -------------------------------------------------
summary_stats <- plot_data %>%
group_by(IQR_Season, Termite_group) %>%
summarise(
mean_Y = mean(Y_ratio),
N = n(),
.groups = "drop"
)
# -------------------------------------------------
# 4️⃣ T-tests per season
# -------------------------------------------------
t_test_results <- plot_data %>%
group_by(IQR_Season) %>%
summarise(
p_value = tryCatch(
t.test(Y_ratio ~ Termite_group)$p.value,
error = function(e) NA_real_
),
y_pos = max(Y_ratio) * 1.05,
.groups = "drop"
)
# -------------------------------------------------
# 5️⃣ Plot
# -------------------------------------------------
ggplot(plot_data,
aes(x = Termite_group,
y = Y_ratio,
fill = Termite_group)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.3) +
geom_text(data = summary_stats,
aes(y = mean_Y,
label = paste0("Mean=", round(mean_Y,2),
"\nN=", N)),
vjust = -1,
size = 3) +
geom_text(data = t_test_results,
aes(x = 1.5,
y = y_pos,
label = paste0("p = ", signif(p_value, 3))),
inherit.aes = FALSE,
size = 4) +
facet_wrap(~ IQR_Season) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
) +
labs(
title = "Y_ratio by Termite Incidence Across Seasons",
x = "Termite Incidence",
y = "Y_ratio"
)
Consistently, plots with termite have more mulch. Then while mulch related positively with CA performance, this may mask the negative effect of termites
library(dplyr)
library(ggplot2)
# -------------------------------------------------
# 1️⃣ Clean dataset
# -------------------------------------------------
CONV_CA_Ratio_clean <- CONV_CA_Ratio %>%
filter(
!is.na(ICR_mulch_perc_planti),
!is.na(IQR_Season),
!is.na(IQR_block)
) %>%
mutate(
ICR_Termite_incidence = as.numeric(ICR_Termite_incidence),
ICR_Termite_incidence = ifelse(is.na(ICR_Termite_incidence), 0, ICR_Termite_incidence),
ICR_mulch_perc_planti = as.numeric(ICR_mulch_perc_planti)
)
# -------------------------------------------------
# 2️⃣ Apply 24B termite replacement
# -------------------------------------------------
termite_24B <- CONV_CA_Ratio_clean %>%
filter(IQR_Season == "24B") %>%
dplyr::select(IQR_block, ICR_24B = ICR_Termite_incidence)
plot_data <- CONV_CA_Ratio_clean %>%
left_join(termite_24B, by = "IQR_block") %>%
mutate(
ICR_Termite_incidence = ifelse(
IQR_Season == "24B",
ICR_Termite_incidence,
ICR_24B
),
ICR_Termite_incidence = coalesce(ICR_Termite_incidence, 0),
Termite_group = ifelse(ICR_Termite_incidence > 0,
"Termite > 0",
"Termite = 0"),
Termite_group = factor(Termite_group,
levels = c("Termite = 0", "Termite > 0"))
) %>%
dplyr::select(IQR_block, IQR_Season,
ICR_mulch_perc_planti,
Termite_group)
# -------------------------------------------------
# 3️⃣ Summary statistics (Mulch)
# -------------------------------------------------
summary_stats <- plot_data %>%
group_by(IQR_Season, Termite_group) %>%
summarise(
mean_mulch = mean(ICR_mulch_perc_planti),
N = n(),
.groups = "drop"
)
# -------------------------------------------------
# 4️⃣ T-tests per season (Mulch)
# -------------------------------------------------
t_test_results <- plot_data %>%
group_by(IQR_Season) %>%
summarise(
p_value = tryCatch(
t.test(ICR_mulch_perc_planti ~ Termite_group)$p.value,
error = function(e) NA_real_
),
y_pos = max(ICR_mulch_perc_planti) * 1.05,
.groups = "drop"
)
# -------------------------------------------------
# 5️⃣ Plot
# -------------------------------------------------
ggplot(plot_data,
aes(x = Termite_group,
y = ICR_mulch_perc_planti,
fill = Termite_group)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.3) +
geom_text(data = summary_stats,
aes(y = mean_mulch,
label = paste0("Mean=", round(mean_mulch,1),
"\nN=", N)),
vjust = -1,
size = 3) +
geom_text(data = t_test_results,
aes(x = 1.5,
y = y_pos,
label = paste0("p = ", signif(p_value, 3))),
inherit.aes = FALSE,
size = 4) +
facet_wrap(~ IQR_Season) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
strip.text = element_text(face = "bold")
) +
labs(
title = "Mulch Percentage at Planting by Termite Incidence",
x = "Termite Incidence",
y = "Mulch at Planting (%)"
)
## Dive into the problematic weed “couchgrass; Digitaria abyssinica”
This weed is very competitive and hard to erradicate by hand, so very
complicated in CA systems. Current weed variable
“Weed_species_A_combined” only divides into broad leaf, grasses and
whitch weed. So now we will digg more into couch grass specifically
First lets explore when was this variable recorded NOtes: not well recorded in 23A
Extract the precence of couch grass in each block from ca_raw
library(dplyr)
library(stringr)
ca_raw <- read_csv("D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/raw/ca_raw.csv")
library(dplyr)
result <- ca_raw %>%
# Keep only Treat_code C
filter(Treat_code == "C") %>%
# Detect Couch grass (any spelling variant)
mutate(Couch_Presence = ifelse(
str_detect(IQR_Weedtype1_specie,
regex("Couch_grass|couchgrass|Couch grass", ignore_case = FALSE)),
1, 0
)) %>%
# Group by block and season
group_by(IQR_block, IQR_Season) %>%
# If it appears at least once → 1
summarise(Couch_Presence = max(Couch_Presence, na.rm = TRUE),
.groups = "drop")
result
library(dplyr)
season_percentages <- result %>%
group_by(IQR_Season) %>%
summarise(
pct_0 = mean(Couch_Presence == 0, na.rm = TRUE) * 100,
pct_1 = mean(Couch_Presence == 1, na.rm = TRUE) * 100,
pct_negInf = mean(Couch_Presence == -Inf, na.rm = TRUE) * 100,
.groups = "drop"
)
season_percentages
library(dplyr)
library(tidyr)
couch_block_table <- result %>%
# Keep only relevant seasons
filter(IQR_Season %in% c("23A", "23B", "24A", "25A", "25B")) %>%
# Pivot to wide format
pivot_wider(
names_from = IQR_Season,
values_from = Couch_Presence,
names_prefix = "Couch_"
) %>%
# Replace missing with 0 (if a block had no observation that season)
mutate(across(starts_with("Couch_"), ~replace_na(., 0))) %>%
# Create consistency variable
mutate(
Couch_Consit = ifelse(
rowSums(across(starts_with("Couch_"))) >= 3,
1, 0
)
)
couch_block_table
library(dplyr)
percent_ones <- couch_block_table %>%
summarise(
across(
-IQR_block,
~ mean(. == 1, na.rm = TRUE) * 100
)
)
percent_ones
library(dplyr)
CONV_CA_Ratio <- CONV_CA_Ratio %>%
left_join(
couch_block_table %>%
dplyr::select(IQR_block, Couch_Consit),
by = "IQR_block"
)
Now we evaluate the relation among having couch grass (at least 2 seasons to make it more robust) with the overall average Y_ratio of a
library(dplyr)
library(ggplot2)
library(ggpubr)
plot_data <- CONV_CA_Ratio %>%
mutate(Couch_Consit = factor(Couch_Consit, levels = c(0, 1)))
# per-facet positions
y_positions <- plot_data %>%
group_by(IQF_environment) %>%
summarise(
y_max = max(Y_ratio, na.rm = TRUE),
y_label = y_max * 0.8,
y_pval = y_max * 0.6,
.groups = "drop"
)
# summary labels (N, Mean, %>=1)
summary_labels <- plot_data %>%
group_by(IQF_environment, Couch_Consit) %>%
summarise(
N = n(),
Mean = mean(Y_ratio, na.rm = TRUE),
Pct_ge1 = mean(Y_ratio >= 1, na.rm = TRUE) * 100,
label = paste0(
"N = ", N,
"\nMean = ", round(Mean, 2),
"\n≥1 = ", round(Pct_ge1, 1), "%"
),
.groups = "drop"
) %>%
left_join(y_positions, by = "IQF_environment")
# p-values per environment
pvals <- plot_data %>%
group_by(IQF_environment) %>%
summarise(p = tryCatch(t.test(Y_ratio ~ Couch_Consit)$p.value, error = function(e) NA_real_),
.groups = "drop") %>%
left_join(y_positions, by = "IQF_environment") %>%
mutate(
group1 = "0",
group2 = "1",
y.position = y_pval,
p.signif = case_when(
is.na(p) ~ "NA",
p <= 0.001 ~ "***",
p <= 0.01 ~ "**",
p <= 0.05 ~ "*",
TRUE ~ "ns"
)
)
ggplot(plot_data, aes(x = Couch_Consit, y = Y_ratio, fill = Couch_Consit)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_text(
data = summary_labels,
aes(x = Couch_Consit, y = y_label, label = label),
inherit.aes = FALSE,
size = 3.2
) +
stat_pvalue_manual(
pvals,
label = "p.signif",
xmin = "group1",
xmax = "group2",
y.position = "y.position",
tip.length = 0.01
) +
facet_wrap(~ IQF_environment, scales = "free_y") +
labs(x = "Couch Consistency", y = "Y_ratio") +
coord_cartesian(clip = "off") +
theme_bw() +
theme(
legend.position = "none",
strip.text = element_text(face = "bold"),
plot.margin = ggplot2::margin(10, 10, 30, 10)
)
## now lets add couch grass precence to drivers
library(dplyr)
drivers_data <- drivers_data %>%
left_join(
couch_block_table %>%
dplyr::select(IQR_block, Couch_Consit),
by = "IQR_block"
)
library(dplyr)
library(tidyr)
unique_weeds <- weeds_data %>%
pivot_longer(
cols = starts_with("Weed_"),
names_to = "Weed_number",
values_to = "Weed_name"
) %>%
filter(!is.na(Weed_name)) %>%
distinct(Weed_name) %>%
arrange(Weed_name)
unique_weeds
library(dplyr)
weeds_data <- weeds_data %>%
rowwise() %>%
mutate(
N_weeds = n_distinct(
c_across(starts_with("Weed_"))[
!c_across(starts_with("Weed_")) %in% c(NA, "Other", "--")
]
)
) %>%
ungroup()
library(ggplot2)
library(dplyr)
ggplot(weeds_data, aes(x = N_weeds)) +
geom_bar(fill = "steelblue", color = "black") +
facet_wrap(~ IQR_Season) +
theme_bw() +
labs(
x = "Number of Unique Weeds (N_weeds)",
y = "Frequency (Number of Plots)",
title = "Frequency Distribution of Weed Richness by Season"
)
### NOw we add two new variables that is the count of broad leaf and of
grasses
library(dplyr)
library(tidyr)
weed_counts <- weeds_data %>%
mutate(row_id = row_number()) %>% # unique row ID
pivot_longer(
cols = starts_with("Weed_"),
values_to = "Weed_name"
) %>%
filter(
!is.na(Weed_name),
Weed_name != "Other",
Weed_name != "--"
) %>%
left_join(
weeds %>% dplyr::select(Name_Trial, Category),
by = c("Weed_name" = "Name_Trial")
)
weed_summary <- weed_counts %>%
group_by(row_id) %>%
summarise(
N_Broadleaf = sum(Category == "Broadleaf", na.rm = TRUE),
N_Grass = sum(Category == "Grass", na.rm = TRUE),
.groups = "drop"
)
weeds_data <- weeds_data %>%
mutate(row_id = row_number()) %>%
left_join(weed_summary, by = "row_id") %>%
dplyr::select(-row_id)
library(dplyr)
weeds_data <- weeds_data %>%
mutate(
CouchGrass = as.integer(
rowSums(across(starts_with("Weed_"), ~ . == "Couch_grass"),
na.rm = TRUE) > 0
)
)
1 if 1 N_weeds = 0 1.5 if N_Broadleaf = 1 & N_Grass = 0 2 if N_Broadleaf = >2 & N_Grass = 0 3 if N_Broadleaf = 0 & N_Grass = 1 & CouchGrass is = 0 3.5 if N_Broadleaf > 0 & N_Grass = 1 & CouchGrass is = 0 4 N_Grass > 1 & CouchGrass is = 0 6 if N_Broadleaf = 0 & N_Grass = 1 & CouchGrass is = 1 6.5 if N_Broadleaf > 0 & N_Grass = 1 & CouchGrass is = 1 8 if N_Grass > 1 & CouchGrass is = 1 ### Same but for which weed
library(dplyr)
weeds_data <- weeds_data %>%
mutate(
Whitch_weed = as.integer(
rowSums(across(starts_with("Weed_"), ~ . == "Witchweed"),
na.rm = TRUE) > 0
)
)
library(dplyr)
weeds_data <- weeds_data %>%
mutate(
Weed_Overall = case_when(
# 1) Sin malezas
N_weeds == 0 ~ 1,
# Solo broadleaf, sin grass
N_Broadleaf == 1 & N_Grass == 0 ~ 1.5,
N_Broadleaf >= 2 & N_Grass == 0 ~ 2,
# Grass = 1, sin Couch grass
N_Broadleaf == 0 & N_Grass == 1 & CouchGrass == 0 ~ 3,
N_Broadleaf > 0 & N_Grass == 1 & CouchGrass == 0 ~ 3.5,
# Grass > 1, sin Couch grass
N_Grass > 1 & CouchGrass == 0 ~ 4,
# Grass = 1, con Couch grass
N_Broadleaf == 0 & N_Grass == 1 & CouchGrass == 1 ~ 6,
N_Broadleaf > 0 & N_Grass == 1 & CouchGrass == 1 ~ 6.5,
# Grass > 1, con Couch grass
N_Grass > 1 & CouchGrass == 1 ~ 8,
# Si algo no entra en ninguna regla (por ejemplo N_Grass==0 & N_Broadleaf==0 pero N_weeds>0)
TRUE ~ NA_real_
)
)
library(dplyr)
weeds_data <- weeds_data %>%
filter(
IQR_Season != "23A",
!is.na(Weed_1)
)
library(dplyr)
# 1) Average Weed_Overall per block (from weeds_data)
weed_overall_block <- weeds_data %>%
group_by(IQR_block) %>%
summarise(
Weed_Overall_avg = mean(Weed_Overall, na.rm = TRUE),
.groups = "drop"
)
# 2) Paste into CONV_CA_Ratio (keeps only IQR_block in CONV_CA_Ratio)
CONV_CA_Ratio <- CONV_CA_Ratio %>%
left_join(weed_overall_block, by = "IQR_block")
library(ggpubr)
ggplot(CONV_CA_Ratio,
aes(x = Weed_Overall_avg, y = Y_ratio)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
stat_regline_equation() +
stat_cor(method = "pearson") +
facet_wrap(~ IQF_environment) +
theme_bw()
library(dplyr)
drivers_data <- drivers_data %>%
left_join(
CONV_CA_Ratio %>%
dplyr::select(IQR_block, Weed_Overall_avg) %>%
distinct(IQR_block, .keep_all = TRUE),
by = "IQR_block"
)
if the weed is in al least on of the seasons we include it in the block
library(dplyr)
whitchweed_block <- weeds_data %>%
group_by(IQR_block) %>%
summarise(
Whitch_weed = as.integer(any(Whitch_weed == 1, na.rm = TRUE)),
.groups = "drop"
)
whitchweed_block
library(dplyr)
drivers_data <- drivers_data %>%
left_join(
whitchweed_block %>%
distinct(IQR_block, .keep_all = TRUE),
by = "IQR_block"
)
library(dplyr)
block_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
total_blocks = n_distinct(IQR_block),
blocks_with_sh_residues_gt0 = n_distinct(IQR_block[ICR_sh_residues_tn_ha > 0]),
blocks_with_sh_residues_nonNA = n_distinct(IQR_block[!is.na(ICR_sh_residues_tn_ha)]),
.groups = "drop"
) %>%
mutate(
perc_gt0 = 100 * blocks_with_sh_residues_gt0 / total_blocks,
perc_nonNA = 100 * blocks_with_sh_residues_nonNA / total_blocks
)
print(block_summary)
## # A tibble: 6 × 6
## IQR_Season total_blocks blocks_with_sh_resid…¹ blocks_with_sh_resid…² perc_gt0
## <chr> <int> <int> <int> <dbl>
## 1 23A 538 0 538 0
## 2 23B 499 1 0 0.200
## 3 24A 449 303 449 67.5
## 4 24B 463 199 463 43.0
## 5 25A 428 278 428 65.0
## 6 25B 441 1 0 0.227
## # ℹ abbreviated names: ¹blocks_with_sh_residues_gt0,
## # ²blocks_with_sh_residues_nonNA
## # ℹ 1 more variable: perc_nonNA <dbl>
sh_residue_summary <- CONV_CA_Ratio %>%
group_by(IQR_Season) %>%
summarise(
count = sum(!is.na(ICR_sh_residues_tn_ha)),
min = min(ICR_sh_residues_tn_ha, na.rm = TRUE),
q25 = quantile(ICR_sh_residues_tn_ha, 0.25, na.rm = TRUE),
median = median(ICR_sh_residues_tn_ha, na.rm = TRUE),
mean = mean(ICR_sh_residues_tn_ha, na.rm = TRUE),
q75 = quantile(ICR_sh_residues_tn_ha, 0.75, na.rm = TRUE),
max = max(ICR_sh_residues_tn_ha, na.rm = TRUE),
sd = sd(ICR_sh_residues_tn_ha, na.rm = TRUE)
) %>%
arrange(IQR_Season)
print(sh_residue_summary)
## # A tibble: 6 × 9
## IQR_Season count min q25 median mean q75 max sd
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23A 538 0 0 0 0 0 0 0
## 2 23B 0 Inf NA NA NaN NA -Inf NA
## 3 24A 449 0 0 3.21 4.57 7.23 33.1 5.21
## 4 24B 463 0 0 0 0.785 0.695 12 1.63
## 5 25A 428 0 0 1.01 3.96 4.61 120 9.07
## 6 25B 0 Inf NA NA NaN NA -Inf NA
library(ggplot2)
ggplot(CONV_CA_Ratio,
aes(x = IQR_Season,
y = ICR_sh_residues_tn_ha)) +
geom_boxplot(outlier.color = "red", fill = "skyblue") +
labs(
title = "Distribution of SH Residues (t/ha) by Season",
y = "ICR_sh_residues_tn_ha",
x = "IQR_Season"
) +
theme_minimal()
model <- lm(Y_ratio ~ ICR_sh_residues_tn_ha,
data = CONV_CA_Ratio,
na.action = na.omit)
summary(model)
##
## Call:
## lm(formula = Y_ratio ~ ICR_sh_residues_tn_ha, data = CONV_CA_Ratio,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73520 -0.19663 -0.02359 0.15158 2.93672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8013764 0.0079983 100.193 <2e-16 ***
## ICR_sh_residues_tn_ha -0.0002543 0.0013606 -0.187 0.852
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3217 on 1876 degrees of freedom
## (940 observations deleted due to missingness)
## Multiple R-squared: 1.862e-05, Adjusted R-squared: -0.0005144
## F-statistic: 0.03494 on 1 and 1876 DF, p-value: 0.8518
library(dplyr)
library(ggplot2)
data_filt <- CONV_CA_Ratio %>%
filter(!IQR_Season %in% c("23A", "23B", "24B", "25B"))
ggplot(data_filt,
aes(x = ICR_sh_residues_tn_ha,
y = Y_ratio,
color = crop)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Yield Ratio vs SH Residues (t/ha) by Crop (Year 23 removed)",
x = "ICR_sh_residues_tn_ha",
y = "Y_ratio",
color = "Crop"
) +
theme_minimal()
library(dplyr)
# --------------------------------------------------
# 1️⃣ Create unique block-level agzone table
# --------------------------------------------------
agzone_block <- CONV_CA_Ratio %>%
filter(!is.na(IQR_block), !is.na(IQF_agzone)) %>%
group_by(IQR_block) %>%
summarise(
IQF_agzone = first(IQF_agzone),
.groups = "drop"
)
# Optional check (recommended once)
# Verify there is only ONE agzone per block
check_agzone <- CONV_CA_Ratio %>%
group_by(IQR_block) %>%
summarise(n_agzone = n_distinct(IQF_agzone)) %>%
filter(n_agzone > 1)
print(check_agzone) # Should return 0 rows
## # A tibble: 21 × 2
## IQR_block n_agzone
## <chr> <int>
## 1 102_10 2
## 2 110_10 2
## 3 115_11 2
## 4 116_10 2
## 5 116_3 2
## 6 132_6 2
## 7 133_1 2
## 8 16_10 2
## 9 22_10 2
## 10 31_10 2
## # ℹ 11 more rows
# --------------------------------------------------
# 2️⃣ Join into drivers_data
# --------------------------------------------------
drivers_data <- drivers_data %>%
left_join(agzone_block, by = "IQR_block")
# --------------------------------------------------
# 3️⃣ Verify no duplication occurred
# --------------------------------------------------
cat("Rows in drivers_data:", nrow(drivers_data), "\n")
## Rows in drivers_data: 473
write.csv(drivers_data, "D:/Mes Donnees/OAF_CIRAD/CA_MoU/data/processed/drivers_data.csv", row.names = FALSE)
We used alternative reponse variables; 1) avg_Y_ratio: the yield ration of CA:CONV averaged across seasons for each plot (farmer). So each farm has one value. 2) Y_ratio_2425: like 1 but we exluded the year 2023. 4) CA_typology: this is the response variable that was created based on what we speculate could be an acceptable yield perfromance across the 6 seasos for CA: Acceptable: Average ≥ 0.95 but at least one season < 0.95 // Poor: Average between 0.85 and 0.95 // Very Poor: Average ≤ 0.85
Database:drivers_data Kept variables: Couch_Grass, ICR_Org_C_avg num: ICR_Avail_P_avg num: ICR_arable_land_avg num: ICR_rainfall_avg num: Comp_amount_corr_quali num: ICF_planting_date num: IQR_TF_tubura_client Factor: IQR_plot_fert_quality Factor: IQR_soil_texture Factor w/ 4 levels: IQR_soil_color Factor w/ 6 levels: IQF_position_slope Factor w/ 7 levels: Weed_species_A_combined: Factor w/ 5 levels: IQF_environment chr “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” “Mid_Low_Wet” avg_ICR_mulch_perc_planti,
RF description:
RF results: Effects on CA:CONV ration averaged along seasons. If we use all the variables we have a very good fit; % Var explained: 58. However, some of those variables would be difficult to measure or predict at scale: soil lab results (N, K, O, OC), planting dates and rains. So we remove those to see the capacity of the model and still provides high presition: % Var explained: 51. Using same list of variables on the last (removing soil test planting dates and rains) but expluding yields from 2023 A and B (protocols changes significantly after this year); % Var explained: 54.64. Looking only at maize, with the reduced list of variables: % Var explained: 44
Effects on Yield performance type (very poor, poor or acceptable) It is clear that we can effectively target plots with good chance of success of CA (81% success rate with th elimited list of “easy to measure drivers”). Sure, we only target 12% of the plots (that is about 25% of the farmers since farmers have 2.5 plots on average). We need to check the % of farmers with acceptable performance with other practices (ferlizer, seed, lime) and se how close we are. We can then regulate how restrictive we are to find a balance among Targeted plots vs. Success rate
Using all the variables: Total Test Cases: 130 Actual Acceptable (%): 22.31 % Farmers Targeted (%): 11.54 % Precision Acceptable (%): 86.67 % Recall Acceptable (%): 44.83 %
Using the easy to record ones: Total Test Cases: 130 Actual Acceptable (%): 22.31 % Farmers Targeted (%): 12.31 % Precision Acceptable (%): 81.25 % Recall Acceptable (%): 44.83 %
# ---------------------------
# Load required packages
# ---------------------------
library(randomForest)
library(dplyr)
# ---------------------------
# Step 1: Select predictors and response
# ---------------------------
rf_data <- drivers_data %>%
dplyr::select(
ICR_N_perc_23A,
Couch_Consit,
Whitch_weed,
#ICR_K_perc_23A,
K_factor,
ICF_Alt_avg,
avg_Y_ratio,
ICR_Org_C_avg,
ICR_Avail_P_avg,
#P_factor,
ICR_arable_land_avg,
ICR_rainfall_avg,
Comp_amount_corr_quali,
ICF_planting_date,
Slope,
#IQR_TF_tubura_client,
#IQR_plot_fert_quality,
IQR_soil_texture,
#IQR_soil_color,
#IQF_position_slope,
#Weed_species_A_combined,
#Weed_Overall_avg,
avg_ICR_mulch_perc_planti,
IQF_agzone,
IQF_environment
) %>%
mutate(across(where(is.character), as.factor))
# ---------------------------
# Step 2: Train-test split (85/15)
# ---------------------------
set.seed(42)
n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = 0.9 * n)
train_data <- rf_data[train_indices, ]
test_data <- rf_data[-train_indices, ]
# ---------------------------
# Step 3: Remove missing values from training set
# ---------------------------
train_data_clean <- train_data %>%
filter(complete.cases(.))
# Check missing values
na_check <- colSums(is.na(train_data_clean))
print(na_check)
## ICR_N_perc_23A Couch_Consit Whitch_weed
## 0 0 0
## K_factor ICF_Alt_avg avg_Y_ratio
## 0 0 0
## ICR_Org_C_avg ICR_Avail_P_avg ICR_arable_land_avg
## 0 0 0
## ICR_rainfall_avg Comp_amount_corr_quali ICF_planting_date
## 0 0 0
## Slope IQR_soil_texture avg_ICR_mulch_perc_planti
## 0 0 0
## IQF_agzone IQF_environment
## 0 0
# ---------------------------
# Step 4: Fit Random Forest Model
# ---------------------------
set.seed(123)
rf_model <- randomForest(
avg_Y_ratio ~ .,
data = train_data_clean,
importance = TRUE,
mtry = 6,
ntree = 1000
)
# ---------------------------
# Step 5: Model Summary & Importance
# ---------------------------
print(rf_model)
##
## Call:
## randomForest(formula = avg_Y_ratio ~ ., data = train_data_clean, importance = TRUE, mtry = 6, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 0.02915448
## % Var explained: 22.48
# Variable importance
randomForest::importance(rf_model)
## %IncMSE IncNodePurity
## ICR_N_perc_23A 5.478877 0.77136654
## Couch_Consit 13.877696 0.46494732
## Whitch_weed 3.761944 0.09099631
## K_factor 8.515755 0.29039145
## ICF_Alt_avg 20.517271 1.40886559
## ICR_Org_C_avg 15.196437 1.14089425
## ICR_Avail_P_avg 9.326057 1.44706298
## ICR_arable_land_avg 8.138493 1.20997879
## ICR_rainfall_avg 31.073456 1.35473086
## Comp_amount_corr_quali 10.940014 1.34074224
## ICF_planting_date 15.314794 1.23020654
## Slope 8.927881 0.59760106
## IQR_soil_texture 4.237955 0.21121077
## avg_ICR_mulch_perc_planti 11.372327 1.01379768
## IQF_agzone 37.099021 1.83227429
## IQF_environment 10.002677 0.26255174
# Plot importance (MSE Increase)
varImpPlot(rf_model, type = 1, main = "Variable Importance (MSE Increase)")
## RF with outpot variable the average Y_ratio_2425 on each block
# ---------------------------
# Load required packages
# ---------------------------
library(randomForest)
library(dplyr)
# ---------------------------
# Step 1: Select predictors and response
# ---------------------------
rf_data <- drivers_data %>%
dplyr::select(
#ICR_N_perc_23A,
Couch_Consit,
#ICR_K_perc_23A,
K_factor,
ICF_Alt_avg,
Y_ratio_2425,
ICR_Org_C_avg,
ICR_Avail_P_avg,
#P_factor,
ICR_arable_land_avg,
ICR_rainfall_avg,
Comp_amount_corr_quali,
ICF_planting_date,
Slope,
#slope_cat.y,
#IQR_TF_tubura_client,
#IQR_plot_fert_quality,
IQR_soil_texture,
#IQR_soil_color,
#IQF_position_slope,
#Weed_species_A_combined,
avg_ICR_mulch_perc_planti,
IQF_agzone,
IQF_environment
) %>%
mutate(across(where(is.character), as.factor))
# ---------------------------
# Step 2: Train-test split (85/15)
# ---------------------------
set.seed(42)
n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = 0.9 * n)
train_data <- rf_data[train_indices, ]
test_data <- rf_data[-train_indices, ]
# ---------------------------
# Step 3: Remove missing values from training set
# ---------------------------
train_data_clean <- train_data %>%
filter(complete.cases(.))
# Check missing values
na_check <- colSums(is.na(train_data_clean))
print(na_check)
## Couch_Consit K_factor ICF_Alt_avg
## 0 0 0
## Y_ratio_2425 ICR_Org_C_avg ICR_Avail_P_avg
## 0 0 0
## ICR_arable_land_avg ICR_rainfall_avg Comp_amount_corr_quali
## 0 0 0
## ICF_planting_date Slope IQR_soil_texture
## 0 0 0
## avg_ICR_mulch_perc_planti IQF_agzone IQF_environment
## 0 0 0
# ---------------------------
# Step 4: Fit Random Forest Model
# ---------------------------
set.seed(123)
rf_model <- randomForest(
Y_ratio_2425 ~ .,
data = train_data_clean,
importance = TRUE,
ntree = 1000
)
# ---------------------------
# Step 5: Model Summary & Importance
# ---------------------------
print(rf_model)
##
## Call:
## randomForest(formula = Y_ratio_2425 ~ ., data = train_data_clean, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.03994959
## % Var explained: 25.73
# Variable importance
randomForest::importance(rf_model)
## %IncMSE IncNodePurity
## Couch_Consit 9.511177 0.4126721
## K_factor 7.867600 0.4409009
## ICF_Alt_avg 19.688306 2.0427041
## ICR_Org_C_avg 13.575540 1.5316198
## ICR_Avail_P_avg 7.718358 2.4426599
## ICR_arable_land_avg 12.364430 2.0911293
## ICR_rainfall_avg 22.983407 1.4899004
## Comp_amount_corr_quali 8.700705 1.8078609
## ICF_planting_date 14.269789 1.8932166
## Slope 12.765729 1.2326012
## IQR_soil_texture 5.821194 0.3596684
## avg_ICR_mulch_perc_planti 13.679202 1.6956865
## IQF_agzone 31.513485 2.6277166
## IQF_environment 12.478549 0.5847764
# Plot importance (MSE Increase)
varImpPlot(rf_model, type = 1, main = "Variable Importance (MSE Increase)")
Modulate sensitivity and presition
library(dplyr)
library(randomForest)
library(ggplot2)
# ======================
# CONTROL PANEL
# ======================
train_frac <- 0.85 # training fraction (try 0.7–0.9)
acceptable_weight <- 1 # try: 1, 2, 3, 5
accept_threshold <- 0.7 # try: 0.5–0.8
set.seed(42)
# ======================
# Step 1: Prepare RF dataset
# ======================
rf_data <- drivers_data %>%
dplyr::select(
#ICR_N_perc_23A,
#ICR_K_perc_23A,
#K_factor,
Couch_Consit,
#ICF_Alt_avg,
CA_typology,
#ICR_Org_C_avg,
#ICR_Avail_P_avg,
#P_factor,
#ICR_arable_land_avg,
#ICR_rainfall_avg,
Comp_amount_corr_quali,
ICF_planting_date,
Slope,
#IQR_TF_tubura_client,
#IQR_plot_fert_quality,
IQR_soil_texture,
IQR_soil_color,
IQF_position_slope,
#Weed_species_A_combined,
#Weed_Overall_avg,
avg_ICR_mulch_perc_planti,
IQF_agzone,
#IQF_environment
) %>%
mutate(
across(where(is.character), as.factor),
# Combine Good + Acceptable
CA_typology = as.character(CA_typology),
CA_typology = ifelse(CA_typology == "Good", "Acceptable", CA_typology),
CA_typology = factor(CA_typology,
levels = c("Very Poor", "Poor", "Acceptable"))
)
cat("Total RF rows:", nrow(rf_data), "\n")
## Total RF rows: 473
# ======================
# Step 2: Train / Test split
# ======================
n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = train_frac * n)
train_data <- rf_data[train_indices, ]
test_data <- rf_data[-train_indices, ]
# ======================
# Step 3: Remove rows with NA
# ======================
train_data_clean <- train_data %>% filter(complete.cases(.))
test_data_clean <- test_data %>% filter(complete.cases(.))
cat("Training cases:", nrow(train_data_clean), "\n")
## Training cases: 400
cat("Test cases:", nrow(test_data_clean), "\n")
## Test cases: 70
# ======================
# Step 4: Random Forest (TRAIN ONLY)
# ======================
class_weights <- c(
"Very Poor" = 1,
"Poor" = 1,
"Acceptable" = acceptable_weight
)
set.seed(123)
rf_model <- randomForest(
CA_typology ~ .,
data = train_data_clean,
ntree = 1000,
importance = TRUE,
classwt = class_weights
)
print(rf_model)
##
## Call:
## randomForest(formula = CA_typology ~ ., data = train_data_clean, ntree = 1000, importance = TRUE, classwt = class_weights)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 44.75%
## Confusion matrix:
## Very Poor Poor Acceptable class.error
## Very Poor 201 18 13 0.1336207
## Poor 66 6 12 0.9285714
## Acceptable 57 13 14 0.8333333
# Variable importance
varImpPlot(rf_model, type = 1, main = "Variable Importance (MDA)")
# ======================
# Step 5: Predict on INDEPENDENT TEST SET (probabilities)
# ======================
prob_test <- predict(rf_model, newdata = test_data_clean, type = "prob")
pred_test <- ifelse(
prob_test[, "Acceptable"] > accept_threshold,
"Acceptable",
colnames(prob_test)[apply(prob_test, 1, which.max)]
)
pred_test <- factor(pred_test, levels = levels(test_data_clean$CA_typology))
conf_mat <- table(
Actual = test_data_clean$CA_typology,
Predicted = pred_test
)
print(conf_mat)
## Predicted
## Actual Very Poor Poor Acceptable
## Very Poor 35 7 2
## Poor 10 4 0
## Acceptable 8 2 2
# ======================
# Step 6: Metrics (TEST DATA)
# ======================
total_cases <- sum(conf_mat)
actual_acceptable <- sum(conf_mat["Acceptable", ])
predicted_acceptable <- sum(conf_mat[, "Acceptable"])
true_acceptable <- conf_mat["Acceptable", "Acceptable"]
perc_actual_acceptable <- actual_acceptable / total_cases * 100
perc_farmers_targeted <- predicted_acceptable / total_cases * 100
precision_acceptable <- true_acceptable / predicted_acceptable * 100
recall_acceptable <- true_acceptable / actual_acceptable * 100
# ======================
# Variable Importance Report
# ======================
importance_mat <- randomForest::importance(rf_model, type = 1)
importance_df <- as.data.frame(importance_mat)
importance_df$Variable <- rownames(importance_df)
importance_df <- importance_df %>%
arrange(desc(MeanDecreaseAccuracy))
print(importance_df)
## MeanDecreaseAccuracy Variable
## IQF_agzone 23.867107 IQF_agzone
## Comp_amount_corr_quali 11.904357 Comp_amount_corr_quali
## Couch_Consit 11.091370 Couch_Consit
## IQR_soil_color 10.269540 IQR_soil_color
## avg_ICR_mulch_perc_planti 10.089181 avg_ICR_mulch_perc_planti
## Slope 7.888358 Slope
## IQR_soil_texture 7.542038 IQR_soil_texture
## IQF_position_slope 7.434393 IQF_position_slope
## ICF_planting_date 6.505443 ICF_planting_date
# ======================
# Per-class Variable Importance Report
# ======================
imp_full <- as.data.frame(randomForest::importance(rf_model))
imp_full$Variable <- rownames(imp_full)
# Reorder columns for readability
imp_full <- imp_full %>%
dplyr::select(Variable, everything())
print(imp_full)
## Variable Very Poor Poor
## Couch_Consit Couch_Consit 12.791466 -1.2491152
## Comp_amount_corr_quali Comp_amount_corr_quali 14.155458 -4.1272110
## ICF_planting_date ICF_planting_date 4.663397 0.4991434
## Slope Slope 7.517825 -0.8686574
## IQR_soil_texture IQR_soil_texture 5.085424 0.9724830
## IQR_soil_color IQR_soil_color 9.357470 3.4593709
## IQF_position_slope IQF_position_slope 6.987894 -1.5381527
## avg_ICR_mulch_perc_planti avg_ICR_mulch_perc_planti 13.961329 -3.5958032
## IQF_agzone IQF_agzone 20.960818 6.9210858
## Acceptable MeanDecreaseAccuracy MeanDecreaseGini
## Couch_Consit 2.712832 11.091370 8.583545
## Comp_amount_corr_quali 5.976400 11.904357 51.272968
## ICF_planting_date 5.474326 6.505443 47.399878
## Slope 5.248742 7.888358 30.511996
## IQR_soil_texture 6.383877 7.542038 10.119882
## IQR_soil_color 3.386498 10.269540 14.758805
## IQF_position_slope 5.680219 7.434393 21.607852
## avg_ICR_mulch_perc_planti -1.110002 10.089181 44.824044
## IQF_agzone 8.517459 23.867107 34.404995
# Optional: export
# write.csv(importance_df, "variable_importance.csv", row.names = FALSE)
# ======================
# Step 7: Print summary
# ======================
cat("\n================ TEST SET MODEL SUMMARY ================\n")
##
## ================ TEST SET MODEL SUMMARY ================
cat("Train fraction:", train_frac, "\n")
## Train fraction: 0.85
cat("Acceptable weight:", acceptable_weight, "\n")
## Acceptable weight: 1
cat("Acceptable threshold:", accept_threshold, "\n\n")
## Acceptable threshold: 0.7
cat("Total Test Cases:", total_cases, "\n")
## Total Test Cases: 70
cat("Actual Acceptable (%):", round(perc_actual_acceptable, 2), "%\n")
## Actual Acceptable (%): 17.14 %
cat("Farmers Targeted (%):", round(perc_farmers_targeted, 2), "%\n")
## Farmers Targeted (%): 5.71 %
cat("Precision Acceptable (%):", round(precision_acceptable, 2), "%\n")
## Precision Acceptable (%): 50 %
cat("Recall Acceptable (%):", round(recall_acceptable, 2), "%\n")
## Recall Acceptable (%): 16.67 %
cat("======================================================\n")
## ======================================================
library(randomForest)
# Extract tree number 1
tree_1 <- getTree(rf_model, k = 2, labelVar = TRUE)
head(tree_1)
library(rpart)
library(rpart.plot)
tree_model <- rpart(
CA_typology ~ .,
data = train_data_clean,
method = "class"
)
rpart.plot(tree_model)
library(dplyr)
library(randomForest)
library(ggplot2)
# ======================
# CONTROL PANEL
# ======================
train_frac <- 0.85 # training fraction (try 0.7–0.9)
acceptable_weight <- 0.7 # try: 1, 2, 3, 5
accept_threshold <- 0.7 # try: 0.5–0.8
set.seed(42)
# ======================
# Step 1: Prepare RF dataset
# ======================
rf_data <- drivers_data %>%
dplyr::select(
#ICR_N_perc_23A,
#ICR_K_perc_23A,
#K_factor,
#Couch_Grass,
Couch_Consit,
#Whitch_weed,
#ICF_Alt_avg,
CA_typology_09,
#ICR_Org_C_avg,
#ICR_Avail_P_avg,
#P_factor,
#ICR_arable_land_avg,
#ICR_rainfall_avg,
Comp_amount_corr_quali,
ICF_planting_date,
Slope,
#slope_cat.y,
#IQR_TF_tubura_client,
#IQR_plot_fert_quality,
IQR_soil_texture,
IQR_soil_color,
IQF_position_slope,
#Weed_species_A_combined,
#Weed_Overall_avg,
avg_ICR_mulch_perc_planti,
IQF_agzone,
#IQF_environment
) %>%
mutate(
across(where(is.character), as.factor),
# Combine Good + Acceptable
CA_typology_09 = as.character(CA_typology_09),
CA_typology_09 = ifelse(CA_typology_09 == "Good", "Acceptable", CA_typology_09),
CA_typology_09 = factor(CA_typology_09,
levels = c("Very Poor", "Poor", "Acceptable"))
)
cat("Total RF rows:", nrow(rf_data), "\n")
## Total RF rows: 473
# ======================
# Step 2: Train / Test split
# ======================
n <- nrow(rf_data)
train_indices <- sample(seq_len(n), size = train_frac * n)
train_data <- rf_data[train_indices, ]
test_data <- rf_data[-train_indices, ]
# ======================
# Step 3: Remove rows with NA
# ======================
train_data_clean <- train_data %>% filter(complete.cases(.))
test_data_clean <- test_data %>% filter(complete.cases(.))
cat("Training cases:", nrow(train_data_clean), "\n")
## Training cases: 400
cat("Test cases:", nrow(test_data_clean), "\n")
## Test cases: 70
# ======================
# Step 4: Random Forest (TRAIN ONLY)
# ======================
class_weights <- c(
"Very Poor" = 1,
"Poor" = 1,
"Acceptable" = acceptable_weight
)
set.seed(123)
rf_model <- randomForest(
CA_typology_09 ~ .,
data = train_data_clean,
ntree = 1000,
importance = TRUE,
classwt = class_weights
)
print(rf_model)
##
## Call:
## randomForest(formula = CA_typology_09 ~ ., data = train_data_clean, ntree = 1000, importance = TRUE, classwt = class_weights)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 45.75%
## Confusion matrix:
## Very Poor Poor Acceptable class.error
## Very Poor 159 7 33 0.2010050
## Poor 38 13 28 0.8354430
## Acceptable 61 16 45 0.6311475
# Variable importance
varImpPlot(rf_model, type = 1, main = "Variable Importance (MDA)")
# ======================
# Step 5: Predict on INDEPENDENT TEST SET (probabilities)
# ======================
prob_test <- predict(rf_model, newdata = test_data_clean, type = "prob")
pred_test <- ifelse(
prob_test[, "Acceptable"] > accept_threshold,
"Acceptable",
colnames(prob_test)[apply(prob_test, 1, which.max)]
)
pred_test <- factor(pred_test, levels = levels(test_data_clean$CA_typology_09))
conf_mat <- table(
Actual = test_data_clean$CA_typology_09,
Predicted = pred_test
)
print(conf_mat)
## Predicted
## Actual Very Poor Poor Acceptable
## Very Poor 28 3 5
## Poor 8 4 2
## Acceptable 9 4 7
# ======================
# Step 6: Metrics (TEST DATA)
# ======================
total_cases <- sum(conf_mat)
actual_acceptable <- sum(conf_mat["Acceptable", ])
predicted_acceptable <- sum(conf_mat[, "Acceptable"])
true_acceptable <- conf_mat["Acceptable", "Acceptable"]
perc_actual_acceptable <- actual_acceptable / total_cases * 100
perc_farmers_targeted <- predicted_acceptable / total_cases * 100
precision_acceptable <- true_acceptable / predicted_acceptable * 100
recall_acceptable <- true_acceptable / actual_acceptable * 100
# ======================
# Variable Importance Report
# ======================
importance_mat <- randomForest::importance(rf_model, type = 1)
importance_df <- as.data.frame(importance_mat)
importance_df$Variable <- rownames(importance_df)
importance_df <- importance_df %>%
arrange(desc(MeanDecreaseAccuracy))
print(importance_df)
## MeanDecreaseAccuracy Variable
## IQF_agzone 27.268516 IQF_agzone
## Comp_amount_corr_quali 14.786359 Comp_amount_corr_quali
## ICF_planting_date 12.593533 ICF_planting_date
## Couch_Consit 11.781759 Couch_Consit
## avg_ICR_mulch_perc_planti 10.725758 avg_ICR_mulch_perc_planti
## IQF_position_slope 10.198440 IQF_position_slope
## Slope 9.174468 Slope
## IQR_soil_color 6.316761 IQR_soil_color
## IQR_soil_texture 3.402108 IQR_soil_texture
# ======================
# Per-class Variable Importance Report
# ======================
imp_full <- as.data.frame(randomForest::importance(rf_model))
imp_full$Variable <- rownames(imp_full)
# Reorder columns for readability
imp_full <- imp_full %>%
dplyr::select(Variable, everything())
print(imp_full)
## Variable Very Poor Poor
## Couch_Consit Couch_Consit 13.838714 -3.284431
## Comp_amount_corr_quali Comp_amount_corr_quali 18.523836 -1.123834
## ICF_planting_date ICF_planting_date 9.651886 6.674355
## Slope Slope 7.073606 4.898445
## IQR_soil_texture IQR_soil_texture 2.562901 -3.602057
## IQR_soil_color IQR_soil_color 4.549651 2.037030
## IQF_position_slope IQF_position_slope 5.843769 3.963753
## avg_ICR_mulch_perc_planti avg_ICR_mulch_perc_planti 14.740756 4.611885
## IQF_agzone IQF_agzone 25.731796 10.768075
## Acceptable MeanDecreaseAccuracy MeanDecreaseGini
## Couch_Consit 4.314735 11.781759 8.688993
## Comp_amount_corr_quali 2.484765 14.786359 51.234172
## ICF_planting_date 5.872505 12.593533 46.299473
## Slope 3.494318 9.174468 30.118431
## IQR_soil_texture 4.886926 3.402108 8.544853
## IQR_soil_color 3.786981 6.316761 13.800649
## IQF_position_slope 7.618268 10.198440 21.184165
## avg_ICR_mulch_perc_planti -4.427283 10.725758 44.354002
## IQF_agzone 4.433662 27.268516 36.120029
# Optional: export
# write.csv(importance_df, "variable_importance.csv", row.names = FALSE)
# ======================
# Step 7: Print summary
# ======================
cat("\n================ TEST SET MODEL SUMMARY ================\n")
##
## ================ TEST SET MODEL SUMMARY ================
cat("Train fraction:", train_frac, "\n")
## Train fraction: 0.85
cat("Acceptable weight:", acceptable_weight, "\n")
## Acceptable weight: 0.7
cat("Acceptable threshold:", accept_threshold, "\n\n")
## Acceptable threshold: 0.7
cat("Total Test Cases:", total_cases, "\n")
## Total Test Cases: 70
cat("Actual Acceptable (%):", round(perc_actual_acceptable, 2), "%\n")
## Actual Acceptable (%): 28.57 %
cat("Farmers Targeted (%):", round(perc_farmers_targeted, 2), "%\n")
## Farmers Targeted (%): 20 %
cat("Precision Acceptable (%):", round(precision_acceptable, 2), "%\n")
## Precision Acceptable (%): 50 %
cat("Recall Acceptable (%):", round(recall_acceptable, 2), "%\n")
## Recall Acceptable (%): 35 %
cat("======================================================\n")
## ======================================================
library(randomForest)
# Extract tree number 1
tree_1 <- getTree(rf_model, k = 2, labelVar = TRUE)
head(tree_1)
library(rpart)
library(rpart.plot)
tree_model <- rpart(
CA_typology_09 ~ .,
data = train_data_clean,
method = "class"
)
rpart.plot(tree_model)
jpeg("D:/Mes Donnees/OAF_CIRAD/CA_MoU/figures/regression_tree.jpg",
width = 2000,
height = 1500,
res = 300)
rpart.plot(tree_model)
dev.off()
## png
## 2
We will compare the farms on the top of performance (30% best average CA:CONV across the 6 seasons) with the lowest performer (30% bottom) for the different variables. Specially for the variables that have more clear impact: agzone mulch compost planting date Soil texture Rain